Looks good to me. Just one nit below:
@@ -150,6 +161,7 @@ extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+ struct rb_node **leftmost,
const struct rb_augment_callbacks *augment)
{
struct rb_node *child = node->rb_right;
@@ -157,6 +169,9 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
struct rb_node *parent, *rebalance;
unsigned long pc;
+ if (leftmost && node == *leftmost)
+ *leftmost = rb_next(node);
+
if (!tmp) {
/*
* Case 1: node to erase has no more than 1 child (easy!)
Why do you propagate e.g. 'leftmost' down to __rb_erase_augmented() when
you could just handle everything within rb_erase_augmented_cached?
Similarly for other functions like __rb_insert()... It would seem like less
churn and I don't see downside to it...