[PATCH 2/2] swap: use separate priority list for available swap_infos
From: Dan Streetman
Date: Sat Apr 12 2014 - 17:04:18 EST
Originally get_swap_page() started iterating through the singly-linked
list of swap_info_structs using swap_list.next or highest_priority_index,
which both were intended to point to the highest priority active swap
target that was not full. The previous patch in this series changed the
singly-linked list to a doubly-linked list, and removed the logic to start
at the highest priority non-full entry; it starts scanning at the highest
priority entry each time, even if the entry is full.
Add a new list, also priority ordered, to track only swap_info_structs
that are available, i.e. active and not full. Use a new spinlock so that
entries can be added/removed outside of get_swap_page; that wasn't possible
previously because the main list is protected by swap_lock, which can't be
taken when holding a swap_info_struct->lock because of locking order.
The get_swap_page() logic now does not need to hold the swap_lock, and it
iterates only through swap_info_structs that are available.
Signed-off-by: Dan Streetman <ddstreet@xxxxxxxx>
---
include/linux/swap.h | 1 +
mm/swapfile.c | 128 ++++++++++++++++++++++++++++++++++-----------------
2 files changed, 87 insertions(+), 42 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 96662d8..d9263db 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,6 +214,7 @@ struct percpu_cluster {
struct swap_info_struct {
unsigned long flags; /* SWP_USED etc: see above */
signed short prio; /* swap priority of this type */
+ struct list_head prio_list; /* entry in priority list */
struct list_head list; /* entry in swap list */
signed char type; /* strange name for an index */
unsigned int max; /* extent of the swap_map */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b958645..3c38461 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -57,9 +57,13 @@ static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
-/* all active swap_info */
+/* all active swap_info; protected with swap_lock */
LIST_HEAD(swap_list_head);
+/* all available (active, not full) swap_info, priority ordered */
+static LIST_HEAD(prio_head);
+static DEFINE_SPINLOCK(prio_lock);
+
struct swap_info_struct *swap_info[MAX_SWAPFILES];
static DEFINE_MUTEX(swapon_mutex);
@@ -73,6 +77,27 @@ static inline unsigned char swap_count(unsigned char ent)
return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
}
+/*
+ * add, in priority order, swap_info (p)->(le) list_head to list (lh)
+ * this list-generic function is needed because both swap_list_head
+ * and prio_head need to be priority ordered:
+ * swap_list_head in swapoff to adjust lower negative prio swap_infos
+ * prio_list in get_swap_page to scan highest prio swap_info first
+ */
+#define swap_info_list_add(p, lh, le) do { \
+ struct swap_info_struct *_si; \
+ BUG_ON(!list_empty(&(p)->le)); \
+ list_for_each_entry(_si, (lh), le) { \
+ if ((p)->prio >= _si->prio) { \
+ list_add_tail(&(p)->le, &_si->le); \
+ break; \
+ } \
+ } \
+ /* lh empty, or p lowest prio */ \
+ if (list_empty(&(p)->le)) \
+ list_add_tail(&(p)->le, (lh)); \
+} while (0)
+
/* returns 1 if swap entry is freed */
static int
__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
@@ -591,6 +616,9 @@ checks:
if (si->inuse_pages == si->pages) {
si->lowest_bit = si->max;
si->highest_bit = 0;
+ spin_lock(&prio_lock);
+ list_del_init(&si->prio_list);
+ spin_unlock(&prio_lock);
}
si->swap_map[offset] = usage;
inc_cluster_info_page(si, si->cluster_info, offset);
@@ -642,53 +670,68 @@ swp_entry_t get_swap_page(void)
{
struct swap_info_struct *si, *next;
pgoff_t offset;
- struct list_head *tmp;
- spin_lock(&swap_lock);
if (atomic_long_read(&nr_swap_pages) <= 0)
goto noswap;
atomic_long_dec(&nr_swap_pages);
- list_for_each(tmp, &swap_list_head) {
- si = list_entry(tmp, typeof(*si), list);
- spin_lock(&si->lock);
- if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
- spin_unlock(&si->lock);
- continue;
- }
-
+ spin_lock(&prio_lock);
+start_over:
+ list_for_each_entry_safe(si, next, &prio_head, prio_list) {
/*
- * rotate the current swap_info that we're going to use
+ * rotate the current swap_info that we're checking
* to after any other swap_info that have the same prio,
* so that all equal-priority swap_info get used equally
*/
- next = si;
- list_for_each_entry_continue(next, &swap_list_head, list) {
- if (si->prio != next->prio)
+ struct swap_info_struct *eq_prio = si;
+ list_for_each_entry_continue(eq_prio, &prio_head, prio_list) {
+ if (si->prio != eq_prio->prio)
break;
- list_rotate_left(&si->list);
- next = si;
+ list_rotate_left(&si->prio_list);
+ eq_prio = si;
+ }
+ spin_unlock(&prio_lock);
+ spin_lock(&si->lock);
+ if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
+ spin_lock(&prio_lock);
+ if (list_empty(&si->prio_list)) {
+ spin_unlock(&si->lock);
+ goto nextsi;
+ }
+ WARN(!si->highest_bit,
+ "swap_info %d in list but !highest_bit\n",
+ si->type);
+ WARN(!(si->flags & SWP_WRITEOK),
+ "swap_info %d in list but !SWP_WRITEOK\n",
+ si->type);
+ list_del_init(&si->prio_list);
+ spin_unlock(&si->lock);
+ goto nextsi;
}
- spin_unlock(&swap_lock);
/* This is called for allocating swap entry for cache */
offset = scan_swap_map(si, SWAP_HAS_CACHE);
spin_unlock(&si->lock);
if (offset)
return swp_entry(si->type, offset);
- spin_lock(&swap_lock);
+ printk(KERN_DEBUG "scan_swap_map of si %d failed to find offset\n",
+ si->type);
+ spin_lock(&prio_lock);
+nextsi:
/*
- * shouldn't really have got here, but for some reason the
- * scan_swap_map came back empty for this swap_info.
- * Since we dropped the swap_lock, there may now be
- * non-full higher prio swap_infos; let's start over.
+ * shouldn't really have got here. either si was
+ * in the prio_head list but was full or !writeok, or
+ * scan_swap_map came back empty. Since we dropped
+ * the prio_lock, the prio_head list may have been
+ * modified; so if next is still in the prio_head
+ * list then try it, otherwise start over.
*/
- tmp = &swap_list_head;
+ if (list_empty(&next->prio_list))
+ goto start_over;
}
atomic_long_inc(&nr_swap_pages);
noswap:
- spin_unlock(&swap_lock);
return (swp_entry_t) {0};
}
@@ -791,8 +834,17 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
dec_cluster_info_page(p, p->cluster_info, offset);
if (offset < p->lowest_bit)
p->lowest_bit = offset;
- if (offset > p->highest_bit)
+ if (offset > p->highest_bit) {
+ bool was_full = !p->highest_bit;
p->highest_bit = offset;
+ if (was_full && (p->flags & SWP_WRITEOK)) {
+ spin_lock(&prio_lock);
+ if (list_empty(&p->prio_list))
+ swap_info_list_add(p, &prio_head,
+ prio_list);
+ spin_unlock(&prio_lock);
+ }
+ }
atomic_long_inc(&nr_swap_pages);
p->inuse_pages--;
frontswap_invalidate_page(p->type, offset);
@@ -1727,8 +1779,6 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
{
- struct swap_info_struct *si;
-
if (prio >= 0)
p->prio = prio;
else
@@ -1740,20 +1790,10 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
total_swap_pages += p->pages;
assert_spin_locked(&swap_lock);
- BUG_ON(!list_empty(&p->list));
- /* insert into swap list: */
- list_for_each_entry(si, &swap_list_head, list) {
- if (p->prio >= si->prio) {
- list_add_tail(&p->list, &si->list);
- return;
- }
- }
- /*
- * this covers two cases:
- * 1) p->prio is less than all existing prio
- * 2) the swap list is empty
- */
- list_add_tail(&p->list, &swap_list_head);
+ swap_info_list_add(p, &swap_list_head, list);
+ spin_lock(&prio_lock);
+ swap_info_list_add(p, &prio_head, prio_list);
+ spin_unlock(&prio_lock);
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -1827,6 +1867,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
+ spin_lock(&prio_lock);
+ list_del_init(&p->prio_list);
+ spin_unlock(&prio_lock);
spin_lock(&p->lock);
if (p->prio < 0) {
struct swap_info_struct *si = p;
@@ -2101,6 +2144,7 @@ static struct swap_info_struct *alloc_swap_info(void)
}
INIT_LIST_HEAD(&p->first_swap_extent.list);
INIT_LIST_HEAD(&p->list);
+ INIT_LIST_HEAD(&p->prio_list);
p->flags = SWP_USED;
spin_unlock(&swap_lock);
spin_lock_init(&p->lock);
--
1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/