[PATCH 2/3] IPVS: add wlib & wlip schedulers

From: Chris Caputo
Date: Tue Jan 20 2015 - 18:21:34 EST


On Tue, 20 Jan 2015, Julian Anastasov wrote:
> > + (u64)dr * (u64)lwgt < (u64)lr * (u64)dwgt ||
[...]
> > + (dr == lr && dwgt > lwgt)) {
>
> Above check is redundant.

I accepted your feedback and applied it to the below, except for this
item. I believe if dr and lr are zero (no traffic), we still want to
choose the higher weight, thus a separate comparison is needed.

Thanks,
Chris

From: Chris Caputo <ccaputo@xxxxxxx>

IPVS wlib (Weighted Least Incoming Byterate) and wlip (Weighted Least Incoming
Packetrate) schedulers, updated for 3.19-rc5.

Signed-off-by: Chris Caputo <ccaputo@xxxxxxx>
---
diff -uprN linux-3.19-rc5-stock/net/netfilter/ipvs/Kconfig linux-3.19-rc5/net/netfilter/ipvs/Kconfig
--- linux-3.19-rc5-stock/net/netfilter/ipvs/Kconfig 2015-01-18 06:02:20.000000000 +0000
+++ linux-3.19-rc5/net/netfilter/ipvs/Kconfig 2015-01-20 08:08:28.883080285 +0000
@@ -240,6 +240,26 @@ config IP_VS_NQ
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.

+config IP_VS_WLIB
+ tristate "weighted least incoming byterate scheduling"
+ ---help---
+ The weighted least incoming byterate scheduling algorithm directs
+ network connections to the server with the least incoming byterate
+ normalized by the server weight.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_WLIP
+ tristate "weighted least incoming packetrate scheduling"
+ ---help---
+ The weighted least incoming packetrate scheduling algorithm directs
+ network connections to the server with the least incoming packetrate
+ normalized by the server weight.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
comment 'IPVS SH scheduler'

config IP_VS_SH_TAB_BITS
diff -uprN linux-3.19-rc5-stock/net/netfilter/ipvs/Makefile linux-3.19-rc5/net/netfilter/ipvs/Makefile
--- linux-3.19-rc5-stock/net/netfilter/ipvs/Makefile 2015-01-18 06:02:20.000000000 +0000
+++ linux-3.19-rc5/net/netfilter/ipvs/Makefile 2015-01-20 08:08:28.883080285 +0000
@@ -33,6 +33,8 @@ obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+obj-$(CONFIG_IP_VS_WLIB) += ip_vs_wlib.o
+obj-$(CONFIG_IP_VS_WLIP) += ip_vs_wlip.o

# IPVS application helpers
obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff -uprN linux-3.19-rc5-stock/net/netfilter/ipvs/ip_vs_wlib.c linux-3.19-rc5/net/netfilter/ipvs/ip_vs_wlib.c
--- linux-3.19-rc5-stock/net/netfilter/ipvs/ip_vs_wlib.c 1970-01-01 00:00:00.000000000 +0000
+++ linux-3.19-rc5/net/netfilter/ipvs/ip_vs_wlib.c 2015-01-20 08:09:00.177816054 +0000
@@ -0,0 +1,166 @@
+/* IPVS: Weighted Least Incoming Byterate Scheduling module
+ *
+ * Authors: Chris Caputo <ccaputo@xxxxxxx> based on code by:
+ *
+ * Wensong Zhang <wensong@xxxxxxxxxxxxxxxxxxxxxx>
+ * Peter Kese <peter.kese@xxxxxx>
+ * Julian Anastasov <ja@xxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Chris Caputo: Based code on ip_vs_wlc.c ip_vs_rr.c.
+ *
+ */
+
+/* The WLIB algorithm uses the results of the estimator's inbps
+ * calculations to determine which real server has the lowest incoming
+ * byterate.
+ *
+ * Real server weight is factored into the calculation. An example way to
+ * use this is if you have one server that can handle 100 Mbps of input and
+ * another that can handle 1 Gbps you could set the weights to be 100 and 1000
+ * respectively.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+static int
+ip_vs_wlib_init_svc(struct ip_vs_service *svc)
+{
+ svc->sched_data = &svc->destinations;
+ return 0;
+}
+
+static int
+ip_vs_wlib_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
+{
+ struct list_head *p;
+
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *)svc->sched_data;
+ /* dest is already unlinked, so p->prev is not valid but
+ * p->next is valid, use it to reach previous entry.
+ */
+ if (p == &dest->n_list)
+ svc->sched_data = p->next->prev;
+ spin_unlock_bh(&svc->sched_lock);
+ return 0;
+}
+
+/* Weighted Least Incoming Byterate scheduling */
+static struct ip_vs_dest *
+ip_vs_wlib_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct list_head *p;
+ struct ip_vs_dest *dest, *last, *least = NULL;
+ int pass = 0;
+ u64 dr, lr = -1;
+ u32 dwgt, lwgt = 0;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /* We calculate the load of each dest server as follows:
+ * (dest inbps rate) / dest->weight
+ *
+ * The comparison of dr*lwght < lr*dwght is equivalent to that of
+ * dr/dwght < lr/lwght if every weight is larger than zero.
+ *
+ * A server with weight=0 is quiesced and will not receive any
+ * new connections.
+ *
+ * In case of inactivity, highest weight is winner. And if that still makes
+ * for a tie, round robin is used (which is why we remember our last
+ * starting location in the linked list).
+ */
+
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *)svc->sched_data;
+ last = dest = list_entry(p, struct ip_vs_dest, n_list);
+
+ do {
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
+ dwgt = (u32)atomic_read(&dest->weight);
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ dwgt > 0) {
+ spin_lock(&dest->stats.lock);
+ /* estimator's scaling doesn't matter */
+ dr = dest->stats.est.inbps;
+ spin_unlock(&dest->stats.lock);
+
+ if (!least ||
+ dr * lwgt < lr * dwgt ||
+ (!dr && !lr && dwgt > lwgt)) {
+ least = dest;
+ lr = dr;
+ lwgt = dwgt;
+ }
+ }
+
+ if (dest == last)
+ goto stop;
+ }
+ pass++;
+ /* Previous dest could be unlinked, do not loop forever.
+ * If we stay at head there is no need for 2nd pass.
+ */
+ } while (pass < 2 && p != &svc->destinations);
+
+stop:
+ if (least)
+ svc->sched_data = &least->n_list;
+
+ spin_unlock_bh(&svc->sched_lock);
+
+ if (least) {
+ IP_VS_DBG_BUF(6,
+ "WLIB: server %s:%u activeconns %d refcnt %d weight %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight));
+ } else {
+ ip_vs_scheduler_err(svc, "no destination available");
+ }
+
+ return least;
+}
+
+static struct ip_vs_scheduler ip_vs_wlib_scheduler = {
+ .name = "wlib",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_wlib_scheduler.n_list),
+ .init_service = ip_vs_wlib_init_svc,
+ .add_dest = NULL,
+ .del_dest = ip_vs_wlib_del_dest,
+ .schedule = ip_vs_wlib_schedule,
+};
+
+static int __init ip_vs_wlib_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_wlib_scheduler);
+}
+
+static void __exit ip_vs_wlib_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_wlib_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_wlib_init);
+module_exit(ip_vs_wlib_cleanup);
+MODULE_LICENSE("GPL");
diff -uprN linux-3.19-rc5-stock/net/netfilter/ipvs/ip_vs_wlip.c linux-3.19-rc5/net/netfilter/ipvs/ip_vs_wlip.c
--- linux-3.19-rc5-stock/net/netfilter/ipvs/ip_vs_wlip.c 1970-01-01 00:00:00.000000000 +0000
+++ linux-3.19-rc5/net/netfilter/ipvs/ip_vs_wlip.c 2015-01-20 08:09:07.456126624 +0000
@@ -0,0 +1,166 @@
+/* IPVS: Weighted Least Incoming Packetrate Scheduling module
+ *
+ * Authors: Chris Caputo <ccaputo@xxxxxxx> based on code by:
+ *
+ * Wensong Zhang <wensong@xxxxxxxxxxxxxxxxxxxxxx>
+ * Peter Kese <peter.kese@xxxxxx>
+ * Julian Anastasov <ja@xxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Chris Caputo: Based code on ip_vs_wlc.c ip_vs_rr.c.
+ *
+ */
+
+/* The WLIP algorithm uses the results of the estimator's inpps
+ * calculations to determine which real server has the lowest incoming
+ * packetrate.
+ *
+ * Real server weight is factored into the calculation. An example way to
+ * use this is if you have one server that can handle 10 Kpps of input and
+ * another that can handle 100 Kpps you could set the weights to be 10 and 100
+ * respectively.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+static int
+ip_vs_wlip_init_svc(struct ip_vs_service *svc)
+{
+ svc->sched_data = &svc->destinations;
+ return 0;
+}
+
+static int
+ip_vs_wlip_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
+{
+ struct list_head *p;
+
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *)svc->sched_data;
+ /* dest is already unlinked, so p->prev is not valid but
+ * p->next is valid, use it to reach previous entry.
+ */
+ if (p == &dest->n_list)
+ svc->sched_data = p->next->prev;
+ spin_unlock_bh(&svc->sched_lock);
+ return 0;
+}
+
+/* Weighted Least Incoming Packetrate scheduling */
+static struct ip_vs_dest *
+ip_vs_wlip_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct list_head *p;
+ struct ip_vs_dest *dest, *last, *least = NULL;
+ int pass = 0;
+ u32 dr, lr = -1;
+ u32 dwgt, lwgt = 0;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /* We calculate the load of each dest server as follows:
+ * (dest inpps rate) / dest->weight
+ *
+ * The comparison of dr*lwght < lr*dwght is equivalent to that of
+ * dr/dwght < lr/lwght if every weight is larger than zero.
+ *
+ * A server with weight=0 is quiesced and will not receive any
+ * new connections.
+ *
+ * In case of inactivity, highest weight is winner. And if that still makes
+ * for a tie, round robin is used (which is why we remember our last
+ * starting location in the linked list).
+ */
+
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *)svc->sched_data;
+ last = dest = list_entry(p, struct ip_vs_dest, n_list);
+
+ do {
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
+ dwgt = (u32)atomic_read(&dest->weight);
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ dwgt > 0) {
+ spin_lock(&dest->stats.lock);
+ /* estimator's scaling doesn't matter */
+ dr = dest->stats.est.inpps;
+ spin_unlock(&dest->stats.lock);
+
+ if (!least ||
+ (u64)dr * lwgt < (u64)lr * dwgt ||
+ (!dr && !lr && dwgt > lwgt)) {
+ least = dest;
+ lr = dr;
+ lwgt = dwgt;
+ }
+ }
+
+ if (dest == last)
+ goto stop;
+ }
+ pass++;
+ /* Previous dest could be unlinked, do not loop forever.
+ * If we stay at head there is no need for 2nd pass.
+ */
+ } while (pass < 2 && p != &svc->destinations);
+
+stop:
+ if (least)
+ svc->sched_data = &least->n_list;
+
+ spin_unlock_bh(&svc->sched_lock);
+
+ if (least) {
+ IP_VS_DBG_BUF(6,
+ "WLIP: server %s:%u activeconns %d refcnt %d weight %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight));
+ } else {
+ ip_vs_scheduler_err(svc, "no destination available");
+ }
+
+ return least;
+}
+
+static struct ip_vs_scheduler ip_vs_wlip_scheduler = {
+ .name = "wlip",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_wlip_scheduler.n_list),
+ .init_service = ip_vs_wlip_init_svc,
+ .add_dest = NULL,
+ .del_dest = ip_vs_wlip_del_dest,
+ .schedule = ip_vs_wlip_schedule,
+};
+
+static int __init ip_vs_wlip_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_wlip_scheduler);
+}
+
+static void __exit ip_vs_wlip_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_wlip_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_wlip_init);
+module_exit(ip_vs_wlip_cleanup);
+MODULE_LICENSE("GPL");
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/