[tip:x86/urgent] x86/UV2: Work around BAU bug

From: tip-bot for Cliff Wickman
Date: Tue Jan 17 2012 - 12:22:09 EST


Commit-ID: c5d35d399e685acccc85a675e8765c26b2a9813a
Gitweb: http://git.kernel.org/tip/c5d35d399e685acccc85a675e8765c26b2a9813a
Author: Cliff Wickman <cpw@xxxxxxx>
AuthorDate: Mon, 16 Jan 2012 15:19:47 -0600
Committer: Ingo Molnar <mingo@xxxxxxx>
CommitDate: Tue, 17 Jan 2012 09:09:54 +0100

x86/UV2: Work around BAU bug

This patch implements a workaround for a UV2 hardware bug.
The bug is a non-atomic update of a memory-mapped register. When
hardware message delivery and software message acknowledge occur
simultaneously the pending message acknowledge for the arriving
message may be lost. This causes the sender's message status to
stay busy.

Part of the workaround is to not acknowledge a completed message
until it is verified that no other message is actually using the
resource that is mistakenly recorded in the completed message.

Part of the workaround is to test for long elapsed time in such
a busy condition, then handle it by using a spare sending
descriptor. The stay-busy condition is eventually timed out by
hardware, and then the original sending descriptor can be
re-used. Most of that logic change is in keeping track of the
current descriptor and the state of the spares.

The occurrences of the workaround are added to the BAU
statistics.

Signed-off-by: Cliff Wickman <cpw@xxxxxxx>
Link: http://lkml.kernel.org/r/20120116211947.GC5767@xxxxxxx
Cc: <stable@xxxxxxxxxx>
Signed-off-by: Ingo Molnar <mingo@xxxxxxx>
---
arch/x86/include/asm/uv/uv_bau.h | 13 ++-
arch/x86/platform/uv/tlb_uv.c | 274 +++++++++++++++++++++++++++++++++-----
2 files changed, 254 insertions(+), 33 deletions(-)

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 4a46b27..1b82f7e 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -167,6 +167,7 @@
#define FLUSH_RETRY_TIMEOUT 2
#define FLUSH_GIVEUP 3
#define FLUSH_COMPLETE 4
+#define FLUSH_RETRY_BUSYBUG 5

/*
* tuning the action when the numalink network is extremely delayed
@@ -463,7 +464,6 @@ struct bau_pq_entry {
struct msg_desc {
struct bau_pq_entry *msg;
int msg_slot;
- int swack_slot;
struct bau_pq_entry *queue_first;
struct bau_pq_entry *queue_last;
};
@@ -517,6 +517,9 @@ struct ptc_stats {
unsigned long s_retry_messages; /* retry broadcasts */
unsigned long s_bau_reenabled; /* for bau enable/disable */
unsigned long s_bau_disabled; /* for bau enable/disable */
+ unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */
+ unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */
+ unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */
/* destination statistics */
unsigned long d_alltlb; /* times all tlb's on this
cpu were flushed */
@@ -593,6 +596,8 @@ struct bau_control {
short cpus_in_socket;
short cpus_in_uvhub;
short partition_base_pnode;
+ short using_desc; /* an index, like uvhub_cpu */
+ unsigned int inuse_map;
unsigned short message_number;
unsigned short uvhub_quiesce;
short socket_acknowledge_count[DEST_Q_SIZE];
@@ -610,6 +615,7 @@ struct bau_control {
int cong_response_us;
int cong_reps;
int cong_period;
+ unsigned long clocks_per_100_usec;
cycles_t period_time;
long period_requests;
struct hub_and_pnode *thp;
@@ -670,6 +676,11 @@ static inline void write_mmr_sw_ack(unsigned long mr)
uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
}

+static inline void write_gmmr_sw_ack(int pnode, unsigned long mr)
+{
+ write_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
+}
+
static inline unsigned long read_mmr_sw_ack(void)
{
return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index c425ff1..9010ca7 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -157,13 +157,14 @@ static int __init uvhub_to_first_apicid(int uvhub)
* clear of the Timeout bit (as well) will free the resource. No reply will
* be sent (the hardware will only do one reply per message).
*/
-static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp)
+static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp,
+ int do_acknowledge)
{
unsigned long dw;
struct bau_pq_entry *msg;

msg = mdp->msg;
- if (!msg->canceled) {
+ if (!msg->canceled && do_acknowledge) {
dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
write_mmr_sw_ack(dw);
}
@@ -212,8 +213,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp,
if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
unsigned long mr;
/*
- * is the resource timed out?
- * make everyone ignore the cancelled message.
+ * Is the resource timed out?
+ * Make everyone ignore the cancelled message.
*/
msg2->canceled = 1;
stat->d_canceled++;
@@ -231,8 +232,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp,
* Do all the things a cpu should do for a TLB shootdown message.
* Other cpu's may come here at the same time for this message.
*/
-static void bau_process_message(struct msg_desc *mdp,
- struct bau_control *bcp)
+static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
+ int do_acknowledge)
{
short socket_ack_count = 0;
short *sp;
@@ -284,8 +285,9 @@ static void bau_process_message(struct msg_desc *mdp,
if (msg_ack_count == bcp->cpus_in_uvhub) {
/*
* All cpus in uvhub saw it; reply
+ * (unless we are in the UV2 workaround)
*/
- reply_to_message(mdp, bcp);
+ reply_to_message(mdp, bcp, do_acknowledge);
}
}

@@ -491,27 +493,138 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
/*
* UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register.
*/
-static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu)
+static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
{
unsigned long descriptor_status;
unsigned long descriptor_status2;

descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK);
- descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL;
+ descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL;
descriptor_status = (descriptor_status << 1) | descriptor_status2;
return descriptor_status;
}

+/*
+ * Return whether the status of the descriptor that is normally used for this
+ * cpu (the one indexed by its hub-relative cpu number) is busy.
+ * The status of the original 32 descriptors is always reflected in the 64
+ * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0.
+ * The bit provided by the activation_status_2 register is irrelevant to
+ * the status if it is only being tested for busy or not busy.
+ */
+int normal_busy(struct bau_control *bcp)
+{
+ int cpu = bcp->uvhub_cpu;
+ int mmr_offset;
+ int right_shift;
+
+ mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
+ right_shift = cpu * UV_ACT_STATUS_SIZE;
+ return (((((read_lmmr(mmr_offset) >> right_shift) &
+ UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY);
+}
+
+/*
+ * Entered when a bau descriptor has gone into a permanent busy wait because
+ * of a hardware bug.
+ * Workaround the bug.
+ */
+int handle_uv2_busy(struct bau_control *bcp)
+{
+ int busy_one = bcp->using_desc;
+ int normal = bcp->uvhub_cpu;
+ int selected = -1;
+ int i;
+ unsigned long descriptor_status;
+ unsigned long status;
+ int mmr_offset;
+ struct bau_desc *bau_desc_old;
+ struct bau_desc *bau_desc_new;
+ struct bau_control *hmaster = bcp->uvhub_master;
+ struct ptc_stats *stat = bcp->statp;
+ cycles_t ttm;
+
+ stat->s_uv2_wars++;
+ spin_lock(&hmaster->uvhub_lock);
+ /* try for the original first */
+ if (busy_one != normal) {
+ if (!normal_busy(bcp))
+ selected = normal;
+ }
+ if (selected < 0) {
+ /* can't use the normal, select an alternate */
+ mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
+ descriptor_status = read_lmmr(mmr_offset);
+
+ /* scan available descriptors 32-63 */
+ for (i = 0; i < UV_CPUS_PER_AS; i++) {
+ if ((hmaster->inuse_map & (1 << i)) == 0) {
+ status = ((descriptor_status >>
+ (i * UV_ACT_STATUS_SIZE)) &
+ UV_ACT_STATUS_MASK) << 1;
+ if (status != UV2H_DESC_BUSY) {
+ selected = i + UV_CPUS_PER_AS;
+ break;
+ }
+ }
+ }
+ }
+
+ if (busy_one != normal)
+ /* mark the busy alternate as not in-use */
+ hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS));
+
+ if (selected >= 0) {
+ /* switch to the selected descriptor */
+ if (selected != normal) {
+ /* set the selected alternate as in-use */
+ hmaster->inuse_map |=
+ (1 << (selected - UV_CPUS_PER_AS));
+ if (selected > stat->s_uv2_wars_hw)
+ stat->s_uv2_wars_hw = selected;
+ }
+ bau_desc_old = bcp->descriptor_base;
+ bau_desc_old += (ITEMS_PER_DESC * busy_one);
+ bcp->using_desc = selected;
+ bau_desc_new = bcp->descriptor_base;
+ bau_desc_new += (ITEMS_PER_DESC * selected);
+ *bau_desc_new = *bau_desc_old;
+ } else {
+ /*
+ * All are busy. Wait for the normal one for this cpu to
+ * free up.
+ */
+ stat->s_uv2_war_waits++;
+ spin_unlock(&hmaster->uvhub_lock);
+ ttm = get_cycles();
+ do {
+ cpu_relax();
+ } while (normal_busy(bcp));
+ spin_lock(&hmaster->uvhub_lock);
+ /* switch to the original descriptor */
+ bcp->using_desc = normal;
+ bau_desc_old = bcp->descriptor_base;
+ bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc);
+ bcp->using_desc = (ITEMS_PER_DESC * normal);
+ bau_desc_new = bcp->descriptor_base;
+ bau_desc_new += (ITEMS_PER_DESC * normal);
+ *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */
+ }
+ spin_unlock(&hmaster->uvhub_lock);
+ return FLUSH_RETRY_BUSYBUG;
+}
+
static int uv2_wait_completion(struct bau_desc *bau_desc,
unsigned long mmr_offset, int right_shift,
struct bau_control *bcp, long try)
{
unsigned long descriptor_stat;
cycles_t ttm;
- int cpu = bcp->uvhub_cpu;
+ int desc = bcp->using_desc;
+ long busy_reps = 0;
struct ptc_stats *stat = bcp->statp;

- descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
+ descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc);

/* spin on the status MMR, waiting for it to go idle */
while (descriptor_stat != UV2H_DESC_IDLE) {
@@ -542,12 +655,23 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
bcp->conseccompletes = 0;
return FLUSH_RETRY_TIMEOUT;
} else {
+ busy_reps++;
+ if (busy_reps > 1000000) {
+ /* not to hammer on the clock */
+ busy_reps = 0;
+ ttm = get_cycles();
+ if ((ttm - bcp->send_message) >
+ (bcp->clocks_per_100_usec)) {
+ return handle_uv2_busy(bcp);
+ }
+ }
/*
* descriptor_stat is still BUSY
*/
cpu_relax();
}
- descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
+ descriptor_stat = uv2_read_status(mmr_offset, right_shift,
+ desc);
}
bcp->conseccompletes++;
return FLUSH_COMPLETE;
@@ -563,14 +687,14 @@ static int wait_completion(struct bau_desc *bau_desc,
{
int right_shift;
unsigned long mmr_offset;
- int cpu = bcp->uvhub_cpu;
+ int desc = bcp->using_desc;

- if (cpu < UV_CPUS_PER_AS) {
+ if (desc < UV_CPUS_PER_AS) {
mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
- right_shift = cpu * UV_ACT_STATUS_SIZE;
+ right_shift = desc * UV_ACT_STATUS_SIZE;
} else {
mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
- right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
+ right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
}

if (bcp->uvhub_version == 1)
@@ -752,8 +876,7 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
* Returns 1 if it gives up entirely and the original cpu mask is to be
* returned to the kernel.
*/
-int uv_flush_send_and_wait(struct bau_desc *bau_desc,
- struct cpumask *flush_mask, struct bau_control *bcp)
+int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
{
int seq_number = 0;
int completion_stat = 0;
@@ -766,20 +889,24 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
struct bau_control *hmaster = bcp->uvhub_master;
struct uv1_bau_msg_header *uv1_hdr = NULL;
struct uv2_bau_msg_header *uv2_hdr = NULL;
+ struct bau_desc *bau_desc;

- if (bcp->uvhub_version == 1) {
- uv1 = 1;
+ if (bcp->uvhub_version == 1)
uv1_throttle(hmaster, stat);
- uv1_hdr = &bau_desc->header.uv1_hdr;
- } else
- uv2_hdr = &bau_desc->header.uv2_hdr;

while (hmaster->uvhub_quiesce)
cpu_relax();

time1 = get_cycles();
do {
- if (try == 0) {
+ bau_desc = bcp->descriptor_base;
+ bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
+ if (bcp->uvhub_version == 1) {
+ uv1 = 1;
+ uv1_hdr = &bau_desc->header.uv1_hdr;
+ } else
+ uv2_hdr = &bau_desc->header.uv2_hdr;
+ if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) {
if (uv1)
uv1_hdr->msg_type = MSG_REGULAR;
else
@@ -797,13 +924,14 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
uv1_hdr->sequence = seq_number;
else
uv2_hdr->sequence = seq_number;
- index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
+ index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc;
bcp->send_message = get_cycles();

write_mmr_activation(index);

try++;
completion_stat = wait_completion(bau_desc, bcp, try);
+ /* UV2: wait_completion() may change the bcp->using_desc */

handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);

@@ -814,6 +942,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
}
cpu_relax();
} while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
+ (completion_stat == FLUSH_RETRY_BUSYBUG) ||
(completion_stat == FLUSH_RETRY_TIMEOUT));

time2 = get_cycles();
@@ -828,6 +957,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
record_send_stats(time1, time2, bcp, stat, completion_stat, try);

if (completion_stat == FLUSH_GIVEUP)
+ /* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */
return 1;
return 0;
}
@@ -983,7 +1113,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
stat->s_ntargself++;

bau_desc = bcp->descriptor_base;
- bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);
+ bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
return NULL;
@@ -996,13 +1126,86 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
* uv_flush_send_and_wait returns 0 if all cpu's were messaged,
* or 1 if it gave up and the original cpumask should be returned.
*/
- if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
+ if (!uv_flush_send_and_wait(flush_mask, bcp))
return NULL;
else
return cpumask;
}

/*
+ * Search the message queue for any 'other' message with the same software
+ * acknowledge resource bit vector.
+ */
+struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
+ struct bau_control *bcp, unsigned char swack_vec)
+{
+ struct bau_pq_entry *msg_next = msg + 1;
+
+ if (msg_next > bcp->queue_last)
+ msg_next = bcp->queue_first;
+ while ((msg_next->swack_vec != 0) && (msg_next != msg)) {
+ if (msg_next->swack_vec == swack_vec)
+ return msg_next;
+ msg_next++;
+ if (msg_next > bcp->queue_last)
+ msg_next = bcp->queue_first;
+ }
+ return NULL;
+}
+
+/*
+ * UV2 needs to work around a bug in which an arriving message has not
+ * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register.
+ * Such a message must be ignored.
+ */
+void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
+{
+ unsigned long mmr_image;
+ unsigned char swack_vec;
+ struct bau_pq_entry *msg = mdp->msg;
+ struct bau_pq_entry *other_msg;
+
+ mmr_image = read_mmr_sw_ack();
+ swack_vec = msg->swack_vec;
+
+ if ((swack_vec & mmr_image) == 0) {
+ /*
+ * This message was assigned a swack resource, but no
+ * reserved acknowlegment is pending.
+ * The bug has prevented this message from setting the MMR.
+ * And no other message has used the same sw_ack resource.
+ * Do the requested shootdown but do not reply to the msg.
+ * (the 0 means make no acknowledge)
+ */
+ bau_process_message(mdp, bcp, 0);
+ return;
+ }
+
+ /*
+ * Some message has set the MMR 'pending' bit; it might have been
+ * another message. Look for that message.
+ */
+ other_msg = find_another_by_swack(msg, bcp, msg->swack_vec);
+ if (other_msg) {
+ /* There is another. Do not ack the current one. */
+ bau_process_message(mdp, bcp, 0);
+ /*
+ * Let the natural processing of that message acknowledge
+ * it. Don't get the processing of sw_ack's out of order.
+ */
+ return;
+ }
+
+ /*
+ * There is no other message using this sw_ack, so it is safe to
+ * acknowledge it.
+ */
+ bau_process_message(mdp, bcp, 1);
+
+ return;
+}
+
+/*
* The BAU message interrupt comes here. (registered by set_intr_gate)
* See entry_64.S
*
@@ -1038,9 +1241,11 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
count++;

msgdesc.msg_slot = msg - msgdesc.queue_first;
- msgdesc.swack_slot = ffs(msg->swack_vec) - 1;
msgdesc.msg = msg;
- bau_process_message(&msgdesc, bcp);
+ if (bcp->uvhub_version == 2)
+ process_uv2_message(&msgdesc, bcp);
+ else
+ bau_process_message(&msgdesc, bcp, 1);

msg++;
if (msg > msgdesc.queue_last)
@@ -1158,7 +1363,7 @@ static int ptc_seq_show(struct seq_file *file, void *data)
seq_printf(file,
"all one mult none retry canc nocan reset rcan ");
seq_printf(file,
- "disable enable\n");
+ "disable enable wars warshw warwaits\n");
}
if (cpu < num_possible_cpus() && cpu_online(cpu)) {
stat = &per_cpu(ptcstats, cpu);
@@ -1189,8 +1394,10 @@ static int ptc_seq_show(struct seq_file *file, void *data)
stat->d_nomsg, stat->d_retries, stat->d_canceled,
stat->d_nocanceled, stat->d_resets,
stat->d_rcanceled);
- seq_printf(file, "%ld %ld\n",
- stat->s_bau_disabled, stat->s_bau_reenabled);
+ seq_printf(file, "%ld %ld %ld %ld %ld\n",
+ stat->s_bau_disabled, stat->s_bau_reenabled,
+ stat->s_uv2_wars, stat->s_uv2_wars_hw,
+ stat->s_uv2_war_waits);
}
return 0;
}
@@ -1564,6 +1771,7 @@ static void pq_init(int node, int pnode)
write_mmr_payload_first(pnode, pn_first);
write_mmr_payload_tail(pnode, first);
write_mmr_payload_last(pnode, last);
+ write_gmmr_sw_ack(pnode, 0xffffUL);

/* in effect, all msg_type's are set to MSG_NOOP */
memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
@@ -1651,6 +1859,7 @@ static void __init init_per_cpu_tunables(void)
bcp->cong_response_us = congested_respns_us;
bcp->cong_reps = congested_reps;
bcp->cong_period = congested_period;
+ bcp->clocks_per_100_usec = usec_2_cycles(100);
}
}

@@ -1771,6 +1980,7 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
}
bcp->uvhub_master = *hmasterp;
bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+ bcp->using_desc = bcp->uvhub_cpu;
if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
printk(KERN_EMERG "%d cpus per uvhub invalid\n",
bcp->uvhub_cpu);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/