[patch 18/29] xen: deal with negative stolen time

From: Jeremy Fitzhardinge
Date: Fri May 04 2007 - 19:53:30 EST


Stolen time should never be negative; if it ever is, it probably
indicates some other bug. However, if it does happen, then its better
to just clamp it at zero, rather than trying to account for it as a
huge positive number.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xxxxxxxxxxxxx>
Acked-by: Chris Wright <chrisw@xxxxxxxxxxxx>

---
arch/i386/xen/smp.c | 4 +
arch/i386/xen/time.c | 112 ++++++++++++++++++++++++++++++++---------------
arch/i386/xen/xen-ops.h | 3 -
3 files changed, 83 insertions(+), 36 deletions(-)

===================================================================
--- a/arch/i386/xen/smp.c
+++ b/arch/i386/xen/smp.c
@@ -72,10 +72,11 @@ static __cpuinit void cpu_bringup_and_id
int cpu = smp_processor_id();

cpu_init();
- xen_setup_timer();

preempt_disable();
per_cpu(cpu_state, cpu) = CPU_ONLINE;
+
+ xen_setup_cpu_clockevents();

/* We can take interrupts now: we're officially "up". */
local_irq_enable();
@@ -263,6 +264,7 @@ int __cpuinit xen_cpu_up(unsigned int cp
per_cpu(current_task, cpu) = idle;
xen_vcpu_setup(cpu);
irq_ctx_init(cpu);
+ xen_setup_timer(cpu);

/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
===================================================================
--- a/arch/i386/xen/time.c
+++ b/arch/i386/xen/time.c
@@ -49,6 +49,35 @@ static DEFINE_PER_CPU(u64, residual_stol
static DEFINE_PER_CPU(u64, residual_stolen);
static DEFINE_PER_CPU(u64, residual_blocked);

+/* return an consistent snapshot of 64-bit time/counter value */
+static u64 get64(const u64 *p)
+{
+ u64 ret;
+
+ if (BITS_PER_LONG < 64) {
+ u32 *p32 = (u32 *)p;
+ u32 h, l;
+
+ /*
+ * Read high then low, and then make sure high is
+ * still the same; this will only loop if low wraps
+ * and carries into high.
+ * XXX some clean way to make this endian-proof?
+ */
+ do {
+ h = p32[1];
+ barrier();
+ l = p32[0];
+ barrier();
+ } while (p32[1] != h);
+
+ ret = (((u64)h) << 32) | l;
+ } else
+ ret = *p;
+
+ return ret;
+}
+
/*
* Runstate accounting
*/
@@ -67,31 +96,29 @@ static void get_runstate_snapshot(struct
* stronger than a compiler barrier when fetching it.
*/
do {
- state_time = state->state_entry_time;
+ state_time = get64(&state->state_entry_time);
barrier();
*res = *state;
barrier();
- } while(state->state_entry_time != state_time);
-}
-
-static void setup_runstate_info(void)
+ } while(get64(&state->state_entry_time) != state_time);
+}
+
+static void setup_runstate_info(int cpu)
{
struct vcpu_register_runstate_memory_area area;

- area.addr.v = &__get_cpu_var(runstate);
+ area.addr.v = &per_cpu(runstate, cpu);

if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
- smp_processor_id(), &area))
+ cpu, &area))
BUG();
-
- get_runstate_snapshot(&__get_cpu_var(runstate_snapshot));
}

static void do_stolen_accounting(void)
{
struct vcpu_runstate_info state;
struct vcpu_runstate_info *snap;
- u64 blocked, runnable, offline, stolen;
+ s64 blocked, runnable, offline, stolen;
cputime_t ticks;

get_runstate_snapshot(&state);
@@ -111,6 +138,10 @@ static void do_stolen_accounting(void)
including any left-overs from last time. Passing NULL to
account_steal_time accounts the time as stolen. */
stolen = runnable + offline + __get_cpu_var(residual_stolen);
+
+ if (stolen < 0)
+ stolen = 0;
+
ticks = 0;
while(stolen >= NS_PER_TICK) {
ticks++;
@@ -123,6 +154,10 @@ static void do_stolen_accounting(void)
including any left-overs from last time. Passing idle to
account_steal_time accounts the time as idle/wait. */
blocked += __get_cpu_var(residual_blocked);
+
+ if (blocked < 0)
+ blocked = 0;
+
ticks = 0;
while(blocked >= NS_PER_TICK) {
ticks++;
@@ -141,7 +176,8 @@ unsigned long long xen_sched_clock(void)
{
struct vcpu_runstate_info state;
cycle_t now;
- unsigned long long ret;
+ u64 ret;
+ s64 offset;

/*
* Ideally sched_clock should be called on a per-cpu basis
@@ -156,9 +192,13 @@ unsigned long long xen_sched_clock(void)

WARN_ON(state.state != RUNSTATE_running);

+ offset = now - state.state_entry_time;
+ if (offset < 0)
+ offset = 0;
+
ret = state.time[RUNSTATE_blocked] +
state.time[RUNSTATE_running] +
- (now - state.state_entry_time);
+ offset;

preempt_enable();

@@ -186,12 +226,10 @@ unsigned long xen_cpu_khz(void)
* Reads a consistent set of time-base values from Xen, into a shadow data
* area.
*/
-static void get_time_values_from_xen(void)
+static unsigned get_time_values_from_xen(void)
{
struct vcpu_time_info *src;
struct shadow_time_info *dst;
-
- preempt_disable();

src = &__get_cpu_var(xen_vcpu)->time;
dst = &__get_cpu_var(shadow_time);
@@ -206,7 +244,7 @@ static void get_time_values_from_xen(voi
rmb();
} while ((src->version & 1) | (dst->version ^ src->version));

- preempt_enable();
+ return dst->version;
}

/*
@@ -250,7 +288,7 @@ static u64 get_nsec_offset(struct shadow
static u64 get_nsec_offset(struct shadow_time_info *shadow)
{
u64 now, delta;
- rdtscll(now);
+ now = native_read_tsc();
delta = now - shadow->tsc_timestamp;
return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
}
@@ -259,10 +297,14 @@ static cycle_t xen_clocksource_read(void
{
struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
cycle_t ret;
-
- get_time_values_from_xen();
-
- ret = shadow->system_timestamp + get_nsec_offset(shadow);
+ unsigned version;
+
+ do {
+ version = get_time_values_from_xen();
+ barrier();
+ ret = shadow->system_timestamp + get_nsec_offset(shadow);
+ barrier();
+ } while(version != __get_cpu_var(xen_vcpu)->time.version);

put_cpu_var(shadow_time);

@@ -484,9 +526,8 @@ static irqreturn_t xen_timer_interrupt(i
return ret;
}

-void xen_setup_timer(void)
-{
- int cpu = smp_processor_id();
+void xen_setup_timer(int cpu)
+{
const char *name;
struct clock_event_device *evt;
int irq;
@@ -501,23 +542,25 @@ void xen_setup_timer(void)
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
name, NULL);

- evt = &get_cpu_var(xen_clock_events);
+ evt = &per_cpu(xen_clock_events, cpu);
memcpy(evt, xen_clockevent, sizeof(*evt));

evt->cpumask = cpumask_of_cpu(cpu);
evt->irq = irq;
- clockevents_register_device(evt);
-
- setup_runstate_info();
-
- put_cpu_var(xen_clock_events);
+
+ setup_runstate_info(cpu);
+}
+
+void xen_setup_cpu_clockevents(void)
+{
+ BUG_ON(preemptible());
+
+ clockevents_register_device(&__get_cpu_var(xen_clock_events));
}

__init void xen_time_init(void)
{
int cpu = smp_processor_id();
-
- get_time_values_from_xen();

clocksource_register(&xen_clocksource);

@@ -535,5 +578,6 @@ __init void xen_time_init(void)

tsc_disable = 0;

- xen_setup_timer();
-}
+ xen_setup_timer(cpu);
+ xen_setup_cpu_clockevents();
+}
===================================================================
--- a/arch/i386/xen/xen-ops.h
+++ b/arch/i386/xen/xen-ops.h
@@ -25,7 +25,8 @@ unsigned long xen_get_wallclock(void);
unsigned long xen_get_wallclock(void);
int xen_set_wallclock(unsigned long time);
unsigned long long xen_sched_clock(void);
-void xen_setup_timer(void);
+void xen_setup_timer(int cpu);
+void xen_setup_cpu_clockevents(void);

void xen_mark_init_mm_pinned(void);


--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/