Re: Linux 2.6.21.7

From: Greg KH
Date: Sat Aug 04 2007 - 12:34:45 EST


diff --git a/Makefile b/Makefile
index 384ad33..673c4dd 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 21
-EXTRAVERSION = .6
+EXTRAVERSION = .7
NAME = Nocturnal Monster Puppy

# *DOCUMENTATION*
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 18bddcb..cb1f16c 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -371,10 +371,6 @@ ENTRY(system_call)
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
GET_THREAD_INFO(%ebp)
- testl $TF_MASK,PT_EFLAGS(%esp)
- jz no_singlestep
- orl $_TIF_SINGLESTEP,TI_flags(%ebp)
-no_singlestep:
# system call tracing in operation / emulation
/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
@@ -389,6 +385,10 @@ syscall_exit:
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
+ testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
+ jz no_singlestep
+ orl $_TIF_SINGLESTEP,TI_flags(%ebp)
+no_singlestep:
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx # current->work
jne syscall_exit_work
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index f72e8e8..a84304e 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -177,6 +177,13 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig,
*/
discard_lazy_cpu_state();

+ /*
+ * Force reload of FP/VEC.
+ * This has to be done before copying stuff into current->thread.fpr/vr
+ * for the reasons explained in the previous comment.
+ */
+ regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC);
+
err |= __copy_from_user(&current->thread.fpr, &sc->fp_regs, FP_REGS_SIZE);

#ifdef CONFIG_ALTIVEC
@@ -198,9 +205,6 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig,
current->thread.vrsave = 0;
#endif /* CONFIG_ALTIVEC */

- /* Force reload of FP/VEC */
- regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC);
-
return err;
}

diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 6fd126a..df35d6d 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -72,6 +72,8 @@ void show_mem(void)

for_each_online_pgdat(pgdat) {
for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+ if (!pfn_valid(pgdat->node_start_pfn + i))
+ continue;
page = pfn_to_page(pgdat->node_start_pfn + i);
total++;
if (PageReserved(page))
diff --git a/drivers/ide/pci/hpt366.c b/drivers/ide/pci/hpt366.c
index cf9d344..14de1e8 100644
--- a/drivers/ide/pci/hpt366.c
+++ b/drivers/ide/pci/hpt366.c
@@ -1,5 +1,5 @@
/*
- * linux/drivers/ide/pci/hpt366.c Version 1.03 May 4, 2007
+ * linux/drivers/ide/pci/hpt366.c Version 1.04 Jun 4, 2007
*
* Copyright (C) 1999-2003 Andre Hedrick <andre@xxxxxxxxxxxxx>
* Portions Copyright (C) 2001 Sun Microsystems, Inc.
@@ -106,7 +106,8 @@
* switch to calculating PCI clock frequency based on the chip's base DPLL
* frequency
* - switch to using the DPLL clock and enable UltraATA/133 mode by default on
- * anything newer than HPT370/A
+ * anything newer than HPT370/A (except HPT374 that is not capable of this
+ * mode according to the manual)
* - fold PCI clock detection and DPLL setup code into init_chipset_hpt366(),
* also fixing the interchanged 25/40 MHz PCI clock cases for HPT36x chips;
* unify HPT36x/37x timing setup code and the speedproc handlers by joining
@@ -365,7 +366,6 @@ static u32 sixty_six_base_hpt37x[] = {
};

#define HPT366_DEBUG_DRIVE_INFO 0
-#define HPT374_ALLOW_ATA133_6 1
#define HPT371_ALLOW_ATA133_6 1
#define HPT302_ALLOW_ATA133_6 1
#define HPT372_ALLOW_ATA133_6 1
@@ -450,7 +450,7 @@ static struct hpt_info hpt370a __devinitdata = {

static struct hpt_info hpt374 __devinitdata = {
.chip_type = HPT374,
- .max_mode = HPT374_ALLOW_ATA133_6 ? 4 : 3,
+ .max_mode = 3,
.dpll_clk = 48,
.settings = hpt37x_settings
};
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4c2471e..b9ff4e3 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -33,7 +33,6 @@
struct crypt_io {
struct dm_target *target;
struct bio *base_bio;
- struct bio *first_clone;
struct work_struct work;
atomic_t pending;
int error;
@@ -107,6 +106,8 @@ struct crypt_config {

static struct kmem_cache *_crypt_io_pool;

+static void clone_init(struct crypt_io *, struct bio *);
+
/*
* Different IV generation algorithms:
*
@@ -378,25 +379,20 @@ static int crypt_convert(struct crypt_config *cc,
* This should never violate the device limitations
* May return a smaller bio when running out of pages
*/
-static struct bio *
-crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
- struct bio *base_bio, unsigned int *bio_vec_idx)
+static struct bio *crypt_alloc_buffer(struct crypt_io *io, unsigned int size,
+ unsigned int *bio_vec_idx)
{
+ struct crypt_config *cc = io->target->private;
struct bio *clone;
unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
unsigned int i;

- if (base_bio) {
- clone = bio_alloc_bioset(GFP_NOIO, base_bio->bi_max_vecs, cc->bs);
- __bio_clone(clone, base_bio);
- } else
- clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
-
+ clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
if (!clone)
return NULL;

- clone->bi_destructor = dm_crypt_bio_destructor;
+ clone_init(io, clone);

/* if the last bio was not complete, continue where that one ended */
clone->bi_idx = *bio_vec_idx;
@@ -495,9 +491,6 @@ static void dec_pending(struct crypt_io *io, int error)
if (!atomic_dec_and_test(&io->pending))
return;

- if (io->first_clone)
- bio_put(io->first_clone);
-
bio_endio(io->base_bio, io->base_bio->bi_size, io->error);

mempool_free(io, cc->io_pool);
@@ -562,6 +555,7 @@ static void clone_init(struct crypt_io *io, struct bio *clone)
clone->bi_end_io = crypt_endio;
clone->bi_bdev = cc->dev->bdev;
clone->bi_rw = io->base_bio->bi_rw;
+ clone->bi_destructor = dm_crypt_bio_destructor;
}

static void process_read(struct crypt_io *io)
@@ -585,7 +579,6 @@ static void process_read(struct crypt_io *io)
}

clone_init(io, clone);
- clone->bi_destructor = dm_crypt_bio_destructor;
clone->bi_idx = 0;
clone->bi_vcnt = bio_segments(base_bio);
clone->bi_size = base_bio->bi_size;
@@ -615,8 +608,7 @@ static void process_write(struct crypt_io *io)
* so repeat the whole process until all the data can be handled.
*/
while (remaining) {
- clone = crypt_alloc_buffer(cc, base_bio->bi_size,
- io->first_clone, &bvec_idx);
+ clone = crypt_alloc_buffer(io, base_bio->bi_size, &bvec_idx);
if (unlikely(!clone)) {
dec_pending(io, -ENOMEM);
return;
@@ -631,31 +623,23 @@ static void process_write(struct crypt_io *io)
return;
}

- clone_init(io, clone);
clone->bi_sector = cc->start + sector;
-
- if (!io->first_clone) {
- /*
- * hold a reference to the first clone, because it
- * holds the bio_vec array and that can't be freed
- * before all other clones are released
- */
- bio_get(clone);
- io->first_clone = clone;
- }
-
remaining -= clone->bi_size;
sector += bio_sectors(clone);

- /* prevent bio_put of first_clone */
+ /* Grab another reference to the io struct
+ * before we kick off the request */
if (remaining)
atomic_inc(&io->pending);

generic_make_request(clone);

+ /* Do not reference clone after this - it
+ * may be gone already. */
+
/* out of memory -> run queues */
if (remaining)
- congestion_wait(bio_data_dir(clone), HZ/100);
+ congestion_wait(WRITE, HZ/100);
}
}

@@ -954,10 +938,12 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
struct crypt_config *cc = ti->private;
struct crypt_io *io;

+ if (bio_barrier(bio))
+ return -EOPNOTSUPP;
+
io = mempool_alloc(cc->io_pool, GFP_NOIO);
io->target = ti;
io->base_bio = bio;
- io->first_clone = NULL;
io->error = io->post_process = 0;
atomic_set(&io->pending, 0);
kcryptd_queue_io(io);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3a95cc5..46677d7 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1240,17 +1240,24 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
}
r1_bio->read_disk = primary;
for (i=0; i<mddev->raid_disks; i++)
- if (r1_bio->bios[i]->bi_end_io == end_sync_read &&
- test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) {
+ if (r1_bio->bios[i]->bi_end_io == end_sync_read) {
int j;
int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
- for (j = vcnt; j-- ; )
- if (memcmp(page_address(pbio->bi_io_vec[j].bv_page),
- page_address(sbio->bi_io_vec[j].bv_page),
- PAGE_SIZE))
- break;
+
+ if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
+ for (j = vcnt; j-- ; ) {
+ struct page *p, *s;
+ p = pbio->bi_io_vec[j].bv_page;
+ s = sbio->bi_io_vec[j].bv_page;
+ if (memcmp(page_address(p),
+ page_address(s),
+ PAGE_SIZE))
+ break;
+ }
+ } else
+ j = 0;
if (j >= 0)
mddev->resync_mismatches += r1_bio->sectors;
if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 82249a6..9eb66c1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1867,6 +1867,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
int d = r10_bio->devs[i].devnum;
bio = r10_bio->devs[i].bio;
bio->bi_end_io = NULL;
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
if (conf->mirrors[d].rdev == NULL ||
test_bit(Faulty, &conf->mirrors[d].rdev->flags))
continue;
@@ -2037,6 +2038,11 @@ static int run(mddev_t *mddev)
/* 'size' is now the number of chunks in the array */
/* calculate "used chunks per device" in 'stride' */
stride = size * conf->copies;
+
+ /* We need to round up when dividing by raid_disks to
+ * get the stride size.
+ */
+ stride += conf->raid_disks - 1;
sector_div(stride, conf->raid_disks);
mddev->size = stride << (conf->chunk_shift-1);

diff --git a/drivers/media/video/bt8xx/bttv-driver.c b/drivers/media/video/bt8xx/bttv-driver.c
index 5720b77..d4bef35 100644
--- a/drivers/media/video/bt8xx/bttv-driver.c
+++ b/drivers/media/video/bt8xx/bttv-driver.c
@@ -1313,7 +1313,7 @@ set_tvnorm(struct bttv *btv, unsigned int norm)

/* Call with btv->lock down. */
static void
-set_input(struct bttv *btv, unsigned int input)
+set_input(struct bttv *btv, unsigned int input, unsigned int norm)
{
unsigned long flags;

@@ -1332,7 +1332,7 @@ set_input(struct bttv *btv, unsigned int input)
}
audio_input(btv,(input == bttv_tvcards[btv->c.type].tuner ?
TVAUDIO_INPUT_TUNER : TVAUDIO_INPUT_EXTERN));
- set_tvnorm(btv,btv->tvnorm);
+ set_tvnorm(btv, norm);
i2c_vidiocschan(btv);
}

@@ -1423,7 +1423,7 @@ static void bttv_reinit_bt848(struct bttv *btv)

init_bt848(btv);
btv->pll.pll_current = -1;
- set_input(btv,btv->input);
+ set_input(btv, btv->input, btv->tvnorm);
}

static int get_control(struct bttv *btv, struct v4l2_control *c)
@@ -1993,8 +1993,7 @@ static int bttv_common_ioctls(struct bttv *btv, unsigned int cmd, void *arg)
return 0;
}

- btv->tvnorm = v->norm;
- set_input(btv,v->channel);
+ set_input(btv, v->channel, v->norm);
mutex_unlock(&btv->lock);
return 0;
}
@@ -2130,7 +2129,7 @@ static int bttv_common_ioctls(struct bttv *btv, unsigned int cmd, void *arg)
if (*i > bttv_tvcards[btv->c.type].video_inputs)
return -EINVAL;
mutex_lock(&btv->lock);
- set_input(btv,*i);
+ set_input(btv, *i, btv->tvnorm);
mutex_unlock(&btv->lock);
return 0;
}
@@ -4762,7 +4761,7 @@ static int __devinit bttv_probe(struct pci_dev *dev,
bt848_hue(btv,32768);
bt848_sat(btv,32768);
audio_mute(btv, 1);
- set_input(btv,0);
+ set_input(btv, 0, btv->tvnorm);
bttv_crop_reset(&btv->crop[0], btv->tvnorm);
btv->crop[1] = btv->crop[0]; /* current = default */
disclaim_vbi_lines(btv);
diff --git a/drivers/media/video/cx88/cx88-blackbird.c b/drivers/media/video/cx88/cx88-blackbird.c
index b0466b8..a80b1cb 100644
--- a/drivers/media/video/cx88/cx88-blackbird.c
+++ b/drivers/media/video/cx88/cx88-blackbird.c
@@ -1034,6 +1034,8 @@ static int vidioc_g_tuner (struct file *file, void *priv,

if (unlikely(UNSET == core->tuner_type))
return -EINVAL;
+ if (0 != t->index)
+ return -EINVAL;

strcpy(t->name, "Television");
t->type = V4L2_TUNER_ANALOG_TV;
diff --git a/drivers/media/video/saa7134/saa7134-tvaudio.c b/drivers/media/video/saa7134/saa7134-tvaudio.c
index dd759d6..36b3fa3 100644
--- a/drivers/media/video/saa7134/saa7134-tvaudio.c
+++ b/drivers/media/video/saa7134/saa7134-tvaudio.c
@@ -1006,7 +1006,7 @@ int saa7134_tvaudio_init2(struct saa7134_dev *dev)
int saa7134_tvaudio_fini(struct saa7134_dev *dev)
{
/* shutdown tvaudio thread */
- if (dev->thread.pid >= 0) {
+ if (dev->thread.pid > 0) {
dev->thread.shutdown = 1;
wake_up_interruptible(&dev->thread.wq);
wait_for_completion(&dev->thread.exit);
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 5006c67..1137291 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -54,8 +54,8 @@

#define DRV_MODULE_NAME "bnx2"
#define PFX DRV_MODULE_NAME ": "
-#define DRV_MODULE_VERSION "1.5.8.1"
-#define DRV_MODULE_RELDATE "May 7, 2007"
+#define DRV_MODULE_VERSION "1.5.8.2"
+#define DRV_MODULE_RELDATE "June 5, 2007"

#define RUN_AT(x) (jiffies + (x))

@@ -1550,6 +1550,7 @@ bnx2_init_context(struct bnx2 *bp)
vcid = 96;
while (vcid) {
u32 vcid_addr, pcid_addr, offset;
+ int i;

vcid--;

@@ -1570,16 +1571,20 @@ bnx2_init_context(struct bnx2 *bp)
pcid_addr = vcid_addr;
}

- REG_WR(bp, BNX2_CTX_VIRT_ADDR, 0x00);
- REG_WR(bp, BNX2_CTX_PAGE_TBL, pcid_addr);
+ for (i = 0; i < (CTX_SIZE / PHY_CTX_SIZE); i++) {
+ vcid_addr += (i << PHY_CTX_SHIFT);
+ pcid_addr += (i << PHY_CTX_SHIFT);

- /* Zero out the context. */
- for (offset = 0; offset < PHY_CTX_SIZE; offset += 4) {
- CTX_WR(bp, 0x00, offset, 0);
- }
+ REG_WR(bp, BNX2_CTX_VIRT_ADDR, 0x00);
+ REG_WR(bp, BNX2_CTX_PAGE_TBL, pcid_addr);

- REG_WR(bp, BNX2_CTX_VIRT_ADDR, vcid_addr);
- REG_WR(bp, BNX2_CTX_PAGE_TBL, pcid_addr);
+ /* Zero out the context. */
+ for (offset = 0; offset < PHY_CTX_SIZE; offset += 4)
+ CTX_WR(bp, 0x00, offset, 0);
+
+ REG_WR(bp, BNX2_CTX_VIRT_ADDR, vcid_addr);
+ REG_WR(bp, BNX2_CTX_PAGE_TBL, pcid_addr);
+ }
}
}

diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index b6b444b..e525a5b 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -95,7 +95,7 @@ static int disable_msi = 0;
module_param(disable_msi, int, 0);
MODULE_PARM_DESC(disable_msi, "Disable Message Signaled Interrupt (MSI)");

-static int idle_timeout = 0;
+static int idle_timeout = 100;
module_param(idle_timeout, int, 0);
MODULE_PARM_DESC(idle_timeout, "Watchdog timer for lost interrupts (ms)");

@@ -2433,6 +2433,13 @@ static int sky2_poll(struct net_device *dev0, int *budget)

work_done = sky2_status_intr(hw, work_limit);
if (work_done < work_limit) {
+ /* Bug/Errata workaround?
+ * Need to kick the TX irq moderation timer.
+ */
+ if (sky2_read8(hw, STAT_TX_TIMER_CTRL) == TIM_START) {
+ sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP);
+ sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START);
+ }
netif_rx_complete(dev0);

sky2_read32(hw, B0_Y2_SP_LISR);
diff --git a/drivers/serial/mpsc.c b/drivers/serial/mpsc.c
index 3d2fcc5..64ed5ef 100644
--- a/drivers/serial/mpsc.c
+++ b/drivers/serial/mpsc.c
@@ -502,7 +502,8 @@ mpsc_sdma_intr_ack(struct mpsc_port_info *pi)

if (pi->mirror_regs)
pi->shared_regs->SDMA_INTR_CAUSE_m = 0;
- writel(0, pi->shared_regs->sdma_intr_base + SDMA_INTR_CAUSE);
+ writeb(0x00, pi->shared_regs->sdma_intr_base + SDMA_INTR_CAUSE +
+ pi->port.line);
return;
}

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 49fe299..8cf1d7f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1138,6 +1138,7 @@ static inline void put_task_struct(struct task_struct *t)
/* Not implemented yet, only for 486*/
#define PF_STARTING 0x00000002 /* being created */
#define PF_EXITING 0x00000004 /* getting shut down */
+#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
#define PF_DUMPCORE 0x00000200 /* dumped core */
diff --git a/ipc/shm.c b/ipc/shm.c
index 4fefbad..8d2672d 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -254,8 +254,10 @@ struct mempolicy *shm_get_policy(struct vm_area_struct *vma, unsigned long addr)

if (sfd->vm_ops->get_policy)
pol = sfd->vm_ops->get_policy(vma, addr);
- else
+ else if (vma->vm_policy)
pol = vma->vm_policy;
+ else
+ pol = current->mempolicy;
return pol;
}
#endif
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 3749193..2b8311b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -905,7 +905,7 @@ static void audit_update_watch(struct audit_parent *parent,

/* If the update involves invalidating rules, do the inode-based
* filtering now, so we don't omit records. */
- if (invalidating &&
+ if (invalidating && current->audit_context &&
audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT)
audit_set_auditable(current->audit_context);

diff --git a/kernel/exit.c b/kernel/exit.c
index b55ed4c..7debf34 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -884,13 +884,29 @@ fastcall NORET_TYPE void do_exit(long code)
if (unlikely(tsk->flags & PF_EXITING)) {
printk(KERN_ALERT
"Fixing recursive fault but reboot is needed!\n");
+ /*
+ * We can do this unlocked here. The futex code uses
+ * this flag just to verify whether the pi state
+ * cleanup has been done or not. In the worst case it
+ * loops once more. We pretend that the cleanup was
+ * done as there is no way to return. Either the
+ * OWNER_DIED bit is set by now or we push the blocked
+ * task into the wait for ever nirwana as well.
+ */
+ tsk->flags |= PF_EXITPIDONE;
if (tsk->io_context)
exit_io_context();
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
}

+ /*
+ * tsk->flags are checked in the futex code to protect against
+ * an exiting task cleaning up the robust pi futexes.
+ */
+ spin_lock_irq(&tsk->pi_lock);
tsk->flags |= PF_EXITING;
+ spin_unlock_irq(&tsk->pi_lock);

if (unlikely(in_atomic()))
printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -957,6 +973,12 @@ fastcall NORET_TYPE void do_exit(long code)
* Make sure we are holding no locks:
*/
debug_check_no_locks_held(tsk);
+ /*
+ * We can do this unlocked here. The futex code uses this flag
+ * just to verify whether the pi state cleanup has been done
+ * or not. In the worst case it loops once more.
+ */
+ tsk->flags |= PF_EXITPIDONE;

if (tsk->io_context)
exit_io_context();
diff --git a/kernel/futex.c b/kernel/futex.c
index 5a270b5..4809436 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -390,18 +390,12 @@ static struct task_struct * futex_find_get_task(pid_t pid)

rcu_read_lock();
p = find_task_by_pid(pid);
- if (!p)
- goto out_unlock;
- if ((current->euid != p->euid) && (current->euid != p->uid)) {
- p = NULL;
- goto out_unlock;
- }
- if (p->exit_state != 0) {
- p = NULL;
- goto out_unlock;
- }
- get_task_struct(p);
-out_unlock:
+
+ if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
+ p = ERR_PTR(-ESRCH);
+ else
+ get_task_struct(p);
+
rcu_read_unlock();

return p;
@@ -467,7 +461,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
struct futex_q *this, *next;
struct list_head *head;
struct task_struct *p;
- pid_t pid;
+ pid_t pid = uval & FUTEX_TID_MASK;

head = &hb->chain;

@@ -485,6 +479,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
return -EINVAL;

WARN_ON(!atomic_read(&pi_state->refcount));
+ WARN_ON(pid && pi_state->owner &&
+ pi_state->owner->pid != pid);

atomic_inc(&pi_state->refcount);
me->pi_state = pi_state;
@@ -495,15 +491,33 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)

/*
* We are the first waiter - try to look up the real owner and attach
- * the new pi_state to it, but bail out when the owner died bit is set
- * and TID = 0:
+ * the new pi_state to it, but bail out when TID = 0
*/
- pid = uval & FUTEX_TID_MASK;
- if (!pid && (uval & FUTEX_OWNER_DIED))
+ if (!pid)
return -ESRCH;
p = futex_find_get_task(pid);
- if (!p)
- return -ESRCH;
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ /*
+ * We need to look at the task state flags to figure out,
+ * whether the task is exiting. To protect against the do_exit
+ * change of the task flags, we do this protected by
+ * p->pi_lock:
+ */
+ spin_lock_irq(&p->pi_lock);
+ if (unlikely(p->flags & PF_EXITING)) {
+ /*
+ * The task is on the way out. When PF_EXITPIDONE is
+ * set, we know that the task has finished the
+ * cleanup:
+ */
+ int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+
+ spin_unlock_irq(&p->pi_lock);
+ put_task_struct(p);
+ return ret;
+ }

pi_state = alloc_pi_state();

@@ -516,7 +530,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
/* Store the key for possible exit cleanups: */
pi_state->key = me->key;

- spin_lock_irq(&p->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &p->pi_state_list);
pi_state->owner = p;
@@ -583,15 +596,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
* preserve the owner died bit.)
*/
if (!(uval & FUTEX_OWNER_DIED)) {
+ int ret = 0;
+
newval = FUTEX_WAITERS | new_owner->pid;

pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
pagefault_enable();
+
if (curval == -EFAULT)
- return -EFAULT;
+ ret = -EFAULT;
if (curval != uval)
- return -EINVAL;
+ ret = -EINVAL;
+ if (ret) {
+ spin_unlock(&pi_state->pi_mutex.wait_lock);
+ return ret;
+ }
}

spin_lock_irq(&pi_state->owner->pi_lock);
@@ -1149,6 +1169,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
if (unlikely(ret != 0))
goto out_release_sem;

+ retry_unlocked:
hb = queue_lock(&q, -1, NULL);

retry_locked:
@@ -1200,34 +1221,58 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
ret = lookup_pi_state(uval, hb, &q);

if (unlikely(ret)) {
- /*
- * There were no waiters and the owner task lookup
- * failed. When the OWNER_DIED bit is set, then we
- * know that this is a robust futex and we actually
- * take the lock. This is safe as we are protected by
- * the hash bucket lock. We also set the waiters bit
- * unconditionally here, to simplify glibc handling of
- * multiple tasks racing to acquire the lock and
- * cleanup the problems which were left by the dead
- * owner.
- */
- if (curval & FUTEX_OWNER_DIED) {
- uval = newval;
- newval = current->pid |
- FUTEX_OWNER_DIED | FUTEX_WAITERS;
+ switch (ret) {

- pagefault_disable();
- curval = futex_atomic_cmpxchg_inatomic(uaddr,
- uval, newval);
- pagefault_enable();
+ case -EAGAIN:
+ /*
+ * Task is exiting and we just wait for the
+ * exit to complete.
+ */
+ queue_unlock(&q, hb);
+ up_read(&curr->mm->mmap_sem);
+ cond_resched();
+ goto retry;

- if (unlikely(curval == -EFAULT))
+ case -ESRCH:
+ /*
+ * No owner found for this futex. Check if the
+ * OWNER_DIED bit is set to figure out whether
+ * this is a robust futex or not.
+ */
+ if (get_futex_value_locked(&curval, uaddr))
goto uaddr_faulted;
- if (unlikely(curval != uval))
- goto retry_locked;
- ret = 0;
+
+ /*
+ * There were no waiters and the owner task lookup
+ * failed. When the OWNER_DIED bit is set, then we
+ * know that this is a robust futex and we actually
+ * take the lock. This is safe as we are protected by
+ * the hash bucket lock. We also set the waiters bit
+ * unconditionally here, to simplify glibc handling of
+ * multiple tasks racing to acquire the lock and
+ * cleanup the problems which were left by the dead
+ * owner.
+ */
+ if (curval & FUTEX_OWNER_DIED) {
+ uval = newval;
+ newval = current->pid |
+ FUTEX_OWNER_DIED | FUTEX_WAITERS;
+
+ pagefault_disable();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr,
+ uval,
+ newval);
+ pagefault_enable();
+
+ if (unlikely(curval == -EFAULT))
+ goto uaddr_faulted;
+ if (unlikely(curval != uval))
+ goto retry_locked;
+ ret = 0;
+ }
+ default:
+ goto out_unlock_release_sem;
}
- goto out_unlock_release_sem;
}

/*
@@ -1279,39 +1324,52 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
list_add(&q.pi_state->list, &current->pi_state_list);
spin_unlock_irq(&current->pi_lock);

- /* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
- up_read(&curr->mm->mmap_sem);
/*
* We own it, so we have to replace the pending owner
- * TID. This must be atomic as we have preserve the
+ * TID. This must be atomic as we have to preserve the
* owner died bit here.
*/
- ret = get_user(uval, uaddr);
+ ret = get_futex_value_locked(&uval, uaddr);
while (!ret) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+ pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr,
uval, newval);
+ pagefault_enable();
+
if (curval == -EFAULT)
ret = -EFAULT;
if (curval == uval)
break;
uval = curval;
}
- } else {
+ } else if (ret) {
/*
* Catch the rare case, where the lock was released
* when we were on the way back before we locked
* the hash bucket.
*/
- if (ret && q.pi_state->owner == curr) {
- if (rt_mutex_trylock(&q.pi_state->pi_mutex))
- ret = 0;
+ if (q.pi_state->owner == curr &&
+ rt_mutex_trylock(&q.pi_state->pi_mutex)) {
+ ret = 0;
+ } else {
+ /*
+ * Paranoia check. If we did not take the lock
+ * in the trylock above, then we should not be
+ * the owner of the rtmutex, neither the real
+ * nor the pending one:
+ */
+ if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
+ printk(KERN_ERR "futex_lock_pi: ret = %d "
+ "pi-mutex: %p pi-state %p\n", ret,
+ q.pi_state->pi_mutex.owner,
+ q.pi_state->owner);
}
- /* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
- up_read(&curr->mm->mmap_sem);
}
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q, hb);
+ up_read(&curr->mm->mmap_sem);

if (!detect && ret == -EDEADLK && 0)
force_sig(SIGKILL, current);
@@ -1331,16 +1389,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
* non-atomically. Therefore, if get_user below is not
* enough, we need to handle the fault ourselves, while
* still holding the mmap_sem.
+ *
+ * ... and hb->lock. :-) --ANK
*/
+ queue_unlock(&q, hb);
+
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt)) {
- ret = -EFAULT;
- goto out_unlock_release_sem;
- }
- goto retry_locked;
+ ret = futex_handle_fault((unsigned long)uaddr, attempt);
+ if (ret)
+ goto out_release_sem;
+ goto retry_unlocked;
}

- queue_unlock(&q, hb);
up_read(&curr->mm->mmap_sem);

ret = get_user(uval, uaddr);
@@ -1382,9 +1442,9 @@ retry:
goto out;

hb = hash_futex(&key);
+retry_unlocked:
spin_lock(&hb->lock);

-retry_locked:
/*
* To avoid races, try to do the TID -> 0 atomic transition
* again. If it succeeds then we can return without waking
@@ -1446,16 +1506,17 @@ pi_faulted:
* non-atomically. Therefore, if get_user below is not
* enough, we need to handle the fault ourselves, while
* still holding the mmap_sem.
+ *
+ * ... and hb->lock. :-) --ANK
*/
+ spin_unlock(&hb->lock);
+
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt)) {
- ret = -EFAULT;
- goto out_unlock;
- }
- goto retry_locked;
+ ret = futex_handle_fault((unsigned long)uaddr, attempt);
+ if (ret)
+ goto out;
+ goto retry_unlocked;
}
-
- spin_unlock(&hb->lock);
up_read(&current->mm->mmap_sem);

ret = get_user(uval, uaddr);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 44318ca..9577ac8 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -354,9 +354,40 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
* it should be restarted.
*/
if (timr->it.real.interval.tv64 != 0) {
+ ktime_t now = hrtimer_cb_get_time(timer);
+
+ /*
+ * FIXME: What we really want, is to stop this
+ * timer completely and restart it in case the
+ * SIG_IGN is removed. This is a non trivial
+ * change which involves sighand locking
+ * (sigh !), which we don't want to do late in
+ * the release cycle.
+ *
+ * For now we just let timers with an interval
+ * less than a jiffie expire every jiffie to
+ * avoid softirq starvation in case of SIG_IGN
+ * and a very small interval, which would put
+ * the timer right back on the softirq pending
+ * list. By moving now ahead of time we trick
+ * hrtimer_forward() to expire the timer
+ * later, while we still maintain the overrun
+ * accuracy, but have some inconsistency in
+ * the timer_gettime() case. This is at least
+ * better than a starved softirq. A more
+ * complex fix which solves also another related
+ * inconsistency is already in the pipeline.
+ */
+#ifdef CONFIG_HIGH_RES_TIMERS
+ {
+ ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ);
+
+ if (timr->it.real.interval.tv64 < kj.tv64)
+ now = ktime_add(now, kj);
+ }
+#endif
timr->it_overrun +=
- hrtimer_forward(timer,
- hrtimer_cb_get_time(timer),
+ hrtimer_forward(timer, now,
timr->it.real.interval);
ret = HRTIMER_RESTART;
++timr->it_requeue_pending;
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 180978c..17d28ce 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -212,6 +212,19 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
if (!waiter || !waiter->task)
goto out_unlock_pi;

+ /*
+ * Check the orig_waiter state. After we dropped the locks,
+ * the previous owner of the lock might have released the lock
+ * and made us the pending owner:
+ */
+ if (orig_waiter && !orig_waiter->task)
+ goto out_unlock_pi;
+
+ /*
+ * Drop out, when the task has no waiters. Note,
+ * top_waiter can be NULL, when we are in the deboosting
+ * mode!
+ */
if (top_waiter && (!task_has_pi_waiters(task) ||
top_waiter != task_top_pi_waiter(task)))
goto out_unlock_pi;
@@ -659,9 +672,16 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
* all over without going into schedule to try
* to get the lock now:
*/
- if (unlikely(!waiter.task))
+ if (unlikely(!waiter.task)) {
+ /*
+ * Reset the return value. We might
+ * have returned with -EDEADLK and the
+ * owner released the lock while we
+ * were walking the pi chain.
+ */
+ ret = 0;
continue;
-
+ }
if (unlikely(ret))
break;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index a3993b9..f745a44 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2831,17 +2831,21 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
unsigned long next_balance = jiffies + 60 * HZ;

for_each_domain(this_cpu, sd) {
- if (sd->flags & SD_BALANCE_NEWIDLE) {
+ unsigned long interval;
+
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+
+ if (sd->flags & SD_BALANCE_NEWIDLE)
/* If we've pulled tasks over stop searching: */
pulled_task = load_balance_newidle(this_cpu,
- this_rq, sd);
- if (time_after(next_balance,
- sd->last_balance + sd->balance_interval))
- next_balance = sd->last_balance
- + sd->balance_interval;
- if (pulled_task)
- break;
- }
+ this_rq, sd);
+
+ interval = msecs_to_jiffies(sd->balance_interval);
+ if (time_after(next_balance, sd->last_balance + interval))
+ next_balance = sd->last_balance + interval;
+ if (pulled_task)
+ break;
}
if (!pulled_task)
/*
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index cb25649..c6b6f35 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -120,7 +120,6 @@ void second_overflow(void)
*/
time_interpolator_update(-NSEC_PER_SEC);
time_state = TIME_OOP;
- clock_was_set();
printk(KERN_NOTICE "Clock: inserting leap second "
"23:59:60 UTC\n");
}
@@ -135,7 +134,6 @@ void second_overflow(void)
*/
time_interpolator_update(NSEC_PER_SEC);
time_state = TIME_WAIT;
- clock_was_set();
printk(KERN_NOTICE "Clock: deleting leap second "
"23:59:59 UTC\n");
}
diff --git a/mm/rmap.c b/mm/rmap.c
index b82146e..6e35d11 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -53,24 +53,6 @@

struct kmem_cache *anon_vma_cachep;

-static inline void validate_anon_vma(struct vm_area_struct *find_vma)
-{
-#ifdef CONFIG_DEBUG_VM
- struct anon_vma *anon_vma = find_vma->anon_vma;
- struct vm_area_struct *vma;
- unsigned int mapcount = 0;
- int found = 0;
-
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
- mapcount++;
- BUG_ON(mapcount > 100000);
- if (vma == find_vma)
- found = 1;
- }
- BUG_ON(!found);
-#endif
-}
-
/* This must be called under the mmap_sem. */
int anon_vma_prepare(struct vm_area_struct *vma)
{
@@ -121,10 +103,8 @@ void __anon_vma_link(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;

- if (anon_vma) {
+ if (anon_vma)
list_add_tail(&vma->anon_vma_node, &anon_vma->head);
- validate_anon_vma(vma);
- }
}

void anon_vma_link(struct vm_area_struct *vma)
@@ -134,7 +114,6 @@ void anon_vma_link(struct vm_area_struct *vma)
if (anon_vma) {
spin_lock(&anon_vma->lock);
list_add_tail(&vma->anon_vma_node, &anon_vma->head);
- validate_anon_vma(vma);
spin_unlock(&anon_vma->lock);
}
}
@@ -148,7 +127,6 @@ void anon_vma_unlink(struct vm_area_struct *vma)
return;

spin_lock(&anon_vma->lock);
- validate_anon_vma(vma);
list_del(&vma->anon_vma_node);

/* We must garbage collect the anon_vma if it's empty */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/