TCP stalls after 128k writes

Bernd Paysan (bernd.paysan@gmx.de)
Sun, 23 Aug 1998 23:08:54 +0200 (MET DST)


Hi!

I could figure out why TCP stalls under some circumstances after writing
128k - both the read and write buffer are full, and the next packet is
discarded (see function tcp_data in linux/net/ipv4/tcp_input.c). This
requires a retransmission. It seems that sk->data_ready doesn't wake up
the reader in time.

There is no code that forces wake up a reading process if there is "too
much" data in the receive buffer (i.e. half full or whatever).

I tried this theory by lowering the switch time (from 200ms to 20ms), and
indeed the results during the critical segment of my test went up from
700k/s to 1M/s. But this is just curing the symptom. Someone should
carefully check where rescheduling and waking input tasks would be a good
idea. This synthetic benchmark shows this misbehaviour very good, since in
real world other factors often hide it (i.e. once your receiving process
is alone on the machine, there won't be an overflow).

I added some debug code to see in which order processing is done. If you
start socktest with -debug as first option, you'll see on stderr in which
order the read and write parts are scheduled, and how large the blocks
were they read. I've also written a filter script (sock.diff) to see the
abnormalities. E.g.

socktest -debug tcp 4096 512 2>&1 1>&0 | sock.diff

gives

195707: 592694-592807 wrote 3584 bytes
195456: 442079-442198 wrote 3584 bytes
195409: 361928-362049 wrote 3584 bytes
195313: 522240-522361 wrote 3584 bytes
195303: 062314-062437 wrote 3584 bytes
195108: 292273-292392 wrote 3584 bytes
195018: 212121-212242 wrote 3584 bytes
194994: 131966-132085 wrote 3584 bytes
194921: 982151-982269 wrote 3584 bytes
194899: 902007-902129 wrote 3584 bytes
192854: 752246-752381 wrote 3584 bytes
192754: 672096-672229 wrote 3584 bytes
190316: 832349-832470 wrote 3584 bytes
20287: 612926-614166 wrote 58368 bytes
20204: 852898-854159 wrote 58368 bytes
20124: 312810-314065 wrote 58368 bytes
20115: 002675-003927 wrote 58368 bytes
20112: 152491-153746 wrote 58368 bytes
20110: 462600-463851 wrote 58368 bytes
....

showing that the typical stakk is either almost 200ms or 20ms. The smaller
(20ms) pauses are interrupts from other tasks (X and co).

I added a small patch to tcp_input.c, which forces rescheduling when the
input buffer is half full (see below). It's not a perfect cure (I think
there are other sk->data_ready places which should be fixed), but it
helps. The normal data_ready path results finally in reschedule_idle(),
which forces rescheduling only for RR and FIFO scheduled tasks.

The patch does much better than the original kernel, thus 4096 times 512
bytes is somewhere between 4 and 10 MB/s (wide variations between runs
indicate that I seem to have missed something) instead of 0.7MB/s
(original kernel).

All measured on a Pentium MMX-200, most of it with 20ms typical switch
time.

Bernd Paysan
"Late answers are wrong answers!"
http://www.jwdt.com/~paysan/
-----------------------------tcp_input.c.patch--------------------------
--- linux-2.1.117/net/ipv4/tcp_input.c.orig Sun Aug 23 22:38:42 1998
+++ linux/net/ipv4/tcp_input.c Sun Aug 23 22:28:29 1998
@@ -1443,6 +1443,10 @@
if (!sk->dead) {
SOCK_DEBUG(sk, "Data wakeup.\n");
sk->data_ready(sk,0);
+ /* patch: force rescheduling if buf is half full */
+ if(atomic_read(&sk->rmem_alloc) > sk->rcvbuf/2) {
+ current->need_resched = 1;
+ }
}
return(1);
}
---------------------------------socktest.c-----------------------------
#include <sys/time.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/resource.h>
#include <linux/in.h>
#include <fcntl.h>
#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <signal.h>

#define TIME(x) ((double)((x).tv_sec)+(double)((x).tv_usec)*1e-6)

#define BIGGEST 4096

double gettime ()
{
struct timeval time1;
struct timezone zone1;

gettimeofday(&time1,&zone1);

return TIME(time1);
}

unsigned long mstime(void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_usec;
}

void main(int argc, char ** argv, char ** env)
{
int pid1, pid2, sv[4];
char buf[1];
int n, m, i, j, r, udp=0;
int debug;
unsigned long otime, oldtime, dtime, rsum = 0;

otime = oldtime = mstime();

if((argc >= 2) && !strcmp(argv[1], "-debug")) {
debug = 1;
argc--;
argv++;
}

if(argc < 4) fprintf(stderr,"usage: socktest tcp/udp/unix n m\n"), exit(1);

n=atoi(argv[2]);
m=atoi(argv[3]);

if(!strcmp(argv[1],"unix")) r=socketpair(AF_UNIX, SOCK_STREAM, PF_UNIX, sv);
else if(!strcmp(argv[1],"tcp")) {
struct sockaddr_in sock1, sock2;
int dummy=16;

sv[0]=socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
sv[1]=socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);

sock1.sin_family=AF_INET;
sock1.sin_port=htons(0);
sock1.sin_addr.s_addr=inet_addr("127.0.0.1");
sock2.sin_family=AF_INET;
sock2.sin_port=htons(0);
sock2.sin_addr.s_addr=inet_addr("127.0.0.1");

r=bind(sv[0], (struct sockaddr *)&sock1, 16);
r=bind(sv[1], (struct sockaddr *)&sock2, 16);
r=getsockname(sv[0], (struct sockaddr *)&sock1, &dummy);
r=getsockname(sv[1], (struct sockaddr *)&sock2, &dummy);
r=listen(sv[1],5);
r=connect(sv[0], (struct sockaddr *)&sock2, 16);
dummy = accept(sv[1], 0, NULL);
close(sv[1]);
sv[1]=dummy;
} else if(!strcmp(argv[1],"udp")) {
struct sockaddr_in sock1, sock2;
int dummy=16;

sv[0]=socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
sv[1]=socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);

sock1.sin_family=AF_INET;
sock1.sin_port=htons(0);
sock1.sin_addr.s_addr=inet_addr("127.0.0.1");
sock2.sin_family=AF_INET;
sock2.sin_port=htons(0);
sock2.sin_addr.s_addr=inet_addr("127.0.0.1");

r=bind(sv[0], (struct sockaddr *)&sock1, 16);
r=bind(sv[1], (struct sockaddr *)&sock2, 16);
r=getsockname(sv[0], (struct sockaddr *)&sock1, &dummy);
r=getsockname(sv[1], (struct sockaddr *)&sock2, &dummy);
/* r=connect(sv[0], (struct sockaddr *)&sock2, 16); */
r=connect(sv[1], (struct sockaddr *)&sock1, 16);
udp=1;
socketpair(AF_UNIX, SOCK_STREAM, PF_UNIX, sv+2);
} else
fprintf(stderr,"usage: socktest ip/unix n m\n"), exit(1);

if(r) fprintf(stderr,"socketpair returned %d\n", r), exit(r);

n *= m;
{
char buffer[m];
struct timeval timeout;
/* struct fd_set wset;
FD_ZERO(&wset);
FD_SET(sv[0], &wset); */
int oldi=0;

pid1 = fork();

if(pid1==0) {
oldi=-1;

for(i=0, j=0; i<n; i+=r, j++) {
if(udp && ((j>256) || (i-oldi)>BIGGEST)) {
write(sv[2], ".", 1);
oldi+=BIGGEST;
j=0;
}
r=read(sv[0], buffer, m);

if(r<0)
fprintf(stderr,"read error %d@%d\n", errno, i), exit(errno);

if(debug) {
unsigned long time = mstime();
dtime = (time - oldtime) % 1000000;
if(dtime < 500) {
rsum += r;
} else {
if(rsum)
fprintf(stderr, "%06d-%06d read %i bytes\n",
otime, oldtime, rsum);
otime = time;
rsum = r;
}
oldtime = time;
}
}
exit(0);
}

pid2 = fork();

if(pid2==0) {
char dummy[1];
struct timeval timeout;

timeout.tv_sec=0;
timeout.tv_usec=0;

select(0, NULL, NULL, NULL, &timeout);

for(i=0; i<m; i++)
buffer[i]=i;

for(i=0, j=0; i<n; i+=r, j++) {
if(udp && ((j>256) || ((i-oldi)>BIGGEST))) {
read(sv[3], dummy, 1);
oldi+=BIGGEST;
j=0;
}
r=write(sv[1], buffer, m);
if(r<0)
fprintf(stderr,"write error %d@%d\n", errno, i), exit(errno);
if(debug) {
unsigned long time = mstime();
dtime = (time - oldtime) % 1000000;
if(dtime < 500) {
rsum += r;
} else {
if(rsum)
fprintf(stderr, "%06d-%06d wrote %i bytes\n",
otime, oldtime, rsum);
otime = time;
rsum = r;
}
oldtime = time;
}
}
exit(0);
}

{
struct rusage usage1, usage2;
double start, end, usr, sys, tot;

start = gettime();

printf("%4s (%5d*%4d): ", argv[1], n/m, m);
fflush(stdout);

wait4(pid1, NULL, 0, &usage1);
wait4(pid2, NULL, 0, &usage2);

end = gettime();

usr=TIME(usage1.ru_utime)+TIME(usage2.ru_utime);
sys=TIME(usage1.ru_stime)+TIME(usage2.ru_stime);
tot=end-start;

printf("user=%.2f, system=%.2f, elapsed=%.2f, CPU=%d%%, MB/s=%.6f\n",
usr, sys, tot, (int)((usr+sys)/(tot*1e-2)), 2*n/(tot*1024*1024));
}
}
}
---------------------------sock.diff------------------------------------
#!/bin/bash

cat | sed -e "s/-/ /g" | while read a b rest
do
diff=$[(1$a-1${olda:-$a})]
if [ $diff "<" 0 ]
then
diff=$[$diff+1000000]
fi
echo $diff: $a-$b $rest
olda=$b
done | sort -r -n | grep .....: | grep -v 9[89]....:
------------------------------------------------------------------------

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.altern.org/andrebalsa/doc/lkml-faq.html