How To DTrace poll(2) System Calls

Max Bruning has been teaching Unix Internals, Device Drivers, and various other courses since 1981. This is a new weekly column to answer your SmartOS, DTrace, and MDB questions. You can submit questions to Max by email at MrBruning@joyent.com, ask on twitter at hashtag #BruningQuestions, or by attending one of his courses.
Question: How To DTrace poll(2) System Calls
I received a question from Mohamed A. Khalfella (@khalfella) about a DTrace script he wrote to trace poll(2)
system calls. Basically, his script was not working, and he wanted to know why not.
Here is his script:
syscall::pollsys:entry/execname == "xxx" && pid == 11528/{ self->pfds_user = arg0; self->fds_cnt = arg1; self->pfds= (struct pollfd*) copyin(self->pfds_user, self->fds_cnt* sizeof(structpollfds)); self->fds_print = arg1; self->in_pollsys = 1; printf("pollsys thread = %d arg1 = %d\n", tid, self->fds_cnt);}syscall::pollsys:entry/self->fds_print/{ self->fd=self->fds_cnt - self->fds_print; self->pr_fd = (struct pollfd*) &self->pfds[self->fd]; printf ("thread = %d FD[%d] fd=%d events=%x revents=%x\n", tid, self->fd, self->pfds[self->fd].fd, self->pfds[self->fd].events, self->pfds[self->fd].revents); self->fds_print--;}syscall::pollsys:entry/self->fds_print/{ self->fd=self->fds_cnt - self->fds_print; self->pr_fd = (struct pollfd*) &self->pfds[self->fd]; printf ("thread = %d FD[%d] fd=%d events=%x revents=%x\n", tid, self->fd, self->pfds[self->fd].fd, self->pfds[self->fd].events, self->pfds[self->fd].revents); self->fds_print--;}syscall::pollsys:entry/self->fds_print/{ self->fd=self->fds_cnt - self->fds_print; self->pr_fd = (struct pollfd*) &self->pfds[self->fd]; printf ("thread = %d FD[%d] fd=%d events=%x revents=%x\n", tid, self->fd, self->pfds[self->fd].fd, self->pfds[self->fd].events, self->pfds[self->fd].revents); self->fds_print--;}syscall::pollsys:entry/self->fds_print/{ self->fd=self->fds_cnt - self->fds_print; self->pr_fd = (struct pollfd*) &self->pfds[self->fd]; printf ("thread = %d FD[%d] fd=%d events=%x revents=%x\n", tid, self->fd, self->pfds[self->fd].fd, self->pfds[self->fd].events, self->pfds[self->fd].revents); self->fds_print--;}syscall::pollsys:entry/self->fds_print/{ self->fd=self->fds_cnt - self->fds_print; self->pr_fd = (struct pollfd*) &self->pfds[self->fd]; printf ("thread = %d FD[%d] fd=%d events=%x revents=%x\n", tid, self->fd, self->pfds[self->fd].fd, self->pfds[self->fd].events, self->pfds[self->fd].revents); self->fds_print--;}syscall::pollsys:entry/self->fds_print/{ self->fd=self->fds_cnt - self->fds_print; self->pr_fd = (struct pollfd*) &self->pfds[self->fd]; printf ("thread = %d FD[%d] fd=%d events=%x revents=%x\n", tid, self->fd, self->pfds[self->fd].fd, self->pfds[self->fd].events, self->pfds[self->fd].revents); self->fds_print--;}syscall::pollsys:entry/self->fds_print/{ self->fd=self->fds_cnt - self->fds_print; self->pr_fd = (struct pollfd*) &self->pfds[self->fd]; printf ("thread = %d FD[%d] fd=%d events=%x revents=%x\n", tid, self->fd, self->pfds[self->fd].fd, self->pfds[self->fd].events, self->pfds[self->fd].revents); self->fds_print--;}/* return probes */syscall::pollsys:return/self->in_pollsys/{ printf ("RETURN Pollsys thread = %d returns %d:%d\n", tid, arg0, arg1); self->pfds= (struct pollfd*) copyin(self->pfds_user, self->fds_cnt*sizeof(structpollfds)); self->fds_print = self->fds_cnt; self->in_pollsys = 0;}syscall::pollsys:return/self->fds_print/{ self->fd=self->fds_cnt - self->fds_print; self->pr_fd = (struct pollfd*) &self->pfds[self->fd]; printf ("thread = %d FD[%d] fd=%d events=%x revents=%x\n", tid, self->fd, self->pfds[self->fd].fd, self->pfds[self->fd].events, self->pfds[self->fd].revents); self->fds_print--;}syscall::pollsys:return/self->fds_print/{ self->fd=self->fds_cnt - self->fds_print; self->pr_fd = (struct pollfd*) &self->pfds[self->fd]; printf ("thread = %d FD[%d] fd=%d events=%x revents=%x\n", tid, self->fd, self->pfds[self->fd].fd, self->pfds[self->fd].events, self->pfds[self->fd].revents); self->fds_print--;}syscall::pollsys:return/self->fds_print/{ self->fd=self->fds_cnt - self->fds_print; self->pr_fd = (struct pollfd*) &self->pfds[self->fd]; printf ("thread = %d FD[%d] fd=%d events=%x revents=%x\n", tid, self->fd, self->pfds[self->fd].fd, self->pfds[self->fd].events, self->pfds[self->fd].revents); self->fds_print--;}syscall::pollsys:return/self->fds_print/{ self->fd=self->fds_cnt - self->fds_print; self->pr_fd = (struct pollfd*) &self->pfds[self->fd]; printf ("thread = %d FD[%d] fd=%d events=%x revents=%x\n", tid, self->fd, self->pfds[self->fd].fd, self->pfds[self->fd].events, self->pfds[self->fd].revents); self->fds_print--;}syscall::pollsys:return/self->fds_print/{ self->fd=self->fds_cnt - self->fds_print; self->pr_fd = (struct pollfd*) &self->pfds[self->fd]; printf ("thread = %d FD[%d] fd=%d events=%x revents=%x\n", tid, self->fd, self->pfds[self->fd].fd, self->pfds[self->fd].events, self->pfds[self->fd].revents); self->fds_print--;}syscall::pollsys:return/self->fds_print/{ self->fd=self->fds_cnt - self->fds_print; self->pr_fd = (struct pollfd*) &self->pfds[self->fd]; printf ("thread = %d FD[%d] fd=%d events=%x revents=%x\n", tid, self->fd, self->pfds[self->fd].fd, self->pfds[self->fd].events, self->pfds[self->fd].revents); self->fds_print--;}syscall::pollsys:return/self->fds_print/{ self->fd=self->fds_cnt - self->fds_print; self->pr_fd = (struct pollfd*) &self->pfds[self->fd]; printf ("thread = %d FD[%d] fd=%d events=%x revents=%x\n", tid, self->fd, self->pfds[self->fd].fd, self->pfds[self->fd].events, self->pfds[self->fd].revents); self->fds_print--;}
So, the script is tracing a specific process (name is "xxx" and pid is 11528). The poll(2
system call calls pollsys
in the kernel. For those not familiar with poll(2)
, it is basically equivalent to select(2)
, but arguments are different. The select(3c)
call actually calls pollsys
in the kernel. An advantage to poll(2)
is that it can handle an arbitrary number of descriptors. select
is limited to FD_SETSIZE
descriptors (see /usr/include/sys/select.h
for details).
Here is some example (pseudo-) code using poll(2)
:
struct pollfd pfd[...]; /* one pollfd per file descriptor and/or socket being polled */int npfds; /* the number of pollfd entries */pfd[0].fd = open(file, mode); /* or socket(...) */pfd[0].events = POLLIN; /* see poll(2) man page, may also poll for other events */pfd[1].fd = open(file1, mode); /* again, may be a socket call instead of open */pfd[1].events = POLLIN;... /* repeat for each file/socket you are interested in */nevents = poll(pfd, npfds, timeout); /* npfds is the number of entries in pfd. See the man page for timeout. */if (nevents > 0) { for (i = 0; i < npfds; i++) { if (pfd[i].revents & POLLIN) handle input event on pfd[i].fd; ... }}
Mohamed wants to list the pollfd
structures in the array passed into the system call, and list them again when the system call returns (that way, he can see which file descriptors/sockets had events set in revents). The number of pollfd
structures passed in as an array to the system call (npfds
) may vary. There is no way to do a "for" loop in DTrace, so the script uses a predicate. It sets a counter for the number of pollfd
structures passed to poll(2)
, and has multiple clauses for entry and return that use a predicate to determine when the script is finished enumerating the array. Note that there is a separate clausefor each pollfd
structure to be printed. Since there are only 7 clauses that print the structure, this script will print up to the first seven structures passed to poll(2)
. If he expects more than seven, he would have to have the same clause that many more times. But for the app he is tracing, 7 is more than enough.
Now let's take a look at the output from the script:
pollsys thread = 1 arg1 = 3thread = 1 FD[0] fd=2 events=0 revents=1thread = 1 FD[1] fd=0 events=0 revents=0thread = 1 FD[2] fd=2 events=0 revents=1RETURN Pollsys thread = 1 returns 1:1thread = 1 FD[0] fd=10 events=0 revents=1thread = 1 FD[1] fd=0 events=0 revents=0thread = 1 FD[2] fd=10 events=0 revents=1pollsys thread = 1 arg1 = 3thread = 1 FD[0] fd=2 events=0 revents=1thread = 1 FD[1] fd=0 events=0 revents=0thread = 1 FD[2] fd=2 events=0 revents=1RETURN Pollsys thread = 1 returns 1:1thread = 1 FD[0] fd=10 events=0 revents=1thread = 1 FD[1] fd=0 events=0 revents=0thread = 1 FD[2] fd=10 events=0 revents=1
According to the above output, poll(2)
is being called with 3 pollfd
structures every time. But 2 of the file descriptors are being used twice. Also, revents is 1 in 2 of the returned structures. This implies that the return value (1:1 in the output) is not correct. When I tried the script on my machine, I also got strange output:
pollsys thread = 1 arg1 = 5thread = 1 FD[0] fd=2 events=3f10 revents=2thread = 1 FD[1] fd=1532892142 events=1 revents=0thread = 1 FD[2] fd=0 events=0 revents=0thread = 1 FD[3] fd=2 events=3f10 revents=2thread = 1 FD[4] fd=-1649627104 events=ff01 revents=ffffRETURN pid: 2548 Pollsys thread = 1 returns 0:0thread = 1 FD[0] fd=10 events=3f10 revents=2thread = 1 FD[1] fd=1532993475 events=1 revents=0thread = 1 FD[2] fd=0 events=0 revents=0thread = 1 FD[3] fd=10 events=3f10 revents=2thread = 1 FD[4] fd=11 events=3f10 revents=2
This simply looks wrong. A file descriptor of 1532892142??? And another file descriptor that is anegative number. Also, the return value from the system call is 0, but revents is set in 3 of the 5pollfd
structures.
So, what output should I expect?
I don't have the "xxx" program on my machine, so first I'll use DTrace to see what processes use poll(2)
.
# dtrace -n 'syscall::pollsys:entry{@[execname] = count();} tick-30s{exit(0);}'dtrace: description 'syscall::pollsys:entry' matched 2 probesCPU ID FUNCTION:NAME 0 75153 :tick-30s fmd 3 sendmail 6 sshd 8 mdnsd 25 bridged 30#
Let's take a look at sshd. First, we'll pick a specific sshd. I have two ssh sessions opened on the machine. Here is one of them:
# ptree $$2594 /usr/lib/ssh/sshd 5897 /usr/lib/ssh/sshd 5898 /usr/lib/ssh/sshd 5901 -bash 7666 ptree 5901#
Let's take a look at the pollfd
structure passed into the poll
system call.
# pargs 58985898: /usr/lib/ssh/sshdargv[0]: /usr/lib/ssh/sshd### mdb /usr/lib/ssh/sshd> 0t5898:A <-- attach to the process, this stops the process. The "0t" is because the numberis decimal.Loading modules: [ ld.so.1 libc.so.1 libproc.so.1 ]>> __pollsys::bp <-- set a breakpoint at the user level library code that calls poll(2)>> :c <-- continue the process
Now, in the window where pid 5898 is running bash, I'll type a carriage return, and back in the window where I'm running mdb, I see:
mdb: stop at libc_hwcap1.so.1`__pollsysmdb: target stopped at:libc_hwcap1.so.1`__pollsys: call +0x0 >> $c <-- let's get a stack tracelibc_hwcap1.so.1`__pollsys(8047380, 5, 0, 0, 4, 80b0040)libc_hwcap1.so.1`pselect+0x1bf(a, 80d5530, 80b8f50, fef79f00, 0, 0)libc_hwcap1.so.1`select+0x8e(a, 80d5530, 80b8f50, 0, 0, c7)wait_until_can_do_something+0x233(80474f0, 0, 8047518, 806fafb, 4, 9)server_loop2+0x190(80d4ba8, 80bd560, 8047548, 808506c)do_authenticated2+0xc(80d4ba8, 80d4ba8, 8047e18, 806282c, 80d4ba8, 31)do_authenticated+0x31(80d4ba8, 31)main+0x14c4(809035a, feffb0a4, 8047e48, 805fbcf, 1, 8047e54)_start+0x83(1, 8047ef4, 0, 8047f06, 8047f20, 8047f31)>
The pollfd
array is at virtual address 0x8047380
in the process. There are 5 structures in the array (the second argument to __pollsys()
). In passing, you can see that sshd
uses select(3C)
, not poll(2)
.
> 8047380,5::print -t struct pollfdstruct pollfd { int fd = 0x4 short events = 0x40 short revents = 0x80b}struct pollfd { int fd = 0x5 short events = 0x40 short revents = 0xfef7}struct pollfd { int fd = 0x7 short events = 0x40 short revents = 0x80d}struct pollfd { int fd = 0x8 short events = 0x4 short revents = 0xfee9}struct pollfd { int fd = 0x9 short events = 0x40 short revents = 0xfed9}>
Now we'll set a breakpoint at the place where __pollsys()
returns. At this point, therevents
field should be set. The return value (i.e., number of events that occurred) should be in the %eax
register. We'll check that, and look at the returned pollfd
structures. Why does the revents
field have strange values? I have a good idea why this occurs, but it is left to the reader as an exercise.
> pselect+1bf::bp <-- from the stack trace above> :c <-- continuemdb: stop at libc_hwcap1.so.1`pselect+0x1bfmdb: target stopped at:libc_hwcap1.so.1`pselect+0x1bf: movl %eax,%esi>> > 8047380,5::print -t struct pollfdstruct pollfd { int fd = 0x4 short events = 0x40 short revents = 0}struct pollfd { int fd = 0x5 short events = 0x40 short revents = 0}struct pollfd { int fd = 0x7 short events = 0x40 short revents = 0}struct pollfd { int fd = 0x8 short events = 0x4 short revents = 0x4 <-- here is the one event}struct pollfd { int fd = 0x9 short events = 0x40 short revents = 0}> $q <-- and exit mdb#
So, now we know what we should expect to see from the DTrace script.
With illumos, a short DTrace script that does what is wanted is:
#!/usr/sbin/dtrace -qssyscall::pollsys:entry/pid==5898/{ printf("\npolling %d file descriptors/sockets\n", arg1); self->pfds = (struct pollfd *) copyin(arg0, sizeof(struct pollfd)*arg1); self->savedpfds = (struct pollfd *)arg0; self->pfdcnt = arg1; print(self->pfds[0]); print(self->pfds[1]); print(self->pfds[2]); print(self->pfds[3]); print(self->pfds[4]);}syscall::pollsys:return/self->savedpfds/{ printf("\npoll returning %d events\n", arg1); self->pfds = (struct pollfd *) copyin((uintptr_t)self->savedpfds, sizeof(structpollfd)*self->pfdcnt); print(self->pfds[0]); print(self->pfds[1]); print(self->pfds[2]); print(self->pfds[3]); print(self->pfds[4]);}
And the output:
# ./shortpoll.dpolling 5 file descriptors/socketsstruct pollfd { int fd = 0x4 short events = 0x40 short revents = 0x80b}struct pollfd { int fd = 0x5 short events = 0x40 short revents = 0xfef7}struct pollfd { int fd = 0x7 short events = 0x40 short revents = 0x80d}struct pollfd { int fd = 0x8 short events = 0x4 short revents = 0xfee9}struct pollfd { int fd = 0x9 short events = 0x40 short revents = 0xfed9}poll returning 1 events <-- poll returns 1struct pollfd { int fd = 0x4 short events = 0x40 short revents = 0}struct pollfd { int fd = 0x5 short events = 0x40 short revents = 0}struct pollfd { int fd = 0x7 short events = 0x40 short revents = 0}struct pollfd { int fd = 0x8 short events = 0x4 short revents = 0x4 <-- here is the 1 event}struct pollfd { int fd = 0x9 short events = 0x40 short revents = 0}
A problem with this script is that if there is less than 5 descriptors being polled, it prints garbage for the extra ones, and, of course, it won't print more than 5.
But Mohamed is running on Solaris 11, which does not have the print()
action I am using in the above script. Here is a script that works similarily to Mohamed's, but gives the correct data.
#!/usr/sbin/dtrace -qssyscall::pollsys:entry/arg0 && arg1/{ self->in_pollsys = 1; self->pfds_user = arg0; self->fds_cnt = (int)arg1; this->pfds= (struct pollfd*) copyin(self->pfds_user, sizeof(struct pollfd)); printf("ENTRY pid: %d Pollsys thread = %d npfds = %d\n", pid, tid, arg1); printf("%s:%s fd = %d, events = %x, revents = %x\n", probefunc, probename, this->pfds->fd, this->pfds->events, this->pfds->revents); self->fds_print = (int)arg1 - 1;}syscall::pollsys:entry/self->fds_print/{ self->next = self->fds_cnt-self->fds_print; this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user + self->next * sizeof(struct pollfd)), sizeof(struct pollfd)); printf("%s:%s fd = %d, events = %x, revents = %x\n", probefunc, probename, this->pfds->fd, this->pfds->events, this->pfds->revents); self->fds_print--;}syscall::pollsys:entry/self->fds_print/{ self->next = self->fds_cnt-self->fds_print; this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user + self->next * sizeof(struct pollfd)), sizeof(struct pollfd)); printf("%s:%s fd = %d, events = %x, revents = %x\n", probefunc, probename, this->pfds->fd, this->pfds->events, this->pfds->revents); self->fds_print--;}syscall::pollsys:entry/self->fds_print/{ self->next = self->fds_cnt-self->fds_print; this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user + self->next * sizeof(struct pollfd)), sizeof(struct pollfd)); printf("%s:%s fd = %d, events = %x, revents = %x\n", probefunc, probename, this->pfds->fd, this->pfds->events, this->pfds->revents); self->fds_print--;}syscall::pollsys:entry/self->fds_print/{ self->next = self->fds_cnt-self->fds_print; this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user + self->next * sizeof(struct pollfd)), sizeof(struct pollfd)); printf("%s:%s fd = %d, events = %x, revents = %x\n", probefunc, probename, this->pfds->fd, this->pfds->events, this->pfds->revents); self->fds_print--;}syscall::pollsys:entry/self->fds_print/{ self->next = self->fds_cnt-self->fds_print; this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user + self->next * sizeof(struct pollfd)), sizeof(struct pollfd)); printf("%s:%s fd = %d, events = %x, revents = %x\n", probefunc, probename, this->pfds->fd, this->pfds->events, this->pfds->revents); self->fds_print--;}syscall::pollsys:return/self->in_pollsys/{ printf ("RETURN pid: %d Pollsys thread = %d returns %d:%d\n", pid, tid, arg0, arg1); this->pfds= (struct pollfd*) copyin(self->pfds_user, sizeof(struct pollfd)); printf("%s:%s fd = %d, events = %x, revents = %x\n", probefunc, probename, this->pfds->fd, this->pfds->events, this->pfds->revents); self->fds_print = (int)self->fds_cnt - 1; self->in_pollsys = 0;}syscall::pollsys:return/self->fds_print/{ self->next = self->fds_cnt-self->fds_print; this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user + self->next * sizeof(struct pollfd)), sizeof(struct pollfd)); printf("%s:%s fd = %d, events = %x, revents = %x\n", probefunc, probename, this->pfds->fd, this->pfds->events, this->pfds->revents); self->fds_print--;}syscall::pollsys:return/self->fds_print/{ self->next = self->fds_cnt-self->fds_print; this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user + self->next * sizeof(struct pollfd)), sizeof(struct pollfd)); printf("%s:%s fd = %d, events = %x, revents = %x\n", probefunc, probename, this->pfds->fd, this->pfds->events, this->pfds->revents); self->fds_print--;}syscall::pollsys:return/self->fds_print/{ self->next = self->fds_cnt-self->fds_print; this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user + self->next * sizeof(struct pollfd)), sizeof(struct pollfd)); printf("%s:%s fd = %d, events = %x, revents = %x\n", probefunc, probename, this->pfds->fd, this->pfds->events, this->pfds->revents); self->fds_print--;}syscall::pollsys:return/self->fds_print/{ self->next = self->fds_cnt-self->fds_print; this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user + self->next * sizeof(struct pollfd)), sizeof(struct pollfd)); printf("%s:%s fd = %d, events = %x, revents = %x\n", probefunc, probename, this->pfds->fd, this->pfds->events, this->pfds->revents); self->fds_print--;}syscall::pollsys:return/self->fds_print/{ self->next = self->fds_cnt-self->fds_print; this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user + self->next * sizeof(struct pollfd)), sizeof(struct pollfd)); printf("%s:%s fd = %d, events = %x, revents = %x\n", probefunc, probename, this->pfds->fd, this->pfds->events, this->pfds->revents); self->fds_print--;}
Here is output:
# ./poll.dENTRY pid: 2475 Pollsys thread = 1 npfds = 3pollsys:entry fd = 7, events = 1, revents = 0pollsys:entry fd = 5, events = 1, revents = 0pollsys:entry fd = 10, events = 1, revents = 0ENTRY pid: 2548 Pollsys thread = 1 npfds = 5pollsys:entry fd = 3, events = 40, revents = 0pollsys:entry fd = 4, events = 40, revents = 0pollsys:entry fd = 5, events = 40, revents = 0pollsys:entry fd = 6, events = 40, revents = 0pollsys:entry fd = 7, events = 40, revents = 0ENTRY pid: 6218 Pollsys thread = 1 npfds = 4pollsys:entry fd = 4, events = 44, revents = 0pollsys:entry fd = 5, events = 40, revents = 807pollsys:entry fd = 7, events = 40, revents = 804pollsys:entry fd = 9, events = 40, revents = fedaRETURN pid: 6218 Pollsys thread = 1 returns 1:1pollsys:return fd = 4, events = 44, revents = 4pollsys:return fd = 5, events = 40, revents = 0pollsys:return fd = 7, events = 40, revents = 0pollsys:return fd = 9, events = 40, revents = 0...
The only difference between this script and Mohamed's script is that the copyin()
action is called for each pollfd
structure in each clause, instead of doing the copyin
at the beginning of the entry and return probes. Mohamed's script looks like it should have worked. However, copyin() data goes into scratch space which is not saved across clauses.
I'll take this time to wish everyone a happy holiday. See you next week!
Post written by Mr. Max Bruning