module example;
import annotations;
/+
At compile time, the following will be printed to STDOUT (when in debug mode only):
FIXME @ example.randomNumber (example.d:11): The RNG is broken.
TODO @ example.main (example.d:17): Properly implement main().
+/
@FixMe!( randomNumber, "The RNG is broken." )
int randomNumber()
{
return 4;
}
@ToDo!( main, "Properly implement main()." )
void main()
{
}
dtrace -qn '
/*
* Measure how random or sequential NFSv3 IO is, and bucket it by
* client IP address. Print results every 5 seconds by default.
* Uncomment lines with @h aggregates to get histograms which show
* common offsets. This is a good way to know just how sequential
* sequential IO is, and in what offsets it is happening, i.e. 4K, 64K, etc.
*/
int offset[string, string];
int lastdelta[string, string];
int havedata;
BEGIN {
printf("command,ip.addr,rand.count,seq.count\n");
map["op-read-start"] = "R";
map["op-write-start"] = "W";
}
nfsv3:::op-read-start,nfsv3:::op-write-start
/!offset[map[probename], args[0]->ci_remote]/ {
offset[map[probename], args[0]->ci_remote] = args[2]->offset;
lastdelta[map[probename], args[0]->ci_remote] = 0;
}
nfsv3:::op-read-start,nfsv3:::op-write-start
/* Previous offset is greater than this offset */
/offset[map[probename], args[0]->ci_remote] > args[2]->offset/ {
this->dist = offset[map[probename], args[0]->ci_remote] - args[2]->offset;
/* @h[map[probename], args[0]->ci_remote] = quantize(this->dist); */
@seq[map[probename], args[0]->ci_remote] =
sum(this->dist == lastdelta[map[probename], args[0]->ci_remote]);
@rand[map[probename], args[0]->ci_remote] =
sum(this->dist != lastdelta[map[probename], args[0]->ci_remote]);
offset[map[probename], args[0]->ci_remote] = args[2]->offset;
lastdelta[map[probename], args[0]->ci_remote] = this->dist;
havedata = 1;
}
nfsv3:::op-read-start,nfsv3:::op-write-start
/* Previous offset is less than this offset */
/offset[map[probename], args[0]->ci_remote] < args[2]->offset/ {
this->dist = args[2]->offset - offset[map[probename], args[0]->ci_remote];
/* @h[map[probename], args[0]->ci_remote] = quantize(this->dist); */
@seq[map[probename], args[0]->ci_remote] =
sum(this->dist == lastdelta[map[probename], args[0]->ci_remote]);
@rand[map[probename], args[0]->ci_remote] =
sum(this->dist != lastdelta[map[probename], args[0]->ci_remote]);
offset[map[probename], args[0]->ci_remote] = args[2]->offset;
lastdelta[map[probename], args[0]->ci_remote] = this->dist;
havedata = 1;
}
tick-5sec /havedata/ {
printa("%s,%s,%@d,%@d\n", @rand, @seq);
trunc(@rand); trunc(@seq);
/* trunc(@h); */
havedata = 0;
}'
nfsv3-top-100-eps-csv.d
dtrace -qn'
#pragma D option aggsize=16m
#pragma D option bufsize=8m
#pragma D option dynvarsize=8m
string typ[string];
BEGIN
{
printf("source,cmd,path,IOs,avg.lat.us,bytes\n");
typ["op-access-done"] = "A";
typ["op-create-done"] = "C";
typ["op-getattr-done"] = "G";
typ["op-read-done"] = "R";
typ["op-write-done"] = "W";
}
/*
* Ignoring all R/W IOs less than 1K. If doing some really tiny <1K IOs,
* lets pretend these IOs did not happen.
*/
nfsv3:::op-read-start,
nfsv3:::op-write-start /args[2]->count > 1 << 10/ { self->bytes = args[2]->count ; }
nfsv3:::op-access-start, nfsv3:::op-create-start, nfsv3:::op-getattr-start,
nfsv3:::op-read-start, nfsv3:::op-write-start
{
self->src_addr = args[0]->ci_remote;
tm[self->src_addr, args[1]->noi_xid] = timestamp;
self->have_path = args[1]->noi_curpath == NULL ? 0 : 1;
/* We should make sure we handle empty value as well */
self->path = args[1]->noi_curpath != "" ?
args[1]->noi_curpath : "NOPATH";
}
/*
* Continue to ignore any R/W IOs that are less than 1K.
*/
nfsv3:::op-read-done, nfsv3:::op-write-done
/self->have_path &&
tm[self->src_addr, args[1]->noi_xid] &&
self->bytes > 1 << 10
/
{
@ios[self->src_addr,
typ[probename], self->path] = count();
@lat_us[self->src_addr,
typ[probename], self->path] =
avg((timestamp - tm[self->src_addr, args[1]->noi_xid]) / 1000);
@s[self->src_addr,
typ[probename], self->path] = sum(self->bytes);
++data;
}
nfsv3:::op-access-done, nfsv3:::op-create-done, nfsv3:::op-getattr-done /self->have_path && tm[self->src_addr, args[1]->noi_xid]/
{
@ios[self->src_addr,
typ[probename], self->path] =
count();
@lat_us[self->src_addr,
typ[probename], self->path] =
avg((timestamp - tm[self->src_addr, args[1]->noi_xid]) / 1000);
@s[self->src_addr,
typ[probename], self->path] =
sum(0);
++data;
}
nfsv3:::op-access-done,
nfsv3:::op-create-done,
nfsv3:::op-getattr-done,
nfsv3:::op-read-done,
nfsv3:::op-write-done {
/* Zero out things to make sure we avoid issues with dynvar drops */
self->bytes = 0;
self->have_path = 0;
tm[self->src_addr, args[1]->noi_xid] = 0;
self->src_addr = 0;
self->path = 0;
}
tick-1sec /data/
{
/*
* Shrink each aggregation to 100 observations first, then print those
* 100 observations, which will not mean we are limited to 100 unique
* connection addresses, or 100 unique paths. As it stands, aggregations
* right now use a tuple for their key, which is a set of three values:
* client address, command, path. This means that in theory we could have:
* (client address) x (command) x (path) number of unique keys.
*/
trunc(@ios, 100); trunc(@lat_us, 100); trunc(@s, 100);
printa("%s,%s,%s,%@d,%@d,%@d\n", @ios, @lat_us, @s);
trunc(@ios); trunc(@lat_us); trunc(@s);
data = 0;
}'
nfsstatcsv.d
dtrace -qn '
inline const int NORM_KB = 1024;
inline const int NS_S = 1000000000;
inline const int NS_MS = 1000000;
inline const int NS_US = 1000;
int cnt, realtm, start, epoch;
self int nfs_r;
#pragma D option aggsize=16m
#pragma D option bufsize=50m
#pragma D option dynvarsize=50m
BEGIN {
printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
"io.type", "runtime.ms", "min.lat", "max.lat", "avg.lat",
"max.io.size", "iops", "throughput.KBps",
"512b.ct", "1024b.ct", "2048b.ct", "4096b.ct", "8192b.ct",
"16384b.ct", "32768b.ct", "65536b.ct", "131072b.ct",
"volatile", "fsync", "dsync", "arc.hit.ct", "arc.miss.ct",
"arc.size.av", "arc.size.min", "arc.size.max", "arc.mru.size.av",
"arc.mru.size.min", "arc.mru.size.max", "arc.mfu.size.av",
"arc.mfu.size.min", "arc.mfu.size.max", "total.ios", "avg.bytes.per.us"
);
hdr = 1;
epoch = timestamp;
start = timestamp;
/* How stable data is expected to be. */
stable_how[0] = "NOSYNC";
stable_how[1] = "DSYNC";
stable_how[2] = "FSYNC";
ts = walltimestamp;
}
tick-1sec /(timestamp - epoch) >= 0 && cnt > 0/ {
realtm = timestamp - epoch; /* Update real time passed since last sample. */
}
tick-1sec /!cnt/ {
/* Set realtime to 0 as first run, to make sure first reported line starts
* at 0 and increments from there. This should only happen once.
*/
realtm = 0;
++cnt;
}
nfsv3:::op-read-start,
nfsv3:::op-write-start,
nfsv4:::op-read-start {
/* If this is a read, increment counter by one, else leave previous value */
self->nfs_r += probename == "op-read-start" ? 1 : 0;
tm[args[0]->ci_remote, args[1]->noi_xid] = timestamp;
opsz[args[0]->ci_remote, args[1]->noi_xid] = args[2]->count;
flag[args[0]->ci_remote, args[1]->noi_xid] = 0;
}
/*
* sdt:::arc-hit and sdt:::arc-miss give info on direct users of the arc
* if arc-miss, then we do an I/O.
* Unfortunately, we cannot directly correlate a NFS Read with event in the
* ARC, at least not easily.
* We are likely to miss at least some ARC hits and misses here, but this
* should be good enough to get a sense for hits and misses in general.
*/
sdt:::arc-miss /self->nfs_r/ { @arcm["R"] = count(); }
sdt:::arc-hit /self->nfs_r/ { @arch["R"] = count(); }
::arc_read:entry {
this->cmd = self->nfs_r ? "R" : "W";
@arcszmin[this->cmd] = min(`arc_stats.arcstat_size.value.i64);
@arcszmax[this->cmd] = max(`arc_stats.arcstat_size.value.i64);
@arcszav[this->cmd] = avg(`arc_stats.arcstat_size.value.i64);
@arcmruszmin[this->cmd] = min(`arc_stats.arcstat_mru_size.value.i64);
@arcmruszmax[this->cmd] = max(`arc_stats.arcstat_mru_size.value.i64);
@arcmruszav[this->cmd] = avg(`arc_stats.arcstat_mru_size.value.i64);
@arcmfuszmin[this->cmd] = min(`arc_stats.arcstat_mfu_size.value.i64);
@arcmfuszmax[this->cmd] = max(`arc_stats.arcstat_mfu_size.value.i64);
@arcmfuszav[this->cmd] = avg(`arc_stats.arcstat_mfu_size.value.i64);
}
nfsv3:::op-write-start {
flag[args[0]->ci_remote,args[1]->noi_xid] = args[2]->stable;
}
nfsv4:::op-write-start {
flag[args[0]->ci_remote,args[1]->noi_xid] = args[2]->stable;
tm[args[0]->ci_remote,args[1]->noi_xid] = timestamp;
opsz[args[0]->ci_remote,args[1]->noi_xid] = args[2]->data_len;
}
nfsv3:::op-read-done,
nfsv3:::op-write-done,
nfsv4:::op-read-done,
nfsv4:::op-write-done
/tm[args[0]->ci_remote,args[1]->noi_xid]/ {
/* Total time from start of operation to end of same operation. */
this->delta = (timestamp - tm[args[0]->ci_remote, args[1]->noi_xid]);
/*
* Flag indicates volatility mode for this operation, i.e.
* `Unstable`, `Data Sync`, or `File Sync`.
*/
this->flag = flag[args[0]->ci_remote,args[1]->noi_xid];
this->name = probename == "op-write-done" ? "W" : "R";
/*
* This is a hack, it takes advantage of the fact that realtm value will
* always increase, and never decrease. Because we do not have an aggregate
* for storing only last value, this is the best we can do. We are
* effectively storing latest computed value.
*/
@t[this->name] = max(realtm);
this->size = opsz[args[0]->ci_remote,args[1]->noi_xid];
this->buck = 512; /* We assume this is smallest possible size */
this->buck = ( this->size > 512 ) ? 1024 : this->buck ;
this->buck = ( this->size > 1024 ) ? 2048 : this->buck ;
this->buck = ( this->size > 2048 ) ? 4096 : this->buck ;
this->buck = ( this->size > 4096 ) ? 8192 : this->buck ;
this->buck = ( this->size > 8192 ) ? 16384 : this->buck ;
this->buck = ( this->size > 16384 ) ? 32768 : this->buck ;
this->buck = ( this->size > 32768 ) ? 65536 : this->buck ;
this->buck = ( this->size > 65536 ) ? 0x20000 : this->buck ;
this->buck = ( this->size > 0x20000 ) ? 0x40000 : this->buck ;
this->buck = ( this->size > 0x40000 ) ? 0x80000 : this->buck ;
this->buck = ( this->size > 0x80000 ) ? 0x100000 : this->buck ;
this->buck = ( this->size > 0x100000 ) ? 0x200000 : this->buck ;
@lmin[this->name] = min(this->delta);
@lmax[this->name] = max(this->delta);
@a[this->name] = avg(this->delta);
@maxiosz[this->name] = max(this->size);
@i[this->name] = count();
/* Average bytes per microsecond of IO time. */
this->bpus = this->delta > 0 ? (this->size*1000 / this->delta) : this->size;
@abus[this->name] = avg(this->bpus);
/* Do not normalize (@ios) to get total I/Os per interval. */
@ios[this->name] = count();
@b[this->name] = sum(opsz[args[0]->ci_remote, args[1]->noi_xid]);
@sz512[this->name] = sum(this->buck == 512 ? 1 : 0 );
@sz1024[this->name] = sum(this->buck == 1024 ? 1 : 0 );
@sz2048[this->name] = sum(this->buck == 2048 ? 1 : 0 );
@sz4096[this->name] = sum(this->buck == 4096 ? 1 : 0 );
@sz8192[this->name] = sum(this->buck == 8192 ? 1 : 0 );
@sz16384[this->name] = sum(this->buck == 16384 ? 1 : 0 );
@sz32768[this->name] = sum(this->buck == 32768 ? 1 : 0 );
@sz65536[this->name] = sum(this->buck == 65536 ? 1 : 0 );
@sz131072[this->name] = sum(this->buck == 131072 ? 1 : 0 );
/*
* There is an aggregate for each possible flag, and we are counting
* number of occurrences of each, reporting each as a separate field.
*/
@nos[this->name] = sum(this->flag == 0 ? 1 : 0 );
@fs[this->name] = sum(this->flag == 1 ? 1 : 0 );
@ds[this->name] = sum(this->flag == 2 ? 1 : 0 );
tm[args[0]->ci_remote,args[1]->noi_xid] = 0;
opsz[args[0]->ci_remote,args[1]->noi_xid] = 0;
flag[args[0]->ci_remote,args[1]->noi_xid] = 0;
self->nfs_r -= probename == "op-read-done" ? 1 : 0;
}
tick-1sec /(timestamp - start) > 0 && cnt/ {
/*
* Normalizing by 1024 can introduce sample error when the value being
* divided is less than 1024. Since there is no floating point division,
* resulting value will be 0, which if data is routinely less than 1024,
* can create a serious discontinuity between real data and normalized
* data.
*/
normalize(@lmin, NS_US);
normalize(@lmax, NS_US);
normalize(@a, NS_US);
normalize(@b, NORM_KB);
normalize(@t, NS_MS);
printa(
"%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
@t, @lmin, @lmax, @a, @maxiosz, @i, @b,
@sz512, @sz1024, @sz2048, @sz4096, @sz8192, @sz16384,
@sz32768, @sz65536, @sz131072, @nos, @fs, @ds,
@arch, @arcm, @arcszav, @arcszmin, @arcszmax,
@arcmruszav, @arcmruszmin, @arcmruszmax, @arcmfuszav, @arcmfuszmin, @arcmfuszmax, @ios, @abus);
/*
* Truncate aggregations and re-set timestamp for next sampling period.
*/
trunc(@t); trunc(@lmin); trunc(@lmax); trunc(@a); trunc(@maxiosz); trunc(@i);
trunc(@b); trunc(@sz512); trunc(@sz1024); trunc(@sz2048); trunc(@sz4096);
trunc(@sz8192); trunc(@sz16384); trunc(@sz32768); trunc(@sz65536);
trunc(@sz131072); trunc(@nos); trunc(@fs); trunc(@ds);
trunc(@ios); trunc(@abus); trunc(@arch); trunc(@arcm);
trunc(@arcszav); trunc(@arcszmin); trunc(@arcszmax);
trunc(@arcmruszav); trunc(@arcmruszmin); trunc(@arcmruszmax);
trunc(@arcmfuszav); trunc(@arcmfuszmin); trunc(@arcmfuszmax);
start = timestamp;
}'
nfs-v3-one-liners.txt
// What is the recordsize of operations, both reads and writes?
dtrace -qn 'nfsv3:::op-commit-start,nfsv3:::op-read-start,nfsv3:::op-write-start {@[probename=="op-write-start" ? "W" : "R"] = quantize(args[2]->count);}'
/+
A simple example showing usage of the
EventHandler struct.
+/
import eventhandler;
import std.stdio;
class List( T )
{
private alias EventHandler!( T ) AddedHandler;
private AddedHandler onItemAdded;
public ref AddedHandler ItemAdded() @property
{
return this.onItemAdded;
}
private T[] array;
public T[] Items() @property
{
return this.array.dup;
}
public void Add( T item )
{
this.array ~= item;
this.onItemAdded( item );
}
}
void printNewItem( string item )
{
"'%s' was added to the list.".writefln( item );
}
void main()
{
auto list = new List!( string );
list.ItemAdded += &printNewItem;
list.Add( "foo" );
// Output: 'foo' was added to the list.
}
EventHandler.d
module eventhandler;
/+
EventHandler implementation.
+/
struct EventHandler( TArgs... )
{
private import std.algorithm;
public alias void delegate( TArgs ) DelegateType;
public alias void function( TArgs ) FunctionType;
private FunctionType[] functions;
private DelegateType[] delegates;
public void opAssign( T... )( T args )
{
throw new Exception( "EventHandler objects may not be assigned to directly, please use += or -= instead." );
}
public void opOpAssign( string op )( FunctionType fn ) if( op == "+" )
{
this.functions ~= fn;
}
public void opOpAssign( string op )( DelegateType dg ) if( op == "+" )
{
this.delegates ~= dg;
}
public void opOpAssign( string op )( FunctionType fn ) if( op == "-" )
{
if( !this.functions.canFind( fn ) )
return;
else
this.functions = this.functions.remove!( x => x == fn );
}
public void opOpAssign( string op )( DelegateType dg ) if( op == "-" )
{
if( !this.delegates.canFind( dg ) )
return;
else
this.delegates = this.delegates.remove!( x => x == dg );
}
public void opCall( TArgs args )
{
int len = max( this.delegates.length, this.functions.length );
for( int i = 0; i < len; ++ i )
{
if( i < this.functions.length )
this.functions[i]( args );
if( i < this.delegates.length )
this.delegates[i]( args );
}
}
}
# Trace smb2 write events
dtrace -qn 'fbt:smbsrv:smb2_write:entry {
@[stringof(args[0]->uid_user->u_name),
(uint16_t)args[0]->uid_user->u_uid,
args[0]->smb_error.status,
(uint16_t)args[0]->smb_error.errcode,
stringof(args[0]->tid_tree->t_sharename),
stringof(args[0]->tid_tree->t_volume),
stringof(args[0]->tid_tree->t_resource),
stringof(args[0]->tid_tree->t_snode->vp->v_path)
]= count(); }'
# Trace mkdir and rename operations with vnode info and name of file or dir being changed
dtrace -qn 'fbt:smbsrv:smb_vop_mkdir:entry,
fbt:smbsrv:smb_vop_rename:entry {
@[probefunc, stringof(args[1]), stringof(args[0]->v_path)] = count(); }
tick-1sec {printa(@); trunc(@);}'
smb-write-offsets.d
// Measure Offsets of IOs issued, by file. We can observe with this snippet
// how IOs are issued, in terms of whether offsets are pretty constant or
// there is a healthy mix of different sizes. In general, differently sized
// offsets is a pretty solid indicator that IO is mostly random.
// This is an example of what we might see. In this instance all offsets are
// 1MB in size.
// /storage/p01/global/smb/00/CentOS-6.8-x86_64-LiveDVD.iso
// value ------------- Distribution ------------- count
// 524288 | 0
// 1048576 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 391
// 2097152 |
dtrace -qn '
self offset_t prev[smb_node_t *];
::smb_fsop_write:entry /!self->prev[args[2]]/ {
self->prev[args[2]] = args[3]->_uio_offset._f;
}
::smb_fsop_write:entry
/self->prev[args[2]] != args[3]->_uio_offset._f/ {
this->o = args[3]->_uio_offset._f;
this->diff = this->o > self->prev[args[2]] ?
this->o - self->prev[args[2]] :
self->prev[args[2]] - this->o;
@[stringof(args[2]->vp->v_path)] = quantize(this->diff);
self->prev[args[2]] = 0;
}'
smb-vop_lookup-access.d
dtrace -qn '
/*
* See /usr/src/uts/common/sys/errno.h for all error codes.
*/
::smb_vop_lookup:entry /args[0]->v_path != NULL/ {
self->path = (stringof(*(struct vnode *)args[0]).v_path);
self->user = args[8]->cr_uid;
self->group = args[8]->cr_uid;
}
::smb_vop_lookup:return /self->path != NULL/ {
this->err = args[1]; /* non-zero here implies a problem */
@[self->path, self->user, self->group, this->err] = count();
}
END {
printa("Path: %s uid: %d gid: %d retcode: %d Count: %@d\n", @);
}'
smb-oplock-acquire-time.d
dtrace -qn '
BEGIN {
ts = walltimestamp;
/* print header for CSV */
printf("timestamp,filename,avg.duration.ns,total.duration.ns\n");
}
::smb_oplock_acquire:entry {
this->start = timestamp;
this->fname = stringof(args[2]->f_node->od_name);
}
::smb_oplock_acquire:return {
@[ts, this->fname] = sum(timestamp - this->start);
/* average amount of time per op-lock acquisition */
@av_dur[ts, this->fname] = avg(timestamp - this->start);
}
tick-5sec {
printa("%Y,%s,%@d,%@d\n", @, @av_dur);
trunc(@); trunc(@av_dur);
ts = walltimestamp;
}'
#!/usr/sbin/dtrace -s
#pragma D option quiet
/*
This script should give us a quantized distribution of
time delta from io:::start to io:::done, i.e. amount of
time it takes to complete IO operations, separated by
reads and writes.
*/
BEGIN
{
start = timestamp;
}
io:::start
{
ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp;
}
io:::done
/ts[args[0]->b_edev, args[0]->b_lblkno]/
{
this->delta = (timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000;
this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ?
"read " : "write ";
@q[this->name] = quantize(this->delta);
@a[this->name] = avg(this->delta);
@v[this->name] = stddev(this->delta);
@i[this->name] = count();
@b[this->name] = sum(args[0]->b_bcount);
ts[args[0]->b_edev, args[0]->b_lblkno] = 0;
}
END
{
printa(@q);
normalize(@i, (timestamp - start) / 1000000000);
normalize(@b, (timestamp - start) / 1000000000 * 1024);
printf("%-30s %11s %11s %11s %11s\n", "", "avg latency", "stddev",
"iops", "throughput");
printa("%-30s %@9uus %@9uus %@9u/s %@8uk/s\n", @a, @v, @i, @b);
}
txg-syncing-size.d
// On a healthy system we should see these events roughly every 5 seconds.
// Give name of pool as single argument to this snippet.
# dtrace -qn '
txg-syncing
{
this->dp = (dsl_pool_t *)arg0;
}
txg-syncing
/this->dp->dp_spa->spa_name == $$1/
{
printf("%Y %4dMB of %4dMB used\n", walltimestamp, this->dp->dp_dirty_total >> 20,
`zfs_dirty_data_max >> 20);
}
'
// A more generic version where all pools are given to us as the syncs happen.
# dtrace -qn '
txg-syncing
{
this->dp = (dsl_pool_t *)arg0;
this->spa = *((dsl_pool_t *)arg0)->dp_spa;
}
txg-syncing
/this->spa.spa_name != NULL/
{
printf("%Y %-16s %4dMB of %4dMB used\n",
walltimestamp,
this->spa.spa_name,
this->dp->dp_dirty_total >> 20,
`zfs_dirty_data_max >> 20);
}
'