d Dtrace用于测量文件系统和fsflush活动的片段

Dtrace用于测量文件系统和fsflush活动的片段

fsflush-latency-one-liners.d
# dtrace -qn '::fsflush_do_pages:entry {self->in = timestamp} ::fsflush_do_pages:return {@delta = quantize((timestamp - self->in) / 1000);}'

d Dtrace用于与文件系统事件交互的脚本

Dtrace用于与文件系统事件交互的脚本

fsiocsv.d
#!/usr/sbin/dtrace -s
#pragma D option quiet
/*
  Copyright 2009-2015 RackTop Systems Inc. and/or its affiliates.
  http:www.racktopsystems.com

  The methods and techniques utilized herein are considered TRADE SECRETS
  and/or CONFIDENTIAL unless otherwise noted. REPRODUCTION or DISTRIBUTION
  is FORBIDDEN, in whole and/or in part, except by express written permission
  of RackTop Systems.

  @@DESCRIPTION@@  Generate detailed CSV output of all reads and writes.
  @@NAME@@         fsiocsv.d
  @@STABILITY@@    unstable
  @@VERSION@@      1.0.1

  This script produces a CSV-like output for all reads and writes
  occuring on the system, without limiting scope to any particular path,
  filesystem, etc.
  ---
  Output looks something like this:
  2015 Jun 28 07:30:23,/storage/p01/test/01/benchmark.1.1,W,425984
  2015 Jun 28 07:30:18,/storage/p01/test/01/benchmark.0.1,W,9043968
  2015 Jun 28 07:30:18,/storage/p01/test/01/benchmark.2.3,W,9404416
  ---
  Redirect output of this script to log, and send stderr to /dev/null,
  to avoid annoying periodic null-pointer issues, which the script fails
  handling periodically, likely due to timing issues.
  Something like this should do: /path/to/fsiocsv.d >/path/to/log 2>/dev/null

  We cannot guarrantee that all file IO is captured by this script, but we
  are confident that the number missed is not significant enough to consider
  this script invalid.
 */

BEGIN {
    /* Extend comment to next line to disable header. */
    printf("timestamp,path,op,source,bytes\n");
}
tick-10sec {
    ts = walltimestamp;
}
fsinfo::fop_read:read /ts/ {
    fpathR = args[0]->fi_pathname;
}

::fop_read:entry /args[0]->v_path != NULL &&
                  stringof(args[0]->v_path) == fpathR/
{
    this->src = execname == "" ? "nocmd" : execname;
    @[ts, fpathR, "R", this->src] = sum(args[1]->uio_resid);
    fpathR = 0;
}

fsinfo::fop_write:write /ts/ {
    fpathW = args[0]->fi_pathname;
}

::fop_write:entry /args[0]->v_path != NULL &&
                   stringof(args[0]->v_path) == fpathW /
{
    this->src = execname == "" ? "nocmd" : execname;
    @[ts, fpathW, "W", this->src] = sum(args[1]->uio_resid);
    fpathW = 0;

}
tick-10sec {
    printa("%Y,%s,%s,%s,%@d\n", @);
    trunc(@);
}

d SNMPD-的init.d

snmpd-init.d
#!/bin/sh
#
# snmpd	This shell script takes care of starting and stopping
#	the net-snmp SNMP daemon
#
# chkconfig: - 26 74
# description: snmpd is net-snmp SNMP daemon.

# Source function library.
. /etc/rc.d/init.d/functions

# Source networking configuration.
. /etc/sysconfig/network

# Check that networking is up.
[ "${NETWORKING}" = "no" ] && exit 0

RETVAL=0
name="snmpd"
prog="/usr/local/sbin/snmpd"

[ -x $prog -a -f /usr/local/share/snmp/snmpd.conf ] || exit 0

start() {
        # Start daemons.
        echo -n $"Starting $name: "
        daemon $prog
	RETVAL=$?
        echo
        [ $RETVAL -eq 0 ] && touch /var/lock/subsys/$name
	return $RETVAL
}

stop() {
        # Stop daemons.
        echo -n $"Shutting down $name: "
	killproc $prog
	RETVAL=$?
        echo
        [ $RETVAL -eq 0 ] && rm -f /var/lock/subsys/$name
	return $RETVAL
}

# See how we were called.
case "$1" in
  start)
	start
        ;;
  stop)
	stop
        ;;
  status)
	status $name
	RETVAL=$?
	;;
  restart|reload)
	stop
	start
	RETVAL=$?
	;;
  condrestart)
	if [ -f /var/lock/subsys/$name ]; then
	    stop
	    start
	    RETVAL=$?
	fi
	;;
  *)
        echo $"Usage: $0 {start|stop|restart|condrestart|status}"
        exit 1
esac

exit $RETVAL

d つよそうな设置关数

つよそうな设置关数

gistfile1.d
void set(T, Args...)(ref T hoge, Args args)
{
    template GetFieldIndex(T, size_t idx, Args...)
    {
        alias Field = typeof(T.init.tupleof);

        // 構造体のどのメンバに代入するか決める
        template Skip(size_t i, size_t j)
        if(i < Field.length && j <= idx)
        {
            static if(!is(Args[idx] : Field[i])){
                alias Skip = Skip!(i+1, j);             // 代入できないならスキップ
            }else static if(idx == j){
                enum size_t index = i;                  // マッチしたので、ここに代入する
            }else static if(is(Args[j] : Field[i])){
                alias Skip = Skip!(i+1, j+1);           // 型はマッチしたけど、まだ自分の番ではないのでスキップ
            }else
                alias Skip = Skip!(i, j+1);             // 型がマッチしなかったのでスキップ
        }

        alias GetFieldIndex = Skip!(0, 0).index;
    }


    foreach(i, e; args){
        hoge.tupleof[GetFieldIndex!(T, i, Args)] = e;
    }
}

/**
テストしてみる
*/
struct S { int a; string b; }
struct T { string a; int[] b; string c; }

import std.stdio;

void main()
{
    S s;
    s.set(1, "b"); writeln(s);
    s.set("b", 1); writeln(s);

    T t;
    t.set([1], "a", "c"); writeln(t);
    t.set("a", [1], "c"); writeln(t);
}

d Dtrace片段用于观察mptsas

Dtrace片段用于观察mptsas

mptsas-target-inflight-ncmds.d
dtrace -qn '
    ::mptsas_start_cmd:entry {
        this->ncmds = args[1]->cmd_tgt_addr->m_t_ncmds;
        this->t_wwn = args[1]->cmd_tgt_addr->m_addr.mta_wwn;
        @max_cmds[this->t_wwn] = max(this->ncmds);
        @plot[this->t_wwn] = quantize(this->ncmds);
    }
    END {
        printa("           Target: %-16X\tMax.in-flight.Cmds: %@d %@d\n", 
                @max_cmds, @plot);
    }'
mptsas-ctrlr-inflight-ncmds.d
dtrace -qn '
    ::mptsas_handle_scsi_io_success:entry {
        self->x = args[0];
    } 
    ::mptsas_handle_scsi_io_success:return {
        @ = quantize(self->x->m_ncmds);
    }'

d [Dtrace IO片段]处理低级SCSI信息的各种部分#tags:io探测器,io,dtrace,设备对齐,队列深度,lat

[Dtrace IO片段]处理低级SCSI信息的各种部分#tags:io探测器,io,dtrace,设备对齐,队列深度,延迟,吞吐量,带宽,IO大小,搜索大小,搜索,biowait,biodone

biowait-latency.d
dtrace -qn '
  ::biowait:entry 
  / execname != "sched" && execname != "fsflush" / {
    this->start = timestamp ;
    this->buf = args[0] ;
  }
  ::biowait:return
  / this->start && execname != "sched" && execname != "fsflush" / {
    this->delta = timestamp - this->start ;

    /* This semaphore count may be interesting to track also */
    this->sema = (sema_impl_t *)&this->buf->b_sem ;

    /* Categorize results by whether or not there was an EIO */
    this->err = args[1] > 0 ? "failed (EIO)" : "completed" ;
    @q[execname, this->err] = quantize(this->delta) ;
    this->start = 0 ;
  }
  END {
      printa("\t   [%s] | IO => %s%@d\n", @q);
  }'
seeksize.d
#!/usr/sbin/dtrace -s
/*
 * seeksize.d - analyse disk head seek distance by process.
 *              Written using DTrace (Solaris 10 3/05).
 *
 * Disk I/O events caused by processes will in turn cause the disk heads
 * to seek. This program analyses those seeks, so that we can determine
 * if processes are causing the disks to seek in a "random" or "sequential"
 * manner.
 *
 * 15-Jun-2005, ver 1.00
 *
 * USAGE:	seeksize.d		# wait several seconds, then hit Ctrl-C
 *
 * FIELDS:
 *		PID	process ID
 *		CMD	command and argument list
 *		value	distance in disk blocks (sectors)
 *		count	number of I/O operations
 *
 * SEE ALSO: bitesize.d, iosnoop
 *
 * Standard Disclaimer: This is freeware, use at your own risk.
 *
 * 11-Sep-2004	Brendan Gregg	Created this.
 * 10-Oct-2004	   "      "	Rewrote to use the io provider.
 */

#pragma D option quiet

/*
 * Print header
 */
dtrace:::BEGIN
{
	printf("Sampling... Hit Ctrl-C to end.\n");
}

self int last[dev_t];

/*
 * Process io start
 */
io:::start
{
	/* fetch details */
	this->dev = args[0]->b_edev;
	this->blk = args[0]->b_blkno;
	this->size = args[0]->b_bcount;
	cmd = (string)curpsinfo->pr_psargs;
}
io:::start
/self->last[this->dev] != 0/
{
	/* calculate seek distance */
	this->dist = this->blk - self->last[this->dev] > 0 ?
	    this->blk - self->last[this->dev] :
	    self->last[this->dev] - this->blk;

	/* store details */
	@Size[pid,cmd] = quantize(this->dist);
}
io:::start
{
	/* save last position of disk head */
	self->last[this->dev] = this->blk + this->size / 512;
}

/*
 * Print final report
 */
dtrace:::END 
{
	printf("\n%8s  %s\n","PID","CMD");
	printa("%8d  %s\n%@d\n",@Size);
}
seek-distance.d
#!/usr/sbin/dtrace -s
#pragma D option quiet
/*
 * This script provides useful information about distances we have to seek,
 * as well as direction and sequential vs. random IO observations.
 * We account for which disk the seeking pertains to, so the distances
 * measured are relevant to each device, but numbers are reported
 * cumulatively for all devices. It may be useful to report an average seeking
 * distance for all IOs that we observe in a sample period.
 */

self long last[dev_t]; 

io:::start
/self->last[args[0]->b_edev] != 0/
{
    this->iotype = args[0]->b_flags & B_WRITE ? "WRITE" :
                    args[0]->b_flags & B_READ ? "READ"  : "NOT R/W";

    this->last = self->last[args[0]->b_edev];
    this->dist = (int)(args[0]->b_blkno - this->last) > 0 ?
        args[0]->b_blkno - this->last : this->last - args[0]->b_blkno;
      
    this->fwd = args[0]->b_blkno + 
                (args[0]->b_bcount / 512) > this->last ? 1 : 0;

    this->seq = this->dist == 0 ? 0 : 1;
    this->direction = args[0]->b_blkno - this->last > 0 ? "RAND FW =>" : 
                      this->last - args[0]->b_blkno > 0 ? "<= RAND REV" :
                      this->last == args[0]->b_blkno && this->fwd ? 
                      "SEQ FW =>" : "<= SEQ REV";

    @gap[this->iotype, this->direction] = quantize(this->dist);
}

io:::start
{
  self->last[args[0]->b_edev] = args[0]->b_blkno +
      args[0]->b_bcount / 512;
}

dtrace:::END
{
  printa("        IO Type: %-8s Direction: %-12s\n%@d\n", @gap);
}
sd-power-change-state.d
# Check for entry into power management functions in sd driver.
dtrace -qn '
  tick-1sec {
    ts = walltimestamp;
  }
  :::pm_lower_power:entry,
  :::sd_pm*:entry,
  :::sd_setup_pm:entry,
  :::sdpower:entry {@[ts, probefunc] = count();
  }
  tick-10sec {
    printa("%Y,%s,%@d\n", @);
    trunc(@);
  }'
sd-pm-devinfo.d
dtrace -qn '
    tick-1sec {
    ts = walltimestamp;
    x = 1;
    }
    :::sd_pm_entry:entry /x/ {
    this->inq = ((struct sd_lun *)args[0])->un_sd->sd_inq;
    this->p = stringof(args[0]->un_sd->sd_inq->inq_pid);
    printf("%Y,%s,%s\n", ts, stringof(this->inq->inq_vid), stringof(this->inq->inq_pid));
    }'
sd-mode-page-changes.d
dtrace -qn '::sd_enable_descr_sense:entry {print(*(struct sd_lun *)args[0]->ssc_un);}'

// How long does it take for system to deal with a mode page change? 
// In some cases we notice this could be rather slow, i.e. seconds.
dtrace -qn '
    ::sd_enable_descr_sense:entry { self->in=timestamp; }
    ::sd_enable_descr_sense:return {
        printf("%d\n", (timestamp - self->in) / 1000);
    }'
sd-longest-io.d
dtrace -qn '
  inline int threshold = 500000;
  inline int nano_to_micro = 1000;
  BEGIN {
    start = timestamp;
  }
  ::sd_start_cmds:entry {
    self->a = args[0]; 
    self->s = timestamp;
    self->devi = (struct dev_info *)args[0]->un_sd->sd_dev;
    @ioc = count();
  }
  ::sd_start_cmds:return /timestamp - self->s > threshold/ {
    this->devname = (struct dev_info *)self->devi->devi_devid_str;
    @q[stringof(this->devname)] =  quantize(timestamp - self->s);
    @iolc[stringof(this->devname)] = count();
    @longest[stringof(this->devname)] = max(timestamp - self->s);
  }
  ::sd_start_cmds:return {
    @avlat = avg(timestamp - self->s);
    self->a = 0; self->s = 0; self->devi = 0;
  }
  END {
    this->runt = (timestamp - start) / 1000000000;
    normalize(@avlat, nano_to_micro);
    normalize(@longest, nano_to_micro);
    normalize(@ioc, this->runt);
    printa("Device: %s\n# Long IOs: %@d Longest IO: %@d(us)\n%@d\n",
            @iolc, @longest, @q);
    printf("Runtime: %d(sec)\n", this->runt);
    printa("Average IO/sec: %@d Average Latency: %@d(us)\n", @ioc, @avlat);
  }'
sd-failed-command-info.d
// This should give us a bit of information about a failed command returned
// to caller. In some cases we may get a path and in most cases we should have
// exec'ing command.
dtrace -qn '
  int x;
  ::sd_return_failed_command:entry {
    ++x;
    this->path = args[1]->b_file != NULL ?
    stringof(args[1]->b_file->v_path) : "nopath";
    this->proc = args[1]->b_proc != NULL ?
    stringof(args[1]->b_proc->p_exec->v_path) : "noproc";
    @fail[args[0], this->path, this->proc] = count();
  }
  tick-10sec /x/ {
    printa("ptr: %p path: %s cmd: %s = %@d\n", @fail);
    trunc(@fail); x = 0;
  }'
sd-av-io-rw-counts-size.d
dtrace -qn '
  /* 
   * All Reads and Writes go through this function, so it is a convenient
   * place to take counts.
   */
  BEGIN { start = timestamp ; }
  ::bioerror:entry /args[0]->b_flags&B_READ/ {
    @r = sum(args[0]->b_flags&B_READ ? 1 : 0);
    @rsz = sum(args[0]->b_bcount >> 10);
  }
  ::bioerror:entry /args[0]->b_flags&B_WRITE/ {
    @w  = sum(args[0]->b_flags&B_WRITE ? 1 : 0);
    @wsz = sum(args[0]->b_bcount >> 10);
  }
  END {
    normalize(@r, (timestamp - start) / 1000000000);
    normalize(@w, (timestamp - start) / 1000000000);
    normalize(@rsz, (timestamp - start) / 1000000000);
    normalize(@wsz, (timestamp - start) / 1000000000);
    printa("Av_R/s: %@d Av_KBr/s: %@d Av_W/s: %@d Av_KBw/s: %@d\n",
            @r, @rsz, @w, @wsz);
  }'
scsilatbycmdcsv.d
dtrace -qn '
/* This value may require adjustment, depending upon amount IO a system does. */
#pragma D option dynvarsize=8m

string scsi_cmd[uchar_t];

dtrace:::BEGIN
{
	/* See /usr/include/sys/scsi/generic/commands.h for the full list. */
	scsi_cmd[0x00] = "test_unit_ready";
	scsi_cmd[0x08] = "read";
	scsi_cmd[0x0a] = "write";
	scsi_cmd[0x12] = "inquiry";
    scsi_cmd[0x15] = "mode_select";
	scsi_cmd[0x17] = "release";
	scsi_cmd[0x1a] = "mode_sense";
	scsi_cmd[0x1b] = "load/start/stop";
	scsi_cmd[0x1c] = "get_diagnostic_results";
	scsi_cmd[0x1d] = "send_diagnostic_command";
	scsi_cmd[0x25] = "read_capacity";
	scsi_cmd[0x28] = "read10";
	scsi_cmd[0x2a] = "write10";
	scsi_cmd[0x35] = "synchronize_cache";
	scsi_cmd[0x4d] = "log_sense";
	scsi_cmd[0x5e] = "persistent_reserve_in";
	scsi_cmd[0xa0] = "report_luns";
}

BEGIN {
    ts = walltimestamp;
    printf("timestamp,cmd,reason,avg.lat,stddev.lat,min.lat,max.lat,lat.1000us,lat.2000us,lat.4000us,lat.8000us,lat.16000us,lat.32000us,lat.64000us,lat.gt.64000us,count.ios\n");
}

fbt::scsi_transport:entry
{
	start[arg0] = timestamp;
}

fbt::scsi_destroy_pkt:entry
/start[arg0]/
{
	this->delta = (timestamp - start[arg0]) / 1000;
    start[arg0] = 0;
	
    this->code = *args[0]->pkt_cdbp;
	this->cmd = scsi_cmd[this->code] != NULL ?
	    scsi_cmd[this->code] : lltostr(this->code);
	this->reason = args[0]->pkt_reason == 0 ? "Success" :
	    strjoin("Fail:", lltostr(args[0]->pkt_reason));

    /* There may be a small error in terms of count measured in @cnt, when
     * compared against summing all latency buckets. This is hopefully along 
     * the lines of >1% or less. We expect that in most cases the two numbers
     * should be identical.
     */
    @cnt[ts, this->cmd, this->reason]      = count();
    @latavg[ts, this->cmd, this->reason]   = avg(this->delta);
	@latsd[ts, this->cmd, this->reason]    = stddev(this->delta);
	@latmin[ts, this->cmd, this->reason]   = min(this->delta);
    @latmax[ts, this->cmd, this->reason]   = max(this->delta);
    /* 
     * Bucket latencies into buckets from 1000 to above 64K microseconds, 
     * starting small and then increasing rapidly, meaning error in reporting
     * will be not scale linearly. 
     */
    @lat1000[ts, this->cmd, this->reason]  = sum(this->delta > 0 && 
                                                this->delta <= 1000 ? 1 : 0);
    @lat2000[ts, this->cmd, this->reason]  = sum(this->delta > 1000 &&      
                                                this->delta <= 2000 ? 1 : 0);
    @lat4000[ts, this->cmd, this->reason]  = sum(this->delta > 2000 && 
                                                this->delta <= 4000 ? 1 : 0);
    @lat8000[ts, this->cmd, this->reason]  = sum(this->delta > 4000 && 
                                                this->delta <= 8000 ? 1 : 0);
    @lat16000[ts, this->cmd, this->reason] = sum(this->delta > 8000 && 
                                                this->delta <= 16000 ? 1 : 0);
    @lat32000[ts, this->cmd, this->reason] = sum(this->delta > 16000 && 
                                                this->delta <= 32000 ? 1 : 0);
    @lat64000[ts, this->cmd, this->reason] = sum(this->delta > 32000 && 
                                                this->delta <= 64000 ? 1 : 0);
    @latgt64k[ts, this->cmd, this->reason] = sum(this->delta > 64000 ? 1 : 0);
}

tick-1sec
{
	printa("%Y,%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n", 
            @latavg, @latsd, @latmin, @latmax, @lat1000, @lat2000, @lat4000, 
            @lat8000, @lat16000, @lat32000, @lat64000, @latgt64k, @cnt);
    trunc(@lat1000); trunc(@lat2000); trunc(@lat4000); 
    trunc(@lat8000); trunc(@lat16000); trunc(@lat32000); 
    trunc(@lat64000); trunc(@latgt64k);
    trunc(@cnt); trunc(@latavg); trunc(@latsd); trunc(@latmin); trunc(@latmax);
    ts = walltimestamp;
}'
scsilatbycmd.csv-latest.d
dtrace -qn '
/* This value may require adjustment, depending upon amount IO a system does. */
#pragma D option dynvarsize=8m

/* This version will not work on earlier versions of bsros than
 * BrickStorOS bsr-fdb711e2 15.07 15.0724.001D i86pc i386 i86pc
*/

string scsi_cmd[uchar_t];

dtrace:::BEGIN
{
    /* See /usr/include/sys/scsi/generic/commands.h for the full list. */
    scsi_cmd[0x00] = "test_unit_ready";
    scsi_cmd[0x03] = "request_sense";
    scsi_cmd[0x08] = "read";
    scsi_cmd[0x0a] = "write";
    scsi_cmd[0x12] = "inquiry";
    scsi_cmd[0x15] = "mode_select";
    scsi_cmd[0x17] = "release";
    scsi_cmd[0x1a] = "mode_sense";
    scsi_cmd[0x1b] = "load/start/stop";
    scsi_cmd[0x1c] = "get_diagnostic_results";
    scsi_cmd[0x1d] = "send_diagnostic_command";
    scsi_cmd[0x25] = "read_capacity";
    scsi_cmd[0x28] = "read10";
    scsi_cmd[0x2a] = "write10";
    scsi_cmd[0x35] = "synchronize_cache";
    scsi_cmd[0x37] = "read_defect_data10";
    scsi_cmd[0x4d] = "log_sense";
    scsi_cmd[0x5e] = "persistent_reserve_in";
    scsi_cmd[0x88] = "read16";
    scsi_cmd[0x8a] = "write16";
    scsi_cmd[0xa0] = "report_luns";
    scsi_cmd[0xb7] = "read_defect_data12";
}

/* This is a fairly wide table, and some columns can benefit from a bit of
 * explanation. This is a description of all columns in order, all latency
 * bucket columns are lat.XXXXus, where XXXX is a unit of time.
 *
 * timestamp - time event logged
 * cmd - scsi command, one of scs_cmd[] above
 * reason - usually indicates success or failure of command
 * avg.lat - on average how long this group of command takes to complete
 * combined.lat - sum of latencies for all instances of this type of command
 * min.lat - smallest recorded latency value
 * max.lat - largest recorded latency value
 * lat.lt.100us - anything under 100 microseconds
 * lat.XXXXus - histogram of latency, each bucket is 2x previous bucket
 * lat.gt.64000us - anything above 64K microseconds (64milliseconds)
 * count.ios - number of IOs of given command
 * avg.qd.drv - average queue size in the drive
 * min.qd.drv - smallest recorded queue size in the drive
 * max.qd.drv - largest recorded queue size in the drive
 * avg.qd.xport - average number of IOs in transport
 * min.qd.xport - smallest recorded number of IOs in transport
 * max.qd.xport - largest recorded number of IOs in transport
 */

BEGIN {
    ts = walltimestamp;
    printf("timestamp,cmd,reason,avg.lat,combined.lat,min.lat,max.lat,lat.lt.100us,lat.1000us,lat.2000us,lat.4000us,lat.8000us,lat.16000us,lat.32000us,lat.64000us,lat.gt.64000us,count.ios,avg.qd.drv,min.qd.drv,max.qd.drv,avg.qd.xport,min.qd.xport,max.qd.xport\n");
}

::sd_start_cmds:entry 
{
    x[args[0]->un_sd] = args[0];
}

fbt::scsi_transport:entry /x[args[0]->pkt_address.a.a_sd] !=NULL /
{
    start[arg0] = timestamp;
    in_xport[arg0] = x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_transport;
    in_drv[arg0] = x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_driver;
    /* x[args[0]->pkt_address.a.a_sd] = 0; clear contents of array */
}

/* 
 * Depending on version of OS, this may need to be fbt::scsi_destroy_pkt:entry,
 * or fbt::vhci_scsi_destroy_pkt:entry. In the first case, args[0] is where
 * the pkt_cdbp should be coming from, and args[1] otherwise.
 */
fbt::vhci_scsi_destroy_pkt:entry
/start[arg1]/
{
    this->in_xport = in_xport[arg1];
    this->in_drv = in_drv[arg1];
    this->delta = timestamp - start[arg1];
    in_xport[arg1] = 0;
    in_drv[arg1] = 0;
    start[arg1] = 0;
    this->code = *args[1]->pkt_cdbp;
    this->cmd = scsi_cmd[this->code] != NULL ?
        scsi_cmd[this->code] : lltostr(this->code);
    this->reason = args[1]->pkt_reason == 0 ? "Success" :
        strjoin("Fail:", lltostr(args[1]->pkt_reason));

    /* There may be a small error in terms of count measured in @cnt, when
     * compared against summing all latency buckets. This is hopefully along 
     * the lines of >1% or less. We expect that in most cases the two numbers
     * should be identical.
     *
     * Do not add standard deviation calculation to this tool. Due to a bug
     * in printing aggregations, standard deviation may result in a
     * divisor != 0 assert being triggered.
     */
    @cnt[ts, this->cmd, this->reason]      = count();
    @latavg[ts, this->cmd, this->reason]   = avg(this->delta);
    @latsum[ts, this->cmd, this->reason]   = sum(this->delta);
    @latmin[ts, this->cmd, this->reason]   = min(this->delta);
    @latmax[ts, this->cmd, this->reason]   = max(this->delta);
    
    /* Track queue depth in the drive for each sd_lun and in transport. */
    @qdxpavg[ts, this->cmd, this->reason]  = avg(this->in_xport);
    @qddrvavg[ts, this->cmd, this->reason] = avg(this->in_drv);
    @qdxpmin[ts, this->cmd, this->reason]  = min(this->in_xport);
    @qddrvmin[ts, this->cmd, this->reason] = min(this->in_drv);
    @qdxpmax[ts, this->cmd, this->reason]  = max(this->in_xport);
    @qddrvmax[ts, this->cmd, this->reason] = max(this->in_drv);

    /* 
     * Bucket latencies into buckets from 1000 to above 64K microseconds, 
     * starting small and then increasing rapidly, meaning error in reporting
     * will be not scale linearly. 
     */
    @latlt100[ts, this->cmd, this->reason]  = sum(this->delta > 0 && 
                                this->delta <= 100000 ? 1 : 0);
    @lat1000[ts, this->cmd, this->reason]  = sum(this->delta > 100000 && 
                                this->delta <= 1000000 ? 1 : 0);
    @lat2000[ts, this->cmd, this->reason]  = sum(this->delta > 1000000 &&      
                                this->delta <= 2000000 ? 1 : 0);
    @lat4000[ts, this->cmd, this->reason]  = sum(this->delta > 2000000 && 
                                this->delta <= 4000000 ? 1 : 0);
    @lat8000[ts, this->cmd, this->reason]  = sum(this->delta > 4000000 && 
                                this->delta <= 8000000 ? 1 : 0);
    @lat16000[ts, this->cmd, this->reason] = sum(this->delta > 8000000 && 
                                this->delta <= 16000000 ? 1 : 0);
    @lat32000[ts, this->cmd, this->reason] = sum(this->delta > 16000000 && 
                                this->delta <= 32000000 ? 1 : 0);
    @lat64000[ts, this->cmd, this->reason] = sum(this->delta > 32000000 && 
                                this->delta <= 64000000 ? 1 : 0);
    @latgt64k[ts, this->cmd, this->reason] = sum(this->delta > 64000000 ? 1 : 0);
}

tick-1sec
{
    /* Instead of delta divided by 1000 to convert to microseconds we
     * normalize the aggregation by 1000, i.e. convert nanoseconds to 
     * microseconds.
     */
    normalize(@latavg, 1000);
    normalize(@latsum, 1000); 
    normalize(@latmin, 1000);
    normalize(@latmax, 1000);
    printa("%Y,%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n", 
            @latavg, @latsum, @latmin, @latmax, @latlt100, @lat1000, @lat2000, @lat4000, 
            @lat8000, @lat16000, @lat32000, @lat64000, @latgt64k, @cnt,
            @qddrvavg, @qddrvmin, @qddrvmax, @qdxpavg, @qdxpmin, @qdxpmax
            );
    trunc(@qdxpavg); trunc(@qdxpmin); trunc(@qdxpmax);
    trunc(@qddrvavg); trunc(@qddrvmin); trunc(@qddrvmax);
    trunc(@latlt100); trunc(@lat1000); trunc(@lat2000); trunc(@lat4000); 
    trunc(@lat8000); trunc(@lat16000); trunc(@lat32000); 
    trunc(@lat64000); trunc(@latgt64k); trunc(@cnt); trunc(@latavg); 
    trunc(@latsum); trunc(@latmin); trunc(@latmax);
    ts = walltimestamp;
}'
notes.txt
# Correlating sd_start_cmds and scsi_transport

dtrace -qn '::sd_start_cmds:entry {x[args[0]->un_sd] = 1} ::scsi_transport:entry /x[args[0]->pkt_address.a.a_sd] > 0/ {trace("X"); x[args[0]->pkt_address.a.a_sd] = 0;}'

dtrace -qn '
  ::sd_start_cmds:entry 
  {
    x[args[0]->un_sd] = args[0]
  }
  ::scsi_transport:entry /x[args[0]->pkt_address.a.a_sd] !=NULL / 
  {
    @qavx["in-xport"] = 
        avg(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_transport);
    @qavsd["in-sd"] = 
        avg(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_driver);
    @qminx["in-xport"] = 
        min(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_transport);
    @qminsd["in-sd"] = 
        min(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_driver);
    @qmaxx["in-xport"] = 
        max(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_transport);
    @qmaxsd["in-sd"] = 
        max(x[args[0]->pkt_address.a.a_sd]->un_ncmds_in_driver);
  }
  tick-30sec {
    printa("xport avgq: %@d sd avgq: %@d xport minq: %@d sd minq: %@d xport maxq: %@d sd maxq: %@d\n", 
    @qavx, @qavsd, @qminx, @qminsd, @qmaxx, @qmaxsd);
    trunc(@qavx); trunc(@qavsd); 
    trunc(@qminx); trunc(@qminsd); 
    trunc(@qmaxx); trunc(@qmaxsd);
    }'
    
***

lat-tp-by-pow2-iosize.d
dtrace -qn '
  io:::start {
    start[args[0]->b_edev, args[0]->b_blkno] = timestamp;
  }
  io:::done
  /start[args[0]->b_edev, args[0]->b_blkno]/ {
    this->op = args[0]->b_flags & B_READ ? "Read" : "Write";
    this->devp = strtok(args[1]->dev_pathname, ":");
    this->elapsed = timestamp - 
            start[args[0]->b_edev, args[0]->b_blkno];

    this->b = args[0]->b_bcount >= 1 << 20 ? 1 << 20 :
            args[0]->b_bcount >= 1 << 19 ? 1 << 19 :
            args[0]->b_bcount >= 1 << 18 ? 1 << 18 :
            args[0]->b_bcount >= 1 << 17 ? 1 << 17 :
            args[0]->b_bcount >= 1 << 16 ? 1 << 16 :
            args[0]->b_bcount >= 1 << 18 ? 1 << 18 :
            args[0]->b_bcount >= 1 << 17 ? 1 << 17 :
            args[0]->b_bcount >= 1 << 16 ? 1 << 16 :
            args[0]->b_bcount >= 1 << 15 ? 1 << 15 :
            args[0]->b_bcount >= 1 << 14 ? 1 << 14 :
            args[0]->b_bcount >= 1 << 13 ? 1 << 13 :
            args[0]->b_bcount >= 1 << 12 ? 1 << 12 :
            args[0]->b_bcount >= 1 << 11 ? 1 << 11 :
            args[0]->b_bcount >= 1 << 10 ? 1 << 10 :
            args[0]->b_bcount >= 1 << 9 ? 1 << 9 :
            args[0]->b_bcount >= 1 << 8 ? 1 << 8 :
            args[0]->b_bcount >= 1 << 7 ? 1 << 7 :
            args[0]->b_bcount >= 1 << 6 ? 1 << 6 :
            args[0]->b_bcount >= 1 << 5 ? 1 << 5 :
            args[0]->b_bcount >= 1 << 4 ? 1 << 4 :
            args[0]->b_bcount >= 1 << 3 ? 1 << 3 :
            args[0]->b_bcount >= 1 << 2 ? 1 << 2 :
            args[0]->b_bcount >= 1 << 1 ? 2 :
            args[0]->b_bcount >= 1 << 0 ? 1 : 0;

    /*
     * To get much more verbose output, which breaks this further down
     * by individual device, use this commented aggregate instead of
     * one we are currently using.
     *
     * @[this->op, this->b, args[1]->dev_statname, this->devp] =
     *     quantize(this->elapsed / 1000);
     * And, use this printa instead of one used now.
     * printa("\t   %s(b)=%d <%s> %s%@d\n", @);
     */
    @h[this->op, this->b]   = quantize(this->elapsed / 1000); /* latency in uS */
    @av[this->op, this->b]  = avg(this->elapsed / 1000);
    @tot[this->op, this->b] = sum(this->elapsed / 1000);
    @tp[this->op, this->b]  = avg((args[0]->b_bcount * 976562) / this->elapsed);
    start[args[0]->b_edev, args[0]->b_blkno] = 0;
  }
  END {
    normalize(@tot, 1000);
    printa("\t   %s = %d(b) | AvLat = %@d(uS) | Active = %@d(mS) | Tput = %@d(KB/s)%@d\n",
    @av, @tot, @tp, @h);
  }'
device-queue-depth-2.d
#!/usr/sbin/dtrace -qCs
/* 
 * Script produces histograms to give a sense of how much IO is queued
 * in the driver and in transport. Histogram seems to be a better means
 * of examining this information as opposed to averages, mins and maxes.
 */
#define SD_TO_DEVINFO(un) ((struct dev_info *)((un)->un_sd->sd_dev))
#define DEV_NAME(un) stringof(`devnamesp[SD_TO_DEVINFO(un)->devi_major].dn_name)
#define DEV_INST(un) (SD_TO_DEVINFO(un)->devi_instance)

/*
 This script should be run in this manner, which gives us ability to control 
 how long the script will run before it returns.
 # dtrace -qCs  ./x -n 'tick-5sec { exit(0) }'
 
 Result will look something like this, where a serial number is given, 
 if known, and not a "" string, and instance corresponds to what you
 get with `iostat -e`.
 
           [In Driver]>     ZA204NDT            (sd2)
           value  ------------- Distribution ------------- count
               1 |                                         0
               2 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 24
               3 |                                         0
*/

::sd_start_cmds:entry {
    this->serial = stringof(args[0]->un_sd->sd_inq->inq_serial) != "" ? stringof(args[0]->un_sd->sd_inq->inq_serial) : "UNKNOWN";
    this->devname = DEV_NAME(args[0]);
    this->devinst = DEV_INST(args[0]);

    @htrans[this->serial, this->devname, this->devinst] =
        lquantize(args[0]->un_ncmds_in_transport, 0, 32, 1);
    @hdriv[this->serial, this->devname, this->devinst] =
        lquantize(args[0]->un_ncmds_in_driver, 0, 32, 1);
}

END {
    printa("\t   [In Driver]>     %-16s\t(%s%d) %@d\n", @hdriv);
    printa("\t   [In Transport]>  %-16s\t(%s%d) %@d\n", @htrans);
}
device-queue-depth-1.d
#!/usr/sbin/dtrace -qCs
/* Script tracks max and average number of commands queued.  */
#define SD_TO_DEVINFO(un) ((struct dev_info *)((un)->un_sd->sd_dev))
#define DEV_NAME(un) stringof(`devnamesp[SD_TO_DEVINFO(un)->devi_major].dn_name)
#define DEV_INST(un) (SD_TO_DEVINFO(un)->devi_instance)

/*
 This script should be run in this manner, which gives us ability to control 
 how long the script will run before it returns.
 # dtrace -qCs  ./x -n 'tick-5sec { exit(0) }'
 
 Result will look something like this, where a serial number is given, 
 if known, and not a "" string, and instance corresponds to what you
 get with `iostat -e`.
 
 
 Device Serial    Instance Max.Queued Avg.Queued Max.Transp Avg.Transp
 -------------    -------- ---------- ---------- ---------- ----------
 Z3G011B2         sd12     17         6          16         5
 Z3G011G8         sd10     19         6          18         5
...trimmed...
*/

::sd_start_cmds:entry {
    this->serial = stringof(args[0]->un_sd->sd_inq->inq_serial) != "" ? stringof(args[0]->un_sd->sd_inq->inq_serial) : "UNKNOWN";
    this->devname = DEV_NAME(args[0]);
    this->devinst = DEV_INST(args[0]);
    @md[this->serial, this->devname, this->devinst] =
        max(args[0]->un_ncmds_in_driver);
    @mt[this->serial, this->devname, this->devinst] =
        max(args[0]->un_ncmds_in_transport);
    @ad[this->serial, this->devname, this->devinst] =
        avg(args[0]->un_ncmds_in_driver);
    @at[this->serial, this->devname, this->devinst] =
        avg(args[0]->un_ncmds_in_transport);
}

END {
        printf("%-16s %-8s %-10s %-10s %-10s %-10s\n", "Device Serial", "Instance", 
                "Max.Queued", "Avg.Queued", "Max.Transp", "Avg.Transp");

        printf("%-16s %-8s %-10s %-10s %-10s %-10s\n", "-------------", "--------", 
                "----------", "----------", "----------", "----------");
        printa("%-16s %s%-6d %-10@d %-10@d %-10@d %-10@d\n", @md, @ad, @mt, @at);

}
bdev-seeksize.d
#!/usr/sbin/dtrace -Cs
#pragma D option quiet

#define SD_TO_DEVINFO(un) ((struct dev_info *)((un)->un_sd->sd_dev))
/* Not using these for now, but they could offer more information
 * about devices.
 */
#define DEV_NAME(un) \
      stringof(`devnamesp[SD_TO_DEVINFO(un)->devi_major].dn_name) /* ` */
#define DEV_INST(un) (SD_TO_DEVINFO(un)->devi_instance)

self long long last_byte_offset[struct sd_lun *];

::sd_core_iostart:entry {
  this->phys_blksz = args[1]->un_phy_blocksize;
  this->blk = args[2]->_b_blkno._p._l;
  this->size = args[2]->b_bcount;
  this->dev = args[1];
}
::sd_core_iostart:entry /self->last_byte_offset[this->dev] != 0/ {
  /* 
   * Create buckets to group distances by IO size. The intent is to measure
   * whether certain patterns are evident, like all small IOs seem to cluster
   * in a particular area of a disk or are they evenly spread out, etc.
   */
  this->bucket = 
    this->size <= 4096 ? "<= 4K" :
    this->size > 4096 && this->size <= 0x8000 ?
        "> 4K to 32K" :
    this->size > 0x8000 && this->size <= 0x10000 ?
        "> 32K to 64K" : "> 64K" ;

  this->dist = this->blk - self->last_byte_offset[this->dev] > 0 ?
        this->blk - self->last_byte_offset[this->dev] :
        self->last_byte_offset[this->dev] - this->blk;
  @seeks[stringof(SD_TO_DEVINFO(this->dev)->devi_devid_str), 
        this->bucket] = quantize(this->dist);
}
::sd_core_iostart:entry {
/* Record offset on device accounting for size of last IO done. */
self->last_byte_offset[args[1]] = 
    this->blk + (this->size / this->phys_blksz);
}
END {
  printa("\t   %s | IO Size: [ %s ] %@d\n", @seeks);
}
bdev-lat+iops+thrp.d
dtrace -qn '
/* 
  Script measures and reports every 10 seconds read and write latency,
  IOPs, and throughput. This is a useful script to watch, while tuning
  ZFS write throttle mechanism. The goal is to have acceptable latency
  while also maximizing possible throughput to disk drives.
*/
BEGIN
{
    start = timestamp;
    line = "------------------------------------------------------------------------";
}

io:::start {
    ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp;
}

io:::done /ts[args[0]->b_edev, args[0]->b_lblkno]/ {
    this->delta = (timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000;
    this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ?
        "Read" : "Write";

    this->size = args[0]->b_bcount;
    this->buck = 512; /* We assume this is smallest possible size */
    this->buck = ( this->size > 512 ) ? 1024 : this->buck ;
    this->buck = ( this->size > 1024 ) ? 2048 : this->buck ;
    this->buck = ( this->size > 4096 ) ? 8192 : this->buck ;
    this->buck = ( this->size > 8192 ) ? 16384 : this->buck ;
    this->buck = ( this->size > 16384 ) ? 32768 : this->buck ;
    this->buck = ( this->size > 32768 ) ? 65536 : this->buck ;
    this->buck = ( this->size > 65536 ) ? 131072 : this->buck ;
    this->buck = ( this->size > 131072 ) ? 262144 : this->buck ;
    this->buck = ( this->size > 262144 ) ? 524288 : this->buck ;
    this->buck = ( this->size > 524288 ) ? 1048576 : this->buck ;
    
    @iosz[this->name] = quantize(this->buck);
    @q[this->name] = quantize(this->delta);
    @a[this->name] = avg(this->delta);
    @v[this->name] = stddev(this->delta);
    @i[this->name] = count();
    @b[this->name] = sum(args[0]->b_bcount);

    ts[args[0]->b_edev, args[0]->b_lblkno] = 0;
}

tick-10sec {
    printf("\n%80s\n", line);
    printa("%14s I/Os by size: %@d\n", @iosz);
    printa("%14s latency(uS): %@d\n", @q);
    /*
     * Adjust for the runtime of each sample, i.e. calculate total seconds
     * since last timestamp, and normalize the data, to get true per second 
     * values.
     */
    normalize(@i, (timestamp - start) / 1000000000);
    normalize(@b, (timestamp - start) / 1000000000 * 1024);
    printf("%-30s %11s %11s %11s %11s\n", "", "avg.lat(uS)", "stddev(uS)",
        "iops", "throughput");
    printa("%-30s %@9uus %@9uus %@9u/s %@8uk/s\n", @a, @v, @i, @b);
    trunc(@iosz); trunc(@q); trunc(@a); trunc(@v); trunc(@i); trunc(@b);
    start = timestamp;
}'
bdev-lat+iops+thrp-csv.d
dtrace -qn '
int realtm, start;
/* int done[string]; */
inline const int NORM_KB = 1024;
inline const int NS_S = 1000000000;
inline const int NS_MS = 1000000;
/* 
 * Script measures and reports every 1 seconds read and write latency,
 * IOPs, and throughput. This is a useful script to watch, while tuning
 * ZFS write throttle mechanism. The goal is to have acceptable latency
 * while also maximizing possible throughput to disk drives.
 *
 * This script is also ideal for pairing with Load Dynamix standardized 
 * tests that we run, as a means of correlating what happens at the disk
 * level, vs. what happens at the application layer. We want to measure IOs
 * and latency, which gives us a sense of service time at the application
 * layer and see how that translates to IOs and latency at the disk layer.
 */
BEGIN
{
  printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
        "io.type", "runtime.ms", "min.lat", "max.lat", "avg.lat",
        "max.io.size", "iops", "throughput.KBps",
        "512b.ct", "1024b.ct", "2048b.ct", "4096b.ct", "8192b.ct",
        "16384b.ct", "32768b.ct", "65536b.ct", "131072b.ct",
        "total.ios", "active.ios", "arc.ios", "arc.io.bytes", "avg.bytes.per.us"
  );
  hdr = 1;
  start = timestamp;
}

io:::start {
  this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ? "R" : "W";
  ts[args[0]->b_edev, args[0]->b_lblkno] = timestamp;
  @queued[this->name] = sum(1); /* increment count of active IOs */
}

/* 
 * There is no very direct correlation between IOs measured here
 * and IOs resulting in physical bytes being read from the disks.
 * It does help though to have some context about IOs that do not actually
 * result in physical IO.
 */
::arc_read_done:entry {
  @asz["R"]           = sum((uint64_t)args[0]->io_size) ;
  @aios["R"]          = count() ;
}

io:::done /ts[args[0]->b_edev, args[0]->b_lblkno]/ {
  this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ? "R" : "W";
}

io:::done /ts[args[0]->b_edev, args[0]->b_lblkno]/ {
  this->delta = (timestamp - ts[args[0]->b_edev, args[0]->b_lblkno]) / 1000;
  this->name = (args[0]->b_flags & (B_READ | B_WRITE)) == B_READ ? "R" : "W";
  this->size = args[0]->b_bcount; /* How many bytes in this IO? */
  this->buck = 512; /* We assume this is smallest possible IO size. */
  this->buck = ( this->size > 512 ) ? 1024 : this->buck ;
  this->buck = ( this->size > 1024 ) ? 2048 : this->buck ;
  this->buck = ( this->size > 4096 ) ? 8192 : this->buck ;
  this->buck = ( this->size > 8192 ) ? 16384 : this->buck ;
  this->buck = ( this->size > 16384 ) ? 32768 : this->buck ;
  this->buck = ( this->size > 32768 ) ? 65536 : this->buck ;
  this->buck = ( this->size > 65536 ) ? 131072 : this->buck ;
  this->buck = ( this->size > 131072 ) ? 262144 : this->buck ;
  this->buck = ( this->size > 262144 ) ? 524288 : this->buck ;
  this->buck = ( this->size > 524288 ) ? 1048576 : this->buck ;

  /*
   * The size buckets are currently limited to 128K, from 512b to 128K,
   * in base2 steps, i.e. 512, 1024, etc.
   * We may want to add larger IO sizes, if we find that they happen.
   */
  @lmin[this->name]   = min(this->delta);
  @lmax[this->name]   = max(this->delta);
  @a[this->name]      = avg(this->delta);
  /* Normalize to 1-second intervals */
  @i[this->name]      = count();
  /* Average bytes per microsecond of IO time. */
  this->bpus = this->delta > 0 ? (this->size / this->delta) : this->size;
  @abus[this->name]   = avg(this->size / this->delta);
  /* Do not normalize (@ios) to get total I/Os per interval. */
  @ios[this->name]    = count();
  @b[this->name]      = sum(args[0]->b_bcount);
  @queued[this->name] = sum(-1); /* decrement count of active IOs */
  @maxiosz[this->name]     = max(this->size);
  @sz512[this->name]  = sum(this->buck == 512 ? 1 : 0 );
  @sz1024[this->name]  = sum(this->buck == 1024 ? 1 : 0 );
  @sz2048[this->name]  = sum(this->buck == 2048 ? 1 : 0 );
  @sz4096[this->name]  = sum(this->buck == 4096 ? 1 : 0 );
  @sz8192[this->name]  = sum(this->buck == 8192 ? 1 : 0 );
  @sz16384[this->name]  = sum(this->buck == 16384 ? 1 : 0 );
  @sz32768[this->name]  = sum(this->buck == 32768 ? 1 : 0 );
  @sz65536[this->name]  = sum(this->buck == 65536 ? 1 : 0 );
  @sz131072[this->name]  = sum(this->buck == 131072 ? 1 : 0 );
  ts[args[0]->b_edev, args[0]->b_lblkno] = 0;
}

tick-5sec {
  this->delta = (timestamp - start) / NS_S;
  /*
   * This is a hack, it takes advantage of the fact that realtm value will
   * always increase, and never decrease. Because we do not have an aggregate
   * for storing only last value, this is the best we can do. We are
   * effectively storing latest computed value.
   */
  @t["R"]      = max(realtm);
  @t["W"]      = max(realtm);

  /* 
   * Normalizing by 1024 can introduce sample error when the value being 
   * divided is less than 1024. Since there is no floating point division,
   * resulting value will be 0, which if data is routinely less than 1024,
   * can create a serious discontinuity between real data and normalized
   * data.
   */
  normalize(@b, NORM_KB);
  normalize(@t, NS_MS);
  normalize(@i, 5); /* adjust based on tick-$value */

  printa(
    "%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n", 
                      @t, @lmin, @lmax, @a, @maxiosz, @i, @b, 
                      @sz512, @sz1024, @sz2048, @sz4096, @sz8192, @sz16384,
                      @sz32768, @sz65536, @sz131072, @ios, @queued, 
                      @aios, @asz, @abus);
  /*
   * Truncate aggregations and re-set timestamp for next sampling period.
   */
  trunc(@lmin); trunc(@lmax); trunc(@a); trunc(@maxiosz); trunc(@i); 
  trunc(@b); trunc(@sz512); trunc(@sz1024); trunc(@sz2048); trunc(@sz4096);
  trunc(@sz8192); trunc(@sz16384); trunc(@sz32768); trunc(@sz65536);
  trunc(@sz131072); trunc(@ios); trunc(@aios); trunc(@asz); trunc(@abus);
  realtm = realtm + (timestamp - start);
  start = timestamp;
}'
aligned-io-16k-devices.d
// Measure how much IO is aligned vs. not-aligned to 16K
// physical device size.
dtrace -qn '
    long total_bytes, issued, notmult;
    long misaligned, misalignedR, misalignedW, ltsize, ltsizeR, ltsizeW;

    BEGIN {
        issued = 0;
        misaligned = 0;
        notmult = 0;
        ltsize = 0;
        ltsizeR = 0;
        ltsizeW = 0;
        total_bytes = 0;
        OFFSET = 32;
        SIZE=16384; /* 16K blocksize */
    }

    fbt:sd:sd_core_iostart:entry {
        issued++;
        bp = (struct buf *) arg2;
        xp = (struct sd_xbuf *)((bp)->b_private);
        misalignedR += bp->b_flags & B_READ
            ? (xp->xb_blkno % OFFSET ? 1 : 0) : 0;
        misalignedW += bp->b_flags & B_WRITE
            ? (xp->xb_blkno % OFFSET ? 1 : 0) : 0;    
        misaligned  += (xp->xb_blkno % OFFSET) ? 1 : 0;
        @zeroalign[(bp->b_bcount % SIZE) ? 
                    "NOT MULTIPLE" : "MULTIPLE"] = 
                    quantize(xp->xb_blkno % OFFSET);
        ltsize      += bp->b_bcount < SIZE ? 1 : 0; /* # of IOs under 16K */
        ltsizeR     += (bp->b_bcount < SIZE) 
            ? (bp->b_flags & B_READ ? 1 : 0) : 0;
        ltsizeW     += (bp->b_bcount < SIZE) 
            ? (bp->b_flags & B_WRITE ? 1 : 0) : 0;
        notmult     += (bp->b_bcount % SIZE) ? 1 : 0;
        total_bytes += bp->b_bcount;
        @align[(xp->xb_blkno % OFFSET) 
                ? "UNALIGNED(bufsize)" : "ALIGNED(bufsize)"] =
                quantize(bp->b_bcount);
    }
    END {
        pct_ltsizeR     = issued > 0 ? 100 * ltsizeR / issued : 0;
        pct_ltsizeW     = issued > 0 ? 100 * ltsizeW / issued : 0;
        pct_ltsize      = issued > 0 ? 100 * ltsize / issued : 0;
        pct_misalign    = issued > 0 ? 100 * misaligned / issued : 0;
        pct_notmult     = issued > 0 ? 100 * notmult / issued : 0;
        printf("=========================================\n");
        printf("================ Summary ================\n");
        printf("=========================================\n");
        printf("Short Reads: %u%% Short Writes: %u%%\n", 
                pct_ltsizeR, pct_ltsizeW);
        printf("Unaligned Reads: %u Unaligned Writes: %u\n",
                misalignedR, misalignedW);
        printf("Bytes Processed: %u\nTotal IOs Issued: %u\nTotal IOs Smaller than %d: %u%%\nNot multiple of %d: %d%%\nUnaligned: %d%%\n", 
        total_bytes, issued, SIZE, pct_ltsize, SIZE, pct_notmult, pct_misalign);
        printf("=========================================\n");
        printa(@align);
        printa(@zeroalign);
    }'
aligned-XX-blocksize.d
dtrace -qn '
    /* First argument to this should be some blocksize, like 8192 for 8K. */
    long total_bytes, issued, notmult;
    long misaligned, misalignedR, misalignedW, ltsize, ltsizeR, ltsizeW;

    BEGIN {
        issued = 0;
        misaligned = 0;
        notmult = 0;
        ltsize = 0;
        ltsizeR = 0;
        ltsizeW = 0;
        total_bytes = 0;
        OFFSET = 32;
        SIZE=$1; /* XX blocksize */
    }

    fbt:sd:sd_core_iostart:entry {
        issued++;
        bp = (struct buf *) arg2;
        xp = (struct sd_xbuf *)((bp)->b_private);
        misalignedR += bp->b_flags & B_READ
            ? (xp->xb_blkno % OFFSET ? 1 : 0) : 0;
        misalignedW += bp->b_flags & B_WRITE
            ? (xp->xb_blkno % OFFSET ? 1 : 0) : 0;    
        misaligned  += (xp->xb_blkno % OFFSET) ? 1 : 0;
        @zeroalign[(bp->b_bcount % SIZE) ? 
                    "NOT MULTIPLE" : "MULTIPLE"] = 
                    quantize(xp->xb_blkno % OFFSET);
        ltsize      += bp->b_bcount < SIZE ? 1 : 0; /* # of IOs under 16K */
        ltsizeR     += (bp->b_bcount < SIZE) 
            ? (bp->b_flags & B_READ ? 1 : 0) : 0;
        ltsizeW     += (bp->b_bcount < SIZE) 
            ? (bp->b_flags & B_WRITE ? 1 : 0) : 0;
        notmult     += (bp->b_bcount % SIZE) ? 1 : 0;
        total_bytes += bp->b_bcount;
        @align[(xp->xb_blkno % OFFSET) 
                ? "UNALIGNED(bufsize)" : "ALIGNED(bufsize)"] =
                quantize(bp->b_bcount);
    }
    END {
        pct_ltsizeR     = issued > 0 ? 100 * ltsizeR / issued : 0;
        pct_ltsizeW     = issued > 0 ? 100 * ltsizeW / issued : 0;
        pct_ltsize      = issued > 0 ? 100 * ltsize / issued : 0;
        pct_misalign    = issued > 0 ? 100 * misaligned / issued : 0;
        pct_notmult     = issued > 0 ? 100 * notmult / issued : 0;
        printf("=========================================\n");
        printf("================ Summary ================\n");
        printf("=========================================\n");
        printf("Short Reads: %u%% Short Writes: %u%%\n", 
                pct_ltsizeR, pct_ltsizeW);
        printf("Unaligned Reads: %u Unaligned Writes: %u\n",
                misalignedR, misalignedW);
        printf("Bytes Processed: %u\nTotal IOs Issued: %u\nTotal IOs Smaller than %d: %u%%\nNot multiple of %d: %d%%\nUnaligned: %d%%\n", 
        total_bytes, issued, SIZE, pct_ltsize, SIZE, pct_notmult, pct_misalign);
        printf("=========================================\n");
        printa(@align);
        printa(@zeroalign);
    }'

d Dtrace片段观察NFSv3和NFSv4的性能

[NFS观察dtrace]各种性能和指标测量,例如io大小,计数,IOPS和对NFSv3 / 4 #tags的检查:nfs,nfsv3,nfsv4,偏移,文件偏移,随机,顺序,dtrace,illumos,iosize,nfs IOPS

nfsv3rwbyfilecsv.d
#!/usr/sbin/dtrace -s
#pragma D option quiet
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2015 RackTop Systems.
 *
 * nfsv3rwbyfilecsv.d
 * This script traces NFSv3 Read and Write IO, reporting each file touched by
 * name if known and kilobytes/sec for IO on this file, averaged over
 * `interval` seconds long sample and count of read and write operations done.
 * We cannot tell from this the origin of request, but we can tell what was
 * accessed and how much work was done.
 *
 * Results should look similar to the following:
 * [timestamp],[command],[path],[rkb/s],[reads],[wkb/s],[writes]
 *
 * 2015 Oct 13 09:49:24,WRITE,/storage/p01/x/.iormstats.sf,0,0,0,0
 * 2015 Oct 13 09:49:24,WRITE,/storage/p01/x/.iormstats.sf,0,0,0,0
 * 2015 Oct 13 09:49:24,WRITE,/storage/p01/x/.iormstats.sf,0,0,0,0
 * 2015 Oct 13 09:49:24,WRITE,/storage/p01/x/.iormstats.sf,0,0,0,0
 */
BEGIN {
  printf("%s,%s,%s,%s,%s,%s,%s\n", "timestamp", "cmd", "pathname",
  "read.kb/sec","read.count", "write.kb/sec", "write.count");
  interval = 5;
  command["op-read-done"] = "READ";
  command["op-write-done"] = "WRITE";
  ts = walltimestamp;
  tick = interval;
}

nfsv3:::op-read-done /ts/ {
  this->bytes = args[2]->res_u.ok.data.data_len;
  @readCnt[ts, command[probename], 
      args[1]->noi_curpath] = count();
  @reads[ts, command[probename], 
      args[1]->noi_curpath] = sum(this->bytes);
}

nfsv3:::op-write-done /ts/ {
  this->bytes = args[2]->res_u.ok.count;
  @writeCnt[ts, command[probename], 
      args[1]->noi_curpath] = count();
  @writes[ts, command[probename], 
      args[1]->noi_curpath] = sum(this->bytes);
}

profile:::tick-1sec, END
/tick-- == 0/ {
  /* Normalize data to report kilobytes/sec for each file. */
  normalize(@reads, 1024 * interval);
  normalize(@writes, 1024 * interval);  
  normalize(@readCnt, interval);
  normalize(@writeCnt, interval);
  printa("%Y,%s,%s,%@d,%@d,%@d,%@d\n", @reads, @readCnt, @writes, @writeCnt);
  trunc(@reads);
  trunc(@readCnt);
  trunc(@writes);
  trunc(@writeCnt);
  tick = interval;
  ts = walltimestamp;
}
nfsv3iobyconncsv.d
#!/usr/sbin/dtrace -s
#pragma D option quiet
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2015 RackTop Systems.
 *
 * nfsv3iobyconncsv.d
 * This script displays amount of NFSv3 Read and Write IO, number of
 * operations performed including reads and writes, and any metadata
 * operations, number of reads and write explicitly, and sizes in KB.,
 * by client connection, i.e. by unique IP address.
 */

dtrace:::BEGIN
{
    ts = walltimestamp;
    interval = 5;
    /* printf("Tracing... Interval %d secs.\n", interval); */
    printf("%s,%s,%s,%s,%s,%s\n", "client.ip.addr", "reads/sec", 
    "writes/s", "kbr/s", "kbw/s", "metadata.ops/s");
    tick = interval;
}

/* Metadata Operations not counted towards R/W IOs */
nfsv3:::op-commit-done,
nfsv3:::op-pathconf-done,
nfsv3:::op-fsinfo-done,
nfsv3:::op-fsstat-done,
nfsv3:::op-readdirplus-done,
nfsv3:::op-readdir-done,
nfsv3:::op-link-done,
nfsv3:::op-rename-done,
nfsv3:::op-rmdir-done,
nfsv3:::op-remove-done,
nfsv3:::op-mknod-done,
nfsv3:::op-symlink-done,
nfsv3:::op-mkdir-done,
nfsv3:::op-create-done,
nfsv3:::op-write-done,
nfsv3:::op-read-done,
nfsv3:::op-readlink-done,
nfsv3:::op-access-done,
nfsv3:::op-lookup-done,
nfsv3:::op-setattr-done,
nfsv3:::op-getattr-done,
nfsv3:::op-null-done
/ts/ {
    @ops[ts, args[0]->ci_remote] = count();
    @ops[ts, "ALL_CLIENTS"] = count();
}

nfsv3:::op-read-done
/ts/ {
    @reads[ts, args[0]->ci_remote] = count();
    @readbytes[ts, args[0]->ci_remote] = sum(args[2]->res_u.ok.data.data_len);
    @reads[ts, "ALL_CLIENTS"] = count();
    @readbytes[ts, "ALL_CLIENTS"] = sum(args[2]->res_u.ok.data.data_len);
}

nfsv3:::op-write-done
/ts/ {
    @writes[ts, args[0]->ci_remote] = count();
    @writebytes[ts, args[0]->ci_remote] = sum(args[2]->res_u.ok.count);
    @writes[ts, "ALL_CLIENTS"] = count();
    @writebytes[ts, "ALL_CLIENTS"] = sum(args[2]->res_u.ok.count);
}

profile:::tick-1sec,END
/tick-- == 0/
{
    normalize(@ops, interval);
    normalize(@reads, interval);
    normalize(@writes, interval);
    normalize(@writebytes, 1024 * interval);
    normalize(@readbytes, 1024 * interval);

    printa("%Y,%s,%@d,%@d,%@d,%@d,%@d\n", @reads, @writes,
        @readbytes, @writebytes, @ops);
    trunc(@ops);
    trunc(@reads);
    trunc(@writes);
    trunc(@readbytes);
    trunc(@writebytes);
    tick = interval;
    ts = walltimestamp;
}
nfsv3iobyconn.d
#!/usr/sbin/dtrace -s
#pragma D option quiet
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2016 RackTop Systems.
 *
 * nfsv3iobyconncsv.d
 * This script displays amount of NFSv3 Read and Write IO, number of
 * operations performed including reads and writes, and any metadata
 * operations, number of reads and writes explicitly, and sizes in KB.,
 * by client connection, i.e. by unique IP address and for ALL_CLIENTS.
 */
 inline int INTERVAL = 5;

dtrace:::BEGIN
{
    printf("%s,%s,%s,%s,%s,%s,%s\n", "client.ip.addr", "timedelta", 
            "reads/sec", "writes/s", "kbr/s", "kbw/s", "metadata.ops/s");
    tick = INTERVAL;
    start_time = timestamp;
    ts = 0;
}

/* Metadata Operations not counted towards R/W IOs */
nfsv3:::op-commit-done,
nfsv3:::op-pathconf-done,
nfsv3:::op-fsinfo-done,
nfsv3:::op-fsstat-done,
nfsv3:::op-readdirplus-done,
nfsv3:::op-readdir-done,
nfsv3:::op-link-done,
nfsv3:::op-rename-done,
nfsv3:::op-rmdir-done,
nfsv3:::op-remove-done,
nfsv3:::op-mknod-done,
nfsv3:::op-symlink-done,
nfsv3:::op-mkdir-done,
nfsv3:::op-create-done,
nfsv3:::op-write-done,
nfsv3:::op-read-done,
nfsv3:::op-readlink-done,
nfsv3:::op-access-done,
nfsv3:::op-lookup-done,
nfsv3:::op-setattr-done,
nfsv3:::op-getattr-done,
nfsv3:::op-null-done
{
    @ops[args[0]->ci_remote] = count();
    @ops["ALL_CLIENTS"] = count();
    @t[args[0]->ci_remote] = max(ts);
    @t["ALL_CLIENTS"] = max(ts);
}

nfsv3:::op-read-done
{
    @reads[args[0]->ci_remote] = count();
    @readbytes[args[0]->ci_remote] = sum(args[2]->res_u.ok.data.data_len);
    @reads["ALL_CLIENTS"] = count();
    @readbytes["ALL_CLIENTS"] = sum(args[2]->res_u.ok.data.data_len);
    @t[args[0]->ci_remote] = max(ts);
    @t["ALL_CLIENTS"] = max(ts);
}

nfsv3:::op-write-done
{
    @writes[args[0]->ci_remote] = count();
    @writebytes[args[0]->ci_remote] = sum(args[2]->res_u.ok.count);
    @writes["ALL_CLIENTS"] = count();
    @writebytes["ALL_CLIENTS"] = sum(args[2]->res_u.ok.count);
    @t[args[0]->ci_remote] = max(ts);
    @t["ALL_CLIENTS"] = max(ts);
}

profile:::tick-1sec,END
/tick-- == 0/
{
    normalize(@ops, INTERVAL);
    normalize(@reads, INTERVAL);
    normalize(@writes, INTERVAL);
    normalize(@writebytes, 1024 * INTERVAL);
    normalize(@readbytes, 1024 * INTERVAL);

    printa("%s,%@d,%@d,%@d,%@d,%@d,%@d\n", @t, @reads, @writes,
        @readbytes, @writebytes, @ops);
    tick = INTERVAL;
    ts = (timestamp - start_time) / 1000000000;
    trunc(@ops);
    trunc(@reads);
    trunc(@writes);
    trunc(@readbytes);
    trunc(@writebytes);
    trunc(@t);
}
nfsv3cmds.d
#!/usr/sbin/dtrace -s
#pragma D option quiet
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2015 RackTop Systems.
 * 
 * nfsv3cmds.d
 * This script displays amount of NFSv3 IO and all operations, and their
 * count, by client connection, i.e. by unique IP address.
 */

dtrace:::BEGIN
{
    interval = 10;
    tick = interval;
    /* All known NFSv3 commands */
    commands["rfs3_commit"] = "COMMIT";
    commands["rfs3_pathconf"] = "PATHCONF";
    commands["rfs3_fsinfo"] = "FSINFO";
    commands["rfs3_fsstat"] = "FSSTAT";
    commands["rfs3_readdirplus"] = "READ_DIR+";
    commands["rfs3_readdir"] = "READ_DIR";
    commands["rfs3_link"] = "LINK";
    commands["rfs3_rename"] = "RENAME";
    commands["rfs3_rmdir"] = "RMDIR";
    commands["rfs3_remove"] = "REMOVE";
    commands["rfs3_mknod"] = "MKNODE";
    commands["rfs3_symlink"] = "SYMLINK";
    commands["rfs3_mkdir"] = "MKDIR";
    commands["rfs3_create"] = "CREATE";
    commands["rfs3_write"] = "WRITE";
    commands["rfs3_read"] = "READ";
    commands["rfs3_readlink"] = "READ_LINK";
    commands["rfs3_access"] = "ACCESS";
    commands["rfs3_lookup"] = "LOOKUP";
    commands["rfs3_setattr"] = "SET_ATTR";
    commands["rfs3_getattr"] = "GET_ATTR";
    commands["rpc_null_v3"] = "NULL";
}

nfsv3:::op-commit-done,
nfsv3:::op-pathconf-done,
nfsv3:::op-fsinfo-done,
nfsv3:::op-fsstat-done,
nfsv3:::op-readdirplus-done,
nfsv3:::op-readdir-done,
nfsv3:::op-link-done,
nfsv3:::op-rename-done,
nfsv3:::op-rmdir-done,
nfsv3:::op-remove-done,
nfsv3:::op-mknod-done,
nfsv3:::op-symlink-done,
nfsv3:::op-mkdir-done,
nfsv3:::op-create-done,
nfsv3:::op-write-done,
nfsv3:::op-read-done,
nfsv3:::op-readlink-done,
nfsv3:::op-access-done,
nfsv3:::op-lookup-done,
nfsv3:::op-setattr-done,
nfsv3:::op-getattr-done,
nfsv3:::op-null-done
{
    @ops[args[0]->ci_remote, commands[probefunc]] = count();
}

nfsv3:::op-read-done
{
    @readbytes[args[0]->ci_remote, 
        args[2]->res_u.ok.count] = sum(args[2]->res_u.ok.data.data_len);
}


nfsv3:::op-write-done
{
    @writebytes[args[0]->ci_remote, 
        args[2]->res_u.ok.count] = sum(args[2]->res_u.ok.count);
}

profile:::tick-1sec
/tick-- == 0/
{
    this->ts = walltimestamp;
    printf("---- Begin IO Byte Counts [%Y] ----\n", this->ts);
    printa("Client[R]: %s IO Size: %d Total bytes: %@d\n", @readbytes);
    printa("Client[W]: %s IO Size: %d Total bytes: %@d\n", @writebytes);
    printf("---- END Byte Counts ----\n\n");
    printf("\n---- Begin Operations [%Y] ----\n", this->ts);
    printa("Client: %s CMD: %s OP.Count: %@d\n", @ops);
    printf("---- END Operations ----\n\n");
    trunc(@readbytes);
    trunc(@writebytes);
    tick = interval;
}
nfsv3-rw-comp-latency.d
dtrace -qn '
  /*
   * Register when a request is received from client 
   */ 
  nfsv3:::op-read-start,nfsv3:::op-write-start {
    self->x[args[1]->noi_xid] = timestamp;
    self->caddr[args[1]->noi_xid] = args[0]->ci_remote;
  }
  /*
   * Register when client acknowledges completion of operation
   */
  nfsv3:::op-read-done,nfsv3:::op-write-done /self->x[args[1]->noi_xid]/ {
    this->delta = timestamp - self->x[args[1]->noi_xid];
    @[self->caddr[args[1]->noi_xid]] = quantize(this->delta / 1000);
    self->x[args[1]->noi_xid] = 0;
    self->caddr[args[1]->noi_xid] = 0;
  }'

// Same as above, but splits commands into two histograms, one for reads and one for writes.
dtrace -qn '
string op[string];
BEGIN {
  op["op-read-done"] = "R";
  op["op-write-done"] = "W";
}
  /*
   * Register when a request is received from client 
   */ 
  nfsv3:::op-read-start,nfsv3:::op-write-start {
    self->x[args[1]->noi_xid] = timestamp;
    self->caddr[args[1]->noi_xid] = args[0]->ci_remote;
  }
  /*
   * Register when client acknowledges completion of operation
   */
  nfsv3:::op-read-done,nfsv3:::op-write-done /self->x[args[1]->noi_xid]/ {
    this->delta = timestamp - self->x[args[1]->noi_xid];
    @[op[probename], self->caddr[args[1]->noi_xid]] = quantize(this->delta / 1000);
    self->x[args[1]->noi_xid] = 0;
    self->caddr[args[1]->noi_xid] = 0;
  }'
nfsv3-rand-seq-by-iosize.d
dtrace -qCs <( print -r '
/*
 * Script is meant to very produce buckets of IOsize(s),
 * by size of IO, grouped by Random and Sequential.
 * Four histograms are generated, two for Read and Write command.
 * One histogram being all sequential IOs, and one being
 * all non-sequential IOs.
 */
#include <nfs/nfs.h>
  offset3 last_off[string];
  count3  last_count[string];
  int ok_r, ok_w;
  
  ::rfs3_read:entry /last_count["read"] == 0/ {
    last_off["read"] = args[0]->offset ;
    last_count["read"] = args[0]->count ;
    ok_r = 1;
  }
  ::rfs3_read:entry /ok_r/ {
    this->offset = args[0]->offset ;
    @rd[ last_off["read"] + last_count["read"] == this->offset ?
      "SEQ" : "RAND"] = quantize(args[0]->count);
    /* Update last offset and count with latest value. */
    last_off["read"] = this->offset ;
    last_count["read"] = args[0]->count ;
  }
  ::rfs3_write:entry /last_count["write"] == 0/ {
    last_off["write"] = args[0]->offset ;
    last_count["write"] = args[0]->count ;
    ok_w = 1;
  }
  ::rfs3_write:entry /ok_w/ {
    this->offset = args[0]->offset ;
    @wr[ last_off["write"] + last_count["write"] == this->offset ?
      "SEQ" : "RAND"] = quantize(args[0]->count);
    /* Update last offset and count with latest value. */
    last_off["write"] = this->offset ;
    last_count["write"] = args[0]->count ;
  }
  END {
    printa("\t[%s] Read(bytes): %@d\n", @rd);
    printa("\t[%s] Write(bytes): %@d\n", @wr);
  }'
)
nfsv3-ops-by-cmd.d
/usr/sbin/dtrace -qn'
  #pragma D option aggsize=16m
  #pragma D option bufsize=8m
  #pragma D option dynvarsize=8m

  string typ[string];
  
  BEGIN 
  {
    printf("source,cmd,path,IOs,avg.lat.ns,bytes\n");
    typ["op-access-done"]      = "access";
    typ["op-commit-done"]      = "commit";
    typ["op-create-done"]      = "create";
    typ["op-fsinfo-done"]      = "fsinfo";
    typ["op-fsstat-done"]      = "fsstat";
    typ["op-getattr-done"]     = "getattr";
    typ["op-link-done"]        = "link";
    typ["op-lookup-done"]      = "lookup";
    typ["op-mkdir-done"]       = "mkdir";
    typ["op-mknod-done"]       = "mknod";
    typ["op-null-done"]        = "null";
    typ["op-pathconf-done"]    = "pathconf";
    typ["op-read-done"]        = "read";
    typ["op-readdir-done"]     = "readdir";
    typ["op-readdirplus-done"] = "readdirplus";
    typ["op-readlink-done"]    = "readlink";
    typ["op-remove-done"]      = "remove";
    typ["op-rename-done"]      = "rename";
    typ["op-rmdir-done"]       = "rmdir";
    typ["op-setattr-done"]     = "setattr";
    typ["op-symlink-done"]     = "symlink";
    typ["op-write-done"]       = "write";
  }
   
  nfsv3:::op-access-start,
  nfsv3:::op-commit-start,
  nfsv3:::op-create-start,
  nfsv3:::op-fsinfo-start,
  nfsv3:::op-fsstat-start,
  nfsv3:::op-getattr-start,
  nfsv3:::op-link-start,
  nfsv3:::op-lookup-start,
  nfsv3:::op-mkdir-start,
  nfsv3:::op-mknod-start,
  nfsv3:::op-null-start,
  nfsv3:::op-pathconf-start,
  nfsv3:::op-read-start,
  nfsv3:::op-readdir-start,
  nfsv3:::op-readdirplus-start,
  nfsv3:::op-readlink-start,
  nfsv3:::op-remove-start,
  nfsv3:::op-rename-start,
  nfsv3:::op-rmdir-start,
  nfsv3:::op-setattr-start,
  nfsv3:::op-symlink-start,
  nfsv3:::op-write-start
  {
    self->src_addr = args[0]->ci_remote;
    tm[self->src_addr, args[1]->noi_xid] = timestamp;

    self->have_path = args[1]->noi_curpath == NULL ? 0 : 1;
    
    /* We should make sure we handle empty value as well */
    self->path = args[1]->noi_curpath != "" ? 
              args[1]->noi_curpath  : "NOPATH";
  }

  nfsv3:::op-create-start,
  nfsv3:::op-mkdir-start,
  nfsv3:::op-mknod-start
  / self->have_path && self->path != "NOPATH" && self->path != "<unknown>" /
  {
    self->path = strjoin(strjoin(self->path, "/"), args[2]->where.name)
  }
  
  nfsv3:::op-lookup-start
  / self->have_path && self->path != "NOPATH" && self->path != "<unknown>" /
  {
    self->path = strjoin(strjoin(self->path, "/"), args[2]->what.name)
  }
  
  nfsv3:::op-remove-start,
  nfsv3:::op-rmdir-start
  / self->have_path && self->path != "NOPATH" && self->path != "<unknown>" /
  {
    self->path = strjoin(strjoin(self->path, "/"), args[2]->object.name)
  }
  
  nfsv3:::op-rename-start
  / self->have_path && self->path != "NOPATH" && self->path != "<unknown>" /
  {
    self->path = strjoin(strjoin(self->path, "/"), args[2]->from.name)
  }
  
  nfsv3:::op-symlink-start
  / self->have_path && self->path != "NOPATH" && self->path != "<unknown>" /
  {
    self->path = strjoin(strjoin(self->path, "/"), args[2]->symlink.symlink_data)
  }
  
  nfsv3:::op-read-done,
  nfsv3:::op-write-done
  / self->have_path && tm[self->src_addr, args[1]->noi_xid] /
  {
    self->bytes = args[2]->res_u.ok.count ;

    @ios[self->src_addr, typ[probename], self->path]    = count();
    @lat_us[self->src_addr, typ[probename], self->path] = 
          avg(timestamp - tm[self->src_addr, args[1]->noi_xid]);
    @s[self->src_addr, typ[probename], self->path]      = sum(self->bytes);
    ++data;
  }

  nfsv3:::op-access-done,
  nfsv3:::op-commit-done,
  nfsv3:::op-create-done,
  nfsv3:::op-fsinfo-done,
  nfsv3:::op-fsstat-done,
  nfsv3:::op-getattr-done,
  nfsv3:::op-link-done,
  nfsv3:::op-lookup-done,
  nfsv3:::op-mkdir-done,
  nfsv3:::op-mknod-done,
  nfsv3:::op-null-done,
  nfsv3:::op-pathconf-done,
  nfsv3:::op-readdir-done,
  nfsv3:::op-readdirplus-done,
  nfsv3:::op-readlink-done,
  nfsv3:::op-remove-done,
  nfsv3:::op-rename-done,
  nfsv3:::op-rmdir-done,
  nfsv3:::op-setattr-done,
  nfsv3:::op-symlink-done
  / self->have_path && tm[self->src_addr, args[1]->noi_xid] /
  {
    @ios[self->src_addr, typ[probename], self->path]    = count();
    @lat_us[self->src_addr, typ[probename], self->path] =
      avg(timestamp - tm[self->src_addr, args[1]->noi_xid]);
    @s[self->src_addr, typ[probename], self->path]      = sum(0);
    ++data;
  }
  
  nfsv3:::op-access-done,
  nfsv3:::op-commit-done,
  nfsv3:::op-create-done,
  nfsv3:::op-fsinfo-done,
  nfsv3:::op-fsstat-done,
  nfsv3:::op-getattr-done,
  nfsv3:::op-link-done,
  nfsv3:::op-lookup-done,
  nfsv3:::op-mkdir-done,
  nfsv3:::op-mknod-done,
  nfsv3:::op-null-done,
  nfsv3:::op-pathconf-done,
  nfsv3:::op-read-done,
  nfsv3:::op-readdir-done,
  nfsv3:::op-readdirplus-done,
  nfsv3:::op-readlink-done,
  nfsv3:::op-remove-done,
  nfsv3:::op-rename-done,
  nfsv3:::op-rmdir-done,
  nfsv3:::op-setattr-done,
  nfsv3:::op-symlink-done,
  nfsv3:::op-write-done
  {
    /* Zero out things to make sure we avoid issues with dynvar drops */
    self->bytes = 0;
    self->have_path = 0;
    tm[self->src_addr, args[1]->noi_xid] = 0;
    self->src_addr = 0;
    self->path = 0;
  }

  tick-1sec /data/
  {
    /*
     * Shrink each aggregation to 100 observations first, then print those
     * 100 observations, which will not mean we are limited to 100 unique
     * connection addresses, or 100 unique paths. As it stands, aggregations
     * right now use a tuple for their key, which is a set of three values:
     * client address, command, path. This means that in theory we could have:
     * (client address) x (command) x (path) number of unique keys.
     */
    trunc(@ios, 100); trunc(@lat_us, 100); trunc(@s, 100);
    /* normalize(@lat_us, data); */
    
    printa("%s,%s,%s,%@d,%@d,%@d\n", @ios, @lat_us, @s);
    trunc(@ios); trunc(@lat_us); trunc(@s);
    data = 0;
  }'
nfsv3-offsets-by-iosize.d
dtrace -qn '
    BEGIN {
        x["op-read-start"] = "Read" ;
        x["op-write-start"] = "Write" ;
    }
    nfsv3:::op-read-start { self->offs = args[2]->offset ; self->sz = args[2]->count ; }
    nfsv3:::op-write-start { self->offs = args[2]->offset ; self->sz = args[2]->data.data_len ; }

    nfsv3:::op-read-start,nfsv3:::op-write-start
    /self->sz <= 0x1000 && self->offs/ {
        @[x[probename], "4K"] = quantize(self->offs);
        self->offs = 0 ; self->sz = 0 ;
    }
    nfsv3:::op-read-start,nfsv3:::op-write-start 
    /self->sz > 0x1000 && self->sz <= 0x2000 && self->offs/ {
        @[x[probename], "8K"] = quantize(self->offs);
        self->offs = 0 ; self->sz = 0 ;
    }
    nfsv3:::op-read-start,nfsv3:::op-write-start
    /self->sz > 0x2000 && self->sz <= 0x4000 && self->offs/ {
        @[x[probename], "16K"] = quantize(self->offs);
        self->offs = 0 ; self->sz = 0 ;
    }
    nfsv3:::op-read-start,nfsv3:::op-write-start
    /self->sz > 0x4000 && self->sz <= 0x8000 && self->offs/ {
        @[x[probename], "32K"] = quantize(self->offs);
        self->offs = 0 ; self->sz = 0 ;
    }
    nfsv3:::op-read-start,nfsv3:::op-write-start
    /self->sz > 0x8000 && self->offs/ {
        @[x[probename], ">32K"] = quantize(self->offs);
        self->offs = 0 ; self->sz = 0 ;
    }
    END {
    printa("[%s] IOsize(%s): %@d\n", @);
    }'
nfsv3-io-summary-by-ip.d
dtrace -qn '
  long ios_per_sec[string];
  long prev_timestamp[string];
  inline const int NSEC_PER_SEC = 1000000000;
  BEGIN {
      start = timestamp ;
      prev_timestamp["op-read-start"] = start;
      prev_timestamp["op-write-start"] = start;
  }
  nfsv3:::op-read-start {
    self->sz = args[2]->count;
    self->ipaddr = inet_ntoa(&((struct sockaddr_in *)((struct svc_req *)arg0)->rq_xprt->xp_xpc.xpc_rtaddr.buf)->sin_addr.S_un.S_addr);
    this->port = ((struct sockaddr_in *)((struct svc_req *)arg0)->rq_xprt->xp_xpc.xpc_rtaddr.buf)->sin_port;
  }
  nfsv3:::op-write-start {
    self->sz = args[2]->data.data_len;
    self->ipaddr = inet_ntoa(&((struct sockaddr_in *)((struct svc_req *)arg0)->rq_xprt->xp_xpc.xpc_rtaddr.buf)->sin_addr.S_un.S_addr);
    this->port = ((struct sockaddr_in *)((struct svc_req *)arg0)->rq_xprt->xp_xpc.xpc_rtaddr.buf)->sin_port;
  }
  nfsv3:::op-read-start,nfsv3:::op-write-start
  /self->sz && self->ipaddr != NULL/ {
    this->dir = probename == "op-read-start" ? "READ" : "WRITE" ;
    @bukit[self->ipaddr, this->dir] = quantize(self->sz);
    @tot[self->ipaddr, this->dir]   = sum(self->sz);
    @avsz[self->ipaddr, this->dir]  = avg(self->sz);
    @minsz[self->ipaddr, this->dir] = min(self->sz);
    @maxsz[self->ipaddr, this->dir] = max(self->sz);
    @ct[self->ipaddr, this->dir]    = count();
    self->sz = 0;
    self->ipaddr = 0;
    ioct[this->dir]++ ;
  }
  nfsv3:::op-read-start,nfsv3:::op-write-start
  /timestamp >= (prev_timestamp[probename] + 1000000000) / {
    this->now = timestamp ;
    this->delta = this->now - prev_timestamp[probename];
    this->dir = probename == "op-read-start" ? "READ" : "WRITE" ;
    this->perSec = (ios_per_sec[this->dir] * NSEC_PER_SEC) / this->delta ;
    prev_timestamp[probename] = this->now ; /* update timestamp for next run */
    @ios_dist[this->dir, "Second"] = quantize(ioct[this->dir]);
    ioct[this->dir] = 0; /* reset counter for next second */
  }
  END {
    this->stop = timestamp;
    normalize(@tot, 1024*1024);
    normalize(@maxsz, 1024);
    normalize(@avsz, 1024);
    printa("\t   IOsize(b)\t\tClient=%s Op=%s%@d\n", @bukit);
    printf("\n----------------------------------------\n\n");
    printa("           %-5s IOs per %s%@d\n", @ios_dist);
    printf("----------------------------------------\n\n");
    printa("%-15s %s total(MB)=%@d min(b)=%@d max(KB)=%@d avg(KB)=%@d issued=%@d \n", 
           @tot, @minsz, @maxsz, @avsz, @ct);
    printf("--------------------\nTotal Runtime: %d seconds\n",
            (this->stop - start)/ NSEC_PER_SEC);
  }'
nfsv3-file-offsets.d
// NFSv3 Read/Write offsets by type of IO (R/W), quantized.
dtrace -qn '
  BEGIN {
    op["op-read-start"] = "READ Offset(KB)"; 
    op["op-write-start"] = "WRITE Offset(KB)"; 
    printf("Tracing file offsets... Ctrl-C to End\n");
  }
  nfsv3:::op-read-start,
  nfsv3:::op-write-start {
    @[op[probename]] = quantize(args[2]->offset >> 10)
  }
  END {
  printa(@);
  }'
nfsalignment.sh
#!/usr/bin/ksh
#
# show affect of alignment
#
### default variables
opt_blocksize=4096  # blocksize for alignment measurements

### process options
while getopts b:h name
do
    case $name in
        b)  opt_blocksize=$OPTARG ;;
	h)  cat <<END >&2
USAGE: nfsalignment [-b blocksize]
    -b blocksize # alignment blocksize (default=${opt_blocksize})
END
        exit 1
    esac
done

dtrace -qn '
inline int OPT_blocksize    = '$opt_blocksize';
inline int READ_ALIGNED     = 0;
inline int WRITE_ALIGNED    = 1;
inline int READ_UNALIGNED   = 2;
inline int WRITE_UNALIGNED  = 3;

int self align_cnt[int];

dtrace:::BEGIN
{
    printf("tracing... press [ctrl]+C when done\n");
    rw["op-read-start"]     = "R";
    rw["op-write-start"]    = "W"; 
}
nfsv3:::op-read-start,
nfsv3:::op-write-start,
nfsv4:::op-read-start,
nfsv4:::op-write-start
{
    this->ts=timestamp;
    this->align=args[2]->offset % OPT_blocksize;
    /* todo: option for offset normalization */
    /* this->align=this->align > 2048 ? this->align=4096 - this->align
        : this->align;
    */
}
nfsv3:::op-read-start,
nfsv4:::op-read-start
{
    this->aligned = args[2]->offset % OPT_blocksize == 0 ? 1 : 0;
    this->unaligned = args[2]->offset % OPT_blocksize != 0 ? 1 : 0;
    self->align_cnt[READ_ALIGNED]     += this->aligned;
    self->align_cnt[READ_UNALIGNED]   += this->unaligned;
}
nfsv3:::op-write-start,
nfsv4:::op-write-start
{
    this->aligned = args[2]->offset % OPT_blocksize == 0 ? 1 : 0;
    this->unaligned = args[2]->offset % OPT_blocksize != 0 ? 1 : 0;
    self->align_cnt[WRITE_ALIGNED]     += this->aligned;
    self->align_cnt[WRITE_UNALIGNED]   += this->unaligned;
}
/* direction of I/O */
nfsv3:::op-read-start,
nfsv4:::op-read-start
nfsv3:::op-write-start,
nfsv4:::op-write-start
{
    this->rw = rw[probename];
    this->fh = args[1]->noi_curpath;
}
/* version of I/O */
nfsv3:::op-read-start,
nfsv3:::op-write-start
{
    this->vers="3";
}
nfsv4:::op-read-start,
nfsv4:::op-write-start
{
    this->vers="4";
}
nfsv3:::op-read-done,
nfsv3:::op-write-done,
nfsv4:::op-read-done,
nfsv4:::op-write-done
/this->ts && this->rw != NULL/
{
    @t[this->rw, this->vers, this->align, this->fh] =
        avg((timestamp - this->ts)/1000);
    @c[this->rw, this->vers, this->align, this->fh] = count();
    this->ts=0;

    /* Aligned IOs Count */
    @a[this->rw, this->vers] = sum(this->rw == "R" ?    
        self->align_cnt[READ_ALIGNED] : 
            self->align_cnt[WRITE_ALIGNED]);

    /* Unaligned IOs Count */
    @u[this->rw, this->vers] = sum(this->rw == "R" ? 
        self->align_cnt[READ_UNALIGNED] :
            self->align_cnt[WRITE_UNALIGNED]);

    self->align_cnt[READ_ALIGNED]       = 0;
    self->align_cnt[READ_UNALIGNED]     = 0;
    self->align_cnt[WRITE_ALIGNED]      = 0;
    self->align_cnt[WRITE_UNALIGNED]    = 0;
}
dtrace:::END
{
    printf("NFS alignment analysis for blocksize=%d\n", OPT_blocksize);
    printf("Sample end time %Y\n", walltimestamp);
    /* Report on total aligned and unaligned I/Os */
    printa("(%s) Count I/Os Aligned   NFSv%s => %@d\n", @a);
    printa("(%s) Count I/Os Unaligned NFSv%s => %@d\n", @u);
    printf("%8s %8s %20s %10s %20s %20s\n",
        "r/w", "version", "misalignment (bytes)","Count", "Avg_t (us)", "Path");
    printa("%8s %8s %20d %10@d %20@d                 %20s\n", @c, @t);
    
}'
nfs-v3-rw-size-hist.d
dtrace -qn '
  BEGIN {
    x["op-read-start"] = "Read" ;
    x["op-write-start"] = "Write" ;
  }
  nfsv3:::op-read-start { self->sz = args[2]->count }
  nfsv3:::op-write-start { self->sz = args[2]->data.data_len }
  nfsv3:::op-read-start,nfsv3:::op-write-start /self->sz/ {
    @[x[probename]] = quantize(self->sz);
    self->sz = 0;
  }
  END {
    printa("[%s] IOsize(b): %@d\n", @);
  }'
nfs-offset-one-liners.d
// Observe by name of file a distribution of offsets within a file beeing seeked.
// This is useful for observing VMs. Offset distance is in bytes from start of file.
dtrace -qn 'nfsv3:::op-read-start,nfsv3:::op-write-start {
	@[args[1]->noi_curpath] = quantize(args[2]->offset >> 10)
	}'

// Observe offsets withing a file being seeked by iosize. 
// This is useful when a load generation tool is creating a known iosize.
dtrace -qn '
	BEGIN {
	    m["op-read-start"] = "R" ;
	    m["op-write-start"] = "W" ;
	}
    nfsv3:::op-read-start {self->sz = args[2]->count}
    nfsv3:::op-write-start {self->sz = args[2]->data.data_len}
    nfsv3:::op-read-start,nfsv3:::op-write-start /self->sz/ {
        /* @[args[1]->noi_curpath] = quantize(args[2]->offset >> 10); */
        @[m[probename], self->sz] = quantize(args[2]->offset >> 10);
        self->sz = 0;
    }
    END {
    	trunc(@, 5);
    	printa("[%s] IOsize: %d bytes\nOffset Distance(KB):%@d\n", @);
    }'
nfs-jump-dist-1.d
dtrace -qn '
    #pragma D option aggsize=16m
    #pragma D option dynvarsize=38m

    int self lastR[string];
    int self lastW[string];
    
    BEGIN {
            cmd["op-read-start"] = "READ";
            cmd["op-write-start"] = "WRITE";
            cmd["op-read-done"] = "READ";
            cmd["op-write-done"] = "WRITE";
            reads = 0;
            writes = 0;
            printf("timestamp,cmd,path,iosize,avg.offset.kb,min.offset.kb,max.offset.kb,elapsed.us\n");
    }

    tick-1sec {
        ts = walltimestamp;
    }
    /* 
     * The distance distance value we store in this->distance variable
     * is number of bytes that that we skipped from previous offset,
     * plus number of bytes read, or written from previous offset.
     * If last offset was say 2048 with 1KB IO size, we will be measuring
     * jump distance from (2048 + 1024) - new offset. Jump distance is 
     * not strictly distance between two consecutive offsets, but a
     * distance between last actual position of cursor as a result of 
     * previous read or write.
     */
    nfsv3:::op-read-start /ts/ {
        this->len = args[2]->count; /* how much to read */
        this->distance = self->lastW[args[1]->noi_curpath] == NULL ? args[2]->offset : 
            self->lastR[args[1]->noi_curpath] > args[2]->offset ?
            self->lastR[args[1]->noi_curpath] - args[2]->offset :
                -self->lastR[args[1]->noi_curpath] + args[2]->offset;

        /* Preserve current offset, after this->len bytes read */
        self->lastR[args[1]->noi_curpath] = args[2]->offset + this->len;
        
        @avgjmp[ts, cmd[probename], args[1]->noi_curpath, this->len]  =     
            avg(this->distance);
        @minjmp[ts, cmd[probename], args[1]->noi_curpath, this->len]  =     
            min(this->distance);
        @maxjmp[ts, cmd[probename], args[1]->noi_curpath, this->len]  = 
            max(this->distance);
        /* @sdjmp[ts, cmd[probename], args[1]->noi_curpath, this->len]  = stddev(this->distance); */
        /* this->len = 0; */
        this->distance = 0;
        this->relt = timestamp;
    }

    nfsv3:::op-write-start /ts/ {
        this->len = args[2]->data.data_len; /* how much to write */
        this->distance = self->lastW[args[1]->noi_curpath] == NULL ? args[2]->offset : 
            self->lastW[args[1]->noi_curpath] > args[2]->offset ?
            self->lastW[args[1]->noi_curpath] - args[2]->offset :
                -self->lastW[args[1]->noi_curpath] + args[2]->offset;

        /* Preserve current offset, after this->len bytes written */
        self->lastW[args[1]->noi_curpath] = args[2]->offset + this->len;
        
        @avgjmp[ts, cmd[probename], args[1]->noi_curpath, this->len]  = 
            avg(this->distance);
        @minjmp[ts, cmd[probename], args[1]->noi_curpath, this->len]  = 
            min(this->distance);
        @maxjmp[ts, cmd[probename], args[1]->noi_curpath, this->len]  = 
            max(this->distance);
        /* @sdjmp[ts, cmd[probename], args[1]->noi_curpath, this->len]   = stddev(this->distance); */
        /* this->len = 0; */
        this->distance = 0;
        this->relt = timestamp;
    }
    
    nfsv3:::op-read-done,nfsv3:::op-write-done /this->relt/ {
        /* Measure average amount of time from start of request to finish. */
        @avglat[ts, cmd[probename], args[1]->noi_curpath, this->len]  = 
            avg((timestamp - this->relt) / 1000);
        
        this->relt = 0;
        this->len = 0;
    }

    tick-1sec {
        normalize(@avgjmp, 1024);
        normalize(@minjmp, 1024);
        normalize(@maxjmp, 1024);
        printa("%Y,%s,%s,%d,%@d,%@d,%@d,%@d\n", 
                @avgjmp, @minjmp, @maxjmp, @avglat); 
        trunc(@minjmp); trunc(@maxjmp); 
        trunc(@avgjmp); trunc(@avglat);
        /* trunc(@sdjmp); */
    }'
nfs-how-random.d
dtrace -qn '
  /*
   * Measure how random or sequential NFSv3 IO is, and bucket it by
   * client IP address. It is useful to know which sizes are dominant
   * when IOs have consistent offset distance from previous offset,
   * i.e. io is sequential, as opposed to what sizes have constantly
   * varied IO distances.
   */
  long offset[string, string, string];

  BEGIN {
    map["op-read-start"] = "Read";
    map["op-write-start"] = "Write";
  }
  nfsv3:::op-read-start,nfsv3:::op-write-start
  /!offset[map[probename], args[0]->ci_remote, args[1]->noi_curpath]/ {
    offset[map[probename], args[0]->ci_remote, args[1]->noi_curpath] = args[2]->offset;
  }
  nfsv3:::op-read-start,nfsv3:::op-write-start
  /offset[map[probename], args[0]->ci_remote,
  args[1]->noi_curpath] != args[2]->offset/ {
    this->dist = args[2]->offset > offset[map[probename],
    args[0]->ci_remote, args[1]->noi_curpath] ?
    args[2]->offset - offset[map[probename],
    args[0]->ci_remote, args[1]->noi_curpath] :
    offset[map[probename], args[0]->ci_remote,
    args[1]->noi_curpath] - args[2]->offset;
    @h[map[probename], args[0]->ci_remote] = quantize(this->dist);
    offset[map[probename], args[0]->ci_remote,
    args[1]->noi_curpath] =
    args[2]->offset;
  }
  END {printa("%s\nIP: %s%@d\n", @h);}'
nfs-file-seeking-patterns.d
dtrace -qn '
/* This script is useful for observing any system which uses NFS,
 * since this is NFSv3 only, at the moment. The script prints out
 * histograms of important metrics, like distances of seeks, and whether
 * or not those seeks happen in fixed offsets, which would normally suggest
 * that IO is not random in nature. If the distances are always very close,
 * it is worth looking into whether using larger record sizes on datasets
 * can improve performance, in particular if reads are often concentrated
 * in narrow bands within files. This tool is file based, which is most 
 * meaningful for us. It does not really make sense to compare offsets between
 * different files, since results will be nonsense.
 *
 * We use long for f[fileid3], becase the array stores offsets which are
 * of type offset3, which is an alias for uint64.
 */
long f[fileid3]; int lastdist[fileid3];

/* Store file ID and use it to key off of. */
:nfssrv:rfs3_read:entry {
  this->fid = (*args[1]).res_u.ok.file_attributes.attr.fileid ; 
}
:nfssrv:rfs3_read:entry / f[this->fid] == NULL / {
  f[this->fid] = args[0]->offset ;
  lastdist[this->fid] = 0 ;
}
:nfssrv:rfs3_read:entry
/ f[this->fid] != NULL && args[0]->offset != f[this->fid] / {
  this->prev_off  = f[this->fid] ;
  this->curr_off  = args[0]->offset ;
  this->gt        = this->curr_off > this->prev_off ;
  this->dist      = this->gt ? (this->curr_off - this->prev_off) :
                    -(this->curr_off - this->prev_off) ;
  /*
   * Uncomment below to troubleshoot, if offsets appear to be wrong.  
   * printf("0x%x 0x%x dist: 0x%x\n", 
   * this->prev_off, this->curr_off, this->dist);
   */
  @seq["READ"]    = quantize(this->dist == lastdist[this->fid]) ;
  @d["READ"]      = quantize(this->dist >> 10) ;
  @gt32["READ"]   = quantize(this->dist > 0x8000) ;
  @gt64["READ"]   = quantize(this->dist > 0x10000) ;
  @gt128["READ"]  = quantize(this->dist > 0x20000) ;
  @gt1m["READ"]   = quantize(this->dist > 0x100000) ;
  /* Update offset in the map and move on */
  f[this->fid]    = args[0]->offset ;
  lastdist[this->fid] = this->dist ;
}

:nfssrv:rfs3_write:entry {
  this->fid = (*args[1]).res_u.ok.file_wcc.after.attr.fileid ; 
}
:nfssrv:rfs3_write:entry / f[this->fid] == NULL / {
  f[this->fid] = args[0]->offset ;
  lastdist[this->fid] = 0 ;
}
:nfssrv:rfs3_write:entry
/ f[this->fid] != NULL && args[0]->offset != f[this->fid] / {
  this->prev_off  = f[this->fid] ;
  this->curr_off  = args[0]->offset ;
  this->gt        = this->curr_off > this->prev_off ;
  this->dist      = this->gt ? (this->curr_off - this->prev_off) :
                    -(this->curr_off - this->prev_off) ;
  /*
   * Uncomment below to troubleshoot, if offsets appear to be wrong.  
   * printf("0x%x 0x%x dist: 0x%x\n", 
   * this->prev_off, this->curr_off, this->dist);
   */
  @seq["WRITE"]   = quantize(this->dist == lastdist[this->fid]) ;
  @d["WRITE"]     = quantize(this->dist >> 10) ;
  @gt32["WRITE"]  = quantize(this->dist > 0x8000) ;
  @gt64["WRITE"]  = quantize(this->dist > 0x10000) ;
  @gt128["WRITE"] = quantize(this->dist > 0x20000) ;
  @gt1m["WRITE"]  = quantize(this->dist > 0x100000) ;
  /* Update offset in the map and move on */
  f[this->fid]    = args[0]->offset ;
  lastdist[this->fid] = this->dist ;
}
END {
  printa("[%s] Direction:\nSeq == 1 | Random == 0\n%@d\n", @seq) ;
  printa("[%s] Seek Dist (KB): %@d\n", @d) ;
  printa("[%s] Seek >  32K:\nYes == 1 | No == 0\n%@d\n", @gt32) ;
  printa("[%s] Seek >  64K:\nYes == 1 | No == 0\n%@d\n", @gt64) ;
  printa("[%s] Seek > 128K:\nYes == 1 | No == 0\n%@d\n", @gt128) ;
  printa("[%s] Seek >   1M:\nYes == 1 | No == 0\n%@d\n", @gt1m) ;
}'
nfs-details-wip.d
dtrace -qn '
BEGIN {
    period = 5000; /* Results are printed every 5 seconds */
    /* How stable data is expected to be. */
    stable_how[0] = "NOSYNC";
    stable_how[1] = "DSYNC";
    stable_how[2] = "FSYNC";
    ts = walltimestamp;
}
nfsv3:::op-read-start, 
nfsv3:::op-write-start,
nfsv4:::op-read-start {
    tm[args[0]->ci_remote, args[1]->noi_xid] = timestamp;
    opsz[args[0]->ci_remote, args[1]->noi_xid] = args[2]->count;
    flag[args[0]->ci_remote, args[1]->noi_xid] = 0;
    self->nfs=1;
}
/* sdt:::arc-miss gives info on direct users of the arc 
 *  if arc-miss, then we do an I/O */

sdt:::arc-miss /self->nfs/ { self->io = 1; self->nfs = 0 ;}

nfsv3:::op-write-start {
    flag[args[0]->ci_remote,args[1]->noi_xid] = args[2]->stable;
}
nfsv4:::op-write-start {
    flag[args[0]->ci_remote,args[1]->noi_xid] = args[2]->stable;
    tm[args[0]->ci_remote,args[1]->noi_xid] = timestamp;
    opsz[args[0]->ci_remote,args[1]->noi_xid] = args[2]->data_len;
}
nfsv3:::op-read-done, 
nfsv3:::op-write-done, 
nfsv4:::op-read-done, 
nfsv4:::op-write-done
/tm[args[0]->ci_remote,args[1]->noi_xid]/
{
        /* Total time from start of operation to end of same operation. */
        this->delta = (timestamp - tm[args[0]->ci_remote, 
                        args[1]->noi_xid]) / 1000;
        /* Flag indicates volatility mode for this operation, i.e.
         * `Unstable`, `Data Sync`, or `File Sync`.
         */
        this->flag =  flag[args[0]->ci_remote,args[1]->noi_xid];
        this->file =  args[1]->noi_curpath;

        this->type =  probename == "op-write-done" ? "W" : "R";
        /*   self->io ? "uncached" : "cached"  */
        this->type_cache = (probename == "op-write-done") ? 
                            "W": self->io ? "R" : "CR" ;

    /* ipaddr=ip[args[1]->noi_xid]; */
        this->ipaddr = inet_ntoa(&((struct sockaddr_in *)((struct svc_req *)arg0)->
            rq_xprt->xp_xpc.xpc_rtaddr.buf)->sin_addr.S_un.S_addr);
        this->port = ((struct sockaddr_in *)((struct svc_req *)arg0)->
            rq_xprt->xp_xpc.xpc_rtaddr.buf)->sin_port;

        
        @nfs_fio[ts, this->type, this->file, this->ipaddr, this->port] = 
            sum(opsz[args[0]->ci_remote, args[1]->noi_xid]);
        
        /* 
        @nfs_fir["R", this->file, this->ipaddr, this->port] = 
                sum((this->type == "R" ? 
                    opsz[args[0]->ci_remote, args[1]->noi_xid] : 0));
        
        @nfs_fiw["W", this->file, this->ipaddr, this->port] = 
                sum((this->type == "W" ? 
                    opsz[args[0]->ci_remote,args[1]->noi_xid] : 0));
        */
        
        /* store size along with max time so the can be correlated */
        /* this->overload = ( (this->delta) * (1000*1000*1000) +
                            opsz[args[0]->ci_remote,args[1]->noi_xid]);
        @nfs_mx[this->flag,"R"]=max( (this->type == "R" ? this->overload : 0));
        @nfs_mx[this->flag,"W"]=max( (this->type == "W" ? this->overload : 0));
        */
        /* Total Completion latency by type of IO and stability flag. */
        @nfs_qtclat[stable_how[this->flag], 
                this->type] = quantize(this->delta);

        this->size = opsz[args[0]->ci_remote,args[1]->noi_xid];
        this->buck = 512; /* We assume this is smallest possible size */
        this->buck = ( this->size > 512 ) ? 1024 : this->buck ;
        this->buck = ( this->size > 1024 ) ? 2048 : this->buck ;
        this->buck = ( this->size > 4096 ) ? 8192 : this->buck ;
        this->buck = ( this->size > 8192 ) ?  16384 : this->buck ;
        this->buck = ( this->size > 16384 ) ?  32768 : this->buck ;
        this->buck = ( this->size > 32768 ) ?  65536 : this->buck ;
        this->buck = ( this->size > 65536 ) ?  131072 : this->buck ;
        this->buck = ( this->size > 131072 ) ?  262144 : this->buck ;
        this->buck = ( this->size > 262144 ) ?  524288 : this->buck ;
        this->buck = ( this->size > 524288 ) ?  1048576 : this->buck ;
        /* this->type is a string, representing a Read, Write or Cache */
        @nfs_qtmsz[this->buck, this->type] = quantize(this->delta);
        @nfs_avtmsz[this->buck, this->type] = avg(this->delta);
        @nfs_cttmsz[this->buck, this->type] = count();

        @nfs_ctm[stable_how[this->flag], this->type_cache] = sum(this->delta); 
        @nfs_ct[stable_how[this->flag], this->type_cache] = count();
        @nfs_sz[stable_how[this->flag], this->type_cache] = sum(this->size);

        tm[args[0]->ci_remote,args[1]->noi_xid] = 0;
        opsz[args[0]->ci_remote,args[1]->noi_xid] = 0;
        flag[args[0]->ci_remote,args[1]->noi_xid]=0;
        self->io = 0;
}
tick-15sec {
    normalize(@nfs_sz, 1024);
    printf("\t---- NFS Total Size of Data by IO Type ----\n");
    printf("\tPossible Types: Cached Read=>CR|Read=>R|Write=>W\n");
    printa("\tStability: %-10s IO Type: %-2s KBytes: %@d\n", @nfs_sz);
    printf("\t---- End NFS Total Size of Data by IO Type ----\n\n");

    printf("\t---- NFS Total Time by IO Type ----\n");
    printf("\tPossible Types: Cached Read=>CR|Read=>R|Write=>W\n");
    printa("\tStability: %-10s IO Type: %-2s Time(uS): %@d\n", @nfs_ctm);
    printf("\t---- End NFS Total Time by IO Type ----\n\n");

    printf("\t---- NFS Data IOs Total by IO Type ----\n");
    printf("\tPossible Types: Cached Read=>CR|Read=>R|Write=>W\n");
    printa("\tStability: %-10s IO Type: %-2s Count: %@d\n", @nfs_ct);
    printf("\t---- End NFS Data IOs Time by IO Type ----\n\n");
    
    printf("\t---- NFS Normalized Average Time ----\n");
    printa("\tBucket(bytes): %-7d IO Type: %-2s Latency(uS): %@d\n",
    @nfs_avtmsz);
    printf("\t---- End NFS Normalized Average Time ----\n\n");

    printf("\t---- Completion Latency by Size and IO Type ----\n");
    printa("\t[%d] bytes\n\tIO Type: %-2s\n\tLatency(uS): %@d\n", 
          @nfs_qtmsz);
    printf("\t---- End Completion Latency by Size and IO Type ----\n\n");

    printf("\t---- Completion Latency Overall ----\n");
    printa("\tStability: %-10s\n\tIO Type: %-2s\n\tLatency(uS): %@d\n", 
          @nfs_qtclat);
    printf("\t---- End Latency Overall ----\n\n");

    printf("\t---- Count of IOs by Size and IO Type ----\n");
    printa("\tBucket(bytes): %-10d IO Type: %-2s %-8@d\n", @nfs_cttmsz);
    printf("\t---- End Count of IOs by Size and IO Type ----\n\n");

    printf("Timestamp, IO Type, File Path, Client IP Address, Port, Size(bytes)\n");
    printa("%-20Y %-2s %s %s %d %@d\n", @nfs_fio);

    trunc(@nfs_sz);
    trunc(@nfs_avtmsz);
    trunc(@nfs_qtmsz);
    trunc(@nfs_ctm);
    trunc(@nfs_ct);
    trunc(@nfs_cttmsz);
    trunc(@nfs_qtclat);
    trunc(@nfs_fio);
    ts = walltimestamp;
}'
nfs-count-seq-random.d
dtrace -qn '
  /*
   * Measure how random or sequential NFSv3 IO is, and bucket it by
   * client IP address. It is useful to know which sizes are dominant
   * when IOs have consistent offset distance from previous offset,
   * i.e. io is sequential, as opposed to what sizes have constantly
   * varied IO distances.
   * Results should look something like this, where a count of both
   * is given for duration of this sample period.
   * CMD: Read IP: 172.16.1.34 RAND: 256 SEQ: 228
   * CMD: Write IP: 172.16.1.2 RAND: 1379 SEQ: 0
   * CMD: Write IP: 172.16.1.34 RAND: 1683 SEQ: 3
   */
  int lastdist[string, string, string];
  int lastpos[string, string, string];

  BEGIN {
    map["op-read-start"] = "Read";
    map["op-write-start"] = "Write";
  }
  nfsv3:::op-read-start,nfsv3:::op-write-start
  /!lastdist[map[probename], args[0]->ci_remote, args[1]->noi_curpath]/ {
    lastdist[map[probename], args[0]->ci_remote, args[1]->noi_curpath] = 0;
    lastpos[map[probename], args[0]->ci_remote, args[1]->noi_curpath] = 0;
  }
  nfsv3:::op-read-start,nfsv3:::op-write-start
  /lastpos[map[probename], args[0]->ci_remote, 
  args[1]->noi_curpath] != args[2]->offset/ {
    this->dist = args[2]->offset > lastpos[map[probename],
    args[0]->ci_remote, args[1]->noi_curpath] ?
    args[2]->offset - lastpos[map[probename],
    args[0]->ci_remote, args[1]->noi_curpath] :
    lastpos[map[probename], args[0]->ci_remote,
    args[1]->noi_curpath] - args[2]->offset;
    /* Determine wether previous step and last step were same, i.e. */
    this->seq = this->dist == 
    lastdist[map[probename], args[0]->ci_remote, args[1]->noi_curpath] ? 1 : 0;
    @rand[map[probename], args[0]->ci_remote] =
    sum(this->seq == 0 ? 1 : 0);
    @seq[map[probename], args[0]->ci_remote] =
    sum(this->seq == 1 ? 1 : 0);
    lastpos[map[probename], args[0]->ci_remote, args[1]->noi_curpath] =
    args[2]->offset;
    lastdist[map[probename], args[0]->ci_remote, args[1]->noi_curpath] = 
    this->dist;
  }
  END {printa("CMD: %s IP: %s RAND: %@d SEQ: %@d\n", @rand, @seq);}'
nfs-basic-sanity-check.d
  dtrace -qn '
  int self ts[string, int];

    nfsv3:::op-read-start, nfsv3:::op-write-start, nfsv3:::op-commit-start {
      self->ts[args[0]->ci_remote, args[1]->noi_xid] = timestamp;
    }
    nfsv3:::op-read-done, nfsv3:::op-write-done, nfsv3:::op-commit-done
    /self->ts[args[0]->ci_remote, args[1]->noi_xid] / {
    @t[probefunc] = quantize((timestamp - 
            self->ts[args[0]->ci_remote, args[1]->noi_xid]) / 1000);
    self->ts[args[0]->ci_remote, args[1]->noi_xid] = 0;
    }
    tick-10sec {
      printa("%s\n%@d\n", @t);
      trunc(@t);
    }'
exhausted-threads.d
dtrace -qn '
    BEGIN {
        printf("timestamp,max.pend.reqs,max.actv.threads,percent.pool.free\n");
    }
    ::svc_xprt_qput:entry {
        enq = 1;
        this->p_reqs = args[0]->p_reqs; 
        /* ++this->p_reqs; This may be bumped on entry 
                        into routine, but after the entry probe is triggered. 
                        Further validation of this is necessary. 
        */
        this->pct_used = (args[0]->p_threads - args[0]->p_asleep) > 0 
        ? 100 * (args[0]->p_threads - args[0]->p_asleep) / args[0]->p_maxthreads
        : 1;
        this->pct_free = this->pct_used < 100 ? 100 - this->pct_used : 0;
        @p_reqs     = max(this->p_reqs);
        @actv_thr   = max(args[0]->p_threads - args[0]->p_asleep);
        @pct_free   = min(this->pct_free);
    }

    tick-5sec /enq/ {
        printf("%Y,", walltimestamp);
        printa("%@d,%@d,%@d\n", @p_reqs, @actv_thr, @pct_free);
        trunc(@p_reqs); 
        trunc(@actv_thr); 
        trunc(@pct_free);
        enq = 0;
    }'
dtrace-nfs-variable-sanity-check.d
dtrace -qn '
  self int x[int]; 
  int cnt1; 
  int cnt2;

  BEGIN {
    cnt1 = 0; /* should not be needed, but
    cnt2 = 0;  * just in case, extra-careful. */
  }
  
  nfsv3:::op-read-start {
    self->x[tid] = 1;
    ts[args[0]->ci_remote, args[1]->noi_xid] = timestamp;
  }
  nfsv3:::op-read-done /self->x[tid] > 0/ {
    ++cnt1; 
    self->x[tid] = 0;
  }
  nfsv3:::op-read-done /ts[args[0]->ci_remote, args[1]->noi_xid] > 0/ {
    ++cnt2;
    ts[args[0]->ci_remote, args[1]->noi_xid] = 0;
  }
  tick-10sec {printf("%d ?= %d\n", cnt1, cnt2); exit(0);}
 '
nfsv3-rw-lat-iosize.d
dtrace -qn '
  char m[string];
  BEGIN {
    m["rfs3_read"]  = 0x52;
    m["rfs3_write"] = 0x57;
  }
  ::rfs3_read:op-read-start {
    self->t = timestamp; self->arg2r = args[2];  self->a = args[0]->ci_remote;
    @qiosz[m[probefunc], args[0]->ci_remote] = quantize(args[2]->count);
  }
  ::rfs3_write:op-write-start {
    self->t = timestamp; self->arg2w = args[2]; self->a = args[0]->ci_remote;
    @qiosz[m[probefunc], args[0]->ci_remote] = quantize(args[2]->count);
  }
  ::rfs3_read:op-read-done /self->t/ {
    this->delta = (timestamp - self->t) / 1000;
    this->b = self->arg2r->count;
    @q[m[probefunc], self->a] = quantize(this->delta);
    
    @av[m[probefunc], self->a] = avg(this->delta);
    self->a = 0; self->s = 0; self->arg2r = 0;
  }
  ::rfs3_write:op-write-done /self->t/ {
    this->delta = (timestamp - self->t) / 1000;
    this->b = self->arg2w->data.data_len;
    @q[m[probefunc], self->a] = quantize(this->delta);
    
    @av[m[probefunc], self->a]  = avg(this->delta);
    self->a = 0; self->s = 0; self->arg2w = 0;
  }
  END {
    printa("\t   %c - IO size(b)  client: %s avg: %@d(b)%@d\n", @av, @qiosz);
    printa("\t   %c - latency(μS) client: %s avg: %@d(μS)%@d\n", @av, @q);
  }'

d Dtrace用于探测ZFS zio管道的片段

OpenZFS围绕ZIO管道编写限制观察和其他观察。 Adam在这篇文章中详细解释了OpenZFS中的新写入限制:http://blog.delphix.com/ahl/2014/tuning-openzfs-write-throttle/。这些片段正在借用

active-zio-ios-by-pool-q.d
#!/usr/sbin/dtrace -qCs
#define UPPER_THRESHOLD 100

/* See zio.h line 147 or thereabout */
enum zio_priority {
    ZIO_PRIORITY_SYNC_READ,
    ZIO_PRIORITY_SYNC_WRITE,   /* ZIL */
    ZIO_PRIORITY_ASYNC_READ,   /* prefetch */
    ZIO_PRIORITY_ASYNC_WRITE,  /* spa_sync() */
    ZIO_PRIORITY_SCRUB,        /* asynchronous scrub/resilver reads */
    ZIO_PRIORITY_NUM_QUEUEABLE,
    ZIO_PRIORITY_NOW           /* non-queued i/os (e.g. free) */
};

BEGIN {
    qnames[ZIO_PRIORITY_SYNC_READ]      = "Sync Read";
    qnames[ZIO_PRIORITY_SYNC_WRITE]     = "Sync Write";
    qnames[ZIO_PRIORITY_ASYNC_READ]     = "Async Read";
    qnames[ZIO_PRIORITY_ASYNC_WRITE]    = "Async Write";
    qnames[ZIO_PRIORITY_SCRUB]          = "Scrub";
    qnames[ZIO_PRIORITY_NUM_QUEUEABLE]  = "Not Queueable";
    qnames[ZIO_PRIORITY_NOW]            = "Not Yet Queued";
}

::vdev_queue_io_add:entry {
this->pri = args[1]->io_priority; /* one of zio_priority above */
this->spa = args[1]->io_spa;
/* 
    * For queue to which this IO is being queued, 
    * how many IOs are already active, grouped-by pool?
    */
@lq[stringof(this->spa->spa_name), qnames[this->pri]] =
    lquantize(args[0]->vq_class[this->pri].vqc_active, 0, UPPER_THRESHOLD);
}

END {
printa("\t   %s | [%s]%@d\n", @lq);
}
active-zio-ios-q.d
dtrace -qn '
    /* See zio.h line 147 or thereabout */
    enum zio_priority {
    	ZIO_PRIORITY_SYNC_READ,
    	ZIO_PRIORITY_SYNC_WRITE,   /* ZIL */
    	ZIO_PRIORITY_ASYNC_READ,   /* prefetch */
    	ZIO_PRIORITY_ASYNC_WRITE,  /* spa_sync() */
    	ZIO_PRIORITY_SCRUB,        /* asynchronous scrub/resilver reads */
    	ZIO_PRIORITY_NUM_QUEUEABLE,

    	ZIO_PRIORITY_NOW       /* non-queued i/os (e.g. free) */
};

BEGIN {
        qnames[ZIO_PRIORITY_SYNC_READ]      = "Sync Read";
        qnames[ZIO_PRIORITY_SYNC_WRITE]     = "Sync Write";
        qnames[ZIO_PRIORITY_ASYNC_READ]     = "Async Read";
        qnames[ZIO_PRIORITY_ASYNC_WRITE]    = "Async Write";
        qnames[ZIO_PRIORITY_SCRUB]          = "Scrub";
        qnames[ZIO_PRIORITY_NUM_QUEUEABLE]  = "Not Queueable";
        qnames[ZIO_PRIORITY_NOW]            = "Not Yet Queued";
}

::vdev_queue_io_add:entry {
    this->pri = args[1]->io_priority; /* one of zio_priority above */
    /* 
     * For queue to which this IO is being queued, 
     * how many IOs are already active? 
     */
    @lq[qnames[this->pri]] = lquantize(args[0]->vq_class[this->pri].vqc_active, 0, 100);
}'
delay-dirty-data-by-pool.d
dtrace -qn '
/* Measure amount of dirty data and whether or not IO is going to be
 * delayed by some fraction of time.
 * Adjusting amount of allowed dirty data before throttle kicks in
 * will likely increase latency, and would result in pushing disks harder
 * which potentially is going to increase throughput.
 * To adjust, tune zfs_dirty_data_max, which is the upper bound for how
 * much dirty data is permitted.
 * # echo zfs_dirty_data_max/Z0x400000000|mdb -kw
 */
  ::dsl_pool_need_dirty_delay:entry {
    self->spa   = args[0]->dp_spa ;
    self->dirty = args[0]->dp_dirty_total >> 20 ; /* In Megabytes */
  }
  ::dsl_pool_need_dirty_delay:return /self->spa/ {
    this->res = args[1] > 0 ? "Yes" : "No" ;
    @[stringof(self->spa->spa_name), this->res] = 
      quantize(self->dirty) ;
      self->spa = 0 ;
      self->dirty = 0;
  }
  END { printa("           [%s] | Throttle [%s] %@d\n", @) ; }
  '
dirty-data-oneliners.d
// Print name of pool and amount of dirty data for that pool.
dtrace -qn '::dsl_pool_need_dirty_delay:entry {@[args[0]->dp_spa->spa_name] = quantize(args[0]->dp_dirty_total >> 20);}'

dtrace -qn '
  ::dsl_pool_need_dirty_delay:entry /args[0]->dp_spa->spa_name != "bp"/ {
    @av[args[0]->dp_spa->spa_name] = avg(args[0]->dp_dirty_total >> 20);
    @m[args[0]->dp_spa->spa_name] = max(args[0]->dp_dirty_total >> 20);
  }
  tick-5sec {
    printa("%s,%@dMB,%@dMB\n", @av,@m); trunc(@av); trunc(@m);
  } 
  END {
    printa("%s,%@dMB,%@dMB\n", @av,@m);
  }'
dirty-delay-detail.d
dtrace -qn '
  int limit, ct, slowct, xslowct ;
  inline int LIMIT = 1000000 ; /* limit in nanoseconds */

  ::dmu_tx_wait:entry {
  	self->spa       = args[0]->tx_pool->dp_spa ;
    self->txbirth   = args[0]->tx_start ;
  }
  ::dmu_tx_wait:return /self->txbirth/ {
    
    this->delta     = timestamp - self->txbirth ;
    this->slow      = this->delta >= LIMIT ;
    this->xslow     = this->delta >= LIMIT*10 ;
  	this->sn        = (char *)self->spa->spa_name ;
    @hist[stringof(this->sn)]       = quantize(this->delta) ;
    @avdelay[stringof(this->sn)]    = avg(this->delta) ;
    @maxdelay[stringof(this->sn)]   = max(this->delta) ;
    @sddelay[stringof(this->sn)]    = stddev(this->delta) ;

    ++ct ;
    slowct  += this->slow ? 1 : 0 ;
    xslowct += this->xslow ? 1 : 0 ;
    self->txbirth   = 0 ; self->spa = 0 ;
  }
  END /ct > 0/ {
    printf("           %% TX(s) delayed by <= %d(us): %d\n", 
            LIMIT/1000, 100 - (100*slowct)/ct) ;
    printf("           %% TX(s) delayed by >= %d(ms): %d\n", 
            LIMIT/100000, (100*xslowct)/ct) ;
    printa("           [%s] Delay [Mean|Max|SD](ns): %@d|%@d|%@d %@d",
            @avdelay, @maxdelay, @sddelay, @hist) ;
  }'
dirty-delay-throttle-hist.d
dtrace -qn '
  ::dmu_tx_wait:entry {
  	self->spa = args[0]->tx_pool->dp_spa ;
    self->s = timestamp ;
    self->wasdelayed = args[0]->tx_waited ;
  }
  ::dmu_tx_wait:return /self->s/ {
  	this->unlucky = self->wasdelayed > 0 ? "Yes" : "No" ;
  	this->sn = (char *)self->spa->spa_name ;
    @[stringof(this->sn), this->unlucky] = quantize(timestamp - self->s) ;
    self->s = 0 ; self->spa = 0 ; self->wasdelayed = 0 ;
  }
  END { printa("           [%s] | TX with >1 Delay [%s] %@d\n", @) ; }
  '
observe-zio-pipeline.d
#!/usr/sbin/dtrace -s -C
#pragma D option quiet
/*
 * See /usr/src/uts/common/fs/zfs/sys/zio_impl.h
 * for more details about this enum.
 */
enum zio_stage {
	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCI */
	ZIO_STAGE_READ_BP_INIT		= 1 << 1,	/* R---- */
	ZIO_STAGE_WRITE_BP_INIT		= 1 << 2,	/* -W--- */
	ZIO_STAGE_FREE_BP_INIT		= 1 << 3,	/* --F-- */
	ZIO_STAGE_ISSUE_ASYNC		= 1 << 4,	/* RWF-- */
	ZIO_STAGE_WRITE_COMPRESS	= 1 << 5,	/* -W--- */
	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 6,	/* -W--- */
	ZIO_STAGE_NOP_WRITE		= 1 << 7,	/* -W--- */
	ZIO_STAGE_DDT_READ_START	= 1 << 8,	/* R---- */
	ZIO_STAGE_DDT_READ_DONE		= 1 << 9,	/* R---- */
	ZIO_STAGE_DDT_WRITE		= 1 << 10,	/* -W--- */
	ZIO_STAGE_DDT_FREE		= 1 << 11,	/* --F-- */
	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 12,	/* RWFC- */
	ZIO_STAGE_GANG_ISSUE		= 1 << 13,	/* RWFC- */
	ZIO_STAGE_DVA_THROTTLE		= 1 << 14,	/* -W--- */
	ZIO_STAGE_DVA_ALLOCATE		= 1 << 15,	/* -W--- */
	ZIO_STAGE_DVA_FREE		= 1 << 16,	/* --F-- */
	ZIO_STAGE_DVA_CLAIM		= 1 << 17,	/* ---C- */
	ZIO_STAGE_READY			= 1 << 18,	/* RWFCI */
	ZIO_STAGE_VDEV_IO_START		= 1 << 19,	/* RW--I */
	ZIO_STAGE_VDEV_IO_DONE		= 1 << 20,	/* RW--I */
	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 21,	/* RW--I */
	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 22,	/* R---- */
	ZIO_STAGE_DONE			= 1 << 23	/* RWFCI */
};
  BEGIN {
    s[NULL] = "null";
    s[ZIO_STAGE_OPEN] = "open";
    s[ZIO_STAGE_READ_BP_INIT] = "read bp init";
    s[ZIO_STAGE_WRITE_BP_INIT] = "write bp init";
    s[ZIO_STAGE_FREE_BP_INIT] = "free bp init";
    s[ZIO_STAGE_ISSUE_ASYNC] = "issue async";
    s[ZIO_STAGE_WRITE_COMPRESS] = "write compress";
    s[ZIO_STAGE_CHECKSUM_GENERATE] = "checksum generate";
    s[ZIO_STAGE_NOP_WRITE] = "nop write";
    s[ZIO_STAGE_DDT_READ_START] = "ddt read start";
    s[ZIO_STAGE_DDT_READ_DONE] = "ddt read done";
    s[ZIO_STAGE_DDT_WRITE] = "ddt write";
    s[ZIO_STAGE_DDT_FREE] = "ddt free";
    s[ZIO_STAGE_GANG_ASSEMBLE] = "gang assemble";
    s[ZIO_STAGE_GANG_ISSUE] = "gang issue";
    s[ZIO_STAGE_DVA_THROTTLE] = "dva throttle";
    s[ZIO_STAGE_DVA_ALLOCATE] = "dva alloc";
    s[ZIO_STAGE_DVA_FREE] = "dva free";
    s[ZIO_STAGE_DVA_CLAIM] = "dva claim";
    s[ZIO_STAGE_READY] = "ready";
    s[ZIO_STAGE_VDEV_IO_START] = "vdev io start";
    s[ZIO_STAGE_VDEV_IO_DONE] = "vdev io done";
    s[ZIO_STAGE_VDEV_IO_ASSESS] = "vdev io assess";
    s[ZIO_STAGE_CHECKSUM_VERIFY] = "chksum verify";
    s[ZIO_STAGE_DONE] = "done";
  }
  ::zio_execute:entry {
    this->args = args[0];
  }
  ::zio_execute:return /this->args/ {
    this->w = this->args->io_waiter != NULL ? 1 : 0;
    this->e = this->args->io_executor != NULL ? 1 : 0;
    this->st = this->args->io_stage;
    this->vd = this->args->io_vd != NULL ? 1 : 0;
    @[this->w, this->e, this->vd, s[this->st]] = count();
  }
zfsdirtyusedcsv.d
dtrace -qn '
    BEGIN {
        /* 
        This is the limit for dirty data before throttle is engaged.
        Dirty bytes above this limit will cause throttle mechanism to 
        start slowly and gradually increase as the amount of dirty data
        increases. 
        */
        dirty_no_throttle_limit = 
            (`zfs_delay_min_dirty_percent * `zfs_dirty_data_max) / 100;
        printf("timestamp,pool,mb.dirty,max.mb.dirty,pct.dirty.used,throttle\n");
}
    ::txg_sync_thread:txg-syncing {
            this->dp = (dsl_pool_t *)arg0;
            this->spa = (char *)this->dp->dp_spa->spa_name;
    }

    ::txg_sync_thread:txg-syncing 
    /this->dp->dp_spa->spa_name != NULL && 
    stringof(this->dp->dp_spa->spa_name) != "bp"/ {
            this->thr = 
                (this->dp->dp_dirty_total >= dirty_no_throttle_limit) ? 1 : 0;
            this->dd_actual_mb = this->dp->dp_dirty_total >> 20;
            this->dd_limit_mb = `zfs_dirty_data_max >> 20;
            this->dd_pct = (100 * this->dd_actual_mb) / this->dd_limit_mb;
            
            printf("%Y,%s,%d,%d,%d,%d\n", 
                                        walltimestamp,
                                        stringof(this->spa),
                                        this->dd_actual_mb,
                                        this->dd_limit_mb,
                                        this->dd_pct,
                                        this->thr);
    }'
zio-execute-latency.d
dtrace -qn '
  #pragma D option dynvarsize=8M
  ::zio_execute:entry {
    self->ziop = args[0];
    self->t = timestamp;
  }
  ::zio_execute:return /self->t/ {
    @ = quantize(timestamp - self->t);
    self->t = 0;
    self->ziop = 0;
  }'

d [ARC调整观察]用于观察ARC行为的Dtrace片段和脚本。 #tags:arc,zfs-arc,arc-latency,arc-eviction,latency

[ARC调整观察]用于观察ARC行为的Dtrace片段和脚本。 #tags:arc,zfs-arc,arc-latency,arc-eviction,latency,changes,mutexes,shrinking,evictions,arc-throughput,cache,zfs-cache

arcevictcsv.d
dtrace -qn '
  /*
   * Mesure the amount of time it takes to adjust the size of ARC
   * by given amount of bytes. We expect 1-second granularity and
   * data being in KBytes, and duration being in microseconds (us).
   */
  BEGIN {
    printf("interval,ev.KBps,avg.lat,longest,tottime\n");
    start_time = timestamp;
    interval = 0; data = 0;
  }
  ::arc_adjust:entry {
    self->ts = timestamp;
  }
  ::arc_adjust:return / self->ts > 0/ {
    this->delta = (timestamp-self->ts) / 1000;
    this->kb    = args[1] / 1024;
    self->ts    = 0;
    data        = 1;
    @avlat[interval]      = avg(this->delta);
    @tottime[interval]    = sum(this->delta);
    @longest[interval]    = max(this->delta);
    @sz[interval]         = sum(this->kb);
  }
  /*
   * We want to make sure to print NAs instead of 0's or nothing
   * in order to maintain regular time-series, more or less.
   */
  tick-1sec /!data/ {
    printf("%d,NA,NA,NA,NA\n", interval) ;
    interval = (timestamp - start_time) / 1000000000 ;
  }
  tick-1sec /data/ {
    data = 0 ; 
    interval = (timestamp - start_time) / 1000000000 ;
    printa("%d,%@d,%@d,%@d,%@d\n", @sz, @avlat, @longest, @tottime);
    trunc(@tottime); trunc(@sz); 
    trunc(@avlat); trunc(@longest);
  }'
arcadjustfrom.d
#!/usr/sbin/dtrace -Cs
/* Dtrace script checking for whether arc_adjust was called from from arc_shrink
 or from arc_reclaim_thread.
*/
#pragma D option quiet
#define AR "arc_reclaim_thread"
#define AS "arc_shrink" 

  BEGIN {
    x["arc_reclaim_thread"] = 0;
    x["arc_shrink"] = 0;
    in_shrink = 0;
    in_reclaim = 0;
    new_data = 0;
  }

  ::arc_reclaim_thread:entry {
    ++in_reclaim;
  }

  ::arc_shrink:entry {
    ++in_shrink;
  }

  ::arc_adjust:return /in_reclaim/ {
    x[AR] = args[1];
  }

  ::arc_adjust:return /in_shrink/ {
    x[AS] = args[1];
  }

  ::arc_adjust:return /x[AR] && x[AS]/ {
    @[AR] = sum(x[AR]);
    @[AS] = sum(x[AS]);
    x[AR] = 0;
    x[AS] = 0;
    in_reclaim = --in_reclaim ? in_reclaim > 0 : 0;
    in_shrink = --in_shrink ? in_shrink > 0 : 0;
    ++new_data;
  }

  tick-1sec /new_data/ {
    printa(@);
    trunc(@);
    new_data = 0;
  }
arcadjtimecsv.d
dtrace -qn '
    BEGIN { 
        printf("timestamp,shrink,duration\n");
        in_shrink = 0;
    }
    ::arc_adjust:entry,::arc_kmem_reap_now:entry {
        self->x[curpsinfo->pr_psargs] = timestamp;
    }
    /* If the ARC is being resized we need to know about this. */
    ::arc_shrink:entry
    {
        in_shrink = 1;
    }

::arc_kmem_reap_now:return,
::arc_adjust:return
/self->x[curpsinfo->pr_psargs] && ((timestamp - self->x[curpsinfo->pr_psargs]) / 1000000 > 0)/
{
    printf("%Y,%d,%d\n", walltimestamp, in_shrink,
        (timestamp - self->x[curpsinfo->pr_psargs]) / 1000000);
    self->x[curpsinfo->pr_psargs] = 0;
    in_shrink = 0;
}'
arc-shrink-detail-csv.d
#!/usr/sbin/dtrace -s
#pragma D option quiet
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2015, RackTop Systems.
 *
 * Description:
 * This script is meant to obtain information about ARC internal state, from
 * values that are not necessarily exposed via kstat, and these values are
 * potentially modified inside of arc_shrink. As such, we are trying to observe
 * values at the beginning and at the end of the function call.
 */

BEGIN {
  printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
    "timestamp", "duration", "avail.rmem", "free.mem", "lotsfree.mem",
    "c.adjust", "p.adjust", "size.adjust",
    "bytes.to.free", "arc_c_min", "arc_c_max",
    "c.top", "p.top", "c.size.top",
    "c.bottom", "p.bottom", "c.size.bottom",
    "c.gt.arc_c_min.top", "c.gt.arc_p.top",
    "c.gt.arc_c_min.bottom", "c.gt.arc_p.bottom"
);
}
::arc_shrink:entry {
  self->availrmem = (pgcnt_t)`availrmem << 12;
  self->freemem   = (pgcnt_t)`freemem << 12;
  self->lotsfree  = (pgcnt_t)`lotsfree << 12;
  self->start     = timestamp;
  self->tofree    = args[0];
  self->c_min     = `arc_stats.arcstat_c_min.value.ull;
  self->c_max     = `arc_stats.arcstat_c_max.value.ull;
  self->c_top     = `arc_stats.arcstat_c.value.ull;
  self->p_top     = `arc_stats.arcstat_p.value.ull;
  self->size_top  = `arc_stats.arcstat_size.value.ull;
  self->c_gt_arc_c_min_top = self->c_top > self->c_min ? 1 : 0;
  self->c_gt_arc_p_top = self->c_top > self->p_top ? 1 : 0;
}

::arc_shrink:return /self->start/ {
  this->delta     = (timestamp - self->start) / 1000; /* Duration of call */
  self->size_btm  = `arc_stats.arcstat_size.value.ull;
  self->c_btm     = `arc_stats.arcstat_c.value.ull;
  self->p_btm     = `arc_stats.arcstat_p.value.ull;
  self->c_gt_arc_c_min_btm = self->c_btm > self->c_min ? 1 : 0;
  self->c_gt_arc_p_btm = self->c_btm > self->p_btm ? 1 : 0;
  this->c_adjust = self->c_top - self->c_btm;
  this->p_adjust = self->p_top - self->p_btm;
  this->size_adjust = self->size_top - self->size_btm;

  printf("%Y,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",
  walltimestamp, this->delta, self->availrmem, self->freemem, self->lotsfree,
  this->c_adjust, this->p_adjust, this->size_adjust,
  self->tofree, self->c_min, self->c_max,
  self->c_top, self->p_top, self->size_top,
  self->c_btm, self->p_btm, self->size_btm,
  self->c_gt_arc_c_min_top, self->c_gt_arc_p_top,
  self->c_gt_arc_c_min_btm, self->c_gt_arc_p_btm
  );
}
arc-shrink-bytes-to-free.d
dtrace -qn '
  BEGIN {
    ts = walltimestamp;
    printf("timestamp,arc_c_min,arc_c_max,bytes.to.free\n");
  }
  ::arc_shrink:entry /ts/ {
    this->tofree = args[0];
    this->min = `arc_stats.arcstat_c_min.value.ull;
    this->max = `arc_stats.arcstat_c_max.value.ull;
    @[ts, this->min, this->max] = sum(this->tofree);
  }
  tick-1sec {
    printa("%Y,%d,%d,%@d\n", @);
    trunc(@);
    ts = walltimestamp;
  }'
  
// More detailed version of the above snippet.
dtrace -qn '
  BEGIN {
    ts = walltimestamp;
    printf("timestamp,bytes.to.free,arc_c_min,arc_c_max,c,p,c.size\n");
  }
  ::arc_shrink:entry /ts/ {
    self->tofree = args[0];
    self->min  = `arc_stats.arcstat_c_min.value.ull;
    self->max  = `arc_stats.arcstat_c_max.value.ull;
    self->c    = `arc_stats.arcstat_c.value.ull;
    self->p    = `arc_stats.arcstat_p.value.ull;
    self->size = `arc_stats.arcstat_size.value.ull;
  }
  ::arc_shrink:return /ts/ {
    printf("%Y,%d,%d,%d,%d,%d,%d\n", walltimestamp,
    self->tofree, self->min, self->max, self->c, self->p, self->size);
  }'
arc-release-bufsz-detail-csv.d
dtrace -qn '
  #pragma option dynvarsize=1M
  BEGIN {
    printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n", 
          "timestamp", "call.count", "b512.count", "b1024.count", 
          "b2048.count", "b4096.count", "b16384.count", "b32768.count", 
          "b65536.count", "b131072.count", "call.dur.us");
    have_data = 0;
    ts = walltimestamp;
    }
  ::arc_release:entry /ts/ {
    this->bhdr = args[0]->b_hdr;
    self->sz = this->bhdr->b_size;
    self->start = timestamp;
  }
  ::arc_release:return /ts/ {
    this->delta = timestamp - self->start;
    @cnt[ts]    = count();
    @dur[ts]    = sum(this->delta/1000);
    /* If size matches bucket, add 1 to sum for that bucket, build buckets
     * for common buffer sizes, such as 512, 4K, 16K, 32K, etc.
     */
    @cnt512[ts]    = sum(self->sz == 512 ? 1 :0);
    @cnt1024[ts]   = sum(self->sz == 1024 ? 1 :0);
    @cnt2048[ts]   = sum(self->sz == 2048 ? 1 :0);
    @cnt4096[ts]   = sum(self->sz == 4096 ? 1 :0);
    @cnt16384[ts]  = sum(self->sz == 16384 ? 1 :0);
    @cnt32768[ts]  = sum(self->sz == 32768 ? 1 :0);
    @cnt65536[ts]  = sum(self->sz == 65536 ? 1 :0);
    @cnt131072[ts] = sum(self->sz == 131072 ? 1 :0);
    ++have_data;
  }
  /* 
   * Zero values to avoid gaps in time-series, so as to make it easier 
   * to plot this information.
   */
  tick-1sec /!have_data/ {
    ts = walltimestamp;
    printf("%Y,0,0,0,0,0,0,0,0,0,0\n", ts);
  }
  tick-1sec /have_data/ {
    printa("%Y,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n", 
          @cnt, @cnt512, @cnt1024, @cnt2048, 
          @cnt4096, @cnt16384, @cnt32768, 
          @cnt65536, @cnt131072, @dur); 
    trunc(@cnt512); trunc(@cnt1024); 
    trunc(@cnt2048); trunc(@cnt4096);
    trunc(@cnt16384); trunc(@cnt32768); 
    trunc(@cnt65536); trunc(@cnt131072);
    trunc(@cnt); trunc(@dur);
    
    have_data = 0;
    ts = walltimestamp;
  }
  END {
    exit(0);
  }'
arc-rel-mutex-time-csv.d
dtrace -qn '
  /*
   * Measure amount of time spent between start and stop of arc_release,
   * as well as time spent in the mutex locks while inside of arc_release.
   */
  BEGIN {
    ts = walltimestamp;
    printf("timestamp,count.ar,count.mutex,dur.ar.us,dur.mutex.us\n");
  }
  
  int self x[kthread_t *];

  ::arc_release:entry {
    ++self->x[curthread];
    this->start = timestamp;
  }

  lockstat::mutex_enter:adaptive-acquire /self->x[curthread] > 0/ {
    /* arg0 is a pointer to kmutex_t struct representing the lock */
    self->start[arg0] = timestamp;
    @cnt_lck[ts] = count();
  }

  lockstat::mutex_exit:adaptive-release /self->start[arg0]/ {
    this->delta = timestamp - self->start[arg0];
    @t_lck[ts] = sum(this->delta / 1000);
    self->start[arg0] = 0;
  }
  
  ::arc_release:return /self->x[curthread]/ {
    this->delta = timestamp - this->start;
    self->x[curthread] = 0;
    @t_ar[ts] = sum(this->delta / 1000);
    @cnt_ar[ts] = count();
  }
  
  tick-1sec {
    printa("%Y,%@d,%@d,%@d,%@d\n", 
          @cnt_ar, @cnt_lck, @t_ar, @t_lck);
    trunc(@cnt_lck); trunc(@t_lck); trunc(@cnt_ar); trunc(@t_ar);
    ts = walltimestamp;
  }
'
arc-evict-state-impl-len-size-csv.d
dtrace -qn '
  int data, start;
  BEGIN {
    printf("timedelta,call.count,arc.sz,evict.sz.kb,call.dur.us\n");
    start = timestamp;
    
    }
  ::arc_evict_state_impl:entry {
    self->start = timestamp;
  }
  ::arc_evict_state_impl:return /timestamp - self->start > 0/ {
    this->delta = timestamp - self->start ;
    this->offset = (timestamp - start) / 1000000000 ;
    @cnt[this->offset]    = count();
    @dur[this->offset]    = sum(this->delta/1000);
    /* total number of kilobytes evicted */
    @evsz[this->offset]   = sum(args[1] >> 10);
    @arcsz[this->offset]  = avg(`arc_stats.arcstat_size.value.ui64 >> 10);
    data = 1 ;
  }
  /* 
   * NA values to avoid gaps in time-series, so as to make it easier 
   * to plot this information.
   */
  tick-1sec /!data/ {
    printf("%d,NA,NA,NA,NA\n", (timestamp - start) / 1000000000);
  }
  tick-1sec /data/ {
    printa("%d,%@d,%@d,%@d,%@d\n", @cnt, @arcsz, @evsz, @dur);
    trunc(@cnt); trunc(@arcsz); trunc(@evsz); trunc(@dur);
    data = 0;
  }
  END {
    exit(0);
  }'
arc-evict-state-impl-len-csv.d
dtrace -qn '
  int data, start;
  BEGIN {
    printf("%s,%s,%s,%s,%s,%s,%s,%s\n",
            "timedelta", "call.count", "arc.sz.kb", "arc.c.kb", "evict.kb",
            "av.evict.kb", "call.len.us", "longest.call.len.us"
    );
    start = timestamp;
    
    }
  ::arc_evict_state_impl:entry {
    self->start = timestamp;
  }
  ::arc_evict_state_impl:return /timestamp - self->start > 0/ {
    this->delta = timestamp - self->start ;
    data        = 1 ;

    @cnt        = count();
    @dur        = sum(this->delta/1000);
    @longest    = max(this->delta/1000);
    /* total number of kilobytes evicted */
    @evsz       = sum(args[1] >> 10);
    @arcszkb    = avg(`arc_stats.arcstat_size.value.ui64 >> 10);
    @arcckb     = avg(`arc_stats.arcstat_c.value.ui64 >> 10);
    @avevkb     = avg(args[1] >> 10); /* average KB per eviction */
  }
  /* 
   * NA values to avoid gaps in time-series, so as to make it easier 
   * to plot this information.
   */
  tick-1sec /!data/ {
    printf("%d,NA,NA,NA,NA,NA,NA,NA\n",
            (timestamp - start) / 1000000000);
  }
  tick-1sec /data/ {
    this->seconds = (timestamp - start) / 1000000000 ;
    normalize(@cnt, this->seconds) ;
    normalize(@evsz, this->seconds) ;
    normalize(@dur, this->seconds) ;

    printf("%d,", this->seconds);
    printa("%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
            @cnt, @arcszkb, @arcckb, @evsz, @avevkb, @dur, @longest);
    trunc(@cnt); trunc(@arcszkb); trunc(@arcckb); 
    trunc(@evsz); trunc(@dur); trunc(@avevkb); trunc(@longest);
    data = 0;
  }
  END {
    exit(0);
  }'
arc-evict-slow-csv.d
dtrace -qn '
    /*
     * Mesure the amount of time it takes to adjust the size of ARC
     * by given amount of bytes. We expect 1-second granularity and
     * data being in KBytes, and duration being in microseconds (us).
     */
  inline const int SLOW = 1000; /* (1ms) limit triggering logging of data */
  inline const string HEADER = 
    "timestamp,evicted.kb,slow,longest.call_t.us,free.mem.kb\n";

    BEGIN {
        printf("%s", HEADER);
        cnt = 0;
    }
    tick-1sec {
        ts = walltimestamp;
    }
    ::arc_adjust:entry /ts > 0/ {
        self->ts = timestamp;
        /* Unreserved memory in kilobytes */
        this->freemem = (`freemem << 12) >> 10;
    }
    ::arc_adjust:return /self->ts/ {
        this->delta = (timestamp-self->ts) / 1000;
        this->l = this->delta > SLOW ? 1 : 0;
        this->kb = args[1] / 1024;
        @delay_max[ts, this->kb, this->l] = max(this->delta);
        @freemem[ts, this->kb, this->l] = min(this->freemem);
    }
    ::arc_adjust:return /this->l > 0/ {
    /* Duration and size are not necessarily linked. One should not assume 
     * that the more time is spent the more bytes evicted. This function will
     * call other functions and will spend time in them. This script measures
     * total amount of time spent in those other functions. They do not always
     * remove same amount of data, and depending on how mcuh work they do,
     * they may be slower or faster and amount of bytes evicted does not
     * actually directly relate to amount of time it takes to evict given
     * number of bytes.
     */
        @delay_max[ts, this->kb, this->l] = max(this->delta);
        @freemem[ts, this->kb, this->l] = min(this->freemem);
        printa("%Y,%d,%d,%@d,%@d\n", @delay_max, @freemem);
        trunc(@delay_max); trunc(@freemem);
        self->ts = 0;
    }'
arc-evict-slow-csv-2.d
#!/usr/sbin/dtrace -s
#pragma D option quiet
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2015, RackTop Systems.
 *
 * Description:
 * This script is meant to measure ARC's adjustment mechanism performance.
 * It has been observed that it takes sometimes a long period of time to
 * adjust the lists in the ARC, but what is not clear is the overall impact.
 * We can measure here how much time is spent doing adjustments and what the
 * size of adjustments is. This tool lets us see the size of the ARC when
 * adjustment happens, the amount of bytes processed and the amount of time
 * it takes to compete adjustments.
 */
 
 /*
  * Mesure the amount of time it takes to adjust the size of ARC
  * by given amount of bytes. We expect 1-second granularity and
  * data being in KBytes, and duration being in microseconds (us).
  */
  inline const int SLOW = 1000; /* (1ms) limit triggering logging of data */
  inline const string HEADER = 
    "timestamp,arc.sz.kb,evict.kb,slow,longest.call_t.us\n";

BEGIN {
    printf("%s", HEADER);
    cnt = 0;
}
tick-1sec {
    ts = walltimestamp;
}
::arc_adjust:entry /ts > 0/ {
    self->ts = timestamp;
}
::arc_adjust:return /self->ts/ {
    this->delta = (timestamp-self->ts) / 1000;
    this->l = this->delta > SLOW ? 1 : 0;
    this->evicted_kb = args[1] / 1024;
    this->arc_sz_kb = `arc_stats.arcstat_size.value.ui64 >> 10;
    @delay_max[ts, this->arc_sz_kb,
            this->evicted_kb, this->l] = max(this->delta);
    
}
::arc_adjust:return /self->ts/ {
/* Duration and size are not necessarily linked. One should not assume 
 * that the more time is spent the more bytes evicted. This function will
 * call other functions and will spend time in them. This script measures
 * total amount of time spent in those other functions. They do not always
 * remove same amount of data, and depending on how much work they do,
 * they may be slower or faster and amount of bytes evicted does not
 * actually directly relate to amount of time it takes to evict given
 * number of bytes.
 */
    @delay_max[ts, this->arc_sz_kb, 
            this->evicted_kb, this->l] = max(this->delta);
    printa("%Y,%d,%d,%d,%@d\n", @delay_max);
    trunc(@delay_max);
    self->ts = 0;
}
arc-evict-slow-csv-1.d
dtrace -qn '
    /*
     * Mesure the amount of time it takes to adjust the size of ARC
     * by given amount of bytes. We expect 1-second granularity and
     * data being in KBytes, and duration being in microseconds (us).
     */
  inline const int SLOW = 1000; /* (1ms) limit triggering logging of data */
  inline const string HEADER = 
    "timestamp,arc.sz.kb,evict.kb,slow,longest.call_t.us\n";

    BEGIN {
        printf("%s", HEADER);
        cnt = 0;
    }
    tick-1sec {
        ts = walltimestamp;
    }
    ::arc_adjust:entry /ts > 0/ {
        self->ts = timestamp;
    }
    ::arc_adjust:return /self->ts/ {
        this->delta = (timestamp-self->ts) / 1000;
        this->l = this->delta > SLOW ? 1 : 0;
        this->evicted_kb = args[1] / 1024;
        this->arc_sz_kb = `arc_stats.arcstat_size.value.ui64 >> 10;
        @delay_max[ts, this->arc_sz_kb,
                this->evicted_kb, this->l] = max(this->delta);
        
    }
    ::arc_adjust:return /self->ts/ {
    /* Duration and size are not necessarily linked. One should not assume 
     * that the more time is spent the more bytes evicted. This function will
     * call other functions and will spend time in them. This script measures
     * total amount of time spent in those other functions. They do not always
     * remove same amount of data, and depending on how much work they do,
     * they may be slower or faster and amount of bytes evicted does not
     * actually directly relate to amount of time it takes to evict given
     * number of bytes.
     */
        @delay_max[ts, this->arc_sz_kb, 
                this->evicted_kb, this->l] = max(this->delta);
        printa("%Y,%d,%d,%d,%@d\n", @delay_max);
        trunc(@delay_max);
        self->ts = 0;
    }'
arc-evict-by-type-1.d
dtrace -qn '
  /*
   * Collect how many bytes ARC is being asked to evict and how many bytes ARC
   * actually evicted, as well as what type these buffers being evicted are,
   * i.e. metadata, data, etc.
   */
   enum arc_buf_contents {
   	ARC_BUFC_DATA,				/* buffer contains data */
   	ARC_BUFC_METADATA,			/* buffer contains metadata */
   	ARC_BUFC_NUMTYPES
   };

  BEGIN {
    ts = walltimestamp;
    printf("timestamp,buf.type,kbytes.needed,kbytes.evicted\n");
  }
  ::arc_evict_state:entry /ts/ {
    self->typ = args[3];
    self->bytes = args[2] >> 10;
    }
  ::arc_evict_state:return /ts/ {
    self->evicted = args[1] >> 10;
    @bytes[ts, self->typ == ARC_BUFC_DATA ? "D" : 
      self->typ == ARC_BUFC_METADATA ? 
      "MD" : "OTHER" ] = sum(self->bytes);
    @evicted[ts, self->typ == ARC_BUFC_DATA ? "D" : 
      self->typ == ARC_BUFC_METADATA ? 
      "MD" : "U" ] = sum(self->evicted);
  }
  tick-1sec {
    printa("%Y,%s,%@d,%@d\n", @bytes, @evicted);
    trunc(@bytes);
    trunc(@evicted);
    ts = walltimestamp;
  }'
arc-ev-state-hdr-duration.d
dtrace -qn '
  /* Measure length of time to complete state and header evictions */
  inline int LONG_CALL = 20000 ; /* nanoseconds */
  ::arc_evict_state:entry,::arc_evict_hdr*:entry {
    self->s = timestamp ; 
  } 
  ::arc_evict_state:return,::arc_evict_hdr*:return /self->s/ {
    this->delta = timestamp - self->s ;
    /* 
     * If we are above the LONG_CALL threshold, aggregate value
     * under the key `probefunc`-long.
     */
    this->lab = this->delta >= LONG_CALL ?
                strjoin(probefunc, "-long") : probefunc ;
    @[this->lab] = quantize(this->delta) ;
    self->s = 0 ;
  }'
arc-adj-mutex-miss-csv.d
dtrace -qn '
  /* Measure number of times we missed acquiring a mutex lock in 
   * arc_evict_hdr and in arc_evict_state_impl.
   */
  BEGIN {
    printf("timestamp,function,mutex.misses,call.duration.us\n");
    changed = 0;
    f["arc_evict_hdr"] = "evict_hdr";
    f["arc_evict_state_impl"]  = "evict_state";
  }
  tick-1sec {
    ts = walltimestamp;
  }
  ::arc_evict_hdr:entry, 
  ::arc_evict_state_impl:entry {
    self->start = timestamp;
    self->x[probefunc] = `arc_stats.arcstat_mutex_miss.value.ui64;
  } 
  ::arc_evict_hdr:return, 
  ::arc_evict_state_impl:return /ts/ {
    this->delta_t = (timestamp - self->start) / 1000;
    this->delta_miss = 
      `arc_stats.arcstat_mutex_miss.value.ui64 - self->x[probefunc];
    @[ts, f[probefunc]] = sum(this->delta_miss);
    @max_t[ts, f[probefunc]] = max(this->delta_t);
    changed += this->delta_miss > 0;
  }
  tick-1sec {
    printa("%Y,%s,%@d,%@d\n", @, @max_t);
    trunc(@);
    trunc(@max_t);
    changed = 0;
  }
  END {
    exit(0);
  }'

d [dtrace在vnode(文件)级别为zfs提供片段]用于观察核心zfs文件系统函数的片段,如zfs_read和zfs_write for de

[dtrace在vnode(文件)级别为zfs提供片段]用于观察核心zfs文件系统函数(如zfs_read和zfs_write)的片段,用于确定文件系统延迟和测量IO大小。

zio-read-write-latency-csv.d
dtrace -qn '
/*
 * This script is intended for measuring basic characteristics of
 * the zio_{read,write} functions in ZFS. These functions are a good
 * proxy for how the underlying storage is performing under current load.
 * Resulting data is CSV-formatted for easy digestion by various command-line
 * tools and import into R and other statistical tools.
 * Totals are normalized to a per-second rate, i.e. amount of time spent
 * in function for each sample is per second of real time.
 */
  int start, interval, rmin, rmax, wmin, wmax, rdata, wdata;
  int cnt[string] ;
  int ldelta[string] ;
  signed long lastema[string] ; /* exp. moving avg. associative array */
 /* 
  * Weight for EMA calculation W = 2 ÷ (N + 1), where N is number of periods.
  * Because we cannot do math with floats in dtrace, we assume that a 20 period
  * EMA is 2 ÷ (20 + 1) ~= 0.0952 to four decimal points. Multiplying by 10000
  * results in chosen weight.
  */
  inline int WEIGHT = 952 ;
  BEGIN {
  printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
          "delta", "rlat", "rlat.ema", "rminlat", "rmaxlat", "rcnt", 
          "read.tot.tm", "read.range", "wlat", "wlat.ema", "wminlat", 
          "wmaxlat", "wcnt", "write.tot.tm", "write.range") ;
    start = timestamp ;
  }
  /*
   * To avoid losing information, due to lack of floating point arithmetic
   * support in dtrace, we avoid conversion from nanoseconds to microseconds
   * in the return clauses, where we compute delta between entry and return
   * triggers. This should be easy to accomplish later on, if necessary in the
   * data processing pipeline.
   */
  ::zio_read:entry,::zio_write:entry { self->s = timestamp ; }

  ::zio_read:return /timestamp > self->s/ {
    this->delta = timestamp - self->s ;
    rmin = rmin == 0 ? this->delta : rmin > this->delta ? this->delta : rmin ;
    rmax = rmax == 0 ? this->delta : rmax < this->delta ? this->delta : rmax ;
    
    @rlat    = avg(this->delta) ;
    @rminlat = min(this->delta) ;
    @rmaxlat = max(this->delta) ;
    @rcnt    = count() ;
    @rtottm  = sum(this->delta) ;

    rdata = 1 ; cnt[probefunc]++ ;
  }

  ::zio_write:return /timestamp > self->s/ {
    this->delta = timestamp - self->s ;
    wmin = wmin == 0 ? this->delta : wmin > this->delta ? this->delta : wmin ;
    wmax = wmax == 0 ? this->delta : wmax < this->delta ? this->delta : wmax ;

    @wlat    = avg(this->delta) ;
    @wminlat = min(this->delta) ;
    @wmaxlat = max(this->delta) ;
    @wcnt    = count() ;
    @wtottm  = sum(this->delta) ;
    
    wdata = 1 ; cnt[probefunc]++ ;
  }

  /*
   * Consider this a required first interval to seed EMA,
   * and do not use this sample for additional computations.
   */
  ::zio_read:return,::zio_write:return
  /self->s && lastema[probefunc] == 0 && this->delta/ {
  lastema[probefunc] = this->delta ;
  self->s = 0 ;
  }

  /* 1/10th of the time do EMA computation. We are purposely skipping over
   * some observations to make the data smoother.
   */
  ::zio_read:return,::zio_write:return
  /self->s && lastema[probefunc] > 0 && this->delta > 0
  && (cnt[probefunc] % 10) == 0/ {
      /* If count is > 10000, reset it to 0, start over. */
      cnt[probefunc] = cnt[probefunc] > 10000 ? 0 : cnt[probefunc] ;
    /*
     * Compute exponential moving average, and due to lack of a better means
     * store values in a avg() aggregate, which is not ideal, but it is better
     * than nothing at all.
     */
    this->ema           = ((WEIGHT * this->delta) + 
                          ((10000 - WEIGHT) * lastema[probefunc])) / 10000 ;
    lastema[probefunc]  = this->ema ;
    self->s = 0 ;
  }
  ::zio_read:return /this->ema/ {
    @rlatema            = avg(this->ema) ;
    this->ema           = 0 ;
  }
  ::zio_write:return /this->ema/ {
    @wlatema            = avg(this->ema) ;
    this->ema           = 0 ;

  }
  tick-5sec /!rdata && !wdata/ {
  /*
   * If the function calls did not happen, we print NA to make it clear
   * that values are not available, as opposed to printing 0s.
   */
   this->toffset = (timestamp - start) / 1000000000 ;
    printf("%d,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA\n",
            this->toffset) ;
    interval = this->toffset ;            
  }
  tick-5sec /rdata || wdata/ {
    /* 
     * We are only using a sum here because we need something that will store
     * results as an aggregated variable, which we can print with printa.
     */
    this->toffset = (timestamp - start) / 1000000000 ;
    @rrng = sum(rdata > 0 ? rmax - rmin : 0) ;
    @wrng = sum(wdata > 0 ? wmax - wmin : 0) ;
    normalize(@rtottm, this->toffset - interval) ; 
    normalize(@wtottm, this->toffset - interval) ;
    normalize(@rcnt, this->toffset - interval) ; 
    normalize(@wcnt, this->toffset - interval) ;

    printf("%d,", this->toffset) ;
    printa("%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
            @rlat, @rlatema, @rminlat, @rmaxlat, @rcnt, @rtottm, @rrng,
            @wlat, @wlatema, @wminlat, @wmaxlat, @wcnt, @wtottm, @wrng) ;

    trunc(@rlat) ; trunc(@rlatema) ; trunc(@rminlat) ; trunc(@rmaxlat) ;
    trunc(@rcnt) ; trunc(@rtottm) ; trunc(@rrng) ;
    trunc(@wlat) ; trunc(@wlatema) ; trunc(@wminlat) ; trunc(@wmaxlat) ;
    trunc(@wcnt) ; trunc(@wtottm) ; trunc(@wrng) ;
    rmin, rmax, rdata, wmin, wmax, wdata = 0 ; 
    interval = this->toffset ;
  }'
zfs-rw-lat-iosize-hists.d
dtrace -qn '
#pragma D option dynvarsize=8m
int gotdata ;

  BEGIN {
    printf("pool,io.type,lt100us,lt1000us,lt10000us,lt100ms,lt1000ms,gt1000ms,blkszle1k,blksz2k,blksz4k,blksz8k,blksz16k,blksz32k,blksz64k,blksz128k,blksz256k,blkszgt256k,io.count\n") ;
    x["zfs_read"] = "R" ;
    x["zfs_write"] = "W" ;
  }

  ::zfs_read:entry,::zfs_write:entry
  /strstr(strtok(args[0]->v_path, "/storage/"), "p") != NULL/ {
    self->b = args[1]->uio_resid ; /* Number of bytes to read or write */
    self->p = strtok(args[0]->v_path, "/storage/") ; /* Should be pool name */
    self->s = timestamp ;
    gotdata = 1 ;
}

  ::zfs_read:return,::zfs_write:return /self->s/ {
    /*
     * Delta is converted from nanoseconds to microseconds, since we
     * expect few IOs to complete in a matter of a few nanoseconds,
     * operating with microseconds istead makes sense.
     */
    this->delta = (timestamp - self->s) / 1000 ;
    this->n = self->p ;
    @cnt[stringof(this->n), x[probefunc]] = count();
    /*
     * Histogram with 6 bins representing distribution of latencies
     * for issued IOs. Bins are from 1000us< to >1000ms.
     * We use the sum() function to increment by 1 each time a value 
     * matches width of a bucket, otherwise we leave unchanged.
     */
    @lt100us[stringof(this->n), x[probefunc]] =
        sum(this->delta <= 100 ? 1 : 0) ;
    @lt1000us[stringof(this->n), x[probefunc]] =
        sum(this->delta > 100 && this->delta <= 1000 ? 1 : 0) ;
    @lt10000us[stringof(this->n), x[probefunc]] =
        sum(this->delta > 1000 && this->delta <= 10000 ? 1 : 0) ;
    @lt100ms[stringof(this->n), x[probefunc]] =
        sum(this->delta > 10000 && this->delta <= 100000 ? 1 : 0) ;
    @lt1000ms[stringof(this->n), x[probefunc]] =
        sum(this->delta > 100000 && this->delta <= 1000000 ? 1 : 0) ;
    @gt1000ms[stringof(this->n), x[probefunc]] =
        sum(this->delta > 1000000 ? 1 : 0) ;
    /*
     * Histogram with 10 bins representing distibution of IO sizes 
     * for issued IOs. Bins are from 1K< to >256K.
     * We use the sum() function to increment by 1 each time a value 
     * matches width of a bucket, otherwise we leave unchanged.
     */

    @blkszle1k[stringof(this->n), x[probefunc]] = 
        sum(self->b <= 0x400 ? 1 : 0) ;

    @blksz2k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x400 && self->b <= 0x800 ? 1 : 0) ;

    @blksz4k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x800 && self->b <= 0x1000 ? 1 : 0) ;
    
    @blksz8k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x1000 && self->b <= 0x2000 ? 1 : 0) ;
    
    @blksz16k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x2000 && self->b <= 0x4000 ? 1 : 0) ;
    
    @blksz32k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x4000 && self->b <= 0x8000 ? 1 : 0) ;
    
    @blksz64k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x8000 && self->b <= 0x10000 ? 1 : 0) ;

    @blksz128k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x10000 && self->b <= 0x20000 ? 1 : 0) ;

    @blksz256k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x20000 && self->b <= 0x40000 ? 1 : 0) ;

    @blkszgt256k[stringof(this->n), x[probefunc]] = 
        sum(self->b > 0x40000 ? 1 : 0) ;

    self->b  = 0 ;
    self->p = 0 ;
    self->s = 0 ;
  }

  tick-1sec /gotdata/ {
    printa("%s,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n", 
            @lt100us, @lt1000us, @lt10000us, @lt100ms, @lt1000ms, 
            @gt1000ms, @blkszle1k, @blksz2k, @blksz4k, @blksz8k, @blksz16k,
            @blksz32k, @blksz64k, @blksz128k, @blksz256k, @blkszgt256k, @cnt
            ) ;
    trunc(@lt100us) ; trunc(@lt1000us) ; trunc(@lt10000us) ;
    trunc(@lt100ms) ; trunc(@lt1000ms) ; trunc(@gt1000ms) ;
    trunc(@blkszle1k) ; trunc(@blksz2k) ; trunc(@blksz4k) ;
    trunc(@blksz8k) ; trunc(@blksz16k) ; trunc(@blksz32k) ;
    trunc(@blksz64k) ; trunc(@blksz128k) ; trunc(@blksz256k) ;
    trunc(@blkszgt256k) ; trunc(@cnt) ;
    gotdata = 0 ;
  }'
zfs-long-duration.d
dtrace -xdynvarsize=10M -qn '
    BEGIN {
        MSEC        = 1000000;
        MICROSEC    = 1000;
        threshold   = 50 * MSEC;
        div = "--------------------------------------------------------------------------------"
    }
    fbt:zfs::entry {
        self->start[probefunc]=timestamp
    } 
    /* Do not enter unless we are over threshold */
    fbt:zfs::return {
        this->now = timestamp;
    }
    fbt:zfs::return /this->now && (this->now - self->start[probefunc]) > threshold/ {
        @["delta (ms)", probefunc] = quantize(
            (timestamp - self->start[probefunc]) / MSEC); 
        self->start[probefunc] = 0;
        this->now = 0;
    }
    tick-5sec {
    printf("%s\n", div); printf("%Y\n", walltimestamp); printa(@); trunc(@);
}'
run_dtrace_long_routines.sh
#!/bin/sh

dtrace -xdynvarsize=10M -qn '
    BEGIN {
        MSEC        = 1000000;
        MICROSEC    = 1000;
        threshold   = 50 * MSEC;
        div = "--------------------------------------------------------------------------------"
    }
    fbt:zfs::entry {
        self->start[probefunc]=timestamp
    }
    /* Do not enter unless we are over threshold */
    fbt:zfs::return {
        this->now = timestamp;
    }
    fbt:zfs::return /this->now && (this->now - self->start[probefunc]) > threshold/ {
        @["delta (ms)", probefunc] = quantize(
            (timestamp - self->start[probefunc]) / MSEC);
        self->start[probefunc] = 0;
        this->now = 0;
    }
    tick-5sec {
    printf("%s\n", div); printf("%Y\n", walltimestamp); printa(@); trunc(@);
}' >> /root/long-running.log
fop-zfs-snippets.d
dtrace -n 'fbt::fop_read:entry /args[0]->v_op->vnop_name == "zfs"/ {
    self->start = timestamp; } fbt::fop_read:return /self->start/ {
    @[execname, "ns"] = quantize(timestamp - self->start); self->start = 0; }
    
dtrace -qn '
  ::fop_fsync:entry /args[0]->v_path != NULL/ {
    self->p = args[0]->v_path;
    self->tick = timestamp;
  }
  ::fop_fsync:entry /self->p/ {
    this->delta = timestamp - self->tick;
    @lat[execname, stringof(self->p)] = quantize(this->delta);
    @[execname, stringof(self->p)] = count();
    self->tick = 0; self->p = 0;
  }
  END {
    printa("prog: %s path: %s count: %@d\n%@d\n", @, @lat);
  }'