#!/usr/local/bin/gawk -f # #@ get stats on values in columns of tables # # tabstat.awk -v col=1,6 table.txt # tabstat.awk col=1,6 table.txt # # command line options # [-v] col=3,5,7 take values from column 3,5,7 # warning: value of col wil be evaluated before each # new file, but only the last value will be used # # # [-v] head=foo,bar,baz headers for the selected columns # # [-v] quant=5,10,25,50,75,85,95 # show quantiles (percentages in comma separated list) # quant="" : use default 5,10,25,50,75,85,95 # quant=" " : don't give quantiles # # [-v] lineout additionally print stats on a line # BEGIN { progname="tabstat.awk" if ( ARGV[1]~/^(h|help)$/ || help!="" ) { #print ARGV[0] # because of the which command, help is only available if # this script is in the PATH system("gawk '/^#/{print}/^BEGIN/{exit}' `which "progname"`") help=1 exit } } echo FNR==1 { # process command line options: col,head,quant if (col) { col=procrange(col) nc=split(col,cc,",") for (i=1;i<=nc;i++) invcc[cc[i]]=i } if (head) { nh=split(head,prhh,",") for (i=1;i<=nh;i++) invhead[prhh[i]]=i } if (quant!="") { nq=split(quant,qq,/,/) } } /^#/ { # skip ,but keep last on for headers print"skip:"$0; $1="";$0=$0 chead=$0 nh=split($0,hh) next } { r++ if (r==1) { cmax=NF if (!head && nh) { nh=split(chead,prhh) for (i=1;i<=nh;i++) invhead[hh[i]]=i } } if (NF>cmax) cmax=NF if (verbose>0)print "proc:"$0 for(i=1;i<=NF;i++) data[i,r]=$i for(i=1;i<=NF;i++) s[i]+=$i for(i=1;i<=NF;i++) m[i]=( length($i) > m[i] ? length($i): m[i] ) headstr="" for(i=1;i<=NF;i++) if ( i in invhead ) headstr=headstr" "$i } END{ if (help==1) exit for ( c=1;c<=cmax;c++ ) { if (!col || (col && (c in invcc))) prcol[c]++ } nr=r print "" for ( c=1;c<=cmax;c++ ) { delete rdata for ( r=1;r<=nr;r++) { cdata[r]=data[c,r] } if (c in prcol) { rms[c]=stdev(cdata) ave[c]=mean(cdata) maxx[c]=maxarr(cdata) minn[c]=minarr(cdata) medi[c]=median(cdata) numelm[c]=array_size(cdata) if (quant=="") { qua1[c]=perc(cdata,5) qua2[c]=perc(cdata,10) qua3[c]=perc(cdata,25) qua4[c]=perc(cdata,50) qua5[c]=perc(cdata,75) ninety[c]=perc(cdata,90) ninetyfive[c]=perc(cdata,95) eightyfive[c]=perc(cdata,85) } else { for (qu=1;qu<=nq;qu++) { qua[qq[qu],c]=perc(cdata,qq[qu]) } } if (lineout) print headstr"[col:"c"]: "stat(cdata) } } for (i in s) m[i]=( length(s[i]) > m[i] ? length(s[i]): m[i] ) # for (i=1;i in s;i++) # if(i in prcol) printf " %*s",m[i],"###" # else printf " %*s",m[i]," " # print "" printf "### " for (i=1;i in s;i++) if(i in prhh) printf " %12s",prhh[i];print "" printf "sel " for (i=1;i in s;i++) if(i in prcol) printf " %12s",substr("-----------",1,m[i]);print "" printf "n " for (i=1;i in s;i++) if(i in prcol) printf " %12i",numelm[i];print "" printf "ave " for (i=1;i in s;i++) if(i in prcol) printf " %12.4f",ave[i];print "" printf "rms " for (i=1;i in s;i++) if(i in prcol) printf " %12.4f",rms[i];print "" printf "med " for (i=1;i in s;i++) if(i in prcol) printf " %12s",medi[i];print "" printf "min " for (i=1;i in s;i++) if(i in prcol) printf " %12.4f",minn[i];print "" if (quant=="") { printf "%s"," 5% " for (i=1;i in s;i++) if(i in prcol) printf " %12s",qua1[i];print "" printf "%s","10% " for (i=1;i in s;i++) if(i in prcol) printf " %12s",qua2[i];print "" printf "%s","25% " for (i=1;i in s;i++) if(i in prcol) printf " %12s",qua3[i];print "" printf "%s","50% " for (i=1;i in s;i++) if(i in prcol) printf " %12s",qua4[i];print "" printf "%s","75% " for (i=1;i in s;i++) if(i in prcol) printf " %12s",qua5[i];print "" printf "%s","85% " for (i=1;i in s;i++) if(i in prcol) printf " %12s",eightyfive[i];print "" printf "%s","90% " for (i=1;i in s;i++) if(i in prcol) printf " %12s",ninety[i];print "" printf "%s","95% " for (i=1;i in s;i++) if(i in prcol) printf " %12s",ninetyfive[i];print "" } else if (quant~/[0-9]/) { for (qu=1;qu<=nq;qu++) { printf "%-5s",qq[qu]"%" for (i=1;i in s;i++) if(i in prcol) printf " %12s",qua[qq[qu],i] print "" } } printf "max " for (i=1;i in s;i++) if(i in prcol) printf " %12.4f",maxx[i];print "" printf "sum " for (i=1;i in s;i++) if(i in prcol) printf " %12s",s[i];print "" # printf "wid " # for (i=1;i in s;i++) printf " %"m[i]"s",m[i];print "" # printf "col " # for (i=1;i in s;i++) printf " %"m[i]"s",i;print "" print " ### ----------- finished --------------- ###\n\n" } # absolute value function abs(x) {return ( x+0<0 ? -x : x+0 ) } # mean function mean(arr, i,n,ave) { for ( i in arr ) { ave+=arr[i];n++ } if (n==0) { print "divide by zero in mean()" > "/dev/stderr" return "" } return ave/n } # weighted mean function wmean(arr,weight, i,n,sumw,ave) { for ( i in arr ) { ave+=arr[i]*weight[i];n++;sumw+=weight[i] } if (sumw==0) { print "divide by zero in wmean()" > "/dev/stderr" return "" } return ave/sumw } # stdev [rms] function stdev(arr, i,m,n,std) { m=mean(arr) for ( i in arr) n++ for ( i in arr ) std+=((arr[i]-m)^2) if (n==0) { print "divide by zero in stdev()" > "/dev/stderr" return "" } std=sqrt(std/n) return std } function array_size(arr, i,n) { for ( i in arr) n++ return n } function stat(arr, format,i,m,n,std,min_,max_) { for ( i in arr ) { m+=arr[i];n++; if (n==1) {min_=max_=arr[i]} max_=(max_>arr[i]?max_:arr[i]) min_=(min_>arr[i]?arr[i]:min_) } if (n==0) { print "divide by zero in stat()" > "/dev/stderr" return "" } # else { print "divide by "n" in stat" } m=m/n if (!format) format=" %.3f" for ( i in arr ) std+=(arr[i]-m)^2 ; std=sqrt(std/n) return_value=sprintf(format" +/- "format" [%i] "format" .. "format"",m,std,n,min_,max_) return return_value } function maxarr(arr, i,max_) { for (i in arr) max_ = (max_arr[i] || !min_ ? arr[i]:min_) ;return min_ } function sort(dat,ni, i, f) { if (!ni) for ( i in dat ) ni++ do { f=1; for (i=1;idat[i+1]) { f=0;switch(dat,i,i+1) } } } while (f==0) return f } # switch values iv dat[a] and dat[b] function switch(dat, a, b, t) { t=dat[a];dat[a]=dat[b];dat[b]=t} # median [ middle value ] function median(arr, atmp,n,i) { #delete atmp #for ( i in arr) { atmp[++n]=arr[i] } #sort(atmp,n) #print "sorting" #asort(arr) #print "sorting finished" return perc(arr,0,0.5) } # percentile [ generalized median ] function perc(arr,p ,f,l,r,atmp,n,i) { # p = percentile in percent [25] # f = p but in fraction [0.25] # if f is given , f is taken, otherwise p. if (f=="") f=p/100 delete atmp for ( i in arr) { atmp[++n]=arr[i] } asort(atmp) lf=(n-1)*f+1; l=int(lf); r=n-int(n-lf) ddl=lf-l #if (verbose) print l,lf,r,ddl return atmp[l]+ddl*(atmp[r]-atmp[l]) } function procrange(str ,s,v1,v2,L,M,R) { # translate 1-5 into 1,2,3,4,5 etc. if (match(str,/[0-9]+\-[0-9]+/)) { M=substr(str,RSTART,RLENGTH) L=substr(str,1,RSTART-1) R=substr(str,RSTART+RLENGTH) v1=M+0 v2=substr(M,index(M,"-")+1)+0 if (v2-v1<0) str = L M procrange(R) else { if (v1==v2) str=L v1 R else if (v1+1==v2) str=L v1","v2 R else if (v1+1