#!/usr/local/bin/gawk -f # # linestat.awk #@ perform statistic on a certain number of columns of each line # # # # linestat.awk col=1,6 table.txt # linestat.awk col=1-6 table.txt # linestat.awk col=1-3,5- table.txt # columns 1,2,3,5,.. until last # command line options # [-v] col=3,5,7 take values from column 3,5,7 # warning: value of col wil be evaluated before each # new file, but only the last value will be used # # col=2- take columns from 2 to last field. # # [-v] format="%.3f" output format # # [-v] q=90 : additionally show 90% quantile # [-v] countlo=1.0 : count number of values lower than 1.0 # [-v] counthi=5.0 : count number of values higher than 5.0 # [-v] echo=0|1 : echo fields from 1 to (not including) first in col # [-v] echoall=0|1 : echo all fields # # BEGIN { stderr="/dev/stderr" progname="linestat.awk" if ( ARGV[1]~/^(h|help)$/ || help!="" ) { #print ARGV[0] # because of the which command, help is only available if # this script is in the PATH system("gawk '/^#/{print}/^BEGIN/{exit}' `which "progname"`") help=1 exit } } #FNR==1 { if (!col) ccol="1-"NF if (col~/[-]$/) ccol=col NF if (col) { ccol=procrange(col,NF) #print NR":columns",ccol > stderr nc=split(ccol,cc,/,/) for (i=1;i<=nc;i++) invcc[cc[i]]=i } if (!format) format=" %.3f" #if (head) { # nh=split(head,prhh,",") # for (i=1;i<=nh;i++) invhead[prhh[i]]=i #} } /^#/ { print"skip:"$0;next} { if (verbose>0)print "proc:"$0 delete cdata delete s delete m c=0 #print nc,NF,cc[nc],col,ccol>stderr for(i=1;i<=nc && cc[i]<=NF;i++) { cdata[i]=$(cc[i]) ; s[i]+-$(cc[i]) #print "i",i >stderr } str="" rms[c]=stdev(cdata) ave[c]=mean(cdata) maxx[c]=maxarr(cdata) minn[c]=minarr(cdata) medi[c]=median(cdata) qua1[c]=perc(cdata,5) qua2[c]=perc(cdata,10) qua3[c]=perc(cdata,25) qua4[c]=perc(cdata,50) qua5[c]=perc(cdata,75) ninety[c]=perc(cdata,90) ninetyfive[c]=perc(cdata,95) ncc=array_size(cdata) fstr=format" +/- "format" [%i] "format" ..[ "format" ].. "format if (echo=="" ||echo>0) for (i=1;!(i in invcc);i++) str=str""sprintf(" %s",$i) if (echoall) { for (;i<=NF;i++) str=str""sprintf(" %s",$i) str=str" stat:" } str=str""sprintf(fstr,ave[c],rms[c],ncc,minn[c],medi[c],maxx[c]) if (q) str=str""sprintf(" "format" ["q+0"%%]",perc(cdata,q)) if (countlo) str=str""sprintf(" n<%s: %i",countlo,count_lo(cdata,countlo) ) if (counthi) str=str""sprintf(" n<%s: %i",counthi,count_hi(cdata,counthi) ) if (echoend) { str=str" # "$0 } kstr[++nstr]=str if (NF==0) kstr[nstr]="" } END{ if (help==1) exit for(n=1;n<=nstr;n++) print kstr[n] exit for ( c=1;c<=cmax;c++ ) { if (!col || (col && (c in invcc))) prcol[c]++ } nr=r print "" for ( c=1;c<=cmax;c++ ) { delete rdata for ( r=1;r<=nr;r++) { cdata[r]=data[c,r] } } for (i in s) m[i]=( length(s[i]) > m[i] ? length(s[i]): m[i] ) # for (i=1;i in s;i++) # if(i in prcol) printf " %*s",m[i],"###" # else printf " %*s",m[i]," " # print "" printf "### \n" printf "sel " for (i=1;i in s;i++) if(i in prcol) printf " %12s",substr("-----------",1,m[i]);print "" printf "ave " for (i=1;i in s;i++) if(i in prcol) printf " %12.4f",ave[i];print "" printf "rms " for (i=1;i in s;i++) if(i in prcol) printf " %12.4f",rms[i];print "" printf "min " for (i=1;i in s;i++) if(i in prcol) printf " %12.4f",minn[i];print "" printf "max " for (i=1;i in s;i++) if(i in prcol) printf " %12.4f",maxx[i];print "" printf "med " for (i=1;i in s;i++) if(i in prcol) printf " %12s",medi[i];print "" printf "%s"," 5% " for (i=1;i in s;i++) if(i in prcol) printf " %12s",qua1[i];print "" printf "%s","10% " for (i=1;i in s;i++) if(i in prcol) printf " %12s",qua2[i];print "" printf "%s","25% " for (i=1;i in s;i++) if(i in prcol) printf " %12s",qua3[i];print "" printf "%s","50% " for (i=1;i in s;i++) if(i in prcol) printf " %12s",qua4[i];print "" printf "%s","75% " for (i=1;i in s;i++) if(i in prcol) printf " %12s",qua5[i];print "" printf "%s","90% " for (i=1;i in s;i++) if(i in prcol) printf " %12s",ninety[i];print "" printf "%s","95% " for (i=1;i in s;i++) if(i in prcol) printf " %12s",ninetyfive[i];print "" printf "sum " for (i=1;i in s;i++) if(i in prcol) printf " %12s",s[i];print "" # printf "wid " # for (i=1;i in s;i++) printf " %"m[i]"s",m[i];print "" # printf "col " # for (i=1;i in s;i++) printf " %"m[i]"s",i;print "" print " ### ----------- finished --------------- ###\n\n" } # absolute value function abs(x) {return ( x+0<0 ? -x : x+0 ) } # mean function mean(arr, i,n,ave) { for ( i in arr ) { ave+=arr[i];n++ } if (n==0) { print "divide by zero in mean()" > "/dev/stderr" print $0 > stderr return "" } return ave/n } # weighted mean function wmean(arr,weight, i,n,sumw,ave) { for ( i in arr ) { ave+=arr[i]*weight[i];n++;sumw+=weight[i] } if (sumw==0) { print "divide by zero in wmean()" > "/dev/stderr" return "" } return ave/sumw } # stdev [rms] function stdev(arr, i,m,n,std) { m=mean(arr) for ( i in arr) n++ for ( i in arr ) std+=((arr[i]-m)^2) if (n==0) { print "divide by zero in stdev(); n="n,$0 > "/dev/stderr" return "" } std=sqrt(std/n) return std } function array_size(arr, i,n) { for ( i in arr) n++ return n } function stat(arr, format,i,m,n,std,min_,max_) { for ( i in arr ) { m+=arr[i];n++; if (n==1) {min_=max_=arr[i]} max_=(max_>arr[i]?max_:arr[i]) min_=(min_>arr[i]?arr[i]:min_) } if (n==0) { print "divide by zero in stat()" > "/dev/stderr" return "" } # else { print "divide by "n" in stat" } m=m/n if (!format) format=" %.3f" for ( i in arr ) std+=(arr[i]-m)^2 ; std=sqrt(std/n) return_value=sprintf(format" +/- "format" [%i] "format" .. "format"",m,std,n,min_,max_) return return_value } function stat2(arr, format,i,m,n,std,min_,max_) { for ( i in arr ) { m+=arr[i];n++; if (n==1) {min_=max_=arr[i]} max_=(max_>arr[i]?max_:arr[i]) min_=(min_>arr[i]?arr[i]:min_) } if (n==0) { print "divide by zero in stat()" > "/dev/stderr" return "" } # else { print "divide by "n" in stat" } m=m/n if (!format) format=" %.3f" for ( i in arr ) std+=(arr[i]-m)^2 ; std=sqrt(std/n) fstr=format" +/- "format" [%i] "format" .. "format" .. "format return_value=sprintf(fstr,m,std,n,min_,max_) return return_value } function maxarr(arr, i,max_) { for (i in arr) max_ = (max_arr[i] || !min_ ? arr[i]:min_) ;return min_ } function sort(dat,ni, i, f) { if (!ni) for ( i in dat ) ni++ do { f=1; for (i=1;idat[i+1]) { f=0;switch(dat,i,i+1) } } } while (f==0) return f } # switch values iv dat[a] and dat[b] function switch(dat, a, b, t) { t=dat[a];dat[a]=dat[b];dat[b]=t} # median [ middle value ] function median(arr, atmp,n,i) { #delete atmp #for ( i in arr) { atmp[++n]=arr[i] } #sort(atmp,n) #print "sorting" #asort(arr) #print "sorting finished" return perc(arr,0,0.5) } # count number <= value function count_lo(arr,lo, n,i) { for (i in arr) if (arr[i]hi) n++ return n } # percentile [ generalized median ] function perc(arr,p ,f,l,r,atmp,n,i) { # p = percentile in percent [25] # f = p but in fraction [0.25] # if f is given , f is taken, otherwise p. if (f=="") f=p/100 delete atmp for ( i in arr) { atmp[++n]=arr[i] } asort(atmp) lf=(n-1)*f+1; l=int(lf); r=n-int(n-lf) ddl=lf-l #if (verbose) print l,lf,r,ddl return atmp[l]+ddl*(atmp[r]-atmp[l]) } function procrange(str ,cmax,s,v1,v2,L,M,R) { # translate 1-5 into 1,2,3,4,5 etc. if (cmax=="") cmax=NF if (str~/[-]$/) str=str""cmax if (match(str,/[0-9]+[-][0-9]*/)) { M=substr(str,RSTART,RLENGTH) L=substr(str,1,RSTART-1) R=substr(str,RSTART+RLENGTH) v1=M+0 v2=substr(M,index(M,"-")+1)+0 v2 = ( v2 == "" ? cmax : v2 ) #if (v1==v2 ) print L"|"M"|"R,v1,v2,$v1,$v2>stderr if (v2-v1<0) str = L M procrange(R) else { if (v1==v2) str=L v1 R else if (v1+1==v2) str=L v1","v2 R else if (v1+1