#!/usr/local/bin/gawk -f # #@ filter peaklist for diagonal,water,lowest/highest intensities # Usage: # filterpeaks.awk diagHskip=0.8 \ # waterHskip=1 waterskip=0.02 hH_2.peaks > hH_2-f.peaks # # Command line options: # diagHskip=0.03 - skip proton diagonal # diagCskip=0.3 - skip carbon diagonal # # diagHoffsetskip=0.24,0.38 # - skip peaks that have 0.24<|ppmx-ppmy|<0.38 # for removal of sidebands # # waterHskip=0.3 - skip protons closer than 0.3 to water ppm in acquisition # waterskip=0.02 - skip protons closer than 0.03 to water ppm in any proton dimension. # # The script uses # waterppm=4.7 : set the ppm-value of water # keepass=1 [def=1] : do not filter out fully assigned peaks # keepass=2 [def=1] : do not filter out partially assigned peaks # keepass=0 [def=1] : ignore assignments # # Experimental: # skiplo=10% : skip 10% lowest peaks # skiphi=10% : skip 10% lowest peaks # skiplo=10 : skip peaks with int/height <= 10 # skiphi=10 : skip peaks with int/height >= 10 # # BEGIN { stderr="/dev/stderr" if(waterppm=="") waterppm=4.7 if(keepass=="") keepass=1 progname="filterpeaks.awk" if ( ARGV[1]~/^(h|help)$/ || help!="" || ARGC<2) { help=1 exit print ARGV[0] # because of the which command, help is only available if # this script is in the PATH system("gawk '/^#/{print}/^BEGIN/{exit}' `which "progname"`") } } /^#CYANAFORMAT/ { if (!format) format=$2 else print "... read ("$0") but keeping format:",format > stderr dim_H=index(format,"H") dim_h=index(format,"h") dim_C=index(format,"C") dim_c=index(format,"c") dim_N=index(format,"N") dim_n=index(format,"n") } /^# *Number of dimensions/ { ndim=$NF } $1~/^#/ && peakline==""{ print;next } { p=$1 peakline++ if ($1~/^#/) { outstr[peakline]=$0 ; next } # store ppm values for(c=2;c<=ndim+1;c++) ppm[p,c]=$c # checkfor assignment c+=2 hei[peakline]+=$c #;print "####",hei[peakline]>stderr pkid[peakline]=p for(c=1;c<=ndim;c++) { ac=ndim+7+c if ($ac != 0) { isassigned[p]++ ; #print "asg",$ac >stderr } } if (dim_H) ppmH=$(dim_H+1) if (dim_h) ppmh=$(dim_h+1) if (dim_C) ppmC=$(dim_C+1) if (dim_c) ppmc=$(dim_c+1) if (keepass==1 && (p in isassigned) && isassigned[p]==ndim) { outstr[peakline]=sprintf("%s",$0) next } if (keepass==2 && (p in isassigned) ) { outstr[peakline]=sprintf("%s",$0) next } if (diagHskip!="" && dim_H && dim_h) { if ( abs( ppmH - ppmh ) <= diagHskip ) { outstr[peakline]=sprintf("#diagonalH %s",$0) ndiagHskip++ next } } if (diagCskip!="" && dim_C && dim_c) { if ( abs( ppmC - ppmc ) <= diagCskip ) { outstr[peakline]=sprintf("#diagonalC %s",$0) ndiagCskip++ next } } if (diagNskip!="" && dim_N && dim_n) { if ( abs( ppmN - ppmn ) <= diagNskip ) { outstr[peakline]=sprintf("#diagonalN %s",$0) ndiagNskip++ next } } if (waterHskip!="" && dim_H) { #print "waterHskip",abs( ppmH - waterppm ) if ( abs( ppmH - waterppm ) <= waterHskip ) { dpw=abs( ppmH - waterppm ) outstr[peakline]=sprintf("#close to water ppm (acq) %s <= %s %s",dpw,waterHskip,$0) nwaterHskip++ next } } if (waterskip!="" && dim_H) { #print "waterskip",abs( ppmH - waterppm ) if ( abs(ppmH - waterppm) <= waterskip ) { outstr[peakline]=sprintf("#close to water ppmH <=%s %s",waterskip,$0) nwaterskip++ next } if ( abs(ppmh - waterppm) <= waterskip ) { outstr[peakline]=sprintf("#close to water ppmh <=%s %s",waterskip,$0) nwaterskip++ next } } if (diagHoffsetskip!="" && dim_H && dim_h) { split(diagHoffsetskip,diagHoff,",") if ( diagHoff[1]>diagHoff[2] ) { print "# order of numbers wrong in diagHoffsetskip:", diagHoffsetskip > stderr } if ( abs( ppmH - ppmh ) > diagHoff[1] && abs( ppmH - ppmh ) < diagHoff[2] ) { outstr[peakline]=sprintf("#diagHoffset %s",$0) ndiagHoffsetskip++ next } } outstr[peakline]=sprintf("%s",$0) next } END { if (help==1) { print ARGV[0] # for ( i in ARGV) print i,ARGV[i] # because of the which command, help is only available if # this script is in the PATH system("gawk '/^#/{print}/^BEGIN/{exit}' `which "progname"`") exit } if (skiplo!=""||skiphi!="") { asort(hei,sorthei) for (i=1;i in sorthei;i++) { #print "sorted",i,sorthei[i],pkid[i] } #print "elements",i-1 nelm=i-1 } if (skiplo) { if (skiplo~/%$/) { oskiplo=skiplo skiplo=skiplo/100 ind=int((nelm*skiplo)+0.5) skiplo=sorthei[ind] print "# skipping lower "oskiplo" of intensities:" } print "# skipping intensities lower than "skiplo for (l=1;l<=peakline;l++) if (l in hei && hei[l]<=skiplo) { outstr[l]="# intensity"hei[l]"<="skiplo" "outstr[l] nskiplo++ } } if (skiphi) { if (skiphi~/%$/) { oskiphi=skiphi skiphi=skiphi/100 ind=int((nelm*skiphi)+0.5) skiphi=sorthei[nelm-ind] print "# skipping higher "oskiphi" of intensities:" } print "# skipping intensities higher than "skiphi for (l=1;l<=peakline;l++) if (l in hei && hei[l]>=skiphi) { outstr[l]="# intensity>="skiphi" "outstr[l] nskiphi++ } } for(l=1;l<=peakline;l++) print outstr[l] sumskip=ndiagHskip+ndiagCskip+ndiagNskip+nwaterHskip+nwaterskip+nskiplo+nskiphi printf "...skipped peaks in %s %i %s",FILENAME,sumskip,(sumskip?"due to:":"") >stderr if (ndiagHskip) printf " Hdiag:%i ",ndiagHskip>stderr if (ndiagCskip) printf " Cdiag:%i ",ndiagCskip>stderr if (ndiagNskip) printf " Ndiag:%i ",ndiagNskip>stderr if (waterHskip) printf " waterH:%i",nwaterHskip>stderr if (waterskip) printf " water(indirect dim):%i",nwaterskip>stderr if (nskiplo) printf " low-Int:%i",nskiplo>stderr if (nskiphi) printf " high-Int:%i",nskiphi>stderr printf "\n">stderr } function abs(x) { return ( x<0 ? -x : x ) }