#!/usr/local/bin/gawk -f # # SYNTAX: # # xpk2peaks.awk [ options ] file[s] # # Written by: Eiso AB # # 18-12-2003 # # convert xpk files to xeasy peakslist # # Use: # xpk2peaks.awk -v f=Hh 2d.prot 2d.xpk > 2d.peaks # xpk2peaks.awk -v f=ChH c13.prot c13u2.xpk > c13.peaks # xpk2peaks.awk -v f=ChH protein.seq c13.prot c13u2.xpk > c13.peaks # ## # commandline options # # -v f=HhN : cyana type format description [ defines *input* data] # -v pkinc=1 [def=1] : peak number increment (xeasy can't have 0] # -v verbose=0,1,2 : be verbose # -v compact=0 : compact {HB1 HB2} assignments into {HB#} # -v prec=3 : print precision for ppm-values # -v dom=1,2,3,4 : read in domains differently [doesn't seem to work] # # -v takehei=0,1 : if 1 take height instead of volume # BEGIN { resstr="ALA ARG ARG+ ASN ASP ASP- CYS CYSS GLN GLU GLU- " resstr=resstr "GLY HIS HIST HIS+ ILE LEU LYS LYS+ MET PHE " resstr=resstr "PRO SER THR TRP TYR VAL ADE RADE CYT RCYT " resstr=resstr "GUA RGUA THY URA PL NL LL LL2 LL5 LP LN PLM " resstr=resstr "NLM LLM LLM2 LLM5 LPM LNM LGLY " resstr=tolower(resstr) NRES=split(resstr,RES) for ( i in RES ) RRES[RES[i]]++ stderr="/dev/stderr" progname="xpk2peaks.awk" #for ( i in ARGV ) if (ARGV[i]=="help") help=1 if ( ARGV[1]~/(^(h|help)$)/ || help!="" ) { print ARGV[0] # because of the which command, help is only available if # this script is in the PATH system("gawk '/^#/{print}/^BEGIN/{exit}' `which "progname"`") help=1 exit } if (pkinc=="") pkinc=1 if (prec=="") prec=3 if (f!="") { cyanaformat=f ; f="" } } FNR==1 { if (FILENAME ~ /[.]seq$/) filetype = "seq" else if (FILENAME ~ /[.]prot$/) filetype = "prot" else if (FILENAME ~ /[.]xpk$/) filetype = "xpk" else filetype="" } FILENAME ~ /[.]seq$/ { seq = FILENAME if (NF==2) resnum=$2 ; else resnum++ residue[resnum]=toupper($1) next } filetype == "prot" { if ($1~/^#/) next resnum=$5 atnam=toupper($4) atid=$5"."$4 atnum[atid]=$1 atppm[atid]=$2 atdppm[atid]=$3 # print atid if (resnum in residue) { if ($6=="#" && NF>6) { tres=$7 if (residue[resnum]!=tres) { print "## warning: different residues names for",resnum":",residue[resnum],tres } } } else { if ($6=="#" && NF>6) residue[resnum]=$7 } } filetype=="prot" && $1!~/#/ { if ($2-900>0) { $2="unassigned" ;if (!keepunassigned) next } tid=$5 " - " $4 if ($6=="#" && ( tolower($7) in RRES )) residue[$5]=$7 atomshift[tid,++nmeas[tid]]=$2 if ($3+0!=0) atomshifterr[tid,nmeas[tid]]=$3 #print tid,atomshift[tid,nmeas[tid]],"-",FILENAME # store freq with filename pkey=tid SUBSEP atomshift[tid,nmeas[tid]] ppmsource[pkey]=ppmsource[pkey] " " FILENAME next } # read NRMView peak files [.xpk] only new format filetype == "xpk" { old=$0 peaksource=FILENAME if (FNR==3) datasource=$0 if (FNR<=6) { if (FNR==1) { print "# reading .xpk peaklist :",FILENAME >stderr # re-evaluate dom (be)for(e) every new file if (!dom) dom="1,2,3,4" else print "## setting dom to:",dom > stderr split(dom,dd,",") olddom=dom } if (FNR==2) ndim=split($0,dimname) if (FNR==3) spec=$1 if (FNR==4 || FNR==5) { gsub(/[{}]/,"") if (FNR==4) nf=split($0,specwid) else if (FNR==5) nf=split($0,specfreq) if (nf!=ndim) print "##error:",$0,";"ndim,"dimensions" > stderr } if (FNR==6) { expectfields=NF+1 d=0;for (i=1;i<=NF;i++) if ($i~/[.]L$/) labelcol[++d]=i+1 d=0;for (i=1;i<=NF;i++) if ($i~/[.]P$/) ppmcol[++d]=i+1 d=0;for (i=1;i<=NF;i++) if ($i~/[.]W$/) ppmwidcol[++d]=i+1 d=0;for (i=1;i<=NF;i++) if ($i~/[.]B$/) ppmboxcol[++d]=i+1 # d=0;for (i=1;i<=NF;i++) if ($i~/[.]E$/) ppmerrcol[++d]=i+1 # d=0;for (i=1;i<=NF;i++) if ($i~/[.]J$/) ppmjcocol[++d]=i+1 for (i=1;i<=NF;i++) if ($i=="vol") intcol=i+1 for (i=1;i<=NF;i++) if ($i=="int") volcol=i+1 for (i=1;i<=NF;i++) if ($i=="stat") statcol=i+1 for (i=1;i<=NF;i++) if ($i=="comment") commentcol=i+1 } next } # remove spaces from assignments $0=unspaceass($0);$1=$1 # $0 changed, so NF is no longer correct!!!! (not updated) nf=split($0,ss) if (expectfields!=nf && nf>0 ) { print "## error ",expectfields,"fields expected",nf"[",NF"]","found in line",FNR,":\n"$0 > stderr error++ exit } pknum=$1 if (verbose>1) print " peak nr:",pknum >stderr ppkk[++pk]=pknum for ( d=1;d<=ndim;d++) if (!seldom || seldom==d) { if (d in labelcol) col=labelcol[d] else { print "## no label for dim",d > stderr for (e in labelcol) print "dim",e,labelcol[e] > stderr print "something wrong; exit" > stderr exit } # rtid=$col ;sub(/^[{]/,"",rtid);sub(/[}]$/,"",rtid) if (expandwildcards) { if (rtid != expwc(rtid)) { if (!outexp++) { print " expanding wildcards like in:" >stderr print rtid,expwc(rtid) >stderr } rtid = expwc(rtid) } } # now split different assignments nt=split(rtid,ttid,"|") # count nr of *different* assignments (there could be doubles) delete invttid for (m in ttid) { tid=ttid[m] invttid[tid]++ gsub(/NULL/,"?",tid) } dcountass=0 ; for (m in invttid) tttid[++dcountass]=m # treat case like {HB1 HB2} in xpk files -> {HB#} hnew="" if (compact>0 && dcountass==2) { hxa1=tttid[1];hxa=substr(hxa1,1,length(hxa1)-1) hxb2=tttid[2];hxb=substr(hxb2,1,length(hxb2)-1) if ( hxa==hxb && hxa1~/1$/ && hxb2~/2$/ ) hnew=hxa"#" else if ( hxa==hxb && hxa1~/2$/ && hxb2~/1$/ ) hnew=hxa"#" #if (verbose>1) print "| ",pknum,hxa,hxb,hxa1,hxb2,":"hnew > stderr if (compact>0 && hnew!="") { delete invttid invttid[hnew]=2 if (verbose>1) print "##",rtid,"->"hnew >stderr } #count again dcountass=0 ; for (m in invttid) dcountass++ } if (verbose>1) print pknum,"res id",rtid,":",dcountass,"assignment(s)" > stderr # keepambiguous can be set to !="" # hnew is e.g. hb# if asg. is {hb1 hb2} if (dcountass==1 ) { for (atid in invttid) { if (atid ~ /[0-9]+[.][a-zA-Z0-0]+[#*%]?/) { nasg[pknum,d]=atid resnum= ( atid+0"" in residue ? residue[atid+0""] : "-" ) sub(/[.]/," "resnum" ",atid) # "59.hn" to "59 - hn" tt=atid if ( gsub(/[{}]/,"",atid) > 0 ) { print "# removing '{}' : strange, should be done already" > stderr print "#["tt"]",rtid > stderr } atomshift[atid,++nmeas[atid]]=$(ppmcol[d]) # print atid,atomshift[atid,nmeas[atid]],"#",FILENAME,"pknr:",pknum # store freq with filename pkey=tid SUBSEP atomshift[atid,nmeas[atid]] ppmsource[pkey]=ppmsource[pkey] " " FILENAME "[nr:" pknum "]" } else if (atid!~"[{]*?[}]*" ) { print "##warn: skipping assignment! not sure how to interpret: \""atid"\"" > stderr print "## in ",FILENAME,":\n## ["old"]" > stderr } } } else if (rtid!="?") { if (verbose) print "not processed ---",pknum,rtid,d,dcountass > stderr } ppm=pkppm[pknum,d]=$(ppmcol[d]) hei=pkhei[pknum]=$(intcol) vol=pkvol[pknum]=$(volcol) linewidth[pknum,d]=$(ppmwidcol[d]) boxwidth[pknum,d]=$(ppmboxcol[d]) #if (pknum % 20 == 0 ) print pknum,d,linewidth[pknum,d]>stderr if (d in maxppm) maxppm[d] = ( ppm > maxppm[d] ? ppm : maxppm[d] ) else maxppm[d]=ppm if (d in minppm) minppm[d] = ( ppm > minppm[d] ? ppm : minppm[d] ) else minppm[d]=ppm if (dcountass==1) { resnam="" atid=( pknum SUBSEP d in nasg ? nasg[pknum,d]: "") #print pknum,"###",atid >stderr ns=split(atid,aa,"[.]") ; nres=aa[1];atnam=aa[2] if (nres in residue) resnam=residue[nres] #print atid,nres,":",resnam if (resnam) { newnam=atnam newnam=to_pseudo(resnam,newnam) newnam=to_cyana(resnam,newnam) newnam=wc_to_pseudo(resnam,newnam) if (atnam!=newnam && verbose>1) print "##",nres,atnam," -> ",newnam > stderr protnam[atid]=nres "." newnam } # remove asg[pknum,d]=atid } } for (d=1;d<=ndim;d++) { if (verbose>1) print "##",pknum,d,asg[pknum,d] > stderr } } END { if (help) exit if (error) exit # re-evaluate dom (be)for(e) every new file if (dom==olddom) {} else if (dom=="") dom="1,2,3,4" else print "## setting dom to:",dom > stderr split(dom,dd,",") for ( i in nmeas ) nnmm++ if (!nnmm) { print "# no shift information found" > stderr } if (f!="") { cyanaformat=f ; f="" } # determine cyanaformat if (!cyanaformat) { rNH=0.101329120 rCH=0.251450201 rNC=rNH/rCH dec=10 iNH=int(dec/rNH)/dec iCH=int(dec/rCH)/dec iNC=int(dec/rNC)/dec cyafstr="" for (d=1;d<=ndim;d++) { if (dimname[d]~/H/) nucname[d]="H" else if (dimname[d]~/D/) nucname[d]="H" else if (dimname[d]~/N/) nucname[d]="N" else if (dimname[d]~/C/) nucname[d]="C" sw=specwid[d] if (sw>350 && sw<1100 ) nucname[d]="H" if (sw>350*rCH && sw<1100*rCH) nucname[d]="C" if (sw>350*rNH && sw<1100*rNH) nucname[d]="N" names[nucname[d]]++ if (d>1 && nucname[d]=="H" && names["H"]>1) nucname[d]=tolower(nucname[d]) cyafstr=cyafstr nucname[d] } # determine format; not used now lo["H"] = 0 ; up["H"] = 10 lo["HC"] = 0 ; up["HC"] = 6 lo["HN"] = 6 ; up["HN"] = 10 lo["C"] = 10 ; up["C"] = 60 lo["CO"] = 167 ; up["CO"] = 182 lo["N"] = 100 ; up["N"] = 142 for (d=1;d<=ndim;d++) { if (dimname[d]~/H/) nucname[d]="H" else if (dimname[d]~/D/) nucname[d]="H" else if (dimname[d]~/N/) nucname[d]="N" else if (dimname[d]~/C/) nucname[d]="C" } cyanaformat=cyafstr } print "# Number of dimensions ",ndim print "#CYANAFORMAT",cyanaformat if (peaksource) print "#PEAKSOURCE",peaksource if (datasource) print "#DATASOURCE",datasource af="" ; for (d=1;d<=ndim;d++) af=af " %4i" pf="%4i %s 6 T %9.3e %8.2e m 0 %s\n" #for ( i in nasg) print i,nasg[i] specdim=ndim for (d=1;d<=ndim;d++) { delete data for (p=1;p<=pknum;p++) { if (p SUBSEP d in linewidth) data[++i]=linewidth[p,d] #print p,d,i,data[i]>stderr } acc[d]=median(data)+0 printf "#LINEWIDTH %i MIN MEDIAN MAX %.3f %.3f %.3f %s[n=%i]\n", d,minarr(data)+0,median(data)+0,maxarr(data)+0,lwunit,array_size(data) delete data } printf "#LINEWIDTH PPM" for (d=1;d<=ndim;d++) printf " %.3f",acc[d] ; print "" for (d=1;d<=ndim;d++) { if ((d in acc) && acc[d]!=0.0) ppmtol[d]=acc[d]/2 else if (dimname[d]~/H|D/) { ppmtol[d]=defppmHtol #print "HHH" } else if (d in acc) { ppmtol[d]=defppmXtol #print "XXX" } else ppmtol[d]=999 } printf "#TOLERANCE" for (d=1;d<=ndim;d++) printf " %.3f",ppmtol[d] ; print "" for (i=1;i<=pk;i++) { pknum=ppkk[i] sasgs=asgs=ppms="" for (d=1;d<=ndim;d++) { protid=atid="" ppms=ppms sprintf(" %6.*f",prec,pkppm[pknum,d]) if (!(pknum SUBSEP d in nasg)) { if (verbose>1) print "peak",pknum,"not assigned in dom=",d >stderr } else if (nasg[pknum,d]=="") { if (verbose) print "peak",pknum,"assigned to NULL in dom=",d >stderr } else { if (verbose>1) print "nasg["pknum","d"]=",nasg[pknum,d] >stderr atid = nasg[pknum,d] } #if (pknum SUBSEP d in nasg) atid = nasg[pknum,d] if ( atid in protnam && atid!="") protid = protnam[atid] if ( atid=="") { if (verbose>1) print "atomname undefined for peak",pknum,"dom=",d >stderr } else if ( protid in atnum ) { atid=protid } else if (atid in atnum) { print "peak",pknum,"changed name ",protid,"not in protlist, but", atid,"is["atnum[atid]"]" > stderr } else { if (atid!="") print "peak",pknum,", atomnumber not found ["atid"]",protid >stderr } asgs=asgs sprintf(" %4i",(atid in atnum ? atnum[atid]: 0)) sasgs = sasgs " " residue[atid+0] "-"\ (pknum SUBSEP d in nasg ? nasg[pknum,d] : "" ) #print nasg[pknum,d] } asgs=asgs " 0" #" " sasgs if (takehei) { pksize=pkhei[pknum] } else { pksize=pkvol[pknum] } printf pf,pknum+pkinc,ppms,pksize,0,asgs } } function mean(dat, i, n, m) { for ( i in dat ) { m+=dat[i];n++ } if (n==0) { print "#", m, n;return m} return m/n } # median [ middle value ] function median(arr) { return perc(arr,0,0.5) } # percentile [ generalized median ] function perc(arr,p ,f,l,r,atmp,n,i) { # p = percentile in percent [25] # f = p but in fraction [0.25] # if f is given , f is taken, otherwise p. if (f=="") f=p/100 delete atmp for ( i in arr) { atmp[++n]=arr[i] } asort(atmp) lf=(n-1)*f+1; l=int(lf); r=n-int(n-lf) ddl=lf-l #if (verbose) print l,lf,r,ddl return atmp[l]+ddl*(atmp[r]-atmp[l]) } function sort(dat,ni, i, j) { if (ni!=1) do { j=1; for (i=1;idat[i+1]) { j=0;switch(dat,i, i+1) } } } while (j==0) } function switch(dat, a, b, t) { t=dat[a];dat[a]=dat[b];dat[b]=t } function rmsdev(dat,m, i,out) { # returns root-mean-square deviation of dat from m ; # returns -1 if dat has no elements. for ( i in dat ) out+=sqr(m-dat[i]) return (size(dat)>0 ? sqrt(out/size(dat)) : -1) } function size(dat, i,ni) { # return number of elements of arr : -1 if no elements for ( i in dat ) ni++ return ( ni ? ni : -1) } function sqr(a) { return a*a } function unspaceass(str, lstr,mstr,rstr) { if (AMBISEP=="") AMBISEP="|" while (match(str,/[{][^{}]*[ \t]+[^{}]*[}]/)) { lstr=substr(str,1,RSTART-1) mstr=substr(str,RSTART,RLENGTH) rstr=substr(str,RSTART+RLENGTH) gsub(/[ \t]+/,AMBISEP,mstr) str=lstr mstr rstr if (verbose>1) print "mstr",mstr > stderr } return str } function expwc(str) { if (AMBISEP=="") AMBISEP="|" oldstr=str while ( match(str,/H[NABGDEZH1-3]+[#]/)>0 ) { lstr=substr(str,1,RSTART-1) mstr=substr(str,RSTART,RLENGTH) rstr=substr(str,RSTART+RLENGTH) if (mstr~/#$/) { sub(/#$/,"",mstr) if (expandwildcards=="to1") mstr = mstr"1" else if (expandwildcards=="to2") mstr = mstr"1" else if (expandwildcards) mstr = mstr"1" sepchar mstr"2" } str = lstr mstr rstr #exit } if (verbose && oldstr!=str) print oldstr"->"str >stderr return str } function to_cyana(res,atom, outatom) { # substitute hb1->hb2 hb2->hb3 etc res=toupper(res) atom=toupper(atom) gsub(/ /,"",atom) outatom=atom #hb1 hb2 #resrxp="(ARG|AS[NP]|CY[PS]|GL[NU]|MET|PHE|" #resrxp=resrxp "PRO|SER|TYR|TRP|LEU|LYS|HIS|UNK|PRU)([-0-9+]?)" if (atom=="HN") outatom="H" if (res == "GLY" ) { if (atom=="HA1") outatom="HA3" } if (res != "ALA" ) { if (atom=="HB1") outatom="HB3" } #hg1 hg2 resrxp="(ARG|GL[NU]|MET|PRO|LYS|UNK|PRU)([-0-9+]?)" if (res ~ resrxp) { if (atom=="HG1") outatom="HG3" } #hg11 hg12 (ile) resrxp="(ILE|UNK|PRU)([-0-9+]?)" if (res ~ resrxp) { if (atom=="HG11") outatom="HG13" } #hd1 hd2 resrxp="(ARG|PRO|LYS|UNK|PRU)([-0-9+]?)" if (res ~ resrxp) { if (atom=="HD1") outatom="HD3" } #he1 he2 resrxp="(LYS)([-0-9+]?)" if (res ~ resrxp) { if (atom=="HE1") outatom="HE3" } if (outatom!=atom && verbose>1) print "## to_cyana("atom")->"outatom return outatom } function wc_to_pseudo(res,atom, outatom) { # substitute ALA HB1/2->QB res=toupper(res) atom=toupper(atom) gsub(/ /,"",atom) outatom=atom if (atom~/#$/) { outatom="Q" substr(atom,2,length(atom)-2) } if (outatom!=atom && verbose>1) print "#wc_to_pseudo("atom")->"outatom > stderr return outatom } function to_pseudo(res,atom, outatom) { # substitute ALA HB1/2->QB # PHE HD1/2->QD # # use only for equivalent CH3 and aromatic ring protons # res=toupper(res) atom=toupper(atom) gsub(/ /,"",atom) outatom=atom #if (resat~/ALA/)print "########",resnum,resat > stderr if (res ~ /^ALA/ && atom ~ /^HB[123#]*$/ ) outatom="QB" else if (res ~ /^VAL/ && atom ~ /^HG1[123#]?$/) outatom="QG1" else if (res ~ /^VAL/ && atom ~ /^HG2[123#]?$/) outatom="QG2" else if (res ~ /^LEU/ && atom ~ /^HD1[123#]?$/) outatom="QD1" else if (res ~ /^LEU/ && atom ~ /^HD2[123#]?$/) outatom="QD2" else if (res ~ /^ILE/ && atom ~ /^HG2[123#]?$/) outatom="QG2" else if (res ~ /^ILE/ && atom ~ /^HD1[123#]?$/) outatom="QD1" else if (res ~ /^THR/ && atom ~ /^HG2[123#]?$/) outatom="QG2" else if (res ~ /^MET/ && atom ~ /^HE[123#]?$/ ) outatom="QE" else if (res ~ /^LYS/ && atom ~ /^HZ[123#]?$/ ) outatom="QZ" else if (res ~ /^(TYR|PHE)/ && atom ~ /^HD[12#]?$/ ) outatom="QD" else if (res ~ /^(TYR|PHE)/ && atom ~ /^HE[12#]?$/ ) outatom="QE" if (outatom!=atom && verbose>1) print "#_to_pseudo("atom")->"outatom > stderr atom"->"outatom return outatom } function maxarr(arr, i,max_) { for (i in arr) max_ = (max_arr[i] || !min_ ? arr[i]:min_) ;return min_ } function array_size(arr, i,n) { for ( i in arr) n++ return n }