#!/usr/local/bin/gawk -f # # put assignments from cycle*.ass in *.xpk file # # use: # addass.awk cycle6.ass -v xpkdom=1:Hh,2:CHh" \ # -v mincontr=0.99 -v maxass=2 \ # -v # 2d.xpk c13.xpk > cycle6ass.xpk # # after that: # ass2shift sh3.seq cycle6ass.xpk > cycle6ass.shifts # # and check _bad.shifts & _ass2shift.tmp for outliers # # # -v xpkdom=... [""] : use to define the order of nuclei for the xpk files # consists of comma-separated xpkfilenr:format strings # # the formatstrings can consist of a string of the following chars # H (protons acq dimension) # N (hetero atom for H ) # C (hetero atom for H ) # h (protons indirect dimension) # h (protons indirect dimension) # N (hetero atom for h ) # C (hetero atom for h ) # 'H' and 'h' should be present , only one of 'N' and 'C' and only one # of 'n' and 'c' should be present # # For example: # xpkdom="1:HNh,4:HNch" # # means: # # xpk-peaklist 1 has format HhN # thus is a 3D peaklist with # - aqc-proton(H) is in dom 1 # - het-nucleus for acq-proton is in dom 3 ( and is Nitrogen) # - indirect proton is in dom 2 # # xpk-peaklist 4 has format HNch # thus is a 4D peaklist with # - aqc-proton(H) is in dom 1 # - het-nucleus for acq-proton is in dom 2 ( and is Nitrogen) # - indirect proton is in dom 4 # - het-nucleus for indirect proton is in dom 3 ( and is Carbon) # # -v ov=0 : if defined and >1 overwrite assignments present in the xpk file # ov=0 sets keepass=1 [don't overwrite] # ov=1 sets keepass=0 [overwrite] # # -v unassign = all : if set remove all assignments present in the xpk file # -v unassign = 1 [1-4] : as above but only remove assignments in dom 1 # # # -v keepass = 1 : if defined don't overwrite assignments present in the xpk file # default : keepass=1 [don't overwrite] # # -v mincontr=0.999 : minimum of total volume contribution # required for acceptance # [ the last percentage on each assignment line # in cycle[n].ass # default : keepcharge="" # # -v maxass=1 : maximum nuber of different assignments # that will be put in the *.xpk file # default : maxass=1 # # -v keepnames=1 : if defined do not change names # from HB3 [iupac] to HB2 [pdb] # default : keepnames="" # # -v keepcharge=1 : if defined do keep charges # -v keeppseudo=0 : don't keep pseudoatoms of cyana [ convert QG2 -> HG21 ] # default : keeppseudo=0 # # -v seqrange=0 : assign only pairs with residuenumber difference # within seqrange # seqrange="" : accept all # seqrange=0 : only intra # seqrange=1 : only intra & sequential # default: seqrange="" # BEGIN { IGNORECASE=1 SUBSEP=":" stderr="/dev/stderr" if ( ARGV[1]~/(^h|help)$/ || help!="" ) { print ARGV[0] system("gawk '/^#/{print}/^BEGIN/{exit}' `which addass.awk`") help=1 exit } # pkincrement used to make *.peaks file # defaults if (pkinc=="") pkinc=1 if (mincontr=="") mincontr= 0.999 if (maxass=="") maxass=1 if (keepass=="") keepass=1 if (ov=="") { ov=0 ; keepass = (!ov) } if (AMBISEP=="") AMBISEP="|" # separator for ambiguous assignments if (xpkdom=="") xpkdom="" # strange, this should not benecessary # bug in gawk 3.1.3 use 3.1.3a1 in # /home/eiso/local/bin/gawk [27/11/2003] #if (seqrange=="") seqrange=-1 } #asserterror==1 { error++; exit } # cyana .ass files FILENAME ~/[.]ass$/ && FNR==1 && mdist=="" { if (dref=="") dref=3.4 for (pkl in peaklists) { mdist[pkl]=sdist[pkl]/ndist[pkl] for ( sp in dist ) if (index(sp,pkl SUBSEP)==1) { dist[sp] /= ( mdist[pkl] / dref ) dist[sp] = sprintf("%.3f",dist[sp]) #print pkl,sp,dist[sp] } print "## peaklist",pkl,"scale",mdist[pkl],sdist[pkl],"/",ndist[pkl] } cyc_code = FILENAME #sub(/^cycle/,"c",cyc_code) sub(/[.]ass$/,"",cyc_code) sub(/.*[/]/,"",cyc_code) } #{if ($0!=s0)print FILENAME,$0;s0=$0} FILENAME ~/[.]ass$/ && (NF==0 || FNR==1){ readnewpeak=1 #if (outfile) print "##",keeppeak,np > outfile # name assname=FILENAME sub(/[.]ass$/,"",assname);sub(/.*[/]/,"",assname) colcode=1 if (nta>1) colcode+=1 if (pkid in rejected) colcode+=2 if (disupl!="") upl=disupl if (outtype!="noes" && pstr!="" && astr!="") { # write p ... line pstr = sprintf("p %s %i %s %s %s %s %s %s", specname[spec],np,"0",qual,noevol,noevolerr,noeint,noeinterr) pstr=pstr sprintf(" %s - %s - %s - %s -",ppmX,ppmH,ppmx,ppmh) # write 'a ...' line fom=supp"|"rsup astr = sprintf("a %s %s %s %7.3f %7.3f - - - - -", colcode,fom,pweight,lol,upl) } if (keeppeak==pkid && outfile && outstr && astr ) if ( (np in sumvolcontr) && sumvolcontr[np]/100>=mincontr) { print "##",sumvolcontr[np] > outfile print pstr > outfile print astr,kept > outfile printf "%s",outstr > outfile keeppeak=0 outstr=pstr=astr="" } if (NF==0)next } FILENAME ~/[.]ass$/ && /^ +Distance limit [0-9.]+ A violated in / && readnewpeak==0 { disupl=$3 nstrviol=$7 aveviol=$10 eliminated = ( /eliminated/ ) } FILENAME ~/[.]ass$/ && /^ +Peak unassigned. *$/ && readnewpeak==0 { rejected[pkid]++ } FILENAME ~/[.]ass$/ && /^ +Peak unassigned. *$/ && readnewpeak==0 && keepall<2 { keeppeak=0 readnewpeak=1 next } FILENAME ~/[.]ass$/ && NF==1 && $1=="Peaks:" { prevnp="" } FILENAME ~/[.]ass$/ && $1=="Peak" && $NF=="ppm):" && readnewpeak==1 { readnewpeak=0 pstr=astr=outstr="" if (!firstassline++) printf "## reading %s\n",FILENAME > stderr gsub(/[(,):]/," ") $1=$1; np=$2 keeppeak=pkid q=$0 if (!prevnp || prevnp-np>0) { spec++ ass_file[spec]=FILENAME printf "\n# - %12s peaklist:%i\n",FILENAME,spec >stderr if (spec in specname) spnam=specname[spec] else spnam = "spec"spec # best guess for peaknumber difference pkincr[spec]=np } #print "SPEC:",spec >stderr # check if ppm's are the same if peaksnumbers are pkid= spec SUBSEP np pkdb[++sppk]=pkid assfilecountpeaks[spec]++ ndim=NF-3 # assdim[pkid]=ndim # keep the dimension for each peak ppmh = sprintf("%.3f",$3) ppmH = sprintf("%.3f",$4) ppmX = ( ndim>2 ? sprintf("%.3f",$5) : "-" ) ppmx = ( ndim>3 ? sprintf("%.3f",$6) : "-" ) #print "ppms",ppmh,ppmH,ppmX,ppmx,ndim > stderr allppms[pkid]= ppmH " "ppmh " "ppmX " " ppmx assppmH[pkid]=ppmH assppmh[pkid]=ppmh assppmX[pkid]=ppmX assppmx[pkid]=ppmx #print (pkid in pkppmH && pkid in pkppmh),(spec in peaklists) if (pkid in pkppmH && pkid in pkppmh) { if ( abs(pkppmH[spec,np]-ppmH)<0.01 ) ppmH=pkppmH[spec,np] else { print "ERR-H",spec,np,ppmH,$0 > stderr} if ( abs(pkppmh[spec,np]-ppmh)<0.01 ) ppmh=pkppmh[spec,np] else { print "ERR-h",spec,np,ppmH,$0 > stderr} if (ppmX!="-") if ( abs(pkppmX[spec,np]-ppmX)<0.01 ) ppmX=pkppmX[spec,np] else { print "ERR-X",spec,np,ppmX,$0> stderr } if (ppmx!="-") if ( abs(pkppmx[spec,np]-ppmx)<0.01 ) ppmx=pkppmx[spec,np] else { print "ERR-x",spec,np,ppmx,$0> stderr } } else { if (spec in peaklists) print " skipping spec:",specname[spec],np,"vol='"vol[pkid]"'", $0 > stderr } getline qual=supp=rsup=diag="" volelim=0 if (/chemical-shift based assignment/ ) { nass = $1 if (nass!=0) { if (match($0,/quality = ([0-9.]+)/,aa)>0) qual=aa[1]+0 if (match($0,/support = ([0-9.]+)/,aa)>0) supp=aa[1]+0 if (match($0,/residual support = ([0-9.]+)/,aa)>0) rsup=aa[1]+0 #if (!qual && !supp && !rsup) # print $0"\nn",nass,"q",qual,"s",supp,"r",rsup } } else if (/diagonal assignment/){ diag=1 nass=$1 keeppeak=0 } else if (/Eliminated by volume filter/){ getline if (/No tentative assignment possible/) {} else { print "line",FNR,"unexpected",$0 > stderr} volelim=1 keeppeak=0 } else { print "???",$0 > stderr next } #print "outtype",outtype > stderr if (outtype!="noes") { if (spnam=="") outfile = "spec"spec".list" else outfile = spnam"-"assname".list" } else outfile = "spec"spec".noes" #outfile="/dev/stdout" #print "#2#",$0 > outfile outstr="" pstr="" astr="" # for ( i in vol ) if (np==i) print "i",i,np,vol[i] if (np in vol) { #print np" in vol" noevol=vol[np] ; noevolerr=volerr[np] } else if ((spec SUBSEP np) in vol) { # print spec np" in vol" noevol=vol[spec , np] ; noevolerr=volerr[spec,np] } else { noevol="1.0" noevolerr="-" } if ( (spec SUBSEP np) in dist) { noeint=noevol noeinterr=noevolerr fom="-" pweight="-" upl=dist[spec,np] lol=1.8 if (outtype!="noes") { # write p ... line pstr = sprintf("p spec%i %i - - %s %s %s %s", spec,np,noevol,noevolerr,noeint,noeinterr) pstr=pstr sprintf(" %s - %s - %s - %s -",ppmX,ppmH,ppmx,ppmh) # write 'a ...' line astr = sprintf("a %s %s %s %.3f %.3f - - - - -", 6,fom,pweight,lol,upl) } } else { peaknotinpeaklist=1 } kept=0 obs=0 trans=0 nodis=0 for (l=1;l<=nass;l++) { getline assstr[l]=$0 if (/ O /) obs++ if (/ T /) trans++ if (/ * /) prevass++ } nta=0 for (l=1;l<=nass;l++) { $0=assstr[l] if (/% +kept/ || keepall) { nta++ # non trivial assignments kept++ if ( $(NF-1) ~ /^[0-9]+[.][0-9]+[%]$/ ) sumvolcontr[pkid]+=$(NF-1) else { print "#ERROR: premature exit" > stderr exit } #ass=substr($0,10,32) a1=substr($0,11,14) ; r1=substr(a1,7,4) if (!keepcharge) sub(/[-+]$/,"",r1) gsub(/ /,"",r1) n1=substr(a1,11,4) ; at1=substr(a1,1,4) if (!keepnames) at1 = from_iupac(r1,at1) xat1 = to_heavy(r1,at1) p1=n1 " " r1 " " at1 h1=n1 " " r1 " " xat1 a2=substr($0,28,14) ; r2=substr(a2,7,4) if (!keepcharge) gsub(/[-+]$/,"",r2) gsub(/[ ]+/,"",r2) n2=substr(a2,11,4) ; at2=substr(a2,1,4) if (!keepnames) at2 = from_iupac(r2,at2) xat2 = to_heavy(r2,at2) p2=n2 " " r2 " " at2 h2=n2 " " r2 " " xat2 if (!volelim) {# NB order of shifts is reversed in *.ass assignmentp1[pkid]= assignmentp1[pkid]" " (nvp1=n2"."at2) assignmenth1[pkid]= assignmenth1[pkid]" " (nvh1=n2"."xat2) assignmentp2[pkid]= assignmentp2[pkid]" " (nvp2=n1"."at1) assignmenth2[pkid]= assignmenth2[pkid]" " (nvh2=n1"."xat1) } # Quality factors qual,support,rsup,obs,trans match($0,/[(](([0-9.]+) *)+[)]/) qstr=substr($0,RSTART+1,RLENGTH-2) nq=split(qstr,qq) if ( ( nq<3 || nq>5) && !(nq==1 && diag) ) { print "ERROR (nq)",nq,diag > stderr print "$0",$0 > stderr exit } else if (nq==1 && diag) { assQ=qq[1] } else { #def obsQ=transQ=1 assQ=qq[1] supQ=qq[nq-1] rsupQ=qq[nq] if (nq==4) { if (obs>0) obsQ=qq[3] if (trans>0) transQ=qq[3] if (obs && trans) { print "ERROR obs+trans"> stderr ;exit } }OorT=qq[3] if (nq==5) { obsQ=qq[3] ; transQ=qq[4] } } # distance if ( match($0,/([0-9.]+) *[+][/][-] *([0-9.]+)/,disrms) ) { #print dstr=substr($0,RSTART,RLENGTH) if ("1" in disrms) dis=disrms[1] if (2 in disrms) rms=disrms[2] #print dis,"+/-",rms } else { nodis=1 #; print "nodis",nodis,$0 } # volume con delete vcvc if ( match($0,/([0-9.]+)% +[*] *([0-9.]+)%/,vcvc) ) { #print vcstr=substr($0,RSTART,RLENGTH) if (1 in vcvc) vcdis=vcvc[1] if (2 in vcvc) vcadd=vcvc[2] #print vcdis,"*",vcadd } else if (diag) { vcdis=vcadd=0 } else { print "["$0"]"> stderr for ( i in strass ) print "str_ass",sstrass[i] > stderr print "ERROR novcadd" > stderr ; exit } # tot vol con [ qual+dis ] vcadd*vcdis/vsum # volume con delete vcvc if ( match($0,/= *([0-9.]+)% *(kept)? *$/,vcvc) ) { #print vcstr=substr($0,RSTART,RLENGTH) if (1 in vcvc) vctot=vcvc[1] #print vctot } else if (diag) { vctot="" } else { print "["$0"]"> stderr print "ERROR novctot" > stderr exit } totQ = assQ * supQ * rsupQ * obsQ * transQ #printf "Qfac %7.2f (%4s %5s %7s %4s %4s) volcon %8s%% * %8s%% %8s%%\n", # totQ,assQ,supQ,rsupQ,obsQ,transQ,vcdis,vcadd,vctot #print "volcon",vcdis,vcadd,vctot #exit if ( (spec SUBSEP np) in dist) { if (outtype=="noes") { outstr=outstr sprintf("noe spec%i %s %s %s %s %s", spec,np,p1,ppmh,p2,ppmH) if (ndim>2) outstr=outstr " " h2 " " ppmX if (ndim>3) outstr=outstr " " h1 " " ppmx outstr=outstr sprintf(" 1.0 # %.3f",vctot) outstr=outstr sprintf(" d=%.3f",dist[spec,np]) } else { # fom,contr,dist,ddist,dback,std, outstr=outstr sprintf("c %7.4f %7.3f %5.2f %5.2f - -",totQ,vctot,dis,rms) # segid1,het1,pro1 outstr=outstr sprintf(" - %3s %3s",n2,r2) if (ndim>2) outstr=outstr " " xat2 else outstr=outstr " -" outstr=outstr " - - "at2" - - " # seqgid2,het2,pro2 outstr=outstr sprintf(" - %3s %3s",n1,r1) if (ndim>3) outstr=outstr " " xat1 else outstr=outstr " -" outstr=outstr " - - "at1" - -\n" } } else { } } # print l,kept,$0 > outfile } if (verbose>1) print "peak",pkid,"["np"]",nass,"assignments; kept",kept > stderr nassignments[pkid] = nass nasskept[pkid] = kept # gsub(/ +/," ",assignmentp1[pkid]) gsub(/ +/," ",assignmenth1[pkid]) gsub(/ +/," ",assignmentp2[pkid]) gsub(/ +/," ",assignmenth2[pkid]) gsub(/^ /,"",assignmentp1[pkid]) gsub(/^ /,"",assignmenth1[pkid]) gsub(/^ /,"",assignmentp2[pkid]) gsub(/^ /,"",assignmenth2[pkid]) if (verbose>1) print "####",assignmentp1[pkid],assignmentp2[pkid] >stderr #if (!(np % 500))print " # processing peak",np,"spec",spec > stderr npread[spec]++ if (++dnpread[spec] >= 500) { printf "# processing peak #%-5s spec:pknr %s\n", npread[spec],pkid > stderr dnpread[spec]=0 } if (verbose>1) { print "pro1",spec,np,assignmentp1[pkid]>stderr print "het1",spec,np,assignmenth1[pkid]>stderr print "pro2",spec,np,assignmentp2[pkid]>stderr print "het2",spec,np,assignmenth2[pkid]>stderr } # printf " %i>>\n%s",np,outstr > outfile prevnp=np prevspec=spec } FILENAME ~ /[.]xpk$/ { if (FNR==1) { xpk++ # this is the xpk'th xpk file xpk_code = FILENAME xpk_file[xpk]=FILENAME sub(/[.]xpk$/,"",xpk_code) outxpk = xpk_code "_" cyc_code ".xpk" printf "\n# output for spectrum %i[%s] goes to %s\n",xpk,FILENAME,outxpk > stderr } else if (FNR==2) ndim= split($0,domname) else if (FNR==6) { # get the columns with the labels c=0 for ( i=1 ;i<=NF;i++) { if ($i ~ /[.]L$/) collabel[++c]=i+1 if ($i == "comment" ) colcomment=i } assert( (c==ndim),"label dimensionality??"c"!="ndim) # format (cyanaformat) for this xpk file xpkform=getthisformat(xpkdom,xpk) if (xpkform=="") xpkform=substr("HhNn",1,ndim) printf "# reading %i-D xpk file %i: %-10s format:%-4s (from=%s)\n", ndim,xpk,FILENAME,"'"xpkform"'",xpkdom > stderr cyanaformat2dom(xpkform,"",xdom) for (i=1;i<=ndim;i++) { printf "# dom%i=%s label=%-5s \n", i,substr(xpkform,i,1),domname[i],"'"xpkdom"'" > stderr } } if (FNR<=6) { print > outxpk ; next } # if FNR>6 expect only peak lines if (NF>0) xpkfilecountpeaks[xpk]++ $0=unspaceass($0);$1=$1 s0=$0 if (unassign=="all") { for ( d=1;d<=ndim;d++) { #print $(collabel[d]),"-> {?}" $(collabel[d])="{?}" } } else if (unassign~/^[1-4]$/) { $(collabel[unassign])="{?}" } if (s0!=$0) { print ">>1 --",s0 print ">>2 --",$0 } s0=$0 xpkid=xpk SUBSEP $1 for ( i=1;i<=ndim;i++) # print "label",i,"col",collabel[i] xallppms[xpkid]=xallppms[xpkid] " " $(collabel[i]+1) xeasypknr=xpk SUBSEP ( $1+pkincr[xpk] ) if (debug) { if (xeasypknr in allppms) print xeasypknr,allppms[xeasypknr],xallppms[xpkid] > stderr else print "peak",xeasypknr,"not found" >stderr } #print if (debug) { printf "\n## --- xpkfile %i peak nr: %i <--> %s peak nr %s\n",xpk,$1,ass_file[xpk],xeasypknr > stderr printf " %i assignments (%i kept)\n",nassignments[xeasypknr],nasskept[xeasypknr] > stderr } if (!(xeasypknr in rejected)) { if ( (xeasypknr in assignmentp1) && nasskept[xeasypknr] <= maxass ) if (xeasypknr in sumvolcontr) if (sumvolcontr[xeasypknr]/100>=mincontr) { inrange=1 if (seqrange!="" && xeasypknr in assignmentp1 && xeasypknr in assignmentp2) { rn1=assignmentp1[xeasypknr]+0 rn2=assignmentp2[xeasypknr]+0 dseq=abs(rn1-rn2) inrange = ( dseq <= seqrange ) } for (d=1;d<=ndim;d++) { # print "d",d> stderr col=collabel[xdom[d]] fill="" if (d==1 && assignmentp1[xeasypknr]!="") fill = assignmentp1[xeasypknr] if (d==2 && assignmentp2[xeasypknr]!="") fill = assignmentp2[xeasypknr] if (d==3 && assignmenth1[xeasypknr]!="") fill = assignmenth1[xeasypknr] if (d==4 && assignmenth2[xeasypknr]!="") fill = assignmenth2[xeasypknr] if (fill!="" && inrange) if (ov || ($col=="{?}") ) { $col = "{"fill"}" if ($colcomment=="{?}") { $colcomment="{candid}" } else if ($colcomment~/^[{].*[}]$/) { sub(/[}]$/," candid",$col) } } assppm="--" if (d==1 && xeasypknr in assppmH) assppm=assppmH[xeasypknr] if (d==2 && xeasypknr in assppmh) assppm=assppmh[xeasypknr] if (d==3 && xeasypknr in assppmX) assppm=assppmX[xeasypknr] if (d==4 && xeasypknr in assppmx) assppm=assppmx[xeasypknr] if (assppm!="--") { # .... xppm=$(col+1) daxppm=xppm-assppm if (abs(daxppm)>0.01) { if (!debug) # if true already done fstr="\n## --- xpkfile %i peak nr: %i <--> %s peak nr %s" printf fstr,xpk,$1,ass_file[xpk],xeasypknr > stderr printf "\n## ERROR: ppm values do not match: %8.3f != %8.3f [exiting]\n", xppm,assppm > stderr print "## spectrum:xeasypeaknr =",xeasypknr > stderr print "## .ass ppms ",allppms[xeasypknr] > stderr print "## .xpk ppms ",xallppms[xpkid] > stderr error++ exit } else if (debug) { if (d==1) { printf " .ass ppms %s\n",allppms[xeasypknr] > stderr printf " .xpk ppms %s\n",xallppms[xpkid] > stderr } printf " match in dom=%2i : %8.3f ~ %8.3f\n", d,xppm,assppm > stderr } } } } } gsub(/[{] +/,"{") gsub(/ +[}]/,"}") if (verbose && s0!=$0) { #print " < ",s0 > stderr print " > ",$0 > stderr } #gsub(/[?]?[}]/,xeasypknr"}",$(NF-1)) gsub("["AMBISEP"]"," ") print > outxpk } END { if (error) exit for(i=1;i<=spec;i++) { printf "\n\nCANDID assfile spec #%i: %i peaks\n", i,na=assfilecountpeaks[i] printf "nmrview xpkfile spec #%i: %i peaks\n", i,nx=xpkfilecountpeaks[i] if (na!=nx) { form="ERROR!:spectrum %i [in %s] and peaklist %s," form=form " have diffent amount of peaks (%i!=%i)\n" printf form,i,ass_file[i],xpk_file[i],na,nx } } for(i=1;i<=xpk;i++) { } } function proc_xpkpeakline(str, lstr,mstr,rstr) { # data lines gsub(/[{][ \t]+/,"{",str) gsub(/[ \t]+[}]/,"}",str) str=toupper(str) while (match(str,/[{][^{}]*[ \t]+[^{}]*[}]/)) { lstr=substr(str,1,RSTART-1) mstr=substr(str,RSTART,RLENGTH) rstr=substr(str,RSTART+RLENGTH) if (verbose) print mstr gsub(/[ \t]+/,"|",mstr) str=lstr mstr rstr if (verbose) print mstr } return str } function cyanaformat2dom(str,label,ddom) { # # transform e.g. NHh etc into ddom[1]=2 ddom[2]=3 ddom[3]=1 # oldIGNORECASE=IGNORECASE IGNORECASE=0 if (debug) print "#cyanaformat2dom",str, index(str,"H"),index(str,"h"),index(str,"N"),index(str,"n"),index(str,"C"),index(str,"c") > stderr label1=(label ? label SUBSEP 1 : 1 ) label2=(label ? label SUBSEP 2 : 2 ) label3=(label ? label SUBSEP 3 : 3 ) label4=(label ? label SUBSEP 4 : 4 ) ddom[label1]=index(str,"H") ddom[label2]=index(str,"h") ddom[label3]=index(str,"N") ddom[label4]=index(str,"n") if ( index(str,"C") ) ddom[label3]=index(str,"C") if ( index(str,"c") ) ddom[label4]=index(str,"c") IGNORECASE=oldIGNORECASE } function proccyaf(str,sspecs,ccyafs,ddoms ,nspec,ns,n,ss,rr,cyaf) { # process e.g. str="1:Hh,2:HhN,3:Hh,4:CHh,5:CHh" ns=split(str,ss,",") for ( n in sspecs) delete sspecs[n] for ( n in ccyafs) delete ccyafs[n] for (n=1;n<=ns;n++) { if (split(ss[n],rr,":")!=2) { print "ERROR processing domains [exit] ",ss[n],"from",str exit } if ( rr[1] in sspecs ) { print "ERROR processing domains [exit] ",rr[1]"(:"rr[2]")","already found",str exit } lab=rr[1] print "lab",lab >stderr sspecs[lab]=n cyaf=ccyafs[lab]=rr[2] cyanaformat2dom(cyaf,lab,ddoms) # if (verbose) printf "#proccyaf %10s %4s %s %s,%s,%s,%s\n",str,rr[1],rr[2],ddoms[lab,1],ddoms[lab,2],ddoms[lab,3],ddoms[lab,4] >stderr } } function getthisformat(str,xpk, retval,ns,ss,n,rr) { # str = list of xpk formats "2:HN,4CHh" # xpk = number of xpkfile ns=split(str,ss,",") for (n=1;n<=ns;n++) { if (split(ss[n],rr,":")!=2) { print "ERROR processing domains [exit] ",ss[n],"from",str exit } if (rr[1] == xpk) retval=rr[2] } return retval } function to_heavy(res,atom, hetero) { # for NH's gsub(/ /,"",res) gsub(/ /,"",atom) if (atom~/^[HQ]N?[123]?$/) hetero="N" else if (res~/^ASN/ && atom ~ /^[HQ]D/ ) hetero="ND2" else if (res~/^GLN/ && atom ~ /^[HQ]E/ ) hetero="NE2" else if (res~/^ARG/ && atom == "HE" ) hetero="NE" else if (res~/^ARG/ && atom ~ /^HH/ ) hetero="NH"substr(atom,3,1) else if (res~/^LYS/ && atom ~ /^HZ/ ) hetero="NZ" else if (res~/^HIS/ && atom ~ /^H(D1|E2)$/ ) hetero="N"substr(atom,2,2) else if (res~/^TRP/ && atom ~ "HE1" ) hetero="N"substr(atom,2,2) else { # for CH's hetero=atom sub(/^[HQ]/,"C",hetero) ho=hetero if (hetero~/^C[AB]/) hetero=substr(hetero,1,2) else if (hetero~/^CG/ && res!~/VAL|ILE|THR/ ) hetero="CG" else if (hetero~/^CD/ && res !~ /(HIS|PHE|TYR|ILE|LEU|TRP)/ ) hetero="CD" else if (hetero~/^CE/ && res !~ /^(HIS|PHE|TYR|TRP)/ ) hetero="CE" else if (hetero~/^C[ZH]/ && res !~ /^TRP/ ) hetero=substr(hetero,1,2) else hetero=substr(hetero,1,3) #if (ho!=hetero) print "##",ho,hetero } #print "##",atom,hetero return hetero } function to_iupac(res,atom) { # substitute hb1->hb2 hb2->hb3 etc res=toupper(res) atom=toupper(atom) gsub(/ /,"",atom) outatom=atom #if (res=="GLY") { # if (atom=="HA1") outatom="HA2" # if (atom=="HA2") outatom="HA3" #} #hb1 hb2 resrxp="(arg|as[np]|cy[ps]|gl[nu]|met|phe|" resrxp=resrxp "pro|ser|tyr|trp|leu|lys|his|unk|pru)([-0-9+]?)" if (res ~ resrxp) { if (atom=="HB1") outatom="HB2" if (atom=="HB2") outatom="HB3" } #hg1 hg2 resrxp="(arg|gl[nu]|met|pro|lys|unk|pru)([-0-9+]?)" if (res ~ resrxp) { if (atom=="HG1") outatom="HG2" if (atom=="HG2") outatom="HG3" } #hg11 hg12 (ile) resrxp="(ile|unk|pru)([-0-9+]?)" if (res ~ resrxp) { if (atom=="HG11") outatom="HG12" if (atom=="HG12") outatom="HG13" } #hd1 hd2 resrxp="(arg|pro|lys|unk|pru)([-0-9+]?)" if (res ~ resrxp) { if (atom=="HD1") outatom="HD2" if (atom=="HD2") outatom="HD3" } #he1 he2 resrxp="(lys)([-0-9+]?)" if (res ~ resrxp) { if (atom=="HE1") outatom="HE2" if (atom=="HE2") outatom="HE3" } return outatom } function from_iupac(res,atom, outatom) { # substitute hb2->hb1 hb3->hb2 etc res=toupper(res) atom=toupper(atom) gsub(/ /,"",atom) outatom=atom #if (res=="GLY") { # if (atom=="HA2") outatom="HA1" # if (atom=="HA3") outatom="HA2" #} #hb1 hb2 resrxp="(arg|as[np]|cy[ps]|gl[nu]|met|phe|" resrxp=resrxp "pro|ser|tyr|trp|leu|lys|his|unk|pru)([-0-9+]?)" if (res ~ resrxp) { if (atom=="HB2") outatom="HB1" if (atom=="HB3") outatom="HB2" } #hg1 hg2 resrxp="(arg|gl[nu]|met|pro|lys|unk|pru)([-0-9+]?)" if (res ~ resrxp) { if (atom=="HG2") outatom="HG1" if (atom=="HG3") outatom="HG2" } #hg11 hg12 (ile) resrxp="(ile|unk|pru)([-0-9+]?)" if (res ~ resrxp) { if (atom=="HG12") outatom="HG11" if (atom=="HG13") outatom="HG12" } #hd1 hd2 resrxp="(arg|pro|lys|unk|pru)([-0-9+]?)" if (res ~ resrxp) { if (atom=="HD2") outatom="HD1" if (atom=="HD3") outatom="HD2" # print res,resrxp,"{" atom ">" outatom "}",(atom=="HD3") } #he1 he2 resrxp="(lys)([-0-9+]?)" if (res ~ resrxp) { if (atom=="HE2") outatom="HE1" if (atom=="HE3") outatom="HE2" } if ( (!keeppseudo) && atom ~ /^Q/ ) outatom = "H" substr(atom,2) "1" return outatom } function from_pseudo_to_nvpdb(res,atom, outresatom) { # substitute hb2->hb1 hb3->hb2 etc res=toupper(res) atom=toupper(atom) gsub(/ /,"",atom) outatom=atom #if (res=="GLY") { # if (atom=="HA2") outatom="HA1" # if (atom=="HA3") outatom="HA2" #} #qa -> ha1 ha2 - hb# resrxp="gly" if (res ~ resrxp) { if (atom=="QA") outatom="HA1 HA2" } #qb -> hb1 hb2 hb3 - hb# resrxp="(arg|as[np]|cy[ps]|gl[nu]|met|phe|" resrxp=resrxp "pro|ser|tyr|trp|leu|lys|his|unk|pru)([-0-9+]?)" if (res ~ resrxp) { if (atom=="QB") outatom="HB1 HB2" } #hg1 hg2 resrxp="(arg|gl[nu]|met|pro|lys|unk|pru)([-0-9+]?)" if (res ~ resrxp) { if (atom=="QG") outatom="HG1 HG2" } #hg11 hg12 (ile) resrxp="(ile|unk|pru)([-0-9+]?)" if (res ~ resrxp) { if (atom=="QG1") outatom="HG11 HG12" } #hd1 hd2 resrxp="(arg|pro|lys|unk|pru)([-0-9+]?)" if (res ~ resrxp) { if (atom=="QD") outatom="HD1 HD2" } #qe -> he1 he2 resrxp="(lys)([-0-9+]?)" if (res ~ resrxp) { if (atom=="QE") outatom="HE1 HE2" } #all other Q's should be methyls, transform like QB -> HB1 if (atom~/^Q/) outatom = "H" substr(atom,2) "1" return outatom } function abs(x) {return x<0?-x:x} function unspaceass(str, lstr,mstr,rstr) { while (match(str,/[{][^{}]*[ \t]+[^{}]*[}]/)) { lstr=substr(str,1,RSTART-1) mstr=substr(str,RSTART,RLENGTH) rstr=substr(str,RSTART+RLENGTH) if (verbose) print mstr gsub(/[ \t]+/,AMBISEP,mstr) str=lstr mstr rstr if (verbose) print "mstr",mstr } return str } function assert(cond,text) { if (!cond) print "\nassertion failed:",text asserterror=1 }