#!/usr/local/bin/gawk -f # #@ make a table with numbers of ambiguous and unambiguous #@ intra/seq/medium/long/inter-dom restraints #@ it needs a2ps to format the output. # # command-line options # # [-v] domains=200 [0] # for dimers: give a residue number in between the monomers # # no support for n-mers with n>2 # [-v] dresdimer=200 [0] # same as domains=.. # [-v] full=1 [""] # print float format ("%10.2f") counts for ambiguous restraints, # default is "%10i", which truncates to integer # [-v] cols=45 # rows of 45 residues # # # # tbl format assumed: (can be on consecutive lines) # # ASSIGN ... resid 10 ... resid 20 ... OR ... resid 10 ... resid 20 ... OR .... # # so the file is split on "ASSI" strings (case ignored) to get the restraints, # and the records are split on "OR" (case ignored) to get the pairs contributing # to the restraint. the first residue of the pair will be taken from the first # number after the first "resid" and the second residue of the pair will be # taken from the first number after the second "resid". # BEGIN { stderr="/dev/stderr" IGNORECASE=1 RS="ASSI(GN?)?" progname="tblcount.awk" if ( ARGV[1]~/^(h|help)$/ || help!="" ) { print ARGV[0] # because of the which command, help is only available if # this script is in the PATH system("gawk '/^#/{print}/^BEGIN/{exit}' `which "progname"`") help=1 exit } if (ARGV[ARGC-1]~ /[.]upl$/) { upl=ARGV[ARGC-1] tbl=gensub(/[.]upl$/,".tbl","",upl) com = "upl2tbl.awk " upl ">" tbl print com system(com) ARGV[ARGC-1]=tbl } } FNR==1 { f++ } #FILENAME ~ /[.]upl$/ && !tbl { # print FILENAME,f,ARGV[f],ARGC,ARGV[ARGC-1] # tbl=gensub(/[.]upl$/,".tbl","",FILENAME) #} # #{print FNR,NR,FILENAME} NR==1 { if (domains!="") dresdimer=domains if (dresdimer!="") dresdimer=int(dresdimer) if (full!="") full=1 if (zc!="") zc="." if (twid=="") twid=4 hwid=11 if (cols=="") cols=40 } FILENAME ~ /[.]seq$/ { if (verbose)print "FILENAME",FILENAME > stderr seq = FILENAME nr=split($0,rrll,"\n") for (l=1;l<=nr;l++) { nf=split(rrll[l],an," ") if (toupper(an[1])~/^(LL|LP|PL)/) {} else { if (nf==2) resnum=an[2] ; else resnum++ residue[resnum]=toupper(an[1]) if (verbose) print "#",resnum,residue[resnum] > stderr if (minres=="") minres=resnum minres = (minres-resnum>0 ? resnum : minres ) if (maxres=="") maxres=resnum maxres = (maxres-resnum<0 ? resnum : maxres ) } } next } FILENAME ~ /[.]pdb$/ { if (verbose) print "FILENAME",FILENAME > stderr seq = FILENAME nr=split($0,rrll,"\n") for (l=1;l<=nr;l++) { nf=split(rrll[l],an," ") if (an[1]=="ATOM") { resnum=an[5] ; resnam=an[4] residue[resnum]=toupper(resnam) } if (verbose) print "#",resnum,residue[resnum] > stderr if (minres=="") minres=resnum minres = (minres-resnum>0 ? resnum : minres ) if (maxres=="") maxres=resnum maxres = (maxres-resnum<0 ? resnum : maxres ) } next } FILENAME ~ /[.]tbl$/ { tbl=FILENAME o=$0 gsub(/[()]/," & ") gsub(/\n/," ") #print RT,$0 regex=".* resid +([0-9]+) .*" rsub="\\1" na=split($0,rr,/OR/) minresdif="" for (a=1;a<=na;a++) { res1=res2="" ro=rr[a] match(rr[a],/.*resid? +([0-9]+) .* resid? +([0-9]+) /,rres) if (1 in rres) res1=rres[1]+0 if (2 in rres) res2=rres[2]+0 if (minres=="") minres=res1 minres = (minres-res1>0 ? res1 : minres ) minres = (minres-res2>0 ? res2 : minres ) if (maxres=="") maxres=res1 maxres = (maxres-res1<0 ? res1 : maxres ) maxres = (maxres-res2<0 ? res2 : maxres ) if (res1=="" || res2=="") { print "???",FNR":[",ro"]" ; next } resdif=abs(res1-res2) dclass="" if (resdif==0) dclass="intra" else if (resdif==1) dclass="sequential" else if (resdif>1 && resdif<5) dclass="medium" else if (dresdimer!="" && (res1-dresdimer)*(res2-dresdimer)<0 ) dclass="inter-domain" else if (resdif>=5) dclass="long" else { print "## error: could not interpret", "res1="res1,"res2="res2,"resdif="resdif >stderr } if (minresdif=="") minresdif=resdif else minresdif = ( minresdif>resdif ? resdif : minresdif ) if (verbose) printf "#### %-8s d=%-2i %4s\n",res1"-"res2,resdif,a"/"na > stderr if (na==1) { if ( dclass == "inter-domain" ) { inter++ ; out = dclass"-"dresdimer".txt" } else if ( dclass=="long" ) { long++ ; out = dclass"-5+.txt" } else if ( dclass=="medium" ) { medium++ ; out = dclass"-234.txt" } else if ( dclass=="sequential") { sequential++ ; out = dclass"-1.txt" } else if ( dclass=="intra" ) { intra++ ; out = dclass"-0.txt" } } else if (na>1) { if ( dclass=="inter-domain" ) { xambinter++ } else if ( dclass=="long" ) { xamblong+=1/na } else if ( dclass=="medium" ) { xambmedium+=1/na } else if ( dclass=="sequential" ) { xambsequential+=1/na } else if ( dclass=="intra" ) { xambintra+=1/na } } if ( dclass=="inter-domain" ) type="inter-dom" else if ( dclass=="long" ) type="long" else if ( dclass=="medium" ) type="medium" else if ( dclass=="sequential" ) type="sequential" else if ( dclass=="intra" ) type="intra-res" if (!(type in typeslist)) typeslist[type]++ if (na==1) { reslist[res1]++ reslist[res2]++ count_unamb[res1,type]+=1 count_unamb[res2,type]+=1 } else if (na>1) { count_amb[res1,type]+=1/na count_amb[res2,type]+=1/na reslist[res1]+=1/na reslist[res2]+=1/na resunamb++ } } if (na==1) { #print resdif,minresdif print RT,$0> "restraints-"out } else if (na>1) { #print "#",resdif,minresdif if (dclass=="inter-domain" ) { ambinter++ ; out = "ambi_inter-"dresdimer".txt" } else if (dclass=="long" ) { amblong+=1 ; out = "ambi_long-5+.txt" } else if (dclass=="medium" ) { ambmedium+=1 ; out = "ambi_medium-234.txt" } else if (dclass=="sequential" ) { ambsequential+=1 ; out = "ambi_sequential-1.txt" } else if (dclass=="intra" ) { ambintra+=1 ; out = "ambi_intra-0.txt" } print "ASSI",$0> "restraints-"out } else { if (FNR!=1 || !/^ *$/) { print "##STRANGE:na=",na > stderr print "##STRANGE:for lines:"FNR"["$0"]" > stderr } } next } END { if(help)exit print "# command line options:" > stderr print "# twid="twid > stderr print "# cols="cols > stderr print "# dresdimer="dresdimer > stderr format="%-14s %10s %10s %10s\n" printf format,"restaints","unambig","ambig","ambig" printf format," "," ","min-range","all-range" printf format,"--------------","-------","---------","--------" format="%-14s %10i %10i %10.2f\n" printf format,"intra-res (0)",intra,ambintra,xambintra printf format,"sequential (1)",sequential,ambsequential,xambsequential printf format,"medium (2-4)",medium,ambmedium,xambmedium printf format,"long (5+)",long,amblong,xamblong if (dresdimer) printf format,"inter-domain (-)",inter,ambinter,xambinter format="%-14s %10s %10s %10s\n" printf format,"---------- ","----","----","----" format="%-14s %10i %10i %10.2f\n" printf format,"total ", intra+sequential+medium+long+inter, ambintra+ambsequential+ambmedium+amblong+ambinter, xambintra+xambsequential+xambmedium+axmblong+xambinter # # split over different types # types="intra-res sequential medium long" ntyp=split(types,ttype) for (i in ttype) invttype[ttype[i]]=i for (t in typeslist) { if (!(t in invttype)) types=types" "t if (length(t)>hwid+1) hwid=length(t)+1 } ntyp=split(types,ttype) print "#TYPES",types format="res %5s %5i %5i %5i %5i\n" printf "\n\nresidue stats for residues between %i and %i\n",minres,maxres printf "--------------------------------------------------\n" printf "%5s","" for (t=1;t<=ntyp;t++) printf " %10s",ttype[t] ; printf " %10s","total" if (resunamb) { for (t=1;t<=ntyp;t++) printf " %10s",ttype[t] printf " %10s","total"; printf " %10s","total" } printf "\n" printf "%7s","resnr" for (t=1;t<=ntyp+1;t++) printf " %10s","unambig" ; if (resunamb) { for (t=1;t<=ntyp+1;t++) printf " %10s","ambig" printf " %10s","un+amb" } printf " \n" for ( r=minres ; r<=maxres ; r++ ) if (r in reslist) { if(prevr+10) { ambiform= ( full ? " %10.2f" : " %10i" ) for (t=1;t<=ntyp;t++) { cam=(r SUBSEP ttype[t] in count_amb ? count_amb[r SUBSEP ttype[t]] : "-") printf ambiform,cam totamb+=cam } printf ambiform,totamb printf ambiform,totamb+totunamb } printf " \n" prevr=r } print "# residues range from",minres,"to",maxres> stderr MINRES=minres; MAXRES=maxres cres=maxres-minres nbin=int(((cres-1)/cols)+1) if (seq) tabfile=gensub("(^.*[/]|[.]seq$)","","g",seq) "-" tabfile=tabfile gensub("(^.*[/]|[.]tbl$)","","g",tbl) tabfile=tabfile "-tblcount.out" print "\n\n printing transposed table to",tabfile > stderr for ( b=1 ; b<=nbin ; b++ ) { minres= (b-1)*cols+MINRES maxres= minres+cols-1 if (maxres>MAXRES) maxres=MAXRES for ( r=minres ; r<=maxres ; r++ ) { if (r in residue) if (length(residue[r])>=twid) { twid++ ; cols=int(cols/twid*(twid-1)) print "#increasing width to:",twid>stderr print "#decreasing cols to:",cols>stderr } } printf "\n%-*s ",hwid," number" > tabfile for ( r=minres ; r<=maxres ; r++ ) { if (!(r % 2) && length(r"")>=twid) { printf "%*i",twid,"" > tabfile } else if (r in residue){ printf "%*i",twid,r > tabfile } } printf " \n" > tabfile printf "%-*s ",hwid," residue" > tabfile for ( r=minres ; r<=maxres ; r++ ) if (r in residue) { printf "%*s",twid,AAoneletter(residue[r]) > tabfile } printf " \n" > tabfile printf "%-*s ",hwid,"" > tabfile for ( r=minres ; r<=maxres ; r++ ) if (r in residue) printf "%*s",twid,"--" > tabfile printf " \n" > tabfile for (t=1;t<=ntyp;t++) { #if (r==minres) printf "%-*s ",hwid," "ttype[t] > tabfile for ( r=minres ; r<=maxres ; r++ ) if (r in residue) { if (r SUBSEP ttype[t] in count_unamb) printf "%*i",twid,count_unamb[r SUBSEP ttype[t]] > tabfile else printf "%*s",twid,zc > tabfile } printf " \n" > tabfile #if (r==minres) printf "%-*s ",hwid," ''-ambi" > tabfile for ( r=minres ; r<=maxres ; r++ ) if (r in residue) { if (r SUBSEP ttype[t] in count_amb) printf "%*i",twid,count_amb[r SUBSEP ttype[t]] > tabfile else printf "%*s",twid,zc > tabfile } printf " \n\n" > tabfile } } close(tabfile) nc=cols*twid+10+4 nl=nbin*((ntyp+1)*3+1) if ( (nc/nl) > (151/57) ) scaleopt="-l"nc else scaleopt="-L"nl com=sprintf("a2ps ./%s -1 -Pcolor -MA4 -r -s2 %s -o restraints-table.ps\n", tabfile,scaleopt) print "#print with: ",com >stderr system(com) } function abs(x) { return int( x<0 ? -x : x ) } function AAtranslate(str,res) { res=str if (str=="A") res="ALA" else if (str=="C") res="CYS" else if (str=="D") res="ASP" else if (str=="E") res="GLU" else if (str=="F") res="PHE" else if (str=="G") res="GLY" else if (str=="H") res="HIS" else if (str=="I") res="ILE" else if (str=="K") res="LYS" else if (str=="L") res="LEU" else if (str=="M") res="MET" else if (str=="N") res="ASN" else if (str=="P") res="PRO" else if (str=="Q") res="GLN" else if (str=="R") res="ARG" else if (str=="S") res="SER" else if (str=="T") res="THR" else if (str=="V") res="VAL" else if (str=="W") res="TRP" else if (str=="Y") res="TYR" else if (str=="U") res="UNK" # unknown residue else if (str=="B") res="UPR" # unknown i-1 neighbour return res } function AAoneletter(res,str) { str=res if (res=="ALA") str="A" else if (res=="CYS") str="C" else if (res=="ASP") str="D" else if (res=="GLU") str="E" else if (res=="PHE") str="F" else if (res=="GLY") str="G" else if (res=="HIS") str="H" else if (res=="ILE") str="I" else if (res=="LYS") str="K" else if (res=="LEU") str="L" else if (res=="MET") str="M" else if (res=="ASN") str="N" else if (res=="PRO") str="P" else if (res=="GLN") str="Q" else if (res=="ARG") str="R" else if (res=="SER") str="S" else if (res=="THR") str="T" else if (res=="VAL") str="V" else if (res=="TRP") str="W" else if (res=="TYR") str="Y" else if (res~/^LYS/) str="K" else if (res~/^ARG/) str="R" else if (res~/^HIS/) str="H" else if (res~/^ASP/) str="D" else if (res~/^GLU/) str="E" else if (res==/UNK/) str="U" else if (res==/UPR/) str="B" return str }