#!/usr/local/bin/gawk -f # #@ convert protein aminoacid sequence files from #@ oneletter to threeletter format and vice versa. # # (c) Eiso AB # # command line options: # [-v] mode=3to1 # convert from three to one letter format # [-v] mode=1to3 # convert from one to three letter format [ABCDE...] # [-v] mode=1nto3 # convert from one to three letter format [A1/ # # seq2seq.awk will try to determine the type of file from the first # field on the first line it encounters. If the first word has either # one or 4 or more characters oneletter format will be assumed. # # possible input: # BEGIN { mode="" # "3to1" or "1to3" progname="seq2seq.awk" if ( ARGV[1]~/^(h|help)$/ || help!="" ) { print ARGV[0] # because of the which command, help is only available if # this script is in the PATH system("gawk '/^#/{print}/^BEGIN/{exit}' `which "progname"`") help=1 exit } } #{ sub(/[\r]$/,"") } mode=="" { # determine mode if ($0~/^[A-Z][0-9]+[\r]?$/) mode="1nto3" # pasta sequence else if ($1~/^([A-Z][A-Z][A-Z0-9][A-Z0-9]?)[-+]?$/) mode="3to1" else if (length($1)==1) mode="1to3" else if (length($1)>1 && $1~/^[A-Z][0-9]+$/) mode="1to3" else if (length($1)>=4) mode="1to3" else mode="?" } # get sequence from sparky project file: FILENAME ~ /[.]proj$/ { if ($1!="sequence") next $0=$2 #print "##??",$0 mode="1to3" for (i=1;i<=length($0);i++) { aa=substr($f,i,1) if (aa~/^[A-Z]/) { print to_3_letter(aa),++r,(i==1?"# "ww:"") } } next } { #print mode,$0 sub(/#.*/,""); $1=$1; $0=$0 if (mode=="3to1") { for (f=1;f<=NF;f++) { ww=$0 if ($f!~/^[0-9]+$/) printf "%s\n",to_1_letter($f) (f==1?" # "ww:"") } } else if (mode=="1to3") { for (f=1;f<=NF;f++) { ww=$f for (i=1;i<=length($f);i++) { aa=substr($f,i,1) if (aa~/^[A-Z]/) print to_3_letter(aa),(i==1?"# "ww:"") } } } else if (mode=="1nto3") { sub("[0-9]"," &") $0=$0 if (NF==2) print to_3_letter($1),$2+0 } else { print "#??",$0 } } function to_3_letter(str,res) { res=str if (str=="A") res="ALA" else if (str=="C") res="CYS" else if (str=="D") res="ASP" else if (str=="E") res="GLU" else if (str=="F") res="PHE" else if (str=="G") res="GLY" else if (str=="H") res="HIS" else if (str=="I") res="ILE" else if (str=="K") res="LYS" else if (str=="L") res="LEU" else if (str=="M") res="MET" else if (str=="N") res="ASN" else if (str=="P") res="PRO" else if (str=="Q") res="GLN" else if (str=="R") res="ARG" else if (str=="S") res="SER" else if (str=="T") res="THR" else if (str=="V") res="VAL" else if (str=="W") res="TRP" else if (str=="Y") res="TYR" return res } function to_1_letter(res,str) { str=res if (res=="ALA") str="A" else if (res~/^CYS/) str="C" else if (res=="ASP") str="D" else if (res=="GLU") str="E" else if (res=="PHE") str="F" else if (res=="GLY") str="G" else if (res=="HIS") str="H" else if (res=="ILE") str="I" else if (res=="LYS") str="K" else if (res=="LEU") str="L" else if (res=="MET") str="M" else if (res=="ASN") str="N" else if (res=="PRO") str="P" else if (res=="GLN") str="Q" else if (res=="ARG") str="R" else if (res=="SER") str="S" else if (res=="THR") str="T" else if (res=="VAL") str="V" else if (res=="TRP") str="W" else if (res=="TYR") str="Y" else if (res=="LYS+") str="K" else if (res=="ARG+") str="R" else if (res=="HIS+") str="H" else if (res=="ASP-") str="D" else if (res=="GLU-") str="E" return str }