#ifndef N2P_H #define N2P_H /* Original file name: n2p.h The N2P 1.0 header library for translating coding sequences Copyright (c) 2005 Reed A. Cartwright and Douglas L. Theobald Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Version 1.0 can be downloaded from http://www.genetics.uga.edu/sw/. All later versions can be found at http://scit.us/n2p/. Functions: Basic Translation: Translate(char *csOut, const char *csIn, const char *csCode) Reverse Complement Translation: TranslateRC(char *csOut, const char *csIn, const char *csCode) Translation With Gaps: TranslateG(char *csOut, const char *csIn, const char *csCode) Reverse Complement Translation With Gaps: TranslateRCG(char *csOut, const char *csIn, const char *csCode) Translate All Six Frames: class Translate6 Global Variables: Standard Genetic Code: const char csStd[] NCBI Genetic Codes: const char csCodes[][65] const char csCodesLC[][65] const char csCodeNames[][75] Implementation Notes: 1) The ^0x2A used in the reverse complement algorithms can be removed if csCode has already been reverse complemented. 2) Some processor-compiler combinations will produce a faster executable if the leftshifts and inclusive ors are replaced by arithmetic operators: e.g.'<< 3' by '*8', '<< 1' by '*2', and '|' by '+'. Intel processors and compilers tend to prefer this method. 3) The genetic code strings can be removed if not in use. */ #ifdef N2P_ARITHMETIC # define CODON64(x,y,z) ((((x)&6)*8)+(((y)&6)*2)+(((z)&6)>>1)) #else # define CODON64(x,y,z) ((((x)&6)<<3)|(((y)&6)<<1)|(((z)&6)>>1)) #endif /********************************************************************************** BASIC TRANSLATION csOut - string buffer where translated sequence is sent must be large enough to hold the output plus terminating null csIn - - string buffer containing coding sequences must contain a whole number of codons (be well formed), i.e strlen(csIn)%3 == 0 csCode - the genetic code **********************************************************************************/ inline void Translate(char *csOut, const char *csIn, const char *csCode) { for(;*csIn; csIn += 3) *csOut++ = csCode[CODON64(csIn[0],csIn[1],csIn[2])]; *csOut = '\0'; } /********************************************************************************** REVERSE COMPLEMENT TRANSLATION **********************************************************************************/ inline void TranslateRC(char *csOut, const char *csIn, const char *csCode) { const char *cs; for(cs = csIn;*cs; cs +=3) { } for(;cs != csIn; cs -= 3) *csOut++ = csCode[CODON64(csIn[-1],csIn[-2],csIn[-3])^0x2A]; *csOut = '\0'; } /********************************************************************************** BASIC TRANSLATION WITH GAPS **********************************************************************************/ inline void TranslateG(char *csOut, const char *csIn, const char *csCode) { for(;*csIn; csIn += 3) { if(csIn[0] < 'A') *csOut++ = csIn[0]; else *csOut++ = csCode[CODON64(csIn[0],csIn[1],csIn[2])]; } *csOut = '\0'; } /********************************************************************************** REVERSE COMPLEMENT TRANSLATION WITH GAPS **********************************************************************************/ inline void TranslateRCG(char *csOut, const char *csIn, const char *csCode) { const char *cs; for(cs = csIn;*cs; cs +=3) { } for(;cs != csIn; cs -= 3) { if(csIn[0] < 'A') *csOut++ = csIn[-1]; else *csOut++ = csCode[CODON64(csIn[-1],csIn[-2],csIn[-3])^0x2A]; } *csOut = '\0'; } /********************************************************************************** TRANSLATE ALL SIX FRAMES **********************************************************************************/ #include class Translate6 { public: // Constructor explicit Translate6(size_t len) { csOut[0] = new char[len]; csOut[1] = new char[len]; csOut[2] = new char[len]; csOut[3] = new char[len]; csOut[4] = new char[len]; csOut[5] = new char[len]; } // Destructor virtual ~Translate6() { delete[] csOut[0]; delete[] csOut[1]; delete[] csOut[2]; delete[] csOut[3]; delete[] csOut[4]; delete[] csOut[5]; } // Translate all six frames of csOut void Translate(const char *csIn, const char *csCode) { int n = ((csIn[0]&6) << 1)|((csIn[1]&6) >> 1); const char *cs = csIn+2; int i=0; for(;cs[1];i++,cs+=3) { n = ((n&0xF) << 2)| ((cs[0]&6) >> 1); csOut[0][i] = csCode[n]; n = ((n&0xF) << 2)| ((cs[1]&6) >> 1); csOut[1][i] = csCode[n]; n = ((n&0xF) << 2)| ((cs[2]&6) >> 1); csOut[2][i] = csCode[n]; } n = ((n&0xF) << 2)| ((cs[0]&6) >> 1); csOut[0][i] = csCode[n]; csOut[0][i+1] = '\0'; csOut[1][i] = '\0'; csOut[2][i] = '\0'; n = ((cs[0]&6) << 1)|((cs[-1]&6) >> 1); cs -= 2; for(i=0; cs != csIn; cs-=3,i++) { n = ((n&0xF) << 2)| ((cs[0]&6) >> 1); csOut[3][i] = csCode[n^0x2A]; n = ((n&0xF) << 2)| ((cs[-1]&6) >> 1); csOut[4][i] = csCode[n^0x2A]; n = ((n&0xF) << 2)| ((cs[-2]&6) >> 1); csOut[5][i] = csCode[n^0x2A]; } n = ((n&0xF) << 2)| ((cs[0]&6) >> 1); csOut[3][i] = csCode[n^0x2A]; csOut[3][i+1] = '\0'; csOut[4][i] = '\0'; csOut[5][i] = '\0'; } // access frame strings const char* operator [] (size_t t) const { return csOut[t]; } char* operator [] (size_t t) { return csOut[t]; } private: char * csOut[6]; private: Translate6(); }; /********************************************************************************** GENETIC CODES Below are the genetic codes taken from the NCBI database, . csStd - - - - Standard Genetic Code csCodes - - - All the genetic codes in a single table. csCodes[i] corresponds to NCBI tabel i. csCodesLC - - same as csCodes, except lowercase csCodeNames - the name associated with each genetic code **********************************************************************************/ const char csStd[] = "KNNKTTTTIIIMRSSRQHHQPPPPLLLLRRRR*YY*SSSSLFFL*CCWEDDEAAAAVVVVGGGG"; // 1 Standard const char csCodes[][65] = { "0000000000000000000000000000000000000000000000000000000000000000", // 0 Unused "KNNKTTTTIIIMRSSRQHHQPPPPLLLLRRRR*YY*SSSSLFFL*CCWEDDEAAAAVVVVGGGG", // 1 Standard "KNNKTTTTMIIM*SS*QHHQPPPPLLLLRRRR*YY*SSSSLFFLWCCWEDDEAAAAVVVVGGGG", // 2 Vertebrate Mitochondrial "KNNKTTTTMIIMRSSRQHHQPPPPTTTTRRRR*YY*SSSSLFFLWCCWEDDEAAAAVVVVGGGG", // 3 Yeast Mitochondrial "KNNKTTTTIIIMRSSRQHHQPPPPLLLLRRRR*YY*SSSSLFFLWCCWEDDEAAAAVVVVGGGG", // 4 Mold, Protozoan, and Coelenterate Mitochondrial and Mycoplasma/Spiroplasma "KNNKTTTTMIIMSSSSQHHQPPPPLLLLRRRR*YY*SSSSLFFLWCCWEDDEAAAAVVVVGGGG", // 5 Invertebrate Mitochondrial "KNNKTTTTIIIMRSSRQHHQPPPPLLLLRRRRQYYQSSSSLFFL*CCWEDDEAAAAVVVVGGGG", // 6 Ciliate, Dasycladacean and Hexamita Nuclear "0000000000000000000000000000000000000000000000000000000000000000", // 7 Unused "0000000000000000000000000000000000000000000000000000000000000000", // 8 Unused "NNNKTTTTIIIMSSSSQHHQPPPPLLLLRRRR*YY*SSSSLFFLWCCWEDDEAAAAVVVVGGGG", // 9 Echinoderm and Flatworm Mitochondrial "KNNKTTTTIIIMRSSRQHHQPPPPLLLLRRRR*YY*SSSSLFFLCCCWEDDEAAAAVVVVGGGG", // 10 Euplotid Nuclear "KNNKTTTTIIIMRSSRQHHQPPPPLLLLRRRR*YY*SSSSLFFL*CCWEDDEAAAAVVVVGGGG", // 11 Bacterial and Plant Plastid "KNNKTTTTIIIMRSSRQHHQPPPPLLLSRRRR*YY*SSSSLFFL*CCWEDDEAAAAVVVVGGGG", // 12 Alternative Yeast Nuclear "KNNKTTTTMIIMGSSGQHHQPPPPLLLLRRRR*YY*SSSSLFFLWCCWEDDEAAAAVVVVGGGG", // 13 Ascidian Mitochondrial "NNNKTTTTIIIMSSSSQHHQPPPPLLLLRRRRYYY*SSSSLFFLWCCWEDDEAAAAVVVVGGGG", // 14 Alternative Flatworm Mitochondrial "KNNKTTTTIIIMRSSRQHHQPPPPLLLLRRRR*YYQSSSSLFFL*CCWEDDEAAAAVVVVGGGG", // 15 Blepharisma Nuclear "KNNKTTTTIIIMRSSRQHHQPPPPLLLLRRRR*YYLSSSSLFFL*CCWEDDEAAAAVVVVGGGG", // 16 Chlorophycean Mitochondrial "0000000000000000000000000000000000000000000000000000000000000000", // 17 Unused "0000000000000000000000000000000000000000000000000000000000000000", // 18 Unused "0000000000000000000000000000000000000000000000000000000000000000", // 19 Unused "0000000000000000000000000000000000000000000000000000000000000000", // 20 Unused "NNNKTTTTMIIMSSSSQHHQPPPPLLLLRRRR*YY*SSSSLFFLWCCWEDDEAAAAVVVVGGGG", // 21 Trematode Mitochondrial "KNNKTTTTIIIMRSSRQHHQPPPPLLLLRRRR*YYL*SSSLFFL*CCWEDDEAAAAVVVVGGGG", // 22 Scenedesmus obliquus mitochondrial "KNNKTTTTIIIMRSSRQHHQPPPPLLLLRRRR*YY*SSSS*FFL*CCWEDDEAAAAVVVVGGGG" // 23 Thraustochytrium Mitochondrial }; const char csCodesLC[][65] = { "0000000000000000000000000000000000000000000000000000000000000000", // 0 Unused "knnkttttiiimrssrqhhqppppllllrrrr*yy*sssslffl*ccweddeaaaavvvvgggg", // 1 Standard "knnkttttmiim*ss*qhhqppppllllrrrr*yy*sssslfflwccweddeaaaavvvvgggg", // 2 Vertebrate Mitochondrial "knnkttttmiimrssrqhhqppppttttrrrr*yy*sssslfflwccweddeaaaavvvvgggg", // 3 Yeast Mitochondrial "knnkttttiiimrssrqhhqppppllllrrrr*yy*sssslfflwccweddeaaaavvvvgggg", // 4 Mold, Protozoan, and Coelenterate Mitochondrial and Mycoplasma/Spiroplasma "knnkttttmiimssssqhhqppppllllrrrr*yy*sssslfflwccweddeaaaavvvvgggg", // 5 Invertebrate Mitochondrial "knnkttttiiimrssrqhhqppppllllrrrrqyyqsssslffl*ccweddeaaaavvvvgggg", // 6 Ciliate, Dasycladacean and Hexamita Nuclear "0000000000000000000000000000000000000000000000000000000000000000", // 7 Unused "0000000000000000000000000000000000000000000000000000000000000000", // 8 Unused "nnnkttttiiimssssqhhqppppllllrrrr*yy*sssslfflwccweddeaaaavvvvgggg", // 9 Echinoderm and Flatworm Mitochondrial "knnkttttiiimrssrqhhqppppllllrrrr*yy*sssslfflcccweddeaaaavvvvgggg", // 10 Euplotid Nuclear "knnkttttiiimrssrqhhqppppllllrrrr*yy*sssslffl*ccweddeaaaavvvvgggg", // 11 Bacterial and Plant Plastid "knnkttttiiimrssrqhhqpppplllsrrrr*yy*sssslffl*ccweddeaaaavvvvgggg", // 12 Alternative Yeast Nuclear "knnkttttmiimgssgqhhqppppllllrrrr*yy*sssslfflwccweddeaaaavvvvgggg", // 13 Ascidian Mitochondrial "nnnkttttiiimssssqhhqppppllllrrrryyy*sssslfflwccweddeaaaavvvvgggg", // 14 Alternative Flatworm Mitochondrial "knnkttttiiimrssrqhhqppppllllrrrr*yyqsssslffl*ccweddeaaaavvvvgggg", // 15 Blepharisma Nuclear "knnkttttiiimrssrqhhqppppllllrrrr*yylsssslffl*ccweddeaaaavvvvgggg", // 16 Chlorophycean Mitochondrial "0000000000000000000000000000000000000000000000000000000000000000", // 17 Unused "0000000000000000000000000000000000000000000000000000000000000000", // 18 Unused "0000000000000000000000000000000000000000000000000000000000000000", // 19 Unused "0000000000000000000000000000000000000000000000000000000000000000", // 20 Unused "nnnkttttmiimssssqhhqppppllllrrrr*yy*sssslfflwccweddeaaaavvvvgggg", // 21 Trematode Mitochondrial "knnkttttiiimrssrqhhqppppllllrrrr*yyl*ssslffl*ccweddeaaaavvvvgggg", // 22 Scenedesmus obliquus mitochondrial "knnkttttiiimrssrqhhqppppllllrrrr*yy*ssss*ffl*ccweddeaaaavvvvgggg" // 23 Thraustochytrium Mitochondrial }; const char csCodeNames[][75] = { "Unused", "Standard", "Vertebrate Mitochondrial", "Yeast Mitochondrial", "Mold, Protozoan, and Coelenterate Mitochondrial and Mycoplasma/Spiroplasma", "Invertebrate Mitochondrial", "Ciliate, Dasycladacean and Hexamita Nuclear", "Unused", "Unused", "Echinoderm and Flatworm Mitochondrial", "Euplotid Nuclear", "Bacterial and Plant Plastid", "Alternative Yeast Nuclear", "Ascidian Mitochondrial", "Alternative Flatworm Mitochondrial", "Blepharisma Nuclear", "Chlorophycean Mitochondrial", "Unused", "Unused", "Unused", "Unused", "Trematode Mitochondrial", "Scenedesmus obliquus mitochondrial", "Thraustochytrium Mitochondrial" }; #endif