io/genbank/genbank.go
/*
Package genbank provides genbank parsers and writers.
GenBank is a flat text file format developed in the 1980s to annotate genetic
sequences, and has since become the standard for sharing annotated genetic
sequences.
This package provides a parser and writer to convert between the GenBank file
format and the more general Genbank struct.
*/
package genbank
import (
"bufio"
"bytes"
"fmt"
"io"
"io/ioutil"
"os"
"regexp"
"strconv"
"strings"
"github.com/TimothyStiles/poly/transform"
"github.com/lunny/log"
"github.com/mitchellh/go-wordwrap"
)
/******************************************************************************
GBK specific IO related things begin here.
******************************************************************************/
var (
readFileFn = ioutil.ReadFile
parseMultiNthFn = ParseMultiNth
parseReferencesFn = parseReferences
)
// Genbank is the main struct for the Genbank file format.
type Genbank struct {
Meta Meta
Features []Feature
Sequence string // will be changed and include reader, writer, and byte slice.
}
// Meta holds the meta data for Genbank and other annotated sequence files.
type Meta struct {
Date string `json:"date"`
Definition string `json:"definition"`
Accession string `json:"accession"`
Version string `json:"version"`
Keywords string `json:"keywords"`
Organism string `json:"organism"`
Source string `json:"source"`
Taxonomy []string `json:"taxonomy"`
Origin string `json:"origin"`
Locus Locus `json:"locus"`
References []Reference `json:"references"`
Other map[string]string `json:"other"`
Name string `json:"name"`
SequenceHash string `json:"sequence_hash"`
SequenceHashFunction string `json:"hash_function"`
}
// Feature holds the information for a feature in a Genbank file and other annotated sequence files.
type Feature struct {
Type string `json:"type"`
Description string `json:"description"`
Attributes map[string]string `json:"attributes"`
SequenceHash string `json:"sequence_hash"`
SequenceHashFunction string `json:"hash_function"`
Sequence string `json:"sequence"`
Location Location `json:"location"`
ParentSequence *Genbank `json:"-"`
}
// Reference holds information for one reference in a Meta struct.
type Reference struct {
Authors string `json:"authors"`
Title string `json:"title"`
Journal string `json:"journal"`
PubMed string `json:"pub_med"`
Remark string `json:"remark"`
Range string `json:"range"`
}
// Locus holds Locus information in a Meta struct.
type Locus struct {
Name string `json:"name"`
SequenceLength string `json:"sequence_length"`
MoleculeType string `json:"molecule_type"`
GenbankDivision string `json:"genbank_division"`
ModificationDate string `json:"modification_date"`
SequenceCoding string `json:"sequence_coding"`
Circular bool `json:"circular"`
Linear bool `json:"linear"`
}
// Location is a struct that holds the location of a feature.
type Location struct {
Start int `json:"start"`
End int `json:"end"`
Complement bool `json:"complement"`
Join bool `json:"join"`
FivePrimePartial bool `json:"five_prime_partial"`
ThreePrimePartial bool `json:"three_prime_partial"`
GbkLocationString string `json:"gbk_location_string"`
SubLocations []Location `json:"sub_locations"`
}
// AddFeature adds a feature to a Genbank struct.
func (sequence *Genbank) AddFeature(feature *Feature) error {
feature.ParentSequence = sequence
sequence.Features = append(sequence.Features, *feature)
return nil
}
// GetSequence returns the sequence of a feature.
func (feature Feature) GetSequence() (string, error) {
return getFeatureSequence(feature, feature.Location)
}
// getFeatureSequence takes a feature and location object and returns a sequence string.
func getFeatureSequence(feature Feature, location Location) (string, error) {
var sequenceBuffer bytes.Buffer
var sequenceString string
parentSequence := feature.ParentSequence.Sequence
if len(location.SubLocations) == 0 {
sequenceBuffer.WriteString(parentSequence[location.Start:location.End])
} else {
for _, subLocation := range location.SubLocations {
sequence, _ := getFeatureSequence(feature, subLocation)
sequenceBuffer.WriteString(sequence)
}
}
// reverse complements resulting string if needed.
if location.Complement {
sequenceString = transform.ReverseComplement(sequenceBuffer.String())
} else {
sequenceString = sequenceBuffer.String()
}
return sequenceString, nil
}
// Read reads a GBK file from path and returns a Genbank struct.
func Read(path string) (Genbank, error) {
genbankSlice, err := ReadMultiNth(path, 1)
if err != nil {
return Genbank{}, err
}
genbank := genbankSlice[0]
return genbank, err
}
// ReadMulti reads a multi Gbk from path and parses it into a slice of Genbank structs.
func ReadMulti(path string) ([]Genbank, error) {
return ReadMultiNth(path, -1)
}
// ReadMultiNth reads a multi Gbk from path and parses N entries into a slice of Genbank structs.
func ReadMultiNth(path string, count int) ([]Genbank, error) {
file, err := os.Open(path)
if err != nil {
return []Genbank{}, err
}
sequence, err := parseMultiNthFn(file, count)
if err != nil {
return []Genbank{}, err
}
return sequence, nil
}
// Write takes an Genbank list and a path string and writes out a genbank record to that path.
func Write(sequences Genbank, path string) error {
// build function always returns nil error.
// This is for API consistency in case we need to
// add error handling in the future.
gbk, _ := Build(sequences)
err := ioutil.WriteFile(path, gbk, 0644)
return err
}
// WriteMulti takes a slice of Genbank structs and a path string and writes out a multi genbank record to that path.
func WriteMulti(sequences []Genbank, path string) error {
// buildmulti function always returns nil error.
// This is for API consistency in case we need to
// add error handling in the future.
gbk, _ := BuildMulti(sequences)
err := ioutil.WriteFile(path, gbk, 0644)
return err
}
// Build builds a GBK byte slice to be written out to db or file.
func Build(gbk Genbank) ([]byte, error) {
gbkSlice := []Genbank{gbk}
multiGBK, err := buildMultiNth(gbkSlice, -1)
return multiGBK, err
}
// BuildMulti builds a MultiGBK byte slice to be written out to db or file.
func BuildMulti(sequence []Genbank) ([]byte, error) {
multiGBK, err := buildMultiNth(sequence, -1)
return multiGBK, err
}
// buildMultiNth builds a MultiGBK byte slice to be written out to db or file.
func buildMultiNth(sequences []Genbank, count int) ([]byte, error) {
var gbkString bytes.Buffer
for _, sequence := range sequences {
locus := sequence.Meta.Locus
var shape string
if locus.Circular {
shape = "circular"
} else if locus.Linear {
shape = "linear"
}
fivespace := generateWhiteSpace(subMetaIndex)
// building locus
locusData := locus.Name + fivespace + locus.SequenceLength + " bp" + fivespace + locus.MoleculeType + fivespace + shape + fivespace + locus.GenbankDivision + fivespace + locus.ModificationDate
locusString := "LOCUS " + locusData + "\n"
gbkString.WriteString(locusString)
// building other standard meta features
definitionString := buildMetaString("DEFINITION", sequence.Meta.Definition)
gbkString.WriteString(definitionString)
accessionString := buildMetaString("ACCESSION", sequence.Meta.Accession)
gbkString.WriteString(accessionString)
versionString := buildMetaString("VERSION", sequence.Meta.Version)
gbkString.WriteString(versionString)
keywordsString := buildMetaString("KEYWORDS", sequence.Meta.Keywords)
gbkString.WriteString(keywordsString)
sourceString := buildMetaString("SOURCE", sequence.Meta.Source)
gbkString.WriteString(sourceString)
organismString := buildMetaString(" ORGANISM", sequence.Meta.Organism)
gbkString.WriteString(organismString)
if len(sequence.Meta.Taxonomy) > 0 {
var taxonomyString strings.Builder
for i, taxonomyData := range sequence.Meta.Taxonomy {
taxonomyString.WriteString(taxonomyData)
if len(sequence.Meta.Taxonomy) == i+1 {
taxonomyString.WriteString(".")
} else {
taxonomyString.WriteString("; ")
}
}
gbkString.WriteString(buildMetaString("", taxonomyString.String()))
}
// building references
// TODO: could use reflection to get keys and make more general.
for referenceIndex, reference := range sequence.Meta.References {
referenceString := buildMetaString("REFERENCE", fmt.Sprintf("%d %s", referenceIndex+1, reference.Range))
gbkString.WriteString(referenceString)
if reference.Authors != "" {
authorsString := buildMetaString(" AUTHORS", reference.Authors)
gbkString.WriteString(authorsString)
}
if reference.Title != "" {
titleString := buildMetaString(" TITLE", reference.Title)
gbkString.WriteString(titleString)
}
if reference.Journal != "" {
journalString := buildMetaString(" JOURNAL", reference.Journal)
gbkString.WriteString(journalString)
}
if reference.PubMed != "" {
pubMedString := buildMetaString(" PUBMED", reference.PubMed)
gbkString.WriteString(pubMedString)
}
}
// building other meta fields that are catch all
otherKeys := make([]string, 0, len(sequence.Meta.Other))
for key := range sequence.Meta.Other {
otherKeys = append(otherKeys, key)
}
for _, otherKey := range otherKeys {
otherString := buildMetaString(otherKey, sequence.Meta.Other[otherKey])
gbkString.WriteString(otherString)
}
// start writing features section.
gbkString.WriteString("FEATURES Location/Qualifiers\n")
for _, feature := range sequence.Features {
gbkString.WriteString(BuildFeatureString(feature))
}
// start writing sequence section.
gbkString.WriteString("ORIGIN\n")
// iterate over every character in sequence range.
for index, base := range sequence.Sequence {
// if 60th character add newline then whitespace and index number and space before adding next base.
if index%60 == 0 {
if index != 0 {
gbkString.WriteString("\n")
}
lineNumberString := strconv.Itoa(index + 1) // genbank indexes at 1 for some reason
leadingWhiteSpaceLength := 9 - len(lineNumberString) // <- I wish I was kidding
for i := 0; i < leadingWhiteSpaceLength; i++ {
gbkString.WriteString(" ")
}
gbkString.WriteString(lineNumberString + " ")
gbkString.WriteRune(base)
// if base index is divisible by ten add a space (genbank convention)
} else if index%10 == 0 {
gbkString.WriteString(" ")
gbkString.WriteRune(base)
// else just add the base.
} else {
gbkString.WriteRune(base)
}
}
// finish genbank file with "//" on newline (again a genbank convention)
gbkString.WriteString("\n//\n")
}
return gbkString.Bytes(), nil
}
// Parse takes in a reader representing a single gbk/gb/genbank file and parses it into a Genbank struct.
func Parse(r io.Reader) (Genbank, error) {
genbankSlice, err := parseMultiNthFn(r, 1)
if err != nil {
return Genbank{}, err
}
return genbankSlice[0], err
}
// ParseMulti takes in a reader representing a multi gbk/gb/genbank file and parses it into a slice of Genbank structs.
func ParseMulti(r io.Reader) ([]Genbank, error) {
genbankSlice, err := parseMultiNthFn(r, -1)
if err != nil {
return []Genbank{}, err
}
return genbankSlice, err
}
type parseLoopParameters struct {
newLocation bool
quoteActive bool
attribute string
attributeValue string
sequenceBuilder strings.Builder
parseStep string
genbank Genbank // since we are scanning lines we need a Genbank struct to store the data outside the loop.// since we are scanning lines we need a Genbank struct to store the data outside the loop.
feature Feature
features []Feature
metadataTag string
metadataData []string //this stutters but will remain to make it easier to batch rename variables when compared to parameters.metadataTag.
genbankStarted bool
currentLine string
prevline string
multiLineFeature bool
}
// method to init loop parameters
func (params *parseLoopParameters) init() {
params.newLocation = true
params.feature.Attributes = make(map[string]string)
params.parseStep = "metadata"
params.genbankStarted = false
params.genbank.Meta.Other = make(map[string]string)
}
// ParseMultiNth takes in a reader representing a multi gbk/gb/genbank file and parses the first n records into a slice of Genbank structs.
func ParseMultiNth(r io.Reader, count int) ([]Genbank, error) {
scanner := bufio.NewScanner(r)
var genbanks []Genbank
// Sequence setup
var parameters parseLoopParameters
parameters.init()
// Loop through each line of the file
for lineNum := 0; scanner.Scan(); lineNum++ {
// get line from scanner and split it
line := scanner.Text()
splitLine := strings.Split(strings.TrimSpace(line), " ")
prevline := parameters.currentLine
parameters.currentLine = line
parameters.prevline = prevline
// keep scanning until we find the start of the first record
if !parameters.genbankStarted {
// We detect the beginning of a new genbank file with "LOCUS"
locusFlag := strings.Contains(line, "LOCUS")
if locusFlag {
parameters = parseLoopParameters{}
parameters.init()
parameters.genbank.Meta.Locus = parseLocus(line)
parameters.genbankStarted = true
}
continue
}
switch parameters.parseStep {
case "metadata":
// Handle empty lines
if len(line) == 0 {
return genbanks, fmt.Errorf("Empty metadata line on line %d", lineNum)
}
// If we are currently reading a line, we need to figure out if it is a new meta line.
if string(line[0]) != " " || parameters.metadataTag == "FEATURES" {
// If this is true, it means we are beginning a new meta tag. In that case, let's save
// the older data, and then continue along.
switch parameters.metadataTag {
case "DEFINITION":
parameters.genbank.Meta.Definition = parseMetadata(parameters.metadataData)
case "ACCESSION":
parameters.genbank.Meta.Accession = parseMetadata(parameters.metadataData)
case "VERSION":
parameters.genbank.Meta.Version = parseMetadata(parameters.metadataData)
case "KEYWORDS":
parameters.genbank.Meta.Keywords = parseMetadata(parameters.metadataData)
case "SOURCE":
parameters.genbank.Meta.Source, parameters.genbank.Meta.Organism, parameters.genbank.Meta.Taxonomy = getSourceOrganism(parameters.metadataData)
case "REFERENCE":
reference, err := parseReferencesFn(parameters.metadataData)
if err != nil {
return []Genbank{}, fmt.Errorf("Failed in parsing reference above line %d. Got error: %s", lineNum, err)
}
parameters.genbank.Meta.References = append(parameters.genbank.Meta.References, reference)
case "FEATURES":
parameters.parseStep = "features"
// We know that we are now parsing features, so lets initialize our first feature
parameters.feature.Type = strings.TrimSpace(splitLine[0])
parameters.feature.Location.GbkLocationString = strings.TrimSpace(splitLine[len(splitLine)-1])
parameters.newLocation = true
continue
default:
if parameters.metadataTag != "" {
parameters.genbank.Meta.Other[parameters.metadataTag] = parseMetadata(parameters.metadataData)
}
}
parameters.metadataTag = strings.TrimSpace(splitLine[0])
parameters.metadataData = []string{strings.TrimSpace(line[len(parameters.metadataTag):])}
} else {
parameters.metadataData = append(parameters.metadataData, line)
}
case "features":
// Switch to sequence parsing
originFlag := strings.Contains(line, "ORIGIN") // we detect the beginning of the sequence with "ORIGIN"
if originFlag {
parameters.parseStep = "sequence"
// save our completed attribute / qualifier string to the current feature
if parameters.attributeValue != "" {
parameters.feature.Attributes[parameters.attribute] = parameters.attributeValue
parameters.features = append(parameters.features, parameters.feature)
parameters.attributeValue = ""
parameters.attribute = ""
parameters.feature = Feature{}
parameters.feature.Attributes = make(map[string]string)
} else {
parameters.features = append(parameters.features, parameters.feature)
}
// add our features to the genbank
for _, feature := range parameters.features {
location, err := parseLocation(feature.Location.GbkLocationString)
if err != nil {
return []Genbank{}, err
}
feature.Location = location
err = parameters.genbank.AddFeature(&feature)
if err != nil {
return []Genbank{}, err
}
}
continue
} // end sequence parsing flag logic
// check if current line contains anything but whitespace
trimmedLine := strings.TrimSpace(line)
if len(trimmedLine) < 1 {
continue
}
// determine if current line is a new top level feature
if countLeadingSpaces(parameters.currentLine) < countLeadingSpaces(parameters.prevline) || parameters.prevline == "FEATURES" {
// save our completed attribute / qualifier string to the current feature
if parameters.attributeValue != "" {
parameters.feature.Attributes[parameters.attribute] = parameters.attributeValue
parameters.features = append(parameters.features, parameters.feature)
parameters.attributeValue = ""
parameters.attribute = ""
parameters.feature = Feature{}
parameters.feature.Attributes = make(map[string]string)
}
// }
// checks for empty types
if parameters.feature.Type != "" {
parameters.features = append(parameters.features, parameters.feature)
}
parameters.feature = Feature{}
parameters.feature.Attributes = make(map[string]string)
// An initial feature line looks like this: `source 1..2686` with a type separated by its location
if len(splitLine) < 2 {
return genbanks, fmt.Errorf("Feature line malformed on line %d. Got line: %s", lineNum, line)
}
parameters.feature.Type = strings.TrimSpace(splitLine[0])
parameters.feature.Location.GbkLocationString = strings.TrimSpace(splitLine[len(splitLine)-1])
parameters.multiLineFeature = false // without this we can't tell if something is a multiline feature or multiline qualifier
} else if !strings.Contains(parameters.currentLine, "/") { // current line is continuation of a feature or qualifier (sub-constituent of a feature)
// if it's a continuation of the current feature, add it to the location
if !strings.Contains(parameters.currentLine, "\"") && (countLeadingSpaces(parameters.currentLine) > countLeadingSpaces(parameters.prevline) || parameters.multiLineFeature) {
parameters.feature.Location.GbkLocationString += strings.TrimSpace(line)
parameters.multiLineFeature = true // without this we can't tell if something is a multiline feature or multiline qualifier
} else { // it's a continued line of a qualifier
removeAttributeValueQuotes := strings.Replace(trimmedLine, "\"", "", -1)
parameters.attributeValue = parameters.attributeValue + removeAttributeValueQuotes
}
} else if strings.Contains(parameters.currentLine, "/") { // current line is a new qualifier
// save our completed attribute / qualifier string to the current feature
if parameters.attributeValue != "" {
parameters.feature.Attributes[parameters.attribute] = parameters.attributeValue
}
parameters.attributeValue = ""
splitAttribute := strings.Split(line, "=")
trimmedSpaceAttribute := strings.TrimSpace(splitAttribute[0])
removedForwardSlashAttribute := strings.Replace(trimmedSpaceAttribute, "/", "", 1)
parameters.attribute = removedForwardSlashAttribute
removeAttributeValueQuotes := strings.Replace(splitAttribute[1], "\"", "", -1)
parameters.attributeValue = removeAttributeValueQuotes
parameters.multiLineFeature = false // without this we can't tell if something is a multiline feature or multiline qualifier
}
case "sequence":
reg, _ := regexp.Compile("[^a-zA-Z]+")
if len(line) < 2 { // throw error if line is malformed
return genbanks, fmt.Errorf("Too short line found while parsing genbank sequence on line %d. Got line: %s", lineNum, line)
} else if line[0:2] == "//" { // end of sequence
parameters.genbank.Sequence = parameters.sequenceBuilder.String()
genbanks = append(genbanks, parameters.genbank)
parameters.genbankStarted = false
parameters.sequenceBuilder.Reset()
} else { // add line to total sequence
parameters.sequenceBuilder.WriteString(reg.ReplaceAllString(line, ""))
}
default:
log.Warnf("Unknown parse step: %s", parameters.parseStep)
parameters.genbankStarted = false
}
}
return genbanks, nil
}
func countLeadingSpaces(line string) int {
return len(line) - len(strings.TrimLeft(line, " "))
}
func parseMetadata(metadataData []string) string {
var outputMetadata string
if len(metadataData) == 0 {
return "."
}
for _, data := range metadataData {
outputMetadata = outputMetadata + strings.TrimSpace(data) + " "
}
outputMetadata = outputMetadata[:len(outputMetadata)-1] // Remove trailing whitespace
return outputMetadata
}
func parseReferences(metadataData []string) (Reference, error) {
var reference Reference
var err error
rangeIndex := strings.Index(metadataData[0], "(")
if rangeIndex != -1 {
reference.Range = metadataData[0][rangeIndex:]
}
var referenceKey string
var referenceValue string
if len(metadataData) == 1 {
return Reference{}, fmt.Errorf("Got reference with no additional information")
}
referenceKey = strings.Split(strings.TrimSpace(metadataData[1]), " ")[0]
referenceValue = strings.TrimSpace(metadataData[1][len(referenceKey)+2:])
for index := 2; index < len(metadataData); index++ {
if len(metadataData[index]) > 3 {
if metadataData[index][3] != ' ' {
err = reference.addKey(referenceKey, referenceValue)
if err != nil {
return reference, err
}
referenceKey = strings.Split(strings.TrimSpace(metadataData[index]), " ")[0]
referenceValue = strings.TrimSpace(metadataData[index][len(referenceKey)+2:])
} else {
// Otherwise, simply append the next metadata.
referenceValue = referenceValue + " " + strings.TrimSpace(metadataData[index])
}
}
}
err = reference.addKey(referenceKey, referenceValue)
if err != nil {
return reference, err
}
return reference, nil
}
func (reference *Reference) addKey(referenceKey string, referenceValue string) error {
switch referenceKey {
case "AUTHORS":
reference.Authors = referenceValue
case "TITLE":
reference.Title = referenceValue
case "JOURNAL":
reference.Journal = referenceValue
case "PUBMED":
reference.PubMed = referenceValue
case "REMARK":
reference.Remark = referenceValue
default:
return fmt.Errorf("ReferenceKey not in [AUTHORS, TITLE, JOURNAL, PUBMED, REMARK]. Got: %s", referenceKey)
}
return nil
}
var genBankMoleculeTypes = []string{
"DNA",
"genomic DNA",
"genomic RNA",
"mRNA",
"tRNA",
"rRNA",
"other RNA",
"other DNA",
"transcribed RNA",
"viral cRNA",
"unassigned DNA",
"unassigned RNA",
}
// used in parseLocus function though it could be useful elsewhere.
var genbankDivisions = []string{
"PRI", //primate sequences
"ROD", //rodent sequences
"MAM", //other mamallian sequences
"VRT", //other vertebrate sequences
"INV", //invertebrate sequences
"PLN", //plant, fungal, and algal sequences
"BCT", //bacterial sequences
"VRL", //viral sequences
"PHG", //bacteriophage sequences
"SYN", //synthetic sequences
"UNA", //unannotated sequences
"EST", //EST sequences (expressed sequence tags)
"PAT", //patent sequences
"STS", //STS sequences (sequence tagged sites)
"GSS", //GSS sequences (genome survey sequences)
"HTG", //HTG sequences (high-throughput genomic sequences)
"HTC", //unfinished high-throughput cDNA sequencing
"ENV", //environmental sampling sequences
}
// TODO rewrite with proper error handling.
// parses locus from provided string.
func parseLocus(locusString string) Locus {
locus := Locus{}
basePairRegex, _ := regexp.Compile(` \d* \w{2} `)
circularRegex, _ := regexp.Compile(` circular `)
linearRegex, _ := regexp.Compile(` linear `)
ModificationDateRegex, _ := regexp.Compile(`\d{2}-[A-Z]{3}-\d{4}`)
locusSplit := strings.Split(strings.TrimSpace(locusString), " ")
var filteredLocusSplit []string
for i := range locusSplit {
if locusSplit[i] != "" {
filteredLocusSplit = append(filteredLocusSplit, locusSplit[i])
}
}
locus.Name = filteredLocusSplit[1]
// sequence length and coding
baseSequenceLength := string(basePairRegex.FindString(locusString))
if baseSequenceLength != "" {
splitBaseSequenceLength := strings.Split(strings.TrimSpace(baseSequenceLength), " ")
if len(splitBaseSequenceLength) == 2 {
locus.SequenceLength = splitBaseSequenceLength[0]
locus.SequenceCoding = splitBaseSequenceLength[1]
}
}
// molecule type
for _, moleculeType := range genBankMoleculeTypes {
moleculeRegex, _ := regexp.Compile(moleculeType)
match := string(moleculeRegex.Find([]byte(locusString)))
if match != "" {
locus.MoleculeType = match
break
}
}
// circularity flag
if circularRegex.Match([]byte(locusString)) {
locus.Circular = true
}
if linearRegex.Match([]byte(locusString)) {
locus.Linear = true
}
// genbank division
for _, genbankDivision := range genbankDivisions {
genbankDivisionRegex, _ := regexp.Compile(genbankDivision)
match := string(genbankDivisionRegex.Find([]byte(locusString)))
if match != "" {
locus.GenbankDivision = match
break
}
}
// ModificationDate
locus.ModificationDate = ModificationDateRegex.FindString(locusString)
return locus
}
// indeces for random points of interests on a gbk line.
const subMetaIndex = 5
const qualifierIndex = 21
func getSourceOrganism(metadataData []string) (string, string, []string) {
source := strings.TrimSpace(metadataData[0])
var organism string
var taxonomy []string
for iterator := 1; iterator < len(metadataData); iterator++ {
dataLine := metadataData[iterator]
headString := strings.Split(strings.TrimSpace(dataLine), " ")[0]
if headString == "ORGANISM" {
index := strings.Index(dataLine, `ORGANISM`)
organism = strings.TrimSpace(dataLine[index+len("ORGANISM"):])
continue
}
for _, taxonomyData := range strings.Split(strings.TrimSpace(dataLine), ";") {
taxonomyDataTrimmed := strings.TrimSpace(taxonomyData)
// Taxonomy ends with a ".", which we check for here
if len(taxonomyDataTrimmed) > 1 {
if taxonomyDataTrimmed[len(taxonomyDataTrimmed)-1] == '.' {
taxonomyDataTrimmed = taxonomyDataTrimmed[:len(taxonomyDataTrimmed)-1]
}
taxonomy = append(taxonomy, taxonomyDataTrimmed)
}
}
}
return source, organism, taxonomy
}
func parseLocation(locationString string) (Location, error) {
var location Location
location.GbkLocationString = locationString
if !(strings.ContainsAny(locationString, "(")) { // Case checks for simple expression of x..x
if !(strings.ContainsAny(locationString, ".")) { //Case checks for simple expression x
position, err := strconv.Atoi(locationString)
if err != nil {
return Location{}, err
}
location = Location{Start: position, End: position}
} else {
// to remove FivePrimePartial and ThreePrimePartial indicators from start and end before converting to int.
partialRegex, _ := regexp.Compile("<|>")
startEndSplit := strings.Split(locationString, "..")
start, err := strconv.Atoi(partialRegex.ReplaceAllString(startEndSplit[0], ""))
if err != nil {
return Location{}, err
}
end, err := strconv.Atoi(partialRegex.ReplaceAllString(startEndSplit[1], ""))
if err != nil {
return Location{}, err
}
location = Location{Start: start - 1, End: end}
}
} else {
firstOuterParentheses := strings.Index(locationString, "(")
expression := locationString[firstOuterParentheses+1 : strings.LastIndex(locationString, ")")]
switch command := locationString[0:firstOuterParentheses]; command {
case "join":
location.Join = true
// This case checks for join(complement(x..x),complement(x..x)), or any more complicated derivatives
if strings.ContainsAny(expression, "(") {
firstInnerParentheses := strings.Index(expression, "(")
ParenthesesCount := 1
comma := 0
for i := 1; ParenthesesCount > 0; i++ { // "(" is at 0, so we start at 1
comma = i
switch expression[firstInnerParentheses+i] {
case []byte("(")[0]:
ParenthesesCount++
case []byte(")")[0]:
ParenthesesCount--
}
}
parseLeftLocation, err := parseLocation(expression[:firstInnerParentheses+comma+1])
if err != nil {
return Location{}, err
}
parseRightLocation, err := parseLocation(expression[2+firstInnerParentheses+comma:])
if err != nil {
return Location{}, err
}
location.SubLocations = append(location.SubLocations, parseLeftLocation, parseRightLocation)
} else { // This is the default join(x..x,x..x)
for _, numberRange := range strings.Split(expression, ",") {
joinLocation, err := parseLocation(numberRange)
if err != nil {
return Location{}, err
}
location.SubLocations = append(location.SubLocations, joinLocation)
}
}
case "complement":
// location.Complement = true
subLocation, err := parseLocation(expression)
if err != nil {
return Location{}, err
}
subLocation.Complement = true
subLocation.GbkLocationString = locationString
location.SubLocations = append(location.SubLocations, subLocation)
}
}
if strings.Contains(locationString, "<") {
location.FivePrimePartial = true
}
if strings.Contains(locationString, ">") {
location.ThreePrimePartial = true
}
// if excess root node then trim node. Maybe should just be handled with second arg?
if location.Start == 0 && location.End == 0 && !location.Join && !location.Complement {
location = location.SubLocations[0]
}
return location, nil
}
// buildMetaString is a helper function to build the meta section of genbank files.
func buildMetaString(name string, data string) string {
keyWhitespaceTrailLength := 12 - len(name) // I wish I was kidding.
var keyWhitespaceTrail string
for i := 0; i < keyWhitespaceTrailLength; i++ {
keyWhitespaceTrail += " "
}
name += keyWhitespaceTrail
wrappedData := wordwrap.WrapString(data, 68)
splitData := strings.Split(wrappedData, "\n")
var returnData string
for index, datum := range splitData {
if index == 0 {
returnData = name + datum + "\n"
} else {
returnData += generateWhiteSpace(12) + datum + "\n"
}
}
return returnData
}
// BuildLocationString is a recursive function that takes a location object and creates a gbk location string for Build()
func BuildLocationString(location Location) string {
var locationString string
if location.Complement {
location.Complement = false
locationString = "complement(" + BuildLocationString(location) + ")"
} else if location.Join {
locationString = "join("
for _, sublocation := range location.SubLocations {
locationString += BuildLocationString(sublocation) + ","
}
locationString = strings.TrimSuffix(locationString, ",") + ")"
} else {
locationString = strconv.Itoa(location.Start+1) + ".." + strconv.Itoa(location.End)
if location.FivePrimePartial {
locationString = "<" + locationString
}
if location.ThreePrimePartial {
locationString += ">"
}
}
return locationString
}
// BuildFeatureString is a helper function to build gbk feature strings for Build()
func BuildFeatureString(feature Feature) string {
whiteSpaceTrailLength := 16 - len(feature.Type) // I wish I was kidding.
whiteSpaceTrail := generateWhiteSpace(whiteSpaceTrailLength)
var location string
if feature.Location.GbkLocationString != "" {
location = feature.Location.GbkLocationString
} else {
location = BuildLocationString(feature.Location)
}
featureHeader := generateWhiteSpace(subMetaIndex) + feature.Type + whiteSpaceTrail + location + "\n"
returnString := featureHeader
qualifierKeys := make([]string, 0, len(feature.Attributes))
for key := range feature.Attributes {
qualifierKeys = append(qualifierKeys, key)
}
for _, qualifier := range qualifierKeys {
returnString += generateWhiteSpace(qualifierIndex) + "/" + qualifier + "=\"" + feature.Attributes[qualifier] + "\"\n"
}
return returnString
}
func generateWhiteSpace(length int) string {
var spaceBuilder strings.Builder
for i := 0; i < length; i++ {
spaceBuilder.WriteString(" ")
}
return spaceBuilder.String()
}
/******************************************************************************
GBK specific IO related things end here.
******************************************************************************/