TimothyStiles/poly

View on GitHub
io/fasta/fasta.go

Summary

Maintainability
A
0 mins
Test Coverage
/*
Package fasta contains fasta parsers and writers.

Fasta is a flat text file format developed in 1985 to store nucleotide and
amino acid sequences. It is extremely simple and well-supported across many
languages. However, this simplicity means that annotation of genetic objects
is not supported.

This package provides a parser and writer for working with Fasta formatted
genetic sequences.
*/
package fasta

import (
    "bufio"
    "bytes"
    "compress/gzip"
    "io"
    "io/ioutil"
    "os"
    "strings"
)

/******************************************************************************
Apr 25, 2021

Fasta Parser begins here

Many thanks to Jordan Campbell (https://github.com/0x106) for building the first
parser for Poly and thanks to Tim Stiles (https://github.com/TimothyStiles)
for helping complete that PR. This work expands on the previous work by allowing
for concurrent  parsing and giving Poly a specific  parser subpackage,
as well as few bug fixes.

Fasta is a very simple file format for working with DNA, RNA, or protein sequences.
It was first released in 1985 and is still widely used in bioinformatics.

https://en.wikipedia.org/wiki/_format

One interesting use of the concurrent  parser is working with the Uniprot
fasta dump files, which are far too large to fit into RAM. This parser is able
to easily handle those files by doing computation actively while the data dump
is getting parsed.

https://www.uniprot.org/downloads

I have removed the  Parsers from the io.go file and moved them into this
subpackage.

Hack the Planet,

Keoni

******************************************************************************/

var (
    gzipReaderFn = gzip.NewReader
    openFn       = os.Open
    buildFn      = Build
)

// Fasta is a struct representing a single Fasta file element with a Name and its corresponding Sequence.
type Fasta struct {
    Name     string `json:"name"`
    Sequence string `json:"sequence"`
}

// Parse parses a given Fasta file into an array of Fasta structs. Internally, it uses ParseFastaConcurrent.
func Parse(r io.Reader) ([]Fasta, error) {
    fastas := make(chan Fasta, 1000) // A buffer is used so that the functions runs as it is appending to outputFastas
    go ParseConcurrent(r, fastas)

    var outputFastas []Fasta
    for fasta := range fastas {
        outputFastas = append(outputFastas, fasta)
    }
    return outputFastas, nil
}

// ParseConcurrent concurrently parses a given Fasta file in an io.Reader into a channel of Fasta structs.
func ParseConcurrent(r io.Reader, sequences chan<- Fasta) {
    // Initialize necessary variables
    var sequenceLines []string
    var name string
    start := true

    // Start the scanner
    scanner := bufio.NewScanner(r)
    for scanner.Scan() {
        line := scanner.Text()
        switch {
        // if there's nothing on this line skip this iteration of the loop
        case len(line) == 0:
            continue
        // if it's a comment skip this line
        case line[0:1] == ";":
            continue
        // start of a fasta line
        case line[0:1] != ">":
            sequenceLines = append(sequenceLines, line)
        // Process normal new lines
        case line[0:1] == ">" && !start:
            sequence := strings.Join(sequenceLines, "")
            newFasta := Fasta{
                Name:     name,
                Sequence: sequence}
            // Reset sequence lines
            sequenceLines = []string{}
            // New name
            name = line[1:]
            sequences <- newFasta
        // Process first line of file
        case line[0:1] == ">" && start:
            name = line[1:]
            start = false
        }
    }
    // Add final sequence in file to channel
    sequence := strings.Join(sequenceLines, "")
    newFasta := Fasta{
        Name:     name,
        Sequence: sequence}
    sequences <- newFasta
    close(sequences)
}

/******************************************************************************

Start of  Read functions

******************************************************************************/

// ReadGzConcurrent concurrently reads a gzipped Fasta file into a Fasta channel.
func ReadGzConcurrent(path string, sequences chan<- Fasta) {
    file, _ := os.Open(path) // TODO: these errors need to be handled/logged

    reader, _ := gzipReaderFn(file)
    defer reader.Close()
    go ParseConcurrent(reader, sequences)
}

// ReadConcurrent concurrently reads a flat Fasta file into a Fasta channel.
func ReadConcurrent(path string, sequences chan<- Fasta) {
    file, _ := os.Open(path) // TODO: these errors need to be handled/logged
    go ParseConcurrent(file, sequences)
}

// ReadGz reads a gzipped  file into an array of Fasta structs.
func ReadGz(path string) ([]Fasta, error) {
    file, err := openFn(path)
    if err != nil {
        return nil, err
    }
    reader, err := gzipReaderFn(file)
    if err != nil {
        return nil, err
    }
    return Parse(reader)
}

// Read reads a  file into an array of Fasta structs
func Read(path string) ([]Fasta, error) {
    file, err := openFn(path)
    if err != nil {
        return nil, err
    }

    return Parse(file)
}

/******************************************************************************

Start of  Write functions

******************************************************************************/

// Build writes a Fasta struct to a  string.
func Build(fastas []Fasta) ([]byte, error) {
    var fastaString bytes.Buffer
    for _, fasta := range fastas {
        fastaString.WriteString(">")
        fastaString.WriteString(fasta.Name)
        fastaString.WriteString("\n")
        fastaString.WriteString(fasta.Sequence)
        fastaString.WriteString("\n")
    }
    return fastaString.Bytes(), nil
}

// Write writes a fasta array to a file.
func Write(fastas []Fasta, path string) error {
    fastaBytes, err := buildFn(fastas) //  fasta.Build returns only nil errors.
    if err != nil {
        return err
    }
    return ioutil.WriteFile(path, fastaBytes, 0644)
}