decoder.go from elliotchance/gedcom

decoder.go
Summary

Maintainability

5 hrs
Test Coverage

Issues
// Decoding a Document
//
// Decoding a GEDCOM stream:
//
//   ged := "0 HEAD\n1 CHAR UTF-8"
//
//   decoder := gedcom.NewDecoder(strings.NewReader(ged))
//   document, err := decoder.Decode()
//   if err != nil {
//     panic(err)
//   }
//
// If you are reading from a file you can use NewDocumentFromGEDCOMFile:
//
//   document, err := gedcom.NewDocumentFromGEDCOMFile("family.ged")
//   if err != nil {
//       panic(err)
//   }
//
package gedcom

import (
    "bufio"
    "bytes"
    "fmt"
    "io"
    "regexp"
    "strconv"
    "strings"
)

// See Decoder.consumeOptionalBOM().
var byteOrderMark = []byte{0xef, 0xbb, 0xbf}

// Decoder represents a GEDCOM decoder.
type Decoder struct {
    r *bufio.Reader

    // It is not valid for GEDCOM values to contain new lines or carriage
    // returns. However, some application dump data without correctly using the
    // CONT tags.
    //
    // Strictly speaking we should bail out with an error but there are too many
    // cases that are difficult to clean up for consumers so we offer and option
    // to permit it.
    //
    // When enabled any line than cannot be parsed will be considered an
    // extension of the previous line (including the new line character).
    AllowMultiLine bool

    // AllowInvalidIndents allows a child node to have an indent greater than +1
    // of the parent. AllowInvalidIndents is disabled by default because if this
    // happens the GEDCOM file is broken in some possibly serious way and
    // certainly not a valid GEDCOM file.
    //
    // The biggest problem with having the indents wrongly aligned is that nodes
    // that are expected to be a certain depth (such as NPFX inside a NAME) will
    // probably break or interfere with a traversal algorithm that is not
    // expecting the node to be there/at that level.
    //
    // Another important thing to note is that the incorrect indent level will
    // not be retained when writing the Document back to a GEDCOM.
    AllowInvalidIndents bool
}

// Create a new decoder to parse a reader that contain GEDCOM data.
func NewDecoder(r io.Reader) *Decoder {
    return &Decoder{
        r: bufio.NewReader(r),
    }
}

// Decode will parse the entire GEDCOM stream (until EOF is reached) and return
// a Document. If the GEDCOM stream is not valid then the document node will
// be nil and the error is returned.
//
// A blank GEDCOM or a GEDCOM that only contains empty lines is valid and a
// Document will be returned with zero nodes.
func (dec *Decoder) Decode() (*Document, error) {
    document := NewDocument()
    indents := Nodes{}
    var family *FamilyNode

    document.HasBOM = dec.consumeOptionalBOM()

    finished := false
    lineNumber := 0

    // Only used when AllowMultiLine is enabled.
    var previousNode Node

    for !finished {
        lineNumber++

        line, err := dec.readLine()
        if err != nil {
            if err != io.EOF {
                return nil, err
            }

            finished = true
        }

        // Skip blank lines.
        if line == "" {
            if dec.AllowMultiLine && previousNode != nil {
                previousNode.RawSimpleNode().value += "\n"
            }

            continue
        }

        node, indent, err := parseLine(line, document, family)
        if err != nil {
            if dec.AllowMultiLine && previousNode != nil {
                previousNode.RawSimpleNode().value += "\n" + line
                continue
            }

            return nil, fmt.Errorf("line %d: %s", lineNumber, err)
        }

        // Families cannot be nested so any children that appear after this node
        // will be attached to the most recently seen family. We do not need to
        // set this back to nil after we exit the family node.
        if f, ok := node.(*FamilyNode); ok {
            family = f
        }

        // Add a root node to the document.
        if indent == 0 {
            dec.trimNodeValue(previousNode)
            document.AddNode(node)
            previousNode = node

            // There can be multiple root nodes so make sure we always reset all
            // indent pointers.
            indents = Nodes{node}

            continue
        }

        if indent-1 >= len(indents) {
            // This means the file is not valid. I have seen it in very rare
            // cases. See full explanation in AllowInvalidIndents.
            if dec.AllowInvalidIndents {
                indent = len(indents)
            } else {
                panic(fmt.Sprintf(
                    "indent is too large - missing parent? at line %d: %s",
                    lineNumber, line))
            }
        }

        i := indents[indent-1]

        switch {
        case indent >= len(indents):
            // Descending one level. It is not valid for a child to have an
            // indent that is more than one greater than the parent. This would
            // be a corrupt GEDCOM and lead to a panic.
            indents = append(indents, node)

        case indent < len(indents)-1:
            // Moving back to a parent. It is possible for this leap to be
            // greater than one so trim the indent levels back as many times as
            // needed to represent the new indent level.
            indents = indents[:indent+1]
            indents[indent] = node

        default:
            // This case would be "indent == len(indents)-1" (the indent does
            // not change from the previous line). However, since it is the only
            // other logical possibility there is no need to evaluate it for the
            // case condition.
            //
            // Make sure we update the current indent with the new node so that
            // children get place on this node and not the previous one.
            indents[indent] = node
        }

        dec.trimNodeValue(previousNode)
        i.AddNode(node)

        previousNode = node
    }

    dec.trimNodeValue(previousNode)

    // Build the cache once.
    document.buildPointerCache()

    return document, nil
}

func (dec *Decoder) trimNodeValue(previousNode Node) {
    // When AllowMultiLine is enabled we have to be careful to trim the
    // surrounding spaces off the value so it can be interpreted correct.
    //
    // Another solution would be to ignore blank lines entirely, but then we
    // would miss the paragraph separators in multiline text.
    if !IsNil(previousNode) {
        newValue := strings.TrimSpace(previousNode.RawSimpleNode().value)
        previousNode.RawSimpleNode().value = newValue
    }
}

func (dec *Decoder) readLine() (string, error) {
    buf := new(bytes.Buffer)

    for {
        b, err := dec.r.ReadByte()
        if err != nil {
            return string(buf.Bytes()), err
        }

        // The line endings in the GEDCOM files can be different. A newline and
        // carriage return are both considered to be the end of the line and
        // empty lines are ignored so we can treat both of these characters as
        // independent line terminators.
        if b == '\n' || b == '\r' {
            break
        }

        buf.WriteByte(b)
    }

    return string(buf.Bytes()), nil
}

var lineRegexp = regexp.MustCompile(`^(\d) +(@[^@]+@ )?(\w+) ?(.*)?$`)

func parseLine(line string, document *Document, family *FamilyNode) (Node, int, error) {
    parts := lineRegexp.FindStringSubmatch(line)

    if len(parts) == 0 {
        return nil, 0, fmt.Errorf("could not parse: %s", line)
    }

    // Indent (required).
    indent, _ := strconv.Atoi(parts[1])

    // Pointer (optional).
    pointer := ""
    if parts[2] != "" {
        // Trim off the surrounding '@'.
        pointer = parts[2][1 : len(parts[2])-2]
    }

    // Tag (required).
    tag := TagFromString(parts[3])

    // Value (optional).
    value := parts[4]

    return newNode(document, family, tag, value, pointer), indent, nil
}

// NewNode creates a node with no children. It is also the correct way to
// create a shallow copy of a node.
//
// If the node tag is recognised as a more specific type, such as *DateNode then
// that will be returned. Otherwise a *SimpleNode will be used.
func NewNode(tag Tag, value, pointer string, children ...Node) Node {
    return newNodeWithChildren(nil, nil, tag, value, pointer, children)
}

func newNode(document *Document, family *FamilyNode, tag Tag, value, pointer string) Node {
    return newNodeWithChildren(document, family, tag, value, pointer, nil)
}

func newNodeWithChildren(document *Document, family *FamilyNode, tag Tag, value, pointer string, children Nodes) Node {
    var node Node

    switch tag {
    case TagBaptism:
        node = NewBaptismNode(value, children...)

    case TagBirth:
        node = NewBirthNode(value, children...)

    case TagBurial:
        node = NewBurialNode(value, children...)

    case TagChild:
        needsFamily(family, tag)

        node = newChildNode(family, value, children...)

    case TagDate:
        node = NewDateNode(value, children...)

    case TagDeath:
        node = NewDeathNode(value, children...)

    case TagEvent:
        node = NewEventNode(value, children...)

    case TagFamily:
        needsDocument(document, tag)

        node = newFamilyNode(document, pointer, children...)

    case UnofficialTagFamilySearchID1, UnofficialTagFamilySearchID2:
        node = NewFamilySearchIDNode(tag, value, children...)

    case TagFormat:
        node = NewFormatNode(value, children...)

    case TagHusband:
        needsFamily(family, tag)

        node = newHusbandNode(family, value, children...)

    case TagIndividual:
        needsDocument(document, tag)

        node = newIndividualNode(document, pointer, children...)

    case TagLatitude:
        node = NewLatitudeNode(value, children...)

    case TagLongitude:
        node = NewLongitudeNode(value, children...)

    case TagMap:
        node = NewMapNode(value, children...)

    case TagName:
        node = NewNameNode(value, children...)

    case TagNickname:
        node = NewNicknameNode(value, children...)

    case TagNote:
        node = NewNoteNode(value, children...)

    case TagPhonetic:
        node = NewPhoneticVariationNode(value, children...)

    case TagPlace:
        node = NewPlaceNode(value, children...)

    case TagResidence:
        node = NewResidenceNode(value, children...)

    case TagRomanized:
        node = NewRomanizedVariationNode(value, children...)

    case TagSex:
        node = NewSexNode(value)

    case TagSource:
        node = NewSourceNode(value, pointer, children...)

    case TagType:
        node = NewTypeNode(value, children...)

    case UnofficialTagUniqueID:
        node = NewUniqueIDNode(value, children...)

    case TagWife:
        needsFamily(family, tag)

        node = newWifeNode(family, value, children...)
    }

    if IsNil(node) {
        node = newSimpleNode(tag, value, pointer, children...)
    } else {
        simpleNode := node.RawSimpleNode()
        simpleNode.pointer = pointer
    }

    return node
}

func needsDocument(document *Document, tag Tag) {
    if document == nil {
        panic(fmt.Sprintf("cannot create %s without a document", tag))
    }
}

func needsFamily(family *FamilyNode, tag Tag) {
    if family == nil {
        panic(fmt.Sprintf("cannot create %s without a family", tag))
    }
}

// consumeOptionalBOM will test and discard the Byte Order Mark at the start of
// the stream.
//
// In order to keep the original stream as intact as possible when encoding the
// BOM will be written back if it existed originally.
//
// Use of a BOM is neither required nor recommended for UTF-8, but may be
// encountered in contexts where UTF-8 data is converted from other encoding
// forms that use a BOM or where the BOM is used as a UTF-8 signature. See the
// “Byte Order Mark” subsection in Section 16.8, Specials, for more information.
// - 2.6 Encoding Schemes, http://www.unicode.org/versions/Unicode5.0.0/ch02.pdf
func (dec *Decoder) consumeOptionalBOM() bool {
    possibleBOM, _ := dec.r.Peek(3)
    hasBOM := bytes.Compare(possibleBOM, byteOrderMark) == 0

    if hasBOM {
        dec.r.Discard(3)
    }

    return hasBOM
}