ReanGD/go-web-search

View on GitHub
import_dict.go

Summary

Maintainability
A
2 hrs
Test Coverage
package main

import (
    "bufio"
    "bytes"
    "errors"
    "fmt"
    "os"
    "strconv"
    "unicode"
    "unicode/utf8"
)

type tokenType uint8

const (
    tokenText   tokenType = 0
    tokenNumber tokenType = 1
    tokenEmpty  tokenType = 2
    tokenEnd    tokenType = 3
)

type token struct {
    text   string
    number int32
    ttype  tokenType
}

type dictParser struct {
    number  int32
    finish  bool
    scanner *bufio.Scanner
    words   []string
}

func (p *dictParser) nextLine() (bool, []byte, error) {
    done := !p.scanner.Scan()

    if done {
        return done, nil, nil
    }

    return done, p.scanner.Bytes(), p.scanner.Err()
}

func (p *dictParser) nextToken() (token, error) {
    done, data, err := p.nextLine()

    if err != nil {
        return token{}, err
    }

    if done {
        return token{ttype: tokenEnd}, nil
    }

    if len(data) == 0 {
        return token{ttype: tokenEmpty}, nil
    }

    var buffer bytes.Buffer
    var rRaw, r rune
    var size int
    for len(data) > 0 {
        rRaw, size = utf8.DecodeRune(data)
        r = unicode.ToLower(rRaw)
        if r == '\t' {
            return token{text: buffer.String(), ttype: tokenText}, nil
        }
        _, _ = buffer.WriteRune(r)
        data = data[size:]
    }

    text := buffer.String()
    number, err := strconv.ParseInt(text, 10, 32)
    if err != nil {
        return token{}, err
    }

    return token{number: int32(number), ttype: tokenNumber}, nil
}

func (p *dictParser) nextWords() ([]string, error) {
    if p.finish {
        return nil, nil
    }
    t, err := p.nextToken()
    if err != nil {
        return nil, err
    }
    if t.ttype != tokenNumber {
        return nil, errors.New("unknown token")
    }

    if p.number >= t.number {
        return nil, fmt.Errorf("unknown number %d", t.number)
    }
    p.number = t.number

    var result []string
    for {
        t, err = p.nextToken()
        if err != nil {
            return nil, err
        } else if t.ttype == tokenText {
            result = append(result, t.text)
        } else if t.ttype == tokenEnd {
            p.finish = true
            break
        } else if t.ttype == tokenEmpty {
            break
        } else {
            return nil, errors.New("unknown token")
        }
    }

    if len(result) == 0 {
        return nil, errors.New("empty array")
    }

    return result, nil
}

func (p *dictParser) parse() error {
    for {
        words, err := p.nextWords()

        if err != nil {
            return err
        }
        if words == nil {
            break
        }

        for _, word := range words {
            p.words = append(p.words, word)
            // fmt.Println(word)
        }

        // fmt.Println("!")
    }

    fmt.Println("success end")
    return nil
}

func (p *dictParser) Run(path string) error {
    p.number = 0
    p.finish = false
    file, err := os.Open(path)
    if err != nil {
        return err
    }
    p.scanner = bufio.NewScanner(file)

    err = p.parse()
    closeErr := file.Close()
    if err == nil {
        err = closeErr
    }
    return err
}

func runImportDict() error {
    var d dictParser
    return d.Run("dict.opcorpora.txt")
}