guimc233/lgz-bot

View on GitHub
src/main/java/huzpsb/ll4j/nlp/token/Tokenizer.java

Summary

Maintainability
A
0 mins
Test Coverage
package huzpsb.ll4j.nlp.token;

import huzpsb.ll4j.utils.data.DataEntry;

import java.io.*;

public class Tokenizer {
    private final String[] vocab;

    public Tokenizer(String[] vocab) {
        this.vocab = vocab;
    }

    public Tokenizer(String[] vocab, int start, int length) {
        this.vocab = new String[length];
        System.arraycopy(vocab, start, this.vocab, 0, length);
    }

    public static Tokenizer load(InputStream stream) {
        return load(new InputStreamReader(stream));
    }

    public static Tokenizer load(Reader reader) {
        return load(new BufferedReader(reader));
    }

    public static Tokenizer load(BufferedReader reader) {
        String[] vocab = null;
        try (reader) {
            String str;
            int index = 0;
            while ((str = reader.readLine()) != null) {
                if (vocab == null) {
                    int size = Integer.parseInt(str);
                    vocab = new String[size];
                    continue;
                }
                vocab[index++] = str;
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        if (vocab == null) return null;
        return new Tokenizer(vocab);
    }

    public DataEntry tokenize(int type, String text) {
        String regularized = CharUtils.regularize(text);
        double[] values = new double[vocab.length + 1];
        values[0] = text.length();
        for (int i = 0; i < vocab.length; i++) {
            values[i + 1] = regularized.contains(vocab[i]) ? 1 : 0;
        }
        return new DataEntry(type, values);
    }

    public void saveToFile(String filename) {
        try (PrintWriter writer = new PrintWriter(filename)) {
            writer.println(vocab.length);
            for (String word : vocab) {
                writer.println(word);
            }
        } catch (Exception ignored) {
        }
    }
}