de.bund.bfr.knime.pmm.nodes/src/de/bund/bfr/knime/pmm/combaseio/lib/CombaseReader.java
/*******************************************************************************
* Copyright (c) 2015 Federal Institute for Risk Assessment (BfR), Germany
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Contributors:
* Department Biological Safety - BfR
*******************************************************************************/
package de.bund.bfr.knime.pmm.combaseio.lib;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import org.hsh.bfr.db.DBKernel;
import de.bund.bfr.knime.pmm.common.KnimeUtils;
import de.bund.bfr.knime.pmm.common.MiscXml;
import de.bund.bfr.knime.pmm.common.PmmException;
import de.bund.bfr.knime.pmm.common.PmmTimeSeries;
import de.bund.bfr.knime.pmm.common.PmmXmlDoc;
import de.bund.bfr.knime.pmm.common.math.MathUtilities;
import de.bund.bfr.knime.pmm.common.pmmtablemodel.AttributeUtilities;
import de.bund.bfr.knime.pmm.common.units.Categories;
public class CombaseReader {
private Map<String, Integer> newAgentIDs = new LinkedHashMap<>();
private Map<String, Integer> newMatrixIDs = new LinkedHashMap<>();
private Map<String, MiscXml> newMiscs = new LinkedHashMap<>();
private MiscConversion conversion;
private List<PmmTimeSeries> result;
public CombaseReader(final String filename) throws FileNotFoundException, IOException {
conversion = new MiscConversion();
result = new ArrayList<>();
File file = KnimeUtils.getFile(filename);
if (file.exists()) {
try (BufferedReader reader = new BufferedReader(
new InputStreamReader(new FileInputStream(file), "UTF-16LE"))) {
PmmTimeSeries data;
while ((data = step(reader)) != null) {
result.add(data);
}
}
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
PmmTimeSeries data;
while ((data = stepNew(reader)) != null) {
result.add(data);
}
}
}
}
public List<PmmTimeSeries> getResult() {
return result;
}
private PmmTimeSeries stepNew(BufferedReader reader) throws IOException {
// initialize next time series
PmmTimeSeries next = new PmmTimeSeries();
while (true) {
String line = reader.readLine();
if (line == null) {
return null;
}
if (line.trim().isEmpty() && next.getCombaseId() != null) {
return next;
}
if (!line.contains(",")) {
continue;
}
String key = line.substring(0, line.indexOf(",")).trim();
String data = line.substring(line.indexOf(",") + 1).trim();
// fetch record id
if (key.equals("ComBase ID")) {
next.setCombaseId(data);
continue;
}
// fetch organism
if (key.equals("Organism")) {
// next.setAgentDetail( token[ 1 ] );
setAgent(next, data);
continue;
}
// fetch environment
if (key.equals("Matrix")) {
// next.setMatrixDetail(token[1]);
setMatrix(next, data);
continue;
}
// fetch temperature
if (key.startsWith("Temperature(")) {
Double value = parse(data);
// next.setTemperature(value);
next.addMisc(AttributeUtilities.ATT_TEMPERATURE_ID, AttributeUtilities.ATT_TEMPERATURE,
AttributeUtilities.ATT_TEMPERATURE, value,
Arrays.asList(Categories.getTempCategory().getName()),
Categories.getTempCategory().getStandardUnit());
continue;
}
// fetch pH
if (key.equals("pH")) {
Double value = parse(data);
// next.setPh(value);
next.addMisc(AttributeUtilities.ATT_PH_ID, AttributeUtilities.ATT_PH, AttributeUtilities.ATT_PH, value,
Arrays.asList(Categories.getPhCategory().getName()), Categories.getPhUnit());
continue;
}
// fetch water activity
if (key.equals("Aw")) {
Double value = parse(data);
// next.setWaterActivity(value);
next.addMisc(AttributeUtilities.ATT_AW_ID, AttributeUtilities.ATT_AW, AttributeUtilities.ATT_AW, value,
Arrays.asList(Categories.getAwCategory().getName()), Categories.getAwUnit());
continue;
}
// fetch conditions
if (key.equals("Conditions")) {
PmmXmlDoc xml = combase2XMLNew(data);
next.addMiscs(xml);
continue;
}
if (key.equals("Max.rate(logc.conc/h)")) {
next.setMaximumRate(parse(data));
continue;
}
if (key.equals("Logcs:")) {
while (true) {
line = reader.readLine();
if (line == null)
return next;
if (!line.contains(",")) {
break;
}
String dataPoint = line.substring(line.indexOf(",") + 1).trim();
double t = parse(dataPoint.substring(0, dataPoint.indexOf(",")).trim());
double logc = parse(dataPoint.substring(dataPoint.indexOf(",") + 1).trim());
if (Double.isNaN(t) || Double.isNaN(logc)) {
continue;
}
next.add(t, Categories.getTimeCategory().getStandardUnit(), logc,
Categories.getConcentrationCategories().get(0).getStandardUnit());
}
break;
}
}
return next;
}
private PmmTimeSeries step(BufferedReader reader) throws IOException {
// initialize next time series
PmmTimeSeries next = new PmmTimeSeries();
while (true) {
String line = reader.readLine();
if (line == null) {
return null;
}
if (line.trim().isEmpty() && next.getCombaseId() != null) {
return next;
}
// split up token
String[] token = line.split("\t");
if (token.length < 2)
continue;
if (token[0].isEmpty())
continue;
for (int i = 0; i < token.length; i++) {
// token[i] =
// token[i].replaceAll("[^a-zA-Z0-9° \\.\\(\\)_/\\+\\-\\*,:]",
// "");
token[i] = token[i].replaceAll("\"", "");
}
String key = token[0].toLowerCase().trim();
// utf16lemessage[0] = (byte)0xFF; utf16lemessage[1] = (byte)0xFE;
if (key.length() > 1 && key.charAt(0) == 65279)
key = key.substring(1);
// fetch record id
if (key.equals("recordid")) {
next.setCombaseId(token[1]);
continue;
}
// fetch organism
if (key.equals("organism")) {
// next.setAgentDetail( token[ 1 ] );
setAgent(next, token[1]);
continue;
}
// fetch environment
if (key.equals("environment")) {
// next.setMatrixDetail(token[1]);
setMatrix(next, token[1]);
continue;
}
// fetch temperature
if (key.equals("temperature")) {
int pos = token[1].indexOf(" ");
if (!token[1].endsWith(" °C"))
throw new PmmException("Temperature unit must be [°C]");
Double value = parse(token[1].substring(0, pos));
// next.setTemperature(value);
next.addMisc(AttributeUtilities.ATT_TEMPERATURE_ID, AttributeUtilities.ATT_TEMPERATURE,
AttributeUtilities.ATT_TEMPERATURE, value,
Arrays.asList(Categories.getTempCategory().getName()),
Categories.getTempCategory().getStandardUnit());
continue;
}
// fetch pH
if (key.equals("ph")) {
Double value = parse(token[1]);
// next.setPh(value);
next.addMisc(AttributeUtilities.ATT_PH_ID, AttributeUtilities.ATT_PH, AttributeUtilities.ATT_PH, value,
Arrays.asList(Categories.getPhCategory().getName()), Categories.getPhUnit());
continue;
}
// fetch water activity
if (key.equals("water activity")) {
Double value = parse(token[1]);
// next.setWaterActivity(value);
next.addMisc(AttributeUtilities.ATT_AW_ID, AttributeUtilities.ATT_AW, AttributeUtilities.ATT_AW, value,
Arrays.asList(Categories.getAwCategory().getName()), Categories.getAwUnit());
continue;
}
// fetch conditions
if (key.equals("conditions")) {
PmmXmlDoc xml = combase2XML(token[1]);
next.addMiscs(xml);
continue;
}
if (key.equals("maximum rate")) {
next.setMaximumRate(parse(token[1]));
continue;
}
if (key.startsWith("time") && token[1].equals("logc")) {
if (!key.endsWith("(h)"))
throw new IOException("Time unit must be [h].");
while (true) {
line = reader.readLine();
if (line == null)
return next;
if (line.replaceAll("\\t\"", "").isEmpty())
break;
token = line.split("\t");
for (int i = 0; i < token.length; i++) {
token[i] = token[i].replaceAll("[^a-zA-Z0-9° \\.\\(\\)/,]", "");
}
if (token.length < 2) {
break;
}
double t = parse(token[0]);
double logc = parse(token[1]);
if (Double.isNaN(t) || Double.isNaN(logc)) {
continue;
}
next.add(t, Categories.getTimeCategory().getStandardUnit(), logc,
Categories.getConcentrationCategories().get(0).getStandardUnit());
}
break;
}
}
return next;
}
private static double parse(String num) {
double n = Double.NaN;
num = num.toLowerCase();
num = num.trim();
if (num.equals("no growth"))
return 0;
try {
num = num.replaceAll("[a-zA-Z\\(\\)\\s]", "");
num = num.replaceAll(",", ".");
n = Double.valueOf(num);
} catch (Exception e) {
}
return n;
}
private PmmXmlDoc combase2XMLNew(String misc) {
PmmXmlDoc result = null;
if (misc != null) {
result = new PmmXmlDoc();
for (String s : misc.split(";")) {
int valueSep = s.indexOf(':');
String name = null;
Double value = null;
if (valueSep != -1) {
String valueString = s.substring(valueSep + 1).trim();
name = s.substring(0, valueSep).trim();
if (valueString.charAt(valueString.length() - 1) == ')') {
int unitSep = valueString.lastIndexOf('(');
String unitString = valueString.substring(unitSep + 1, valueString.length() - 1).trim();
valueString = valueString.replace(unitString, "").trim();
}
value = parse(valueString);
} else {
name = s;
value = 1.0;
}
// ersetzen mehrerer Spaces im Text durch lediglich eines, Bsp.:
// "was ist los?" -> "was ist los?"
String description = name.trim().replaceAll(" +", " ");
MiscXml mx = getMiscXml(description, value);
// new MiscXml(newIDs.get(description),
// getCombaseName(description), description, dbl, unit);
result.add(mx);
}
}
return result;
}
private PmmXmlDoc combase2XML(String misc) {
PmmXmlDoc result = null;
if (misc != null) {
result = new PmmXmlDoc();
List<String> conds = condSplit(misc);
for (int i = 0; i < conds.size(); i++) {
String val = conds.get(i).trim();
int index = val.indexOf(':');
int index2 = 0;
// String unit = null;
Double dbl = null;
if (index >= 0) {
try {
dbl = Double.parseDouble(val.substring(index + 1));
if (val.charAt(index - 1) == ')') {
for (index2 = index - 1; index2 >= 0 && val.charAt(index2) != '('; index2--) {
;
}
// unit = val.substring(index2 + 1, index - 1);
val = val.substring(0, index2);
}
} catch (Exception e) {
e.printStackTrace();
}
} else {
dbl = 1.0;
}
// ersetzen mehrerer Spaces im Text durch lediglich eines, Bsp.:
// "was ist los?" -> "was ist los?"
String description = val.trim().replaceAll(" +", " ");
MiscXml mx = getMiscXml(description, dbl);
// new MiscXml(newIDs.get(description),
// getCombaseName(description), description, dbl, unit);
result.add(mx);
}
}
return result;
}
private void setMatrix(PmmTimeSeries next, String matrixname) {
Integer id = null;
String matrixdetail = null;
int index = matrixname.indexOf("(");
if (index > 0) {
matrixdetail = matrixname.substring(index).trim();
matrixname = matrixname.substring(0, index).trim();
}
if (!newMatrixIDs.containsKey(matrixname)) {
id = DBKernel.getID("Matrices", "Matrixname", matrixname);
if (id == null) {
System.err.println(matrixname + "... unknown Matrix ID...");
id = MathUtilities.getRandomNegativeInt();
}
newMatrixIDs.put(matrixname, id);
} else
id = newMatrixIDs.get(matrixname);
matrixdetail = id < 0 ? matrixname + " (" + matrixdetail + ")" : matrixdetail;
next.setMatrix(id, id < 0 ? null : matrixname, matrixdetail, null);
}
private void setAgent(PmmTimeSeries next, String agentsname) {
Integer id = null;
if (!newAgentIDs.containsKey(agentsname)) {
id = DBKernel.getID("Agenzien", "Agensname", agentsname);
if (id == null) {
System.err.println(agentsname + "... unknown Agens ID...");
id = MathUtilities.getRandomNegativeInt();
}
newAgentIDs.put(agentsname, id);
} else
id = newAgentIDs.get(agentsname);
next.setAgent(id, id < 0 ? null : agentsname, id < 0 ? agentsname : null, null);
}
private MiscXml getMiscXml(String description, Double dbl) {
if (!newMiscs.containsKey(description)) {
MiscXml m = conversion.combaseToPmm(description);
Integer id = (Integer) DBKernel.getValue(null, "SonstigeParameter", "Parameter", m.name, "ID");
m.id = id;
newMiscs.put(description, m);
}
MiscXml misc = new MiscXml(newMiscs.get(description));
misc.value = dbl;
return misc;
}
private List<String> condSplit(final String misc) {
if (misc == null) {
return null;
}
List<String> result = new ArrayList<>();
StringTokenizer tok = new StringTokenizer(misc, ",");
int openParenthesis = 0;
while (tok.hasMoreTokens()) {
String nextToken = tok.nextToken();
if (openParenthesis > 0) {
nextToken = result.get(result.size() - 1) + "," + nextToken;
result.remove(result.size() - 1);
}
result.add(nextToken);
openParenthesis = 0;
int index = -1;
while ((index = nextToken.indexOf("(", index + 1)) >= 0) {
openParenthesis++;
}
while ((index = nextToken.indexOf(")", index + 1)) >= 0) {
openParenthesis--;
}
}
return result;
}
}