lp-ontology-recommender/src/main/resources/gate/plugins/Gazetteer_Ontology_Based/src/gate/clone/ql/OntoRootGaz.java
/*
* OntoRootGaz.java
*
* Copyright (c) 1998-2008, The University of Sheffield.
*
* This file is part of GATE (see http://gate.ac.uk/), and is free software,
* licenced under the GNU Library General Public License, Version 2, June 1991
* (in the distribution as file licence.html, and also available at
* http://gate.ac.uk/gate/licence.html).
*/
package gate.clone.ql;
import gate.Annotation;
import gate.Corpus;
import gate.CorpusController;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.Resource;
import gate.Utils;
import gate.creole.ANNIEConstants;
import gate.creole.ExecutionException;
import gate.creole.POSTagger;
import gate.creole.ResourceInstantiationException;
import gate.creole.SerialAnalyserController;
import gate.creole.gazetteer.DefaultGazetteer;
import gate.creole.gazetteer.FSMState;
import gate.creole.gazetteer.LinearDefinition;
import gate.creole.gazetteer.Lookup;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.HiddenCreoleParameter;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.creole.morph.Morph;
import gate.creole.ontology.InvalidURIException;
import gate.creole.ontology.Ontology;
import gate.creole.tokeniser.DefaultTokeniser;
import gate.util.InvalidOffsetException;
import gate.util.OffsetComparator;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* @author Danica Damljanovic
*/
@CreoleResource(name = "Onto Root Gazetteer",
comment = "An ontology lookup component",
helpURL = "http://gate.ac.uk/userguide/sec:gazetteers:ontoRootGaz",
icon = "gazetteer")
public class OntoRootGaz extends DefaultGazetteer {
private static final long serialVersionUID = 0L;
@Deprecated
protected POSTagger posTagger;
@Deprecated
protected DefaultTokeniser tokeniser;
@Deprecated
protected Morph morpher;
protected CorpusController rootFinderApplication;
protected OffsetComparator offsetComparator;
protected Ontology ontology;
/**
* should camelCased words be separated so that projectName becomes project
* Name
*/
protected Boolean separateCamelCasedWords;
/**
* should resource URI (usually called a fragment identifier - a set of
* characters after / or #) be considered; for example, if there is a resource
* with URI http://gate.ac.uk/ns/gate-ontology#POSTagger, should POSTagger be
* considered or not
*/
protected Boolean useResourceUri;
/**
* should properties be considered or not; NOTE: if this parameter is set to
* false, than propertiesToInlcude and propertiesToExclude will be ignored
*/
protected Boolean considerProperties;
/**
* a list of lookups that will be created after processing of all relevant
* data
*/
protected List<Lookup> allLookups = new ArrayList<Lookup>();
protected Corpus applicationCorpus;
/**
* a map of roots: a key is a lookup.list value, e.g. 'projects', and the
* value is a root of that key, in this case that would be 'project'
*/
Map<String, String> listRoots = new HashMap<String, String>();
/**
* Should the rules be followed or not: if true then, few heuristic rules will
* apply: the words containing spaces will be split; for example, if 'pos
* tagger for spanish' would be analysed, 'for' would be considered a stop
* word and heuristically derived would be 'pos tagger' and this would be
* further used to add 'pos tagger' with heuristical level 0, and 'tagger'
* with hl 1 to the gazetteer list; at runtime lower heuristical level should
* be prefered
*/
protected Boolean considerHeuristicRules;
/**
* comma separated values of property names that will be considered when
* initializing the gazetteer
*/
protected String propertiesToInclude;
/**
* comma separated values of property names that will be excluded when
* initializing the gazetteer NOTE: setting propertiesToInclude to be
* different from "" automatically means that all properties not in the list
* will be excluded (in other words, if propertiesToInclude is set, it is not
* necessary to set propertiesToExclude as all properties not listed in
* propertiesToInclude will be excluded);
*/
protected String propertiesToExclude;
protected Set<String> typesToConsider;
public Set<String> getTypesToConsider() {
return typesToConsider;
}
@RunTime
@CreoleParameter(defaultValue = "class;instance;property",
comment = "The list of types to be included. Possible values: "
+ "instance, class, property")
public void setTypesToConsider(Set<String> typesToConsider) {
this.typesToConsider = typesToConsider;
}
/*****************************************************************************
* setting logger to log entries to the gazetteer list
****************************************************************************/
private static final Log logger = LogFactory.getLog(OntoRootGaz.class);
public void reInit() throws ResourceInstantiationException {
this.init();
}
@Override
@SuppressWarnings("deprecation")
public Resource init() throws ResourceInstantiationException {
// list of namespaces to be ignored when creating gazetteer list
List<String> nsToIgnore = new ArrayList<String>();
nsToIgnore.add("http://www.w3.org/2002/07/owl#");
nsToIgnore.add("http://www.w3.org/2000/01/rdf-schema#");
nsToIgnore.add("http://www.w3.org/1999/02/22-rdf-syntax-ns#");
logger.info("--------------------------------------\n");
logger.info(" Initializing gazetteer...\n");
// logger.info(ontology.getURL().toString());
logger.info("--------------------------------------\n");
long startedInit = System.currentTimeMillis();
List<String> propertiesToIncludeList = new ArrayList<String>();
List<String> propertiesToExcludeList = new ArrayList<String>();
if(rootFinderApplication == null) {
if(tokeniser != null && posTagger != null & morpher != null) {
logger.warn("No rootFinderApplication provided, but found old-style "
+ "separate parameters for tokeniser, POS tagger and morpher. "
+ "Constructing an application from these.");
rootFinderApplication =
(CorpusController)Factory.createResource(
"gate.creole.SerialAnalyserController",
Factory.newFeatureMap(), Factory.newFeatureMap(),
"Root finder application for " + getName());
((SerialAnalyserController)rootFinderApplication).add(tokeniser);
((SerialAnalyserController)rootFinderApplication).add(posTagger);
((SerialAnalyserController)rootFinderApplication).add(morpher);
}
}
if(ontology == null) {
throw new ResourceInstantiationException("No ontology provided!");
} else {
Ontology2MapManager.getInstance().addOntologyToIndex(ontology);
}
/* set default values if they are not set already */
if(this.useResourceUri == null) useResourceUri = true;
if(considerProperties == null) considerProperties = true;
if(separateCamelCasedWords == null) separateCamelCasedWords = true;
if(considerHeuristicRules == null) considerHeuristicRules = false;
if(typesToConsider == null) {
typesToConsider = new HashSet<String>();
typesToConsider.add(CATConstants.TYPE_CLASS);
typesToConsider.add(CATConstants.TYPE_INSTANCE);
typesToConsider.add(CATConstants.TYPE_PROPERTY);
}
fsmStates = new HashSet<FSMState>();
initialState = new FSMState(this);
/* create a corpus and hide it inside the GATE GUI */
FeatureMap corpusParams = Factory.newFeatureMap();
corpusParams.put("name", this.getClass().getCanonicalName());
FeatureMap corpusFeatures = Factory.newFeatureMap();
Gate.setHiddenAttribute(corpusFeatures, true);
applicationCorpus =
(Corpus)Factory.createResource("gate.corpora.CorpusImpl", corpusParams,
corpusFeatures);
offsetComparator = new OffsetComparator();
/*
* move properties to include and exclude from the list of CSV to the actual
* List objects
*/
if(considerProperties && propertiesToInclude != null
&& propertiesToExclude != null) {
String[] listInclude = propertiesToInclude.split(",");
for(String item : listInclude) {
if(!"".equals(item.trim())) propertiesToIncludeList.add(item.trim());
}
String[] listExclude = propertiesToExclude.split(",");
for(String item : listExclude) {
if(!"".equals(item.trim())) propertiesToExcludeList.add(item.trim());
}
}
/*
* check validity: if a property is in both 'to be excluded' and 'to be
* included' list throw an exception
*/
if(propertiesToExcludeList.size() > 0 && propertiesToIncludeList.size() > 0) {
for(String propertyUri : propertiesToExcludeList) {
if(propertiesToIncludeList.contains(propertyUri))
throw new ResourceInstantiationException(
"You specified that the same property should be both included and "
+ "excluded!");
}
}
if(considerProperties) {
/*************************************************************************
* instances with all set properties returned in a table with 3 columns:
* ... instanceUri, propertyUri, propertyValue [new line] instanceUri,
* propertyUri, propertyValue [new line] ...
************************************************************************/
if(typesToConsider.contains(CATConstants.TYPE_INSTANCE)) {
String[] rows =
Ontology2MapManager.getInstance().getOntology2Map()
.getListOfInstances().split(CATConstants.NEW_LINE);
for(String eachRow : rows) {
String[] columns = eachRow.split("\\|");
if(columns.length == 3) {
String uri = columns[0].trim();
try {
/* create uriURI for validation purposes */
// URI uriUri = new URI(uri, false);
// OURI uriUri = ontology.createOURI(uri);
gate.creole.ontology.URI uriUri = null;
String classType = "";
if(uri.startsWith("_")) {
uriUri = new gate.creole.ontology.URI(uri, true);
classType = "bNode";
} else {
uriUri = new gate.creole.ontology.URI(uri, false);
classType =
new ArrayList<String>(Ontology2MapManager.getInstance()
.getOntology2Map().getInstanceTypes().get(uri)).get(0);
}
String propUri = columns[1].trim();
if((propertiesToIncludeList.size() == 0 || propertiesToIncludeList
.contains(propUri))
&& (propertiesToExcludeList.size() == 0 || !(propertiesToExcludeList
.contains(propUri)))) {
if(!nsToIgnore.contains(uriUri.getNameSpace())) {
String propValue = columns[2].trim();
Map<String, Object> lookupFeatures =
new HashMap<String, Object>();
lookupFeatures.put(CATConstants.ONTORES_TYPE,
CATConstants.TYPE_INSTANCE);
lookupFeatures.put(CATConstants.FEATURE_URI, uri);
if(propUri != null)
lookupFeatures.put(CATConstants.FEATURE_PROPERTY_URI,
propUri);
if(propValue != null)
lookupFeatures.put(CATConstants.FEATURE_PROPERTY_VALUE,
propValue);
if(Ontology2MapManager.getInstance().getOntology2Map()
.getInstanceTypes().get(uri) != null)
lookupFeatures.put(CATConstants.CLASS_URI_LIST,
Ontology2MapManager.getInstance().getOntology2Map()
.getInstanceTypes().get(uri));
else lookupFeatures.put(CATConstants.CLASS_URI_LIST,
classType);
// lookupFeatures.put(CATConstants.CLASS_URI,
// new ArrayList<String>(Ontology2MapManager.getInstance()
// .getOntology2Map().getInstanceTypes().get(uri))
// .get(0));
lookupFeatures.put(CATConstants.CLASS_URI, classType);
Lookup aLookup = null;
if(propValue != null) {
aLookup = new Lookup(propValue, "", null, null);
aLookup.features = lookupFeatures;
allLookups.add(aLookup);
}
}// if uri is in the list of ignored namespaces:
// nsToIgnore
}// end if propertiesToIncludeList==0 ...
} catch(InvalidURIException e) {
logger.info("URI:'" + uri + "' is not valid. Skipping...\n");
}
}
}
}
/*************************************************************************
* classes with all set properties returned in a table with 3 columns:
* classUri, propertyUri, propertyValue
* ************************************************************ *
************************************************************************/
if(typesToConsider.contains(CATConstants.TYPE_CLASS)) {
String[] rows =
Ontology2MapManager.getInstance().getOntology2Map()
.getListOfClasses().split(CATConstants.NEW_LINE);
for(String eachRow : rows) {
String[] columns = eachRow.split("\\|");
if(columns.length == 3) {
String uri = columns[0].trim();
try {
// URI uriUri = new URI(uri, false);
// OURI uriUri = ontology.createOURI(uri);
gate.creole.ontology.URI uriUri = null;
if(uri.startsWith("_"))
uriUri = new gate.creole.ontology.URI(uri, true);
else uriUri = new gate.creole.ontology.URI(uri, false);
String propUri = columns[1].trim();
if((propertiesToIncludeList.size() == 0 || propertiesToIncludeList
.contains(propUri))
&& (propertiesToExcludeList.size() == 0 || !(propertiesToExcludeList
.contains(propUri)))) {
if(!nsToIgnore.contains(uriUri.getNameSpace())) {
String propValue = columns[2].trim();
Map<String, Object> lookupFeatures =
new HashMap<String, Object>();
lookupFeatures.put(CATConstants.ONTORES_TYPE,
CATConstants.TYPE_CLASS);
lookupFeatures.put(CATConstants.FEATURE_URI, uri);
lookupFeatures
.put(CATConstants.FEATURE_PROPERTY_URI, propUri);
Lookup aLookup = new Lookup(propValue, "", null, null);
aLookup.features = lookupFeatures;
allLookups.add(aLookup);
}// end if propertiesToIncludeList==0 ...
}// if uri is in the list of ignored namespaces: nsToIgnore
} catch(InvalidURIException e) {
logger.info("URI:'" + uri + "' is not valid.\n");
}
}
}
}
/*************************************************************************
* properties with all set properties returned in a table with 3 columns:
* propertyUri, setPropertyUri, propertyValue
* ************************************************************ *
************************************************************************/
if(typesToConsider.contains(CATConstants.TYPE_PROPERTY)) {
String[] rows =
Ontology2MapManager.getInstance().getOntology2Map()
.getListOfProperties().split(CATConstants.NEW_LINE);
for(String eachRow : rows) {
String[] columns = eachRow.split("\\|");
if(columns.length == 3) {
String uri = columns[0].trim();
try {
// URI uriUri = new URI(uri, false);
// OURI uriUri = ontology.createOURI(uri);
gate.creole.ontology.URI uriUri = null;
if(uri.startsWith("_"))
uriUri = new gate.creole.ontology.URI(uri, true);
else uriUri = new gate.creole.ontology.URI(uri, false);
String propUri = columns[1].trim();
if((propertiesToIncludeList.size() == 0 || propertiesToIncludeList
.contains(propUri))
&& (propertiesToExcludeList.size() == 0 || !(propertiesToExcludeList
.contains(propUri)))) {
if(!nsToIgnore.contains(uriUri.getNameSpace())) {
String propValue = columns[2].trim();
Map<String, Object> lookupFeatures =
new HashMap<String, Object>();
lookupFeatures.put(CATConstants.ONTORES_TYPE,
CATConstants.TYPE_PROPERTY);
lookupFeatures.put(CATConstants.FEATURE_URI, uri);
lookupFeatures
.put(CATConstants.FEATURE_PROPERTY_URI, propUri);
lookupFeatures.put(CATConstants.FEATURE_PROPERTY_VALUE,
propValue);
Lookup aLookup = new Lookup(propValue, "", null, null);
aLookup.features = lookupFeatures;
allLookups.add(aLookup);
}// end if propertiesToIncludeList==0 ...
}
} catch(InvalidURIException e) {
logger.info("URI:'" + uri + "' is not valid.\n");
}
}
}
}
}// end consider properties
/* uri retrieval */
if(useResourceUri) {
/*************************************************************************
* class uris
************************************************************************/
if(typesToConsider.contains(CATConstants.TYPE_CLASS)) {
String[] rows =
Ontology2MapManager.getInstance().getOntology2Map().getClassURIs()
.split(CATConstants.NEW_LINE);
for(String eachRow : rows) {
String uri = eachRow.trim();
try {
// URI uriUri = new URI(uri, false);
// OURI uriUri = ontology.createOURI(uri);
gate.creole.ontology.URI uriUri = null;
if(uri.startsWith("_"))
uriUri = new gate.creole.ontology.URI(uri, true);
else uriUri = new gate.creole.ontology.URI(uri, false);
String shortName = uriUri.getResourceName();
if(!nsToIgnore.contains(uriUri.getNameSpace())) {
Map<String, Object> lookupFeatures =
new HashMap<String, Object>();
lookupFeatures.put(CATConstants.ONTORES_TYPE,
CATConstants.TYPE_CLASS);
lookupFeatures.put(CATConstants.FEATURE_URI, uri.trim());
Lookup aLookup = new Lookup(shortName, "", null, null);
aLookup.features = lookupFeatures;
allLookups.add(aLookup);
}
} catch(InvalidURIException e) {
logger.info("URI:" + uri + " is not valid.\n");
}
}
}
/*************************************************************************
* instance uris
************************************************************************/
if(typesToConsider.contains(CATConstants.TYPE_INSTANCE)) {
Set<String> setOfInstanceTypes =
Ontology2MapManager.getInstance().getOntology2Map()
.getInstanceTypes().keySet();
for(String uri : setOfInstanceTypes) {
try {
// URI uriUri = new URI(uri, false);
// OURI uriUri = ontology.createOURI(uri);
gate.creole.ontology.URI uriUri = null;
if(uri.startsWith("_"))
uriUri = new gate.creole.ontology.URI(uri, true);
else uriUri = new gate.creole.ontology.URI(uri, false);
String shortName = uriUri.getResourceName();
if(!nsToIgnore.contains(uriUri.getNameSpace())) {
Map<String, Object> lookupFeatures =
new HashMap<String, Object>();
lookupFeatures.put(CATConstants.ONTORES_TYPE,
CATConstants.TYPE_INSTANCE);
lookupFeatures.put(CATConstants.FEATURE_URI, uri);
Set<String> l =
Ontology2MapManager.getInstance().getOntology2Map()
.getInstanceTypes().get(uri);
if(l == null) l = new HashSet<String>();
lookupFeatures.put(CATConstants.CLASS_URI_LIST, l);
lookupFeatures.put(CATConstants.CLASS_URI, new ArrayList<String>(
l).get(0));
Lookup aLookup = new Lookup(shortName, "", null, null);
aLookup.features = lookupFeatures;
allLookups.add(aLookup);
}
} catch(InvalidURIException e) {
logger.info("URI:" + uri + " is not valid.\n");
}
}
}
/*************************************************************************
* property uris
************************************************************************/
if(typesToConsider.contains(CATConstants.TYPE_PROPERTY)) {
// System.out.println("Considering properties");
String[] rows =
Ontology2MapManager.getInstance().getOntology2Map()
.getPropertyURIs().split(CATConstants.NEW_LINE);
for(String eachRow : rows) {
String uri = eachRow.trim();
try {
// URI uriUri = new URI(uri, false);
// OURI uriUri = ontology.createOURI(uri);
gate.creole.ontology.URI uriUri = null;
if(uri.startsWith("_"))
uriUri = new gate.creole.ontology.URI(uri, true);
else uriUri = new gate.creole.ontology.URI(uri, false);
String shortName = uriUri.getResourceName();
if(!nsToIgnore.contains(uriUri.getNameSpace())) {
Map<String, Object> lookupFeatures =
new HashMap<String, Object>();
lookupFeatures.put(CATConstants.ONTORES_TYPE,
CATConstants.TYPE_PROPERTY);
lookupFeatures.put(CATConstants.FEATURE_URI, uri);
Lookup aLookup = new Lookup(shortName, "", null, null);
aLookup.features = lookupFeatures;
allLookups.add(aLookup);
}
} catch(InvalidURIException e) {
logger.info("URI:" + uri + " is not valid.\n");
}
}
}
}
addLookups(allLookups);
allLookups = new ArrayList<Lookup>();
/* release GATE resources */
Factory.deleteResource(applicationCorpus);
applicationCorpus = null;
long currentTime = System.currentTimeMillis();
logger.info("OntoRootGaz initialized for:" + (currentTime - startedInit)
+ " ms");
return this;
}
/**
* This method takes a list of lookups as a parameter, process them and
* returns a list of new Lookups that are than added to the gazetteer.
* 'Processing' means replacing lookup.list feature with its root.
* Additionally during the processing a new list if Lookups is created called
* additionalList: this list contains a new Lookups that needs to be processed
* by calling this method again afterwards: - if lookup.list contains "-" or
* "_", replace these chars by space, add new lookups to the additionalList
* and then extract the root in the next call to this method - if
* separateCamelCasedWords=true, separate them by adding a space, add new
* lookups to the additionalList and then extract the root later - if
* considerHeuristicRules=true then separate words as proposed by these rules,
* add new lookups to the additionalList and then extract the root later
*
* @param List
* <Lookup> lookups
* @throws ResourceInstantiationException
*/
protected void addLookups(List<Lookup> lookups)
throws ResourceInstantiationException {
List<Lookup> lookupsToBeAdded = runRootFinderApplication(lookups);
List<Lookup> additionalListTemp = new ArrayList<Lookup>();
additionalListTemp.addAll(additionalList);
additionalList = new ArrayList<Lookup>();
List<Lookup> addition = runRootFinderApplication(additionalListTemp);
List<Lookup> all = new ArrayList<Lookup>();
all.addAll(lookupsToBeAdded);
all.addAll(addition);
for(Lookup aLookup : all) {
String root = listRoots.get(aLookup.list);
long hLevel = 0;
if(root != null) {
/*
* check if the root has spaces and if considerHeuristicRules is set to
* true, if yes, than split words and add heuristical_level to each
*/
if(root.contains(" ") && considerHeuristicRules == true) {
Lookup aNewLookup = new Lookup(aLookup.list, "", null, null);
Map<String, Object> newFeatures = new HashMap<String, Object>();
for(Object key : aLookup.features.keySet()) {
newFeatures.put((String)key, aLookup.features.get(key));
}
aNewLookup.features = newFeatures;
aNewLookup.features.put(CATConstants.FEATURE_HEURISTIC_LEVEL, hLevel);
aNewLookup.features.put(CATConstants.FEATURE_HEURISTIC_VALUE, root);
addLookup(root.trim(), aNewLookup);
logger.info("NEW ENTRY: " + root + "\n");
int firstIndex = root.trim().indexOf(" ");
String newRoot = root.trim();
while(firstIndex >= 0) {
newRoot = newRoot.substring(firstIndex + 1, newRoot.length());
hLevel++;
Lookup anotherLookup = new Lookup(aLookup.list, "", null, null);
Map<String, Object> anotherFeatures = new HashMap<String, Object>();
for(Object key : aLookup.features.keySet()) {
anotherFeatures.put((String)key, aLookup.features.get(key));
}
anotherLookup.features = anotherFeatures;
anotherLookup.features.put(CATConstants.FEATURE_HEURISTIC_LEVEL,
hLevel);
anotherLookup.features.put(CATConstants.FEATURE_HEURISTIC_VALUE,
newRoot.trim());
addLookup(newRoot.trim(), anotherLookup);
logger.info("NEW ENTRY: " + newRoot + "\n");
firstIndex = newRoot.trim().indexOf(" ");
}
} else {// if it doesn't have spaces or
// considerHeuristicRules=false
aLookup.features.put(CATConstants.FEATURE_HEURISTIC_LEVEL, 0L);
if(root != null && aLookup != null) {
addLookup(root.trim(), aLookup);
logger.info("NEW ENTRY: " + root + "\n");
} else logger.info("NEW ENTRY: " + root + "\n");
}
} else {
logger.info("root is null for lookup:" + aLookup);
}
}
}
/*
* this list is populated during the processing of all lookups, when some
* entries have multiple interpretations; for example, when processing
* Project-Name, 'Project-Name' would be added in the first iteration, while
* 'Project Name' would be added to the additionalList for later processing
*/
List<Lookup> additionalList = new ArrayList<Lookup>();
/**
* This method process given lookups so that their entries are converted to
* the root of the entry i.e. lookup.list is processed and 'root' feature is
* used to be lookup.list for resulting lookups. All unprocessed lookups are
* added to the additionalList and they are processed later with the same
* method
*/
private List<Lookup> runRootFinderApplication(List<Lookup> lookups)
throws ResourceInstantiationException {
Pattern p = Pattern.compile(CATConstants.REGEX_CAMEL_CASE);
List<Lookup> lookupsToBeReturned = new ArrayList<Lookup>();
for(Lookup lookup : lookups) {
String list = lookup.list;
if(list != null && list.trim().length() > 0) {
if(list.contains("_")) {
String newText = list.replace('_', ' ');
Lookup aLookup = new Lookup(newText, "", null, null);
aLookup.features = lookup.features;
additionalList.add(aLookup);
}
// if text is camel cased add space between words
if(separateCamelCasedWords) {
Matcher m = p.matcher(list);
StringBuffer sb = new StringBuffer();
while (m.find()) {
m.appendReplacement(sb, "$1 $2");
}
m.appendTail(sb);
String separatedCamelCase = sb.toString();
if(list != null && (!list.equals(separatedCamelCase))) {
Lookup aLookup = new Lookup(separatedCamelCase, "", null, null);
aLookup.features = lookup.features;
additionalList.add(aLookup);
}
}
lookupsToBeReturned.add(lookup);
/* set new documents to be hidden inside the GATE GUI */
FeatureMap docParams = Factory.newFeatureMap();
docParams.put("stringContent", list);
FeatureMap docFeatures = Factory.newFeatureMap();
Gate.setHiddenAttribute(docFeatures, true);
Document aDocument = null;
try {
aDocument =
(Document)Factory.createResource("gate.corpora.DocumentImpl",
docParams, docFeatures);
// add spanning Sentence annotation
aDocument.getAnnotations().add(Utils.start(aDocument),
Utils.end(aDocument), SENTENCE_ANNOTATION_TYPE,
Factory.newFeatureMap());
applicationCorpus.add(aDocument);
// be nice - save the old corpus parameter from the RFA and restore
// it after running
Corpus rootFinderCorpus = rootFinderApplication.getCorpus();
try {
rootFinderApplication.setCorpus(applicationCorpus);
rootFinderApplication.execute();
} finally {
rootFinderApplication.setCorpus(rootFinderCorpus);
}
} catch(ExecutionException | InvalidOffsetException ee) {
throw new ResourceInstantiationException(ee);
}
Iterator<Document> it = applicationCorpus.iterator();
while(it.hasNext()) {
Document doc = it.next();
Set<String> tokenTypes = new HashSet<String>();
tokenTypes.add(ANNIEConstants.TOKEN_ANNOTATION_TYPE);
tokenTypes.add(ANNIEConstants.SPACE_TOKEN_ANNOTATION_TYPE);
List<Annotation> tokenList =
new ArrayList<Annotation>(aDocument.getAnnotations().get(
tokenTypes));
Collections.sort(tokenList, offsetComparator);
StringBuffer rootForText = new StringBuffer("");
boolean lastAnnWasSpace = false;
for(Annotation ann : tokenList) {
if(ann.getType().equals(ANNIEConstants.TOKEN_ANNOTATION_TYPE)) {
lastAnnWasSpace = false;
String category =
(String)ann.getFeatures().get(
ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME);
/*
* category "IN" means it is a preposition, and these are used to
* be a stop words, so crop everything afterwards, but ONLY if
* parameter considerHeuristicRules is set to be true
*/
if(considerHeuristicRules == true && category.equals("IN")) {
break;
} else {
String root = (String)ann.getFeatures().get("root");
if(root != null) {
rootForText.append(root);
} else {
throw new ResourceInstantiationException(
"No root found for annotation " + ann.toString());
}
}
} else if(ann.getType().equals(
ANNIEConstants.SPACE_TOKEN_ANNOTATION_TYPE)) {
if(!lastAnnWasSpace) {
rootForText.append(' ');
}
lastAnnWasSpace = true;
} else {
// malfunction
throw new ResourceInstantiationException(
"Invalid annotation type: " + ann);
}
}
listRoots.put(doc.getContent().toString(), rootForText.toString());
}
applicationCorpus.clear();
Factory.deleteResource(aDocument);
aDocument = null;
}
}
return lookupsToBeReturned;
}
@Deprecated
public void setMorpher(Morph morpher) {
this.morpher = morpher;
}
@Deprecated
public void setPosTagger(POSTagger posTagger) {
this.posTagger = posTagger;
}
@Deprecated
public void setTokeniser(DefaultTokeniser tokeniser) {
this.tokeniser = tokeniser;
}
public CorpusController getRootFinderApplication() {
return rootFinderApplication;
}
@CreoleParameter(comment = "Application to find the roots that go into the "
+ "gazetteer. Executed over a document containing a single "
+ SENTENCE_ANNOTATION_TYPE + " annotation spanning the whole document, "
+ "this application should produce Token annotations with category (POS "
+ "tag) and root (morphological root) features. Typically that means a "
+ "tokeniser, POS tagger and morphological analyser, but any "
+ "application producing the correct features will work")
public void setRootFinderApplication(CorpusController rootFinderApplication) {
this.rootFinderApplication = rootFinderApplication;
}
public Ontology getOntology() {
return ontology;
}
@CreoleParameter(comment = "The ontology to be used")
public void setOntology(Ontology ontology) {
this.ontology = ontology;
}
public Boolean getConsiderProperties() {
return considerProperties;
}
@CreoleParameter(comment = "Should this gazetteer consider properties or not",
defaultValue = "true")
public void setConsiderProperties(Boolean considerProperties) {
this.considerProperties = considerProperties;
}
public Boolean getUseResourceUri() {
return useResourceUri;
}
@CreoleParameter(comment = "Should this gazetteer use resource URIs or not",
defaultValue = "true")
public void setUseResourceUri(Boolean useResourceUri) {
this.useResourceUri = useResourceUri;
}
/**
* @return the separateCamelCasedWords
*/
public Boolean getSeparateCamelCasedWords() {
return separateCamelCasedWords;
}
/**
* @param separateCamelCasedWords
* the separateCamelCasedWords to set
*/
@CreoleParameter(comment = "Should this gazetteer separate camelCased "
+ "words, e.g. ProjectName into Project Name", defaultValue = "true")
public void setSeparateCamelCasedWords(Boolean separateCamelCasedWords) {
this.separateCamelCasedWords = separateCamelCasedWords;
}
/**
* @return the propertiesToExclude
*/
public String getPropertiesToExclude() {
return propertiesToExclude;
}
/**
* @param propertiesToExclude
* the propertiesToExclude to set
*/
@Optional
@CreoleParameter(comment = "The list of property names to be excluded, "
+ "comma separated", defaultValue = "")
public void setPropertiesToExclude(String propertiesToExclude) {
this.propertiesToExclude = propertiesToExclude;
}
/**
* @return the propertiesToInclude
*/
public String getPropertiesToInclude() {
return propertiesToInclude;
}
/**
* @param propertiesToInclude
* the propertiesToInclude to set
*/
@Optional
@CreoleParameter(comment = "The list of property names to be included, "
+ "comma separated", defaultValue = "")
public void setPropertiesToInclude(String propertiesToInclude) {
this.propertiesToInclude = propertiesToInclude;
}
/**
* @return
*/
public Boolean getConsiderHeuristicRules() {
return considerHeuristicRules;
}
/**
* @param considerHeuristicRules
*/
@CreoleParameter(comment = "Should this gazetteer consider several "
+ "heuristic rules or not: the words containing spaces will be split; "
+ "for example, if 'pos tagger for spanish' would be analysed, 'for' "
+ "would be consider as a stop word; heuristically derived would be "
+ "'pos tagger' and this would be further used to add 'pos tagger' with "
+ "heuristical level 0, and 'tagger' with hl 1 to the gazetteer list; "
+ "at runtime lower heuristical level should be prefered",
defaultValue = "false")
public void setConsiderHeuristicRules(Boolean considerHeuristicRules) {
this.considerHeuristicRules = considerHeuristicRules;
}
/**
* Gets the linear definition of the gazetteer. This method is added so that
* Gaze does not complain when rendering views and showing initialisation
* parameters.
*
* @return the linear definition of the gazetteer
*/
public LinearDefinition getLinearDefinition() {
return new LinearDefinition();
}
// hidden parameters
@HiddenCreoleParameter
public void setListsURL(URL u) {
super.setListsURL(u);
}
@HiddenCreoleParameter
public void setGazetteerFeatureSeparator(String sep) {
super.setGazetteerFeatureSeparator(sep);
}
@HiddenCreoleParameter
public void setEncoding(String encoding) {
super.setEncoding(encoding);
}
}