components/triplestowikipageconv.go
package components
import (
"regexp"
str "strings"
"github.com/knakk/rdf"
)
// Constants etc ---------------------------------------------------------------
var titleProperties = []string{
"http://semantic-mediawiki.org/swivt/1.0#page",
"http://www.w3.org/2000/01/rdf-schema#label",
"http://purl.org/dc/elements/1.1/title",
"http://purl.org/dc/terms/title",
"http://www.w3.org/2004/02/skos/core#preferredLabel",
"http://xmlns.com/foaf/0.1/name",
}
var namespaceAbbreviations = map[string]string{
"http://www.opentox.org/api/1.1#": "opentox",
}
var propertyTypes = []string{
"http://www.w3.org/2002/07/owl#AnnotationProperty",
"http://www.w3.org/2002/07/owl#DatatypeProperty",
"http://www.w3.org/2002/07/owl#ObjectProperty",
}
var categoryTypes = []string{
"http://www.w3.org/2002/07/owl#Class",
}
const (
typePropertyURI = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
subClassPropertyURI = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
)
const (
dataTypeURIString = "http://www.w3.org/2001/XMLSchema#string"
dataTypeURILangString = "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString"
dataTypeURIInteger = "http://www.w3.org/2001/XMLSchema#integer"
dataTypeURIFloat = "http://www.w3.org/2001/XMLSchema#float"
)
const (
_ = iota
URITypeUndefined
URITypePredicate
URITypeClass
URITypeTemplate
)
// Code -----------------------------------------------------------------------
// TripleAggregateToWikiPageConverter takes *TripleAggregate's and converts
// them into a *WikiPage which can be used to generate wiki text content.
type TripleAggregateToWikiPageConverter struct {
InAggregate chan *TripleAggregate
InIndex chan *map[string]*TripleAggregate
OutPage chan *WikiPage
cleanUpRegexes []*regexp.Regexp
}
func NewTripleAggregateToWikiPageConverter() *TripleAggregateToWikiPageConverter {
return &TripleAggregateToWikiPageConverter{
InAggregate: make(chan *TripleAggregate, BUFSIZE),
InIndex: make(chan *map[string]*TripleAggregate, BUFSIZE),
OutPage: make(chan *WikiPage, BUFSIZE),
cleanUpRegexes: []*regexp.Regexp{
regexp.MustCompile(" [(][^)]*:[^)]*[)]"),
regexp.MustCompile(" [[][^]]*:[^]]*[]]"),
},
}
}
func (p *TripleAggregateToWikiPageConverter) Run() {
defer close(p.OutPage)
predPageIndex := make(map[string]*WikiPage)
resourceIndex := <-p.InIndex
for aggr := range p.InAggregate {
pageType := p.determineType(aggr)
pageTitle, _ := p.convertUriToWikiTitle(aggr.SubjectStr, pageType, resourceIndex)
page := NewWikiPage(pageTitle, []*Fact{}, []*Category{}, nil, pageType)
topSuperCatsCnt := 0
for _, tr := range aggr.Triples {
predTitle, propertyStr := p.convertUriToWikiTitle(tr.Pred.String(), URITypePredicate, resourceIndex) // Here we know it is a predicate, simply because its location in a triple
// Make sure property page exists
if predPageIndex[predTitle] == nil {
predPageIndex[predTitle] = NewWikiPage(predTitle, []*Fact{}, []*Category{}, nil, URITypePredicate)
}
var valueStr string
if tr.Obj.Type() == rdf.TermIRI {
valueAggr := (*resourceIndex)[tr.Obj.String()]
valueUriType := p.determineType(valueAggr)
_, valueStr = p.convertUriToWikiTitle(tr.Obj.String(), valueUriType, resourceIndex)
predPageIndex[predTitle].AddFactUnique(NewFact("Has type", "Page"))
} else if tr.Obj.Type() == rdf.TermLiteral {
valueStr = tr.Obj.String()
for _, r := range p.cleanUpRegexes {
valueStr = r.ReplaceAllString(valueStr, "")
}
dataTypeStr := tr.Obj.(rdf.Literal).DataType.String()
// Add type info on the current property's page
switch dataTypeStr {
case dataTypeURIString:
predPageIndex[predTitle].AddFactUnique(NewFact("Has type", "Text"))
case dataTypeURILangString:
predPageIndex[predTitle].AddFactUnique(NewFact("Has type", "Text"))
case dataTypeURIInteger:
predPageIndex[predTitle].AddFactUnique(NewFact("Has type", "Number"))
case dataTypeURIFloat:
predPageIndex[predTitle].AddFactUnique(NewFact("Has type", "Number"))
}
}
if tr.Pred.String() == typePropertyURI || tr.Pred.String() == subClassPropertyURI {
page.AddCategoryUnique(NewCategory(valueStr))
superCatsCnt := p.countSuperCategories(tr, resourceIndex)
if superCatsCnt > topSuperCatsCnt {
topSuperCatsCnt = superCatsCnt
page.SpecificCategory = NewCategory(valueStr)
//println("Page:", page.Title, " | Adding cat", valueStr, "since has", superCatsCnt, "super categories.")
}
} else {
page.AddFactUnique(NewFact(propertyStr, valueStr))
}
}
// Add Equivalent URI fact
equivURIFact := NewFact("Equivalent URI", aggr.Subject.String())
page.AddFactUnique(equivURIFact)
// Don't send predicates just yet (we want to gather facts about them,
// and send at the end) ...
if pageType == URITypePredicate {
if predPageIndex[page.Title] != nil {
// Add facts and categories to existing page
for _, fact := range page.Facts {
predPageIndex[page.Title].AddFactUnique(fact)
}
for _, cat := range page.Categories {
predPageIndex[page.Title].AddCategoryUnique(cat)
}
} else {
// If page does not exist, use the newly created one
predPageIndex[page.Title] = page
}
} else {
p.OutPage <- page
}
}
for _, predPage := range predPageIndex {
p.OutPage <- predPage
}
}
func (p *TripleAggregateToWikiPageConverter) determineType(uriAggr *TripleAggregate) int {
if uriAggr != nil {
if uriAggr.Triples != nil {
for _, tr := range uriAggr.Triples {
for _, propType := range propertyTypes {
if tr.Pred.String() == typePropertyURI && tr.Obj.String() == propType {
return URITypePredicate
}
}
for _, catType := range categoryTypes {
if tr.Pred.String() == typePropertyURI && tr.Obj.String() == catType {
return URITypeClass
}
}
}
}
}
return URITypeUndefined
}
// For properties, the factTitle and pageTitle will be different (The page
// title including the "Property:" prefix), while for normal pages, they will
// be the same.
func (p *TripleAggregateToWikiPageConverter) convertUriToWikiTitle(uri string, uriType int, resourceIndex *map[string]*TripleAggregate) (pageTitle string, factTitle string) {
aggr := (*resourceIndex)[uri]
// Conversion strategies:
// 1. Existing wiki title (in wiki, or cache)
// 2. Use configured title-deciding properties
if aggr != nil {
factTitle = p.findTitleInTriples(aggr.Triples)
}
// 3. Shorten URI namespace to alias (e.g. http://purl.org/dc -> dc:)
// (Does this apply for properties only?)
// 4. Remove namespace, keep only local part of URL (Split on '/' or '#')
if factTitle == "" {
bits := str.Split(uri, "#")
lastBit := bits[len(bits)-1]
bits = str.Split(lastBit, "/")
lastBit = bits[len(bits)-1]
factTitle = lastBit
}
// Clean up strange characters
factTitle = str.Replace(factTitle, "[", "(", -1)
factTitle = str.Replace(factTitle, "]", ")", -1)
factTitle = str.Replace(factTitle, "{", "(", -1)
factTitle = str.Replace(factTitle, "}", ")", -1)
factTitle = str.Replace(factTitle, "|", " ", -1)
factTitle = str.Replace(factTitle, "#", " ", -1)
factTitle = str.Replace(factTitle, "<", "less than", -1)
factTitle = str.Replace(factTitle, ">", "greater than", -1)
factTitle = str.Replace(factTitle, "?", " ", -1)
factTitle = str.Replace(factTitle, "&", " ", -1)
factTitle = str.Replace(factTitle, ",", " ", -1) // Can't allow comma's as we use it as a separator in template variables
factTitle = str.Replace(factTitle, ".", " ", -1)
factTitle = str.Replace(factTitle, "=", "-", -1)
// Clean up according to regexes
for _, r := range p.cleanUpRegexes {
factTitle = r.ReplaceAllString(factTitle, "")
}
// Limit to max 255 chars (due to MediaWiki limitation)
titleIsShortened := false
for len(factTitle) >= 250 {
factTitle = removeLastWord(factTitle)
titleIsShortened = true
}
if titleIsShortened {
factTitle += " ..."
}
factTitle = p.upperCaseFirst(factTitle)
if uriType == URITypePredicate {
pageTitle = "Property:" + factTitle
} else if uriType == URITypeClass {
pageTitle = "Category:" + factTitle
} else {
pageTitle = factTitle
}
return pageTitle, factTitle
}
func (p *TripleAggregateToWikiPageConverter) findTitleInTriples(triples []rdf.Triple) string {
for _, titleProp := range titleProperties {
for _, tr := range triples {
if tr.Pred.String() == titleProp {
return tr.Obj.String()
}
}
}
return ""
}
func (p *TripleAggregateToWikiPageConverter) countSuperCategories(tr rdf.Triple, ri *map[string]*TripleAggregate) int {
catPage := (*ri)[tr.Obj.String()]
topSuperCatsCnt := 0
if catPage != nil {
for _, subTr := range catPage.Triples {
if subTr.Pred.String() == typePropertyURI || subTr.Pred.String() == subClassPropertyURI {
superCatsCnt := p.countSuperCategories(subTr, ri) + 1
if superCatsCnt > topSuperCatsCnt {
topSuperCatsCnt = superCatsCnt
}
}
}
}
return topSuperCatsCnt
}
func (p *TripleAggregateToWikiPageConverter) upperCaseFirst(inStr string) string {
var outStr string
if inStr != "" {
outStr = str.ToUpper(inStr[0:1]) + inStr[1:]
}
return outStr
}
func removeLastWord(inStr string) string {
bits := str.Split(inStr, " ")
outStr := str.Join(append(bits[:len(bits)-1]), " ")
return outStr
}