PDF-Archiver/PDF-Archiver

View on GitHub
ArchiveCore/Sources/ArchiveBackend/Helpers/TagParser.swift

Summary

Maintainability
A
55 mins
Test Coverage
//
//  TagParser.swift
//  ArchiveLib
//
//  Created by Julian Kahnert on 28.12.18.
//
// Example from: https://developer.apple.com/documentation/naturallanguage/identifying_people_places_and_organizations

import Foundation
import NaturalLanguage

/// Parse tags from a String.
public enum TagParser {

    private static let seperator = "-"

    /// Get tag names from a string.
    ///
    /// - Parameter raw: Raw string which might contain some tags.
    /// - Returns: Found tag names.
    public static func parse(_ text: String) -> Set<String> {
        var documentTags = Set<String>()

        if #available(iOS 12.0, OSX 10.14, *) {
            let tagger = NLTagger(tagSchemes: [.nameType])
            tagger.string = text
            let options: NLTagger.Options = [.omitPunctuation, .omitWhitespace, .omitOther, .joinContractions]

            let tags: [NLTag] = [.personalName, .organizationName, .placeName]
            tagger.enumerateTags(in: text.startIndex..<text.endIndex, unit: .word, scheme: .nameType, options: options) { tag, tokenRange in
                if let tag = tag,
                    tags.contains(tag) {

                    // slugify tag
                    let foundTagName = String(text[tokenRange]).lowercased().slugified(withSeparator: seperator)

                    // validate the found tag:
                    // * should not contain any sperators, since this is a hint on duplicates, e.g. "zalando" vs. "zalando se"
                    // * should have more than 2 characters
                    if !foundTagName.contains(seperator) && foundTagName.count > 2 {
                        documentTags.insert(foundTagName)
                    }
                }
                return true
            }
        }

        return documentTags
    }
}