phinze/homebrew-cask

View on GitHub
developer/bin/import_google_fonts

Summary

Maintainability
Test Coverage
#!/usr/bin/env python3
#
# import_google_fonts
#
# Using Python rather than Ruby for the Protocol Buffer parser.
# https://github.com/protocolbuffers/protobuf/issues/6508#issuecomment-522165498
#
# To install dependencies:
#
#   pip3 install gftools html2text jinja2 protobuf

from functools import reduce
from glob import glob
import os
import re
import sys
from google.protobuf import text_format
import gftools.fonts_public_pb2 as fonts_pb2
import jinja2
import html2text
import urllib.request

def parse_metadata(filename):
    # based off of
    # https://github.com/googlefonts/gftools/blob/2bfd4acd402b353aaeb46b991e6cad855001e4c8/Lib/gftools/util/google_fonts.py
    with open(filename) as f:
        meta = fonts_pb2.FamilyProto()
        text_format.Merge(f.read(), meta)
        return meta


class FontCask:
    ENVIRONMENT = jinja2.Environment(
        keep_trailing_newline=True, trim_blocks=True, undefined=jinja2.StrictUndefined,
    )
    TEMPLATE = ENVIRONMENT.from_string(
        """cask "{{token}}" do
  version :latest
  sha256 :no_check

{% if files|length == 1 %}
  url "https://github.com/google/fonts/raw/main/{{folder}}/{{files[0] | urlencode}}"
      {%- if not 'github.com/' in homepage %},
      verified: "github.com/google/fonts/"
      {%- endif +%}
{% else %}
  url "https://github.com/google/fonts.git",
      {%- if not 'github.com/' in homepage +%}
      verified:  "github.com/google/fonts",
      {%- endif +%}
      branch:    "main",
      only_path: "{{folder}}"
{% endif %}
  name "{{font_name}}"
  homepage "{{homepage}}"

{% for file in files %}
  font "{{file}}"
{% endfor %}

  # No zap stanza required
end
"""
    )

    def __init__(self, folder, meta, description=None, early_access=False):
        self.folder = folder
        self.meta = meta
        self.desc = description
        self.early_access = early_access
        self.homepage_override = None

    def font_name(self):
        return self.meta.name

    def description(self):
        if not self.desc:
          return None

        if len(self.desc) == 0:
          return None

        return self.desc

    def token(self):
        # https://github.com/Homebrew/homebrew-cask-fonts/blob/master/CONTRIBUTING.md#converting-the-canonical-name-to-a-token
        token = self.font_name().lower().replace(" ", "-")
        return f"font-{token}"
    
    def token_sharding_dir(self):
        return "font-" + self.token().split("font-")[1][0]

    def dest_path(self):
        return os.path.join("Casks", "font", f"{self.token_sharding_dir()}", f"{self.token()}.rb")

    def name_path(self):
        return self.font_name().replace(" ", "+")

    def homepage(self):
        if self.homepage_override is not None:
          return self.homepage_override

        if self.early_access:
            return f"https://fonts.google.com/earlyaccess"

        return f"https://fonts.google.com/specimen/{self.name_path()}"

    def files(self):
        results = [font.filename for font in self.meta.fonts]
        results.sort()
        return results

    def cask_content(self):
        return self.TEMPLATE.render(
            token=self.token(),
            folder=self.folder,
            font_name=self.font_name(),
            desc=self.description(),
            homepage=self.homepage(),
            files=self.files(),
        )


def is_other_foundry(cask_path):
    with open(cask_path) as f:
        contents = f.read()

    return not re.search(r"url ['\"]https://github.com/google/fonts", contents)


def should_skip(cask_path):
    if os.path.exists(cask_path):
        # Cask already exists
        if is_other_foundry(cask_path):
            print("Other foundry:", cask_path)
            # don't overwrite it, per
            # https://github.com/Homebrew/homebrew-cask-fonts/blob/master/CONTRIBUTING.md#google-web-font-directory
            return True

    return False


def metadata_to_cask(meta_file, repo_dir):
    folder = os.path.dirname(os.path.relpath(meta_file, start=repo_dir))
    meta = parse_metadata(meta_file)

    description_path = os.path.join(os.path.dirname(meta_file), "DESCRIPTION.en_us.html")

    description = None

    if os.path.exists(description_path):
      with open(description_path) as f:
        h2t = html2text.HTML2Text()
        h2t.ignore_links = True
        h2t.ignore_images = True
        h2t.ignore_tables = True
        h2t.ignore_emphasis = True

        contents = " ".join(h2t.handle(f.read()).replace('"', '').split())

        regex = r".*" + re.escape(meta.name) + r"\s+(?:.*\s+)?is(?:\s+an?|the)?\s+"
        parts = re.split(regex, contents, maxsplit=1)

        if len(parts) > 1:
          description = parts[1].split(".")[0].capitalize()

    return FontCask(folder, meta, description=description)


def write_cask(cask):
    path = cask.dest_path()
    if should_skip(path):
        return False

    content = cask.cask_content()

    if os.path.exists(path):
      with open(path, "r") as f:
        if f.read() == content:
          return False

    directory = os.path.dirname(path)
    if not os.path.exists(directory):
      os.makedirs(directory)

    with open(path, "w") as f:
        f.write(content)

    return True


def find_google_casks():
  casks = {}

  for cask_path in glob('Casks/font/**/*.rb'):
    token = os.path.splitext(os.path.basename(cask_path))[0]

    with open(cask_path, "r") as f:
      contents = f.read()

      # Skip "font-material-symbols" as it matches the url regex, but is not included in the Google Fonts repo
      if os.path.basename(cask_path) == "font-material-symbols.rb":
        continue

      if not re.search(r"(github\.com\/google\/fonts|fonts\.google\.com|google\.com/fonts)", contents):
        continue

      homepage = re.search(r"homepage\s+([\"'])(.*)(\1)\s*", contents)
      if homepage:
        homepage = homepage[2]

      desc = re.search(r"desc\s+([\"'])(.*)(\1)\s*", contents)
      if desc:
        description = desc[2]
      else:
        description = None

    casks[token] = {
      'path': cask_path,
      'description': description,
      'homepage': homepage,
    }

  return casks

def find_family_folders(repo_dir):
    SUBDIRS = ["apache", "ofl", "ufl"]
    folders_list = [glob(os.path.join(repo_dir, subdir, "*")) for subdir in SUBDIRS]
    # flatten
    return reduce(lambda x, y: x + y, folders_list)


# https://www.geeksforgeeks.org/python-split-camelcase-string-to-individual-strings/
def camel_case_split(str):
    return re.findall(r"[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))", str)


def derive_name(font_file):
    parent_dir_name = os.path.basename(os.path.dirname(font_file))

    font_file = os.path.splitext(os.path.basename(font_file))[0]
    font_file = font_file[: len(parent_dir_name)]
    font_file = re.sub(r"-\w+$", "", font_file)

    name_parts = camel_case_split(font_file)
    result = " ".join(name_parts)
    return result


def derive_metadata(family_folder):
    """Create a metadata object based on the contents of the folder."""

    meta = fonts_pb2.FamilyProto()

    font_files = glob(os.path.join(family_folder, "*.ttf"))
    # grab the first font, arbitrarily
    meta.name = derive_name(font_files[0])

    fonts = [
        fonts_pb2.FontProto(filename=os.path.basename(filename))
        for filename in font_files
    ]
    meta.fonts.extend(fonts)

    return meta


def derive_cask(family_folder, repo_dir):
    meta = derive_metadata(family_folder)
    folder = os.path.relpath(family_folder, start=repo_dir)

    early_access_file = os.path.join(family_folder, "EARLY_ACCESS.category")
    early_access = os.path.exists(early_access_file)

    return FontCask(folder, meta, early_access=early_access)


def run():
    if len(sys.argv) != 3:
        print(
            """Usage: ./import_google_fonts <path-to-repo> <mode>

Download the or clone the repository from https://github.com/google/fonts, then provide the path to the script.
    """
        )
        sys.exit(1)

    repo_dir = sys.argv[1]
    mode = sys.argv[2]
    family_folders = find_family_folders(repo_dir)

    existing_casks = find_google_casks()
    added_casks = {}
    updated_casks = {}

    for family_folder in family_folders:
        meta_file = os.path.join(family_folder, "METADATA.pb")
        # check if the metadata file is present
        # https://github.com/google/fonts/issues/2512
        if os.path.exists(meta_file):
          try:
            cask = metadata_to_cask(meta_file, repo_dir)
          except text_format.ParseError:
            continue
        else:
          cask = derive_cask(family_folder, repo_dir)

        # Ek Mukta has been renamed to just Mukta but still exists.
        if cask.token() == 'font-ek-mukta':
          continue

        # Skip cask if already handled (i.e. if it exists in multiple license sub-directories).
        if cask.token() in added_casks or cask.token() in updated_casks:
          continue

        existing_cask = existing_casks.pop(cask.token(), None)
        if existing_cask:
          cask.desc = existing_cask['description']

          # If font is unreleased, re-use previous homepage URL.
          if cask.homepage() != existing_cask['homepage']:
            try:
              urllib.request.urlopen(cask.homepage())
            except urllib.request.URLError as e:
              if e.code == 404:
                cask.homepage_override = existing_cask['homepage']

          updated_casks[cask.token()] = cask
        else:
          try:
             urllib.request.urlopen(cask.homepage())
          except urllib.request.URLError as e:
            if e.code == 404:
              cask.homepage_override = cask.meta.source.repository_url

          added_casks[cask.token()] = cask

    changed_casks = {}

    print("Mode " + mode)

    if not mode or mode == 'add':
      print("Adding added casks")

      changed_casks |= added_casks

    if not mode or mode == 'update':
      print("Adding updated casks")
      changed_casks |= updated_casks

    written_casks = 0
    for token, cask in changed_casks.items():
      if write_cask(cask):
        written_casks += 1

      # Limit cask changes per PR.
      if mode is not None and written_casks >= 50:
        break

    if not mode or mode == 'delete':
      # Delete casks which don't exist anymore.
      for deleted_cask in existing_casks.values():
        os.remove(deleted_cask["path"])

run()