bin/build_geolocation
#!/usr/bin/perl -w
#-------------------------------------------------------------------------------
# File: build_geolocation
#
# Description: Parse geonames files to create ExifTool geolocation database
#
# Syntax: build_geolocation [OPTIONS] [DBFILE] ...
#
# Options: (see -h output)
#
# Created: 2024-03-03 - P. Harvey
# 2024-04-15 - PH Clean up and add options for public release
# 2024-04-22 - PH Increased number of possible feature codes from
# 32 to 64. Convert backslashes in directory names
# 2024-04-24 - PH Fixed problem with population exponent when run
# under ActivePerl
# 2024-04-29 - PH Added feature types and default to db v1.03
#
# Notes: Requires these files from https://download.geonames.org/export/
#
# allCountries.txt (or other input database if specified)
# countryInfo.txt
# admin1CodesASCII.txt
# admin2Codes.txt
# featureCodes_XX.txt (optional)
# alternateNamesV2.txt (optional)
#
# Output datbase format (Geolocation.dat):
#
# Header:
# "GeolocationV.VV\tNNNN\n" (V.VV=version, NNNN=num city entries)
# "# <comment>\n"
# NNNN City entries:
# Offset Format Description
# 0 int16u - latitude high 16 bits (converted to 0-0x100000 range)
# 2 int8u - latitude low 4 bits, longitude low 4 bits
# 3 int16u - longitude high 16 bits
# 5 int8u - index of country in country list
# 6 int8u - 0xf0 = population E exponent (in format "N.Fe+0E"), 0x0f = population N digit
# 7 int16u - 0xf000 = population F digit, 0x0fff = index in region list (admin1)
# 9 int16u - v1.02: 0x7fff = index in subregion (admin2), 0x8000 = high bit of time zone
# 9 int16u - v1.03: index in subregion (admin2)
# 11 int8u - low byte of time zone index
# 12 int8u - 0x3f = feature code index (see below), v1.03: 0x80 = high bit of time zone
# 13 string - UTF8 City name, terminated by newline
# "\0\0\0\0\x01"
# Country entries:
# 1. 2-character country code
# 2. Country name, terminated by newline
# "\0\0\0\0\x02"
# Region entries:
# 1. Region name, terminated by newline
# "\0\0\0\0\x03"
# Subregion entries:
# 1. Subregion name, terminated by newline
# "\0\0\0\0\x04"
# Time zone entries:
# 1. Time zone name, terminated by newline
# "\0\0\0\0\x05" (feature codes added in v1.03)
# Feature codes:
# 1. Feature code, optional space-followed-by-feature-name, then newline
# "\0\0\0\0\0"
#
# Feature Codes v1.02: (see http://www.geonames.org/export/codes.html#P for descriptions)
#
# 0. Other 3. PPLA2 6. PPLA5 9. PPLF 12. PPLR 15. PPLX
# 1. PPL 4. PPLA3 7. PPLC 10. PPLG 13. PPLS
# 2. PPLA 5. PPLA4 8. PPLCH 11. PPLL 14. STLMT
#
# Feature Codes v1.03 and later are listed at the end of the database
#-------------------------------------------------------------------------------
use strict;
my $dbVer = '1.03'; # default output database version
my $dbFile = 'allCountries.txt'; # default database file
my $countryFile = 'countryInfo.txt'; # mandatory country names file
my $regionFile = 'admin1CodesASCII.txt'; # mandatory region names file
my $admin2File = 'admin2Codes.txt'; # mandatory subregion names file
my $featureFile = 'featureCodes_en.txt'; # optional feature names file
my $altNamesFile = 'alternateNamesV2.txt'; # optional alternate names file
my $outFile = 'Geolocation.dat'; # output ExifTool database file
my $outAltNames = 'AltNames.dat'; # output alternate names file
my $outDirName = 'Geolocation_out'; # output directory for database files
my $geoLang = 'GeoLang'; # output directory for language files
my %defaults = (
file => $dbFile,
minpop => 2000,
def_codes => 'PPLA,PPLA2',
def_codesp => 'PPL,PPLA,PPLA2,PPLA3,PPLA4,PPLA5,PPLC,PPLCH,PPLF,PPLG,PPLH,PPLL,PPLQ,PPLR,PPLS,PPLW,PPLX,STLMT',
);
# languages to read from geonames database
my @languages = qw(cs de en en-ca en-gb es fi fr it ja ko nl pl ru sk sv tr zh zh-cn zh-tw);
# indices of feature codes (v1.02 is hard-coded in ExifTool)
my @fc102 = qw(
Other PPL PPLA PPLA2 PPLA3 PPLA4 PPLA5 PPLC PPLCH
PPLF PPLG PPLL PPLR PPLS STLMT PPLX
);
# base features for v1.03+
my @fc103 = qw(
Other PPL PPLA PPLA2 PPLA3 PPLA4 PPLA5 PPLC PPLCH
PPLF PPLG PPLH PPLL PPLQ PPLR PPLS PPLW PPLX STLMT
);
my $i = 0;
my @featureCodes = $dbVer eq '1.02' ? @fc102 : @fc103;
my %featureCodes = map { $_ => $i++ } @featureCodes;
my %featureNames;
my ($dbfile, @dbfiles, $outDir, $verbose, $noLang, %needRgn);
my %optArgs = ( p => 1, c => 1, cp => 1, l => 1, o => 1, ver => 1 );
# process command-line arguments
my $opts = { };
while (@ARGV) {
my $opt = shift;
if (not $opt =~ s/^-//) {
$opt = '.' unless length $opt;
$opt =~ tr(\\)(/); # use forward slashes
$opt =~ s(/$)(); # remove trailing slash
$opt = "$opt/$defaults{file}" if -d $opt;
-e $opt or die "Error opening database $opt\n";
push @dbfiles, { %defaults, %$opts, file => $opt };
$opts = { };
next;
}
my $arg;
if ($optArgs{$opt}) {
$arg = shift;
defined $arg or die "Expecting argument for -$opt option\n";
}
if ($opt eq 'p') {
$arg = uc $arg;
if ($arg =~ /=/) {
my ($cc, $mp) = split /=/, $arg;
$mp =~ /^\d+$/ or die "Expecting number on rhs of '=' for -p option\n";
my @cc = split /,/, $cc;
foreach $cc (@cc) {
$cc =~ /^([A-Z]{2})(\..+)?$/ or die "Invalid country/region '$cc' for -p option\n";
$needRgn{$1} = $needRgn{$cc} = 1 if length $cc > 2;
$$opts{cc_minpop}{$cc} = $mp;
}
} else {
$arg =~ /^\d+$/ or die "Expecting number for -p option\n";
$$opts{minpop} = $arg;
}
} elsif ($opt =~ /^c(p?)$/) {
my $p = $1;
$arg = uc $arg;
my ($cc, $co);
if ($arg =~ /=/) {
($cc, $co) = split /=/, $arg;
} else {
($cc, $co) = ('??', $arg);
}
my $sign = $co =~ s/^([-+])// ? $1 : '';
my @co = split /,/, $co;
my @cc = split /,/, $cc;
# store lookup for features to keep for each country ('??' = any country)
foreach $cc (@cc) {
$cc =~ /^([A-Z]{2}|\?\?)(\..+)?$/ or die "Invalid country/region '$cc' for -$opt option\n";
$needRgn{$1} = $needRgn{$cc} = 1 if length $cc > 2;
if (not $sign) {
$$opts{"keep$p"}{$cc} = { };
} elsif (not $$opts{"keep$p"}{$cc}) {
# start from defaults
my %codes = map { $_ => 1 } split /,/, $defaults{"def_codes$p"};
$$opts{"keep$p"}{$cc} = \%codes;
}
foreach $co (@co) {
if ($sign eq '-') {
delete $$opts{"keep$p"}{$cc}{$co};
} else {
$$opts{"keep$p"}{$cc}{$co} = 1;
}
}
}
} elsif ($opt eq 'l') {
$arg = lc $arg;
my @langs = split ',', $arg;
if (not @langs) {
undef @languages;
$noLang = 1;
} elsif ($langs[0] =~ s/^-//) {
@languages = grep !/^$langs[0]$/, @languages foreach @langs;
} else {
@languages = @langs;
}
} elsif ($opt eq 'o') {
$outDir = $arg;
} elsif ($opt eq 'ver') {
$dbVer = $arg;
$dbVer =~ /^1\.0[23]$/ or die "Unsupported version number $dbVer\n";
} elsif ($opt eq 'v') {
$verbose = 1;
} elsif ($opt eq 'h') {
my $defcp = $defaults{def_codesp};
$defcp =~ s/(PPLG,)/\n $1/;
my $defLang = join ',', @languages;
$defLang =~ s/(ja,)/\n $1/;
print <<"END";
Description: Build ExifTool Geolocation database.
Syntax: build_geolocation [OPTIONS] [DBFILE] ...
Options:
DBFILE - Input database file name or directory. Multiple input database
files may be specified. The -p, -c and -cp options apply to
the database that comes after them on the command line.
Default is "$dbFile".
-p POP - Minimum population for cities to include. POP may be a number
or be of the form "CC[,C2...]=###" to set different limits for
specific countries/regions, where CC and C2 are country codes
with optional region name or code appended after a period (eg.
"CA.Ontario,US=500" sets the minimum population to 500 for
cities on Ontario Canada or the U.S.). If a region is
specified, either the full name or the geonames admin1 code may
be used, and case and spaces are not significant. May be
multiple -p options for each input DBFILE. Default is "$defaults{minpop}".
-c CODE - Feature codes to always include, regardless of population. CODE
is a comma-separated list of feature codes, with an optional
leading comma-separated list of country/region codes followed
by an equals sign to apply only to specific countries. The
feature-code list may begin with a dash to remove entries from
the default list, or a plus sign to add entries. May be
multiple -c options for each intput DBFILE. Country/region and
feature names are case insensitive. Default is "$defaults{def_codes}".
-cp CODE - Additional features to include if above minimum population.
Default is "$defcp".
-l LANG - Alternate languages to read from $altNamesFile if
available. These are used to generate $outAltNames an the
$geoLang files. LANG is a comma-separated list of language
codes, starting with a dash to remove items from the default
list. May be set to an empty string to disable generation
of alternate language files even if $altNamesFile
exists. The same set of languages applies to all input
database files. Default is "$defLang".
-o OUTDIR - Output directory name. Default is the same directory as the
first input database file. A directory named $outDirName
containing the output files will be created in this directory.
-ver VER - Version for output geolocation database (default is $dbVer).
-v - Verbose messages.
-h - Show this help.
Input files (download from https://download.geonames.org/export/dump/):
$dbFile - default database file (smaller files with names
like "cities###.txt" may be specified instead)
$countryFile - mandatory country names file
$regionFile - mandatory region names file
$admin2File - mandatory subregion names file
$featureFile - optional feature codes file
$altNamesFile - optional alternate names file (must exist to
to generate $outAltNames and $geoLang files)
Output files:
$outDirName - default output directory name
$outFile - ExifTool database file
$outAltNames - alternate names file
$geoLang - directory for alternate language files
Author:
Copyright 2024, Phil Harvey
This is free software; you can redistribute it and/or modify it under the
same terms as Perl itself.
END
exit 0;
} else {
die "Unknown option '-$opt'\n";
}
}
if (@dbfiles) {
# apply any remaining options to last database file
$dbfiles[-1]{$_} = $$opts{$_} foreach keys %$opts;
} else {
# use default database file if none specified
push @dbfiles, { %defaults, %$opts };
unless (-e $dbfiles[0]{file}) {
# also look in script directory
if ($0 =~ m{(.*)/} and -e "$1/$dbfiles[0]{file}") {
$dbfiles[0]{file} = "$1/$dbfiles[0]{file}";
} else {
die qq(Database "$dbfiles[0]{file}" not found. Use -h option for help.\n);
}
}
}
# determine our working directory
my $dbdir = $dbfiles[0]{file};
$dbdir = '.' unless $dbdir =~ s(/[^/]*$)();
# add default feature code lookups if necessary
foreach $dbfile (@dbfiles) {
my $p;
foreach $p ('', 'p') {
next if $$dbfile{"keep$p"}{'??'};
my %codes = map { $_ => 1 } split /,/, $defaults{"def_codes$p"};
$$dbfile{"keep$p"}{'??'} = \%codes;
}
}
# pre-read region file if necessary
if (%needRgn) {
open REGION, '<', "$dbdir/$regionFile" or die "Error opening $dbdir/$regionFile\n";
while (<REGION>) {
my @items = split /\t/;
my $rgn = $items[0];
my ($cc) = split /\./, $rgn;
next unless $needRgn{$cc};
unless ($needRgn{$rgn}) { # allow region code to be used
$rgn = $cc . '.' . uc$items[1]; # also support full region name
unless ($needRgn{$rgn}) {
$rgn =~ tr/ //d;
next unless $needRgn{$rgn}; # also allow no spaces
}
}
$needRgn{$rgn} = [$items[0], "$cc.$items[1]"];
}
close REGION;
foreach (sort keys %needRgn) {
next if length == 2;
die "No matching region for $_\n" unless ref $needRgn{$_};
}
}
if ($verbose) {
my $langs = join ',', @languages;
$langs or $langs = '<none>';
print "Languages to read from input database(s):\n $langs\n";
foreach $dbfile (@dbfiles) {
print "Parameters for reading $$dbfile{file}:\n";
print " Minimum populations (??=any country):\n";
print " ??=$$dbfile{minpop}\n";
foreach (reverse sort keys %{$$dbfile{cc_minpop}}) {
my $cc = ref $needRgn{$_} ? $needRgn{$_}[1] : $_;
print " $cc=$$dbfile{cc_minpop}{$_}\n";
}
print " Features to keep regardless of population:\n";
foreach (reverse sort keys %{$$dbfile{keep}}) {
my $cc = ref $needRgn{$_} ? $needRgn{$_}[1] : $_;
print " $cc=",join(',', sort keys %{$$dbfile{keep}{$_}}), "\n";
}
print " Features to keep for population >= minimum:\n";
foreach (reverse sort keys %{$$dbfile{keepp}}) {
my $cc = ref $needRgn{$_} ? $needRgn{$_}[1] : $_;
print " $_=",join(',', sort keys %{$$dbfile{keepp}{$_}}), "\n";
}
}
}
# translate option region arguments to region codes
foreach $dbfile (@dbfiles) {
my ($type, $cc);
foreach $type (qw(cc_minpop keep keepp)) {
my @cc = keys %{$$dbfile{$type}};
foreach $cc (@cc) {
next unless ref $needRgn{$cc};
my $tmp = $$dbfile{$type}{$cc};
delete $$dbfile{$type}{$cc};
$$dbfile{$type}{$needRgn{$cc}[0]} = $tmp;
}
}
}
$outDir = "$dbdir/$outDirName" unless defined $outDir;
-d $outDir or mkdir $outDir, 0777 or die "Error creating output directory '$outDir'\n";
-e "$dbdir/$_" or die "Missing input file $dbdir/$_\n" foreach $countryFile, $regionFile, $admin2File;
# order of country codes, region names and subregions in database
my (%orderCC, %orderRgn, %orderSub);
# languages to read from geonames database (converted to lower case)
my %languages = map { $_ => 1 } @languages;
# language codes supported by ExifTool
my @supportedLangs = qw(cs de en-ca en-gb es fi fr it ja ko nl pl ru sk sv tr zh-cn zh-tw);
# supported country-specific languages
my %ccLang = ( TW => 'zh', CN => 'zh', CA => 'en', GB => 'en' );
my (%lang, %featureLang, %haveCountry, %cityFlags, %rgnFlags, %subFlags, %ccFlags, %flags);
my (%haveRegion, %haveSubRgn, $filesize, $percent);
sub GetFileSize($)
{
my $file = shift;
seek $file, 0, 2 or die "Seek error\n";
my $size = tell $file;
seek $file, 0, 0 or die "Seek error\n";
return $size;
}
# pre-scan database to determine which countries, regions subregions and
# feature codes we will be using
foreach $dbfile (@dbfiles) {
my $database = $$dbfile{file};
my $upgraded;
print "Reading $database... 0%";
flush STDOUT;
# pre-read the files to initialize necessary variables
open INFILE, '<', $database or die "Error opening $database\n";
$filesize = GetFileSize(\*INFILE);
open OUTFILE, '>', "$outDir/$outFile" or die "Error creating $outFile in $outDir\n";
binmode(OUTFILE);
$$dbfile{kept} = [ ];
$percent = -1;
while (<INFILE>) {
my $p = int(100 * tell(INFILE) / $filesize + 0.5);
if ($percent != $p) {
printf("\b\b\b\b%3d%%", $percent = $p);
flush STDOUT;
}
my @items = split /\t/;
my ($dbnum, $code, $cc, $rgn, $sub, $pop) = @items[0,7,8,10,11,14];
next unless @items > 17 and $cc =~ /^[A-Z]{2}$/;
my ($minpop, $keep);
if ($needRgn{$cc} and defined $$dbfile{cc_minpop}{"$cc$rgn"}) {
$minpop = $$dbfile{cc_minpop}{"$cc$rgn"};
} elsif (defined $$dbfile{cc_minpop}{$cc}) {
$minpop = $$dbfile{cc_minpop}{$cc};
} else {
$minpop = $$dbfile{minpop};
}
# keep regardless of population
if ($needRgn{$cc} and $$dbfile{keep}{"$cc$rgn"}) {
$keep = $$dbfile{keep}{"$cc$rgn"}{$code};
} elsif ($$dbfile{keep}{$cc}) {
$keep = $$dbfile{keep}{$cc}{$code};
} else {
$keep = $$dbfile{keep}{'??'}{$code};
}
if ($pop < $minpop) {
next unless $keep;
} elsif ($needRgn{$cc} and $$dbfile{keepp}{"$cc$rgn"}) {
next unless $$dbfile{keepp}{"$cc$rgn"}{$code};
} elsif ($$dbfile{keepp}{$cc}) {
next unless $$dbfile{keepp}{$cc}{$code};
} else {
next unless $$dbfile{keepp}{'??'}{$code};
}
push @{$$dbfile{kept}}, $_;
$lang{$dbnum} = { alt => [ ] };
$haveCountry{$cc} = 1;
$haveRegion{"$cc$rgn"} = 1;
$haveSubRgn{"$cc$rgn.$sub"} = 1;
# add new feature codes (up to maximum index of 0x3f)
unless ($featureCodes{$code} or @featureCodes > 0x3f) {
if ($dbVer eq '1.02') {
next if $code =~ /^(PPLH|PPLQ|PPLW)$/; # (stored as "Other" in v1.02)
$dbVer = '1.03';
$upgraded = 1; # print upgrade warning
@featureCodes = @fc103;
my $i = 0;
%featureCodes = map { $_ => $i++ } @featureCodes;
next if $featureCodes{$code};
}
push @featureCodes, $code;
$featureCodes{$code} = $#featureCodes;
}
}
close INFILE;
print "\b\b\b\bDone.\n";
warn "Some feature codes not supported by version 1.02, writing as 1.03 instead.\n" if $upgraded;
}
# read feature names
if (open INFILE, '<', "$dbdir/$featureFile") {
print "Reading $dbdir/$featureFile\n";
while (<INFILE>) {
my @items = split /\t/;
$items[0] =~ s/^.\.//; # remove feature group and "." separator
next unless $featureCodes{$items[0]};
my $name = ucfirst $items[1];
$name =~ s/ ([a-z])/ \U$1/g;
$featureNames{$items[0]} = $name;
}
close INFILE;
} else {
print "Not found: $dbdir/$featureFile\n";
print "--> Not storing feature type names\n";
}
# read country names
$i = 0;
open INFILE, '<', "$dbdir/$countryFile" or die "Error opening $dbdir/$countryFile\n";
print "Reading $dbdir/$countryFile\n";
while (<INFILE>) {
next if /^#/;
my @items = split /\t/;
next unless $haveCountry{$items[0]};
$lang{$items[16]} = { alt => [ ] }; # reference lookup by db number
$orderCC{$items[0]} = $i++; # (entry 0 is the first country)
}
close INFILE;
printf " %.6d countries (0x%.4x)\n",$i,$i if $verbose;
die "Too many countries!\n" if $i > 0x100; # (no default 0 entry)
# read region (admin1) names
$i = 0;
open REGION, '<', "$dbdir/$regionFile" or die "Error opening $dbdir/$regionFile\n";
print "Reading $dbdir/$regionFile\n";
while (<REGION>) {
chomp;
my @items = split /\t/;
$items[0] =~ tr/.//d; # (remove "." separator)
next unless $haveRegion{$items[0]};
$lang{$items[3]} = { alt => [ ] }; # reference lookup by db number
$orderRgn{$items[0]} = ++$i; # (entry 0 is default "" region)
}
close REGION;
printf " %.6d regions (0x%.4x)\n",$i,$i if $verbose;
die "Too many regions!\n" if $i > 0x0fff; # (account for default 0 entry)
# read subregion (admin2) names
$i = 0;
open ADMIN2, '<', "$dbdir/$admin2File" or die "Error opening $dbdir/$admin2File\n";
print "Reading $dbdir/$admin2File\n";
while (<ADMIN2>) {
chomp;
my @items = split /\t/;
$items[0] =~ s/\.//; # (remove first "." separator)
next unless $haveSubRgn{$items[0]};
$lang{$items[3]} = { alt => [ ] }; # reference lookup by db number
$orderSub{$items[0]} = ++$i; # (entry 0 is default "" subregion)
}
close ADMIN2;
printf " %.6d subregions (0x%.4x)\n",$i,$i if $verbose;
if ($i > ($dbVer eq '1.02' ? 0x7fff : 0xffff)) {
die "Too many subregions!\n" if $i > 0xffff;
$dbVer = '1.03';
warn "Too many subregions for version 1.02, writing as 1.03 instead.\n";
}
# read alternate names file if available
if (not $noLang and open INFILE, '<', "$dbdir/$altNamesFile") {
$filesize = GetFileSize(\*INFILE);
print "Reading $dbdir/$altNamesFile... 0%";
my %bestPri;
while (<INFILE>) {
my $p = int(100 * tell(INFILE) / $filesize + 0.5);
if ($percent != $p) {
printf("\b\b\b\b%3d%%", $percent = $p);
flush STDOUT;
}
# items: 0=altID,1=geoID,2=lang,3=alt name,4=preferred,5=short,6=colloquial,7=historic
my @items = split /\t/;
my $lkup = $lang{$items[1]} or next;
my $altList = $lang{$items[1]}{alt};
my $lng = lc $items[2];
next if $lng and not $languages{$lng};
push @$altList, $items[3] unless grep /^\Q$items[3]\E$/i, @$altList;
next unless $lng;
my $flags = 0;
# keep only the best translation for this name for each language
$items[$_] and $flags |= (1<<($_-4)) foreach 4,5,6,7;
$flags{$items[1]} = ( $flags{$items[1]} || 0 ) | $flags;
next if $items[6] or $items[7]; # ignore colloquial and historic names
my $pri = $items[5] ? 0 : ($items[4] ? 1 : 2); # priority for best type of name
my $langPri = $bestPri{$lng};
$langPri or $langPri = $bestPri{$lng} = { };
next if $$langPri{$items[1]} and $$langPri{$items[1]} > $pri;
$$langPri{$items[1]} = $pri;
# save language-specific name for this feature, removing commas
($$lkup{$lng} = $items[3]) =~ tr/,//d;
}
print "\b\b\b\bDone.\n";
close INFILE;
# read alternate feature names
if (%featureNames) {
my $lng;
foreach $lng (@languages) {
next if $lng eq 'en' or not $languages{$lng};
my $file = "$dbdir/$featureFile";
$file =~ s/_en\./_$lng./ or next;
next unless open INFILE, '<', $file;
print "Reading $file\n";
while (<INFILE>) {
my @items = split /\t/;
$items[0] =~ s/^.\.//; # remove feature group and "." separator
next unless $featureNames{$items[0]};
utf8::decode($items[1]);
my $name = ucfirst $items[1];
$name =~ s/ (.)/ \U$1/g;
# change $name back to byte string
if ($] >= 5.006 and (eval { require Encode; Encode::is_utf8($name) } or $@)) {
$name = $@ ? pack('C*',unpack($] < 5.010000 ? 'U0C*' : 'C0C*',$name)) : Encode::encode('utf8',$name);
}
next if $name eq $featureNames{$items[0]};
$featureLang{$lng}{$items[0]} = $name;
}
close INFILE;
}
}
} else {
print "Not found: $dbdir/$altNamesFile\n--> " unless $noLang;
print "Not writing alternate languages\n";
$noLang = 1;
}
my (%coords, %langLookups);
foreach $dbfile (@dbfiles) {
my $database = $$dbfile{file};
print "Processing database entries... 0%";
my $i = 0;
my $num = scalar @{$$dbfile{kept}};
foreach (@{$$dbfile{kept}}) {
my @items = split /\t/;
my ($lat, $lon) = @items[4,5];
$lat = int(($lat + 90) / 180 * 0x100000 + 0.5) & 0xfffff;
$lon = int(($lon + 180) / 360 * 0x100000 + 0.5) & 0xfffff;
my $coord = pack('nCn',$lat>>4,(($lat&0x0f)<<4)|($lon&0x0f),$lon>>4);;
# take the city with the highest population if there are
# multiple cities with the same reduced coordinates
if ($coords{$coord} and $coords{$coord}[6] >= $items[14]) {
next;
}
# coords=(0.lat,1.lon,2.city,3.cc,4.rgn,5.admin2,6.population,7.timezone,8.feature code,9.alt names)
my ($altList, $alt);
die "Internal error\n" unless $lang{$items[0]} and $altList = $lang{$items[0]}{alt};
if (@$altList) {
tr/,//d foreach @$altList;
$alt = join ',', sort @$altList;
} else {
$alt = '';
}
$coords{$coord} = [ @items[4,5,1,8,10,11,14,17,7] ];
$coords{$coord}[9] = $alt;
my $lkup = $lang{$items[0]}; # 0=geoID
my $key = $items[1]; # 1=city
$lkup or die "Missing language for geoID $items[0]\n";
$cityFlags{$flags{$items[0]}} = ($cityFlags{$flags{$items[0]}} || 0) + 1 if defined $flags{$items[0]};
my $ccLang = $ccLang{$items[8]}; # get country-specific language
if ($ccLang and $$lkup{$ccLang}) {
my $lc = $ccLang . '-' . lc($items[8]); # eg. zh-cn
# add country suffix for this language in this country
$$lkup{$lc} = $$lkup{$ccLang} unless $$lkup{$lc};
}
foreach (@supportedLangs) {
next unless $$lkup{$_} and $$lkup{$_} ne $key; # (ignore if same)
$langLookups{$_}{$key} or $langLookups{$_}{$key} = [ ];
push @{$langLookups{$_}{$key}}, "$items[8]$items[10].$items[11],$$lkup{$_}";
}
my $p = int(100 * ++$i / $num + 0.5);
next if $percent == $p;
printf("\b\b\b\b%3d%%", $percent = $p);
flush STDOUT;
}
print "\b\b\b\bDone.\n";
}
# write city database
my $str = $noLang ? '' : " and $outAltNames";
my @t = localtime;
my $date = sprintf('%.4d-%.2d-%.2d', $t[5]+1900, $t[4]+1, $t[3]);
print "Writing $outDir/$outFile (version $dbVer)$str...\n";
print OUTFILE "Geolocation$dbVer\t",scalar(keys %coords),"\n";
print OUTFILE "# $date Cities with population $dbfiles[0]{minpop} or greater from geonames.org with a Creative Commons license\n";
if ($noLang) {
unlink "$outDir/$outAltNames";
} else {
open ALTOUT, ">$outDir/$outAltNames";
binmode ALTOUT;
}
my (%tz, @tz, %fcodes);
my $tzNum = 0;
foreach (sort { $a cmp $b } keys %coords) {
my $items = $coords{$_};
# @$items=(0.lat,1.lon,2.city,3.cc,4.rgn,5.admin2,6.population,7.timezone,8.feature code,9.alt names)
my $iCC = $orderCC{$$items[3]};
defined $iCC or warn("Unknown country code $$items[3]\n"), next;
my $iRgn = $orderRgn{"$$items[3]$$items[4]"} || 0;
my $iSub = $orderSub{"$$items[3]$$items[4].$$items[5]"} || 0;
my $tn = $tz{$$items[7]};
unless ($tn) {
push @tz, $$items[7];
$tn = $tz{$$items[7]} = $tzNum++;
}
# convert population to our binary format
# Note: format in ActivePerl is "3.1e+004", but "3.1+04" in other Perls,
# but other Perls will round 34500 to "3.4e+04", so add 1 to get "3.5e+04"
$$items[6] += 1 if $$items[6] > 100 and not $$items[6] % 10;
my $pop = sprintf('%.1e',$$items[6]);
# pack CC, population and region index into a 32-bit integer
my $code = ($iCC << 24) | (substr($pop,-1,1)<<20) | (substr($pop,0,1)<<16) | (substr($pop,2,1)<<12) | $iRgn;
$fcodes{$$items[8]} = ($fcodes{$$items[8]} || 0) + 1;
my $fc = $featureCodes{$$items[8]} || 0;
# store high bit of timezone index
if ($tn > 255) {
if ($dbVer eq '1.02') {
$iSub |= 0x8000;
$tn -= 256;
} else {
$fc |= 0x80;
$tn -= 256;
}
}
my $pt = pack('NnCC', $code, $iSub, $tn, $fc);
$$items[2] =~ tr/,//d; # remove any commas
print OUTFILE "$_$pt$$items[2]\n";
next if $noLang;
$$items[9] =~ tr/,/\n/;
print ALTOUT $$items[9],"\0";
}
my $altSize = 0;
unless ($noLang) {
$altSize = tell ALTOUT;
close ALTOUT;
}
print OUTFILE "\0\0\0\0\x01\n"; # section terminator
die "Too many time zones!\n" if $tzNum > 0x01ff;
if ($verbose) {
$i = 0;
print "Features kept:\n";
foreach (sort keys %fcodes) {
my $fc = $featureCodes{$_} || 0;
printf "%6d (%2d) %s\n", $fcodes{$_}, $fc, $_;
}
}
# write country codes
open COUNTRY, '<', "$dbdir/$countryFile" or die "Error opening $dbdir/$countryFile\n";
my %cc;
while (<COUNTRY>) {
next if /^#/;
my @items = split /\t/;
next unless $haveCountry{$items[0]};
$cc{$items[4]} = $items[0];
die "country code error\n" if length $items[0] != 2;
$items[4] =~ tr/,//d; # remove any commas
print OUTFILE "$items[0]$items[4]\n";
if ($lang{$items[16]}) { # (16=geoID)
my $lkup = $lang{$items[16]};
my $key = $items[4]; # country name
$ccFlags{$flags{$items[16]}} = ($ccFlags{$flags{$items[16]}} || 0) + 1 if defined $flags{$items[16]};
foreach (@supportedLangs) {
next unless $$lkup{$_} and $$lkup{$_} ne $key; # (ignore if same)
$langLookups{$_}{$key} or $langLookups{$_}{$key} = [ ];
push @{$langLookups{$_}{$key}}, ",$$lkup{$_}";
}
}
}
close COUNTRY;
print OUTFILE "\0\0\0\0\x02\n"; # section terminator
# write regions
print OUTFILE "\n"; # (null region)
open REGION, '<', "$dbdir/$regionFile" or die "Error opening $dbdir/$regionFile\n";
my %region;
while (<REGION>) {
chomp;
my @items = split /\t/;
#items: 0=region code, 1=name, 2=ascii, 3=geoID
$items[0] =~ tr/.//d; # (remove "." separator)
next unless $haveRegion{$items[0]};
$region{$items[0]} = $items[1];
$items[1] =~ tr/,//d; # remove any commas
print OUTFILE "$items[1]\n";
if ($lang{$items[3]}) { # (3=geoID)
my $lkup = $lang{$items[3]};
my $key = $items[1]; # region name
my $cc = substr($items[0], 0, 2);
$rgnFlags{$flags{$items[3]}} = ($rgnFlags{$flags{$items[3]}} || 0) + 1 if defined $flags{$items[3]};
foreach (@supportedLangs) {
next unless $$lkup{$_} and $$lkup{$_} ne $key; # (ignore if same)
$langLookups{$_}{$key} or $langLookups{$_}{$key} = [ ];
push @{$langLookups{$_}{$key}}, "$cc,$$lkup{$_}";
}
}
}
close REGION;
print OUTFILE "\0\0\0\0\x03\n"; # section terminator
# write subregions
print OUTFILE "\n"; # (null admin2)
open ADMIN2, '<', "$dbdir/$admin2File" or die "Error opening $dbdir/$admin2File\n";
my %subregion;
while (<ADMIN2>) {
chomp;
my @items = split /\t/;
#items: 0=admin2 code, 1=name, 2=ascii, 3=geoID
$items[0] =~ s/\.//; # (remove first "." separator)
next unless $haveSubRgn{$items[0]};
$subregion{$items[0]} = $items[1];
$items[1] =~ tr/,//d; # remove any commas
print OUTFILE "$items[1]\n";
if ($lang{$items[3]}) { # (3=geoID)
my $lkup = $lang{$items[3]};
my $key = $items[1]; # region name
$subFlags{$flags{$items[3]}} = ($subFlags{$flags{$items[3]}} || 0) + 1 if defined $flags{$items[3]};
my $rc = $items[0];
$rc =~ s/\..*//; # (remove subregion code)
foreach (@supportedLangs) {
next unless $$lkup{$_} and $$lkup{$_} ne $key; # (ignore if same)
$langLookups{$_}{$key} or $langLookups{$_}{$key} = [ ];
push @{$langLookups{$_}{$key}}, "$rc,$$lkup{$_}";
}
}
}
close ADMIN2;
print OUTFILE "\0\0\0\0\x04\n"; # section terminator
# write timezones
print OUTFILE $_,"\n" foreach @tz;
print OUTFILE "\0\0\0\0\x05\n"; # section terminator
# write feature codes and optional names
foreach (@featureCodes) {
print OUTFILE $_;
print OUTFILE ' ', $featureNames{$_} if $featureNames{$_};
print OUTFILE "\n";
}
# write terminator and close Geolocation.dat
print OUTFILE "\0\0\0\0\0\n"; # file terminator
my $outSize = tell OUTFILE;
close OUTFILE;
# write language lookups
my $langSize = 0;
my $langDir = "$outDir/$geoLang";
# delete existing languages
unlink <"$langDir/*.pm">;
if ($noLang) {
rmdir $langDir;
} else {
my $n = scalar(keys %langLookups);
print "Writing $n language files to $outDir/$geoLang...\n";
mkdir $langDir, 0777;
my ($lng, $key, $str, $nm, $alt);
foreach $lng (sort keys %langLookups) { # ($lng = language code)
my $myLng = $lng;
$myLng =~ tr/-/_/;
my $lkup = $langLookups{$lng};
my $file = "$myLng.pm";
open OUT, ">$langDir/$file" or die "Error creating $file\n";
binmode OUT;
print OUT "# Geolocation language translations for $myLng\n";
print OUT "#\n# Based on Creative Commons database from geonames.org\n\n";
print OUT "%Image::ExifTool::GeoLang::${myLng}::Translate = (\n";
foreach $key (sort keys %$lkup) {
($nm = $key) =~ s/'/\\'/g;
# count entries and use the most common one, then add others with country+region ID's
# (entries in @$li are of the form: City:"CCRc,Sc,Alt", Sub:"CCRc,Alt", Rgn:"CC,Alt", Country:",Alt")
# (Rc = region code, Sc = subregion code)
my $li = $$lkup{$key};
my %count;
# sort by popularity of alternate name
foreach (@$li) {
my $val = $_;
$val =~ s/.*?,//;
$count{$val} = ($count{$val} || 0) + 1;
}
my @order = sort { $count{$b} <=> $count{$a} or length($a) <=> length($b) or $a cmp $b } keys %count;
my $first = 1;
foreach $alt (@order) {
foreach (sort @$li) {
my ($code,$val) = split ',', $_, 2;
# ($code will be empty for a country name, and 2 characters for a region name,
# and contain a "." for a city name)
next unless $val eq $alt; # don't add if alternate name is the same
die "Backslash in translated name" if $val =~ /\\/;
$val =~ s/'/\\'/g; # escape single quotes
if ($first and $val !~ /\(/) { # (don't add general translation if name is qualified with brackets)
print OUT "\t'$nm' => '$val',\n";
undef $first;
last;
}
# format for keys in language table
# City: "CCRgn,Subregion,City", "CCRgn,,City", "CC,City", ",City"
# Subregion: "CCRgn,Subregion,", "CCRgn,,"
# Region: "CCRgn,"
# Country: "CC,"
# Any: "Name"
if (not $code) {
# this is a country
$code = $cc{$key};
printf OUT "\t'$code,' => '$val',\n";
} elsif ($code !~ /\./) {
# this is a region or subregion
print OUT "\t'$code$nm,' => '$val',\n";
} else {
# this is a city
# use region/subregions name instead of code
my $sub = $subregion{$code} || '';
$sub =~ s/'/\\'/g;
$code =~ s/\..*//;
$code = substr($code,0,2) . $region{$code} if $region{$code};
$code =~ s/'/\\'/g;
print OUT "\t'$code,$sub,$nm' => '$val',\n";
}
}
}
}
if ($featureLang{$lng}) {
print OUT "\n\t# feature types\n";
foreach (sort keys %{$featureLang{$lng}}) {
my $ftype = $featureLang{$lng}{$_};
$ftype =~ s/'/\\'/g;
print OUT "\t$_ => '$ftype',\n"
}
}
print OUT ");\n\n1; #end\n";
$langSize += tell OUT;
close OUT;
}
if ($verbose) {
my @type = ( City => \%cityFlags, Region => \%rgnFlags, Subregion => \%subFlags, Country => \%ccFlags );
for (;;) {
my $type = shift @type or last;
my $flags = shift @type;
print "$type flags:\n";
printf(" 0x%.2x - %d\n", 0, $$flags{0} || 0);
my @label = qw(preferred short colloquial historic);
foreach my $bit (0..5) {
my $n = 0;
$_ & (1<<$bit) and ++$n foreach keys %$flags;
printf(" 0x%.2x - %d (%s)\n", (1<<$bit), $n, shift(@label)) if $n;
}
}
}
}
print "Output file size(s):\n";
printf "%8.2f MB %s (%d entries)\n", $outSize / 1e6, $outFile, scalar(keys %coords);
printf "%8.2f MB %s\n", $altSize / 1e6, $outAltNames if $altSize;
printf "%8.2f MB %s/*.pm\n", $langSize / 1e6, $geoLang if $langSize;
printf "%8.2f MB Total\n", ($outSize + $altSize + $langSize) / 1e6 if $altSize or $langSize;
# end