data/lib/us_geo_data/place.rb from bdurand/us_geo

data/lib/us_geo_data/place.rb
Summary

Maintainability

6 hrs
Test Coverage

Issues
# frozen_string_literal: true

module USGeoData
  class Place
    include Processor

    STATE_ABBREVIATIONS = {
      "Alabama" => "Ala.",
      "Arizona" => "Ariz.",
      "Arkansas" => "Ark.",
      "California" => "Calif.",
      "Colorado" => "Colo.",
      "Connecticut" => "Conn.",
      "Delaware" => "Del.",
      "Florida" => "Fla.",
      "Georgia" => "Ga.",
      "Illinois" => "Ill.",
      "Indiana" => "Ind.",
      "Kansas" => "Kan.",
      "Kentucky" => "Ky.",
      "Louisiana" => "La.",
      "Maryland" => "Md.",
      "Massachusetts" => "Mass.",
      "Michigan" => "Mich.",
      "Minnesota" => "Minn.",
      "Mississippi" => "Miss.",
      "Missouri" => "Mo.",
      "Montana" => "Mont.",
      "Nebraska" => "Neb.",
      "Nevada" => "Nev.",
      "New Hampshire" => "N.H.",
      "New Jersey" => "N.J.",
      "New Mexico" => "N.M.",
      "New York" => "N.Y.",
      "North Carolina" => "N.C.",
      "North Dakota" => "N.D.",
      "Oklahoma" => "Okla.",
      "Oregon" => "Ore.",
      "Pennsylvania" => "Pa.",
      "Rhode Island" => "R.I.",
      "South Carolina" => "S.C.",
      "South Dakota" => "S.D.",
      "Tennessee" => "Tenn.",
      "Vermont" => "Vt.",
      "Virginia" => "Va.",
      "Washington" => "Wash.",
      "West Virginia" => "W.Va.",
      "Wisconsin" => "Wis.",
      "Wyoming" => "Wyo."
    }.freeze

    STRIP_FROM_SHORT_NAME = [
      /\A(The )?City and Borough of /i,
      /\A(The )?Unified Government of /i,
      / unified government\b/i,
      /\A(The )?Consolidated Government of /i,
      /\A(The )?City and County of /i,
      /\A(The )?Metropolitan Government of /i,
      /\A(The )?City of /i,
      /\A(The )?Town of /i,
      /\A(The )?Township of /i,
      /\A(The )?Municipality of /i,
      /\A(The )?Village of /i,
      /\A(The )?Borough of /i,
      /\A(The )?County of /i,
      /\A(The )?Corporation of /i,
      /( Census)? Designated Place/i,
      / CDP/i,
      / \(historical\)/i,
      / \(balance\)/i,
      / Township/i,
      / Consolidated Government/i,
      / Metro Government/i,
      / Comunidad\z/i,
      / Zona Urbana\z/i
    ].freeze

    ABBREVIATIONS = {
      /\bCensus Designated Place\b/i => "CDP",
      /\b(The )?University\b/i => "Univ.",
      /\bInstitute\b/i => "Inst.",
      /\bCollege\b/i => "Coll."
    }.freeze

    def dump_csv(output)
      csv = CSV.new(output)
      csv << ["GEOID", "GNIS ID", "Name", "Short Name", "State", "County GEOID", "Urban Area GEOID", "FIPS Class", "Population", "Housing Units", "Land Area", "Water Area", "Latitude", "Longitude"]
      place_data.each_value do |data|
        unless data[:gnis_id] && data[:fips_class]
          puts "Missing data for place #{data[:geoid]} #{data[:name]}, #{data[:state]}: #{data.inspect}"
          next
        end
        csv << [
          data[:geoid],
          data[:gnis_id],
          abbr_name(data[:name], 60),
          short_name(data[:name]),
          data[:state],
          data[:county_geoid],
          data[:urban_area_geoid],
          data[:fips_class],
          data[:population],
          data[:housing_units],
          data[:land_area]&.round(3),
          data[:water_area]&.round(3),
          data[:lat],
          data[:lng]
        ]
      end
      output
    end

    def dump_counties_csv(output)
      csv = CSV.new(output)
      csv << ["Place GEOID", "County GEOID"]
      place_data.each_value do |place|
        place[:counties].each do |county_geoid|
          csv << [
            place[:geoid],
            county_geoid
          ]
        end
      end
      output
    end

    def place_data
      unless defined?(@place_data)
        places = {}

        gnis_places = gnis_place_mapping
        foreach(data_file(USGeoData::PLACE_GAZETTEER_FILE), col_sep: "\t") do |row|
          geoid = row["GEOID"]
          gnis_id ||= row["ANSICODE"].gsub(/\A0+/, "").to_i
          data = gnis_places[gnis_id].dup
          data ||= {
            name: row["NAME"],
            state: row["USPS"],
            gnis_id: gnis_id
          }
          data[:geoid] = geoid
          data[:land_area] = row["ALAND_SQMI"]&.to_f
          data[:water_area] = row["AWATER_SQMI"]&.to_f
          data[:lat] ||= row["INTPTLAT"]&.to_f
          data[:lng] ||= row["INTPTLONG"]&.to_f
          data[:counties] = [data[:county_geoid]].compact
          places[geoid] = data
        end

        add_demographics(places)
        add_counties(places)
        add_urban_areas(places)

        @place_data ||= places
      end
      @place_data
    end

    def short_name(name)
      short_name = name
      STRIP_FROM_SHORT_NAME.each do |pattern|
        short_name = short_name.sub(pattern, "")
      end

      short_name = abbr_name(short_name, 30)
      short_name = abbr_state(short_name) if short_name.size > 30
      short_name = short_name.split("-", 2).first if short_name.size > 30

      if short_name.size > 30
        raise "Short name for #{name} greather than 30 characters: #{short_name.inspect} (#{short_name.size} characters)"
      end

      short_name
    end

    private

    def add_demographics(places)
      demographics(data_file(USGeoData::PLACE_POPULATION_FILE)).each do |geoid, population|
        info = places[geoid]
        info[:population] = population if info
      end

      demographics(data_file(USGeoData::PLACE_HOUSING_UNITS_FILE)).each do |geoid, housing_units|
        info = places[geoid]
        info[:housing_units] = housing_units if info
      end
    end

    def add_counties(data)
      foreach(processed_file(Gnis::PLACE_COUNTIES_FILE), col_sep: ",") do |row|
        place_geoid = row["Place GEOID"]
        county_geoid = row["County GEOID"]

        place = data[place_geoid]
        next unless place

        place[:counties] << county_geoid unless place[:counties].include?(county_geoid)
      end
    end

    def add_urban_areas(places)
      overlaps = {}
      foreach(data_file(USGeoData::PLACE_URBAN_AREA_REL_FILE), col_sep: "|") do |row|
        urban_area_geoid = row["GEOID_UA_20"]
        place_geoid = row["GEOID_PLACE_20"]
        overlap_land_area = row["AREALAND_PART"].to_f * SQUARE_METERS_TO_MILES
        place_land_area = row["AREALAND_PLACE_20"].to_f * SQUARE_METERS_TO_MILES

        next unless urban_area_geoid && place_geoid && overlap_land_area > 0 && place_land_area > 0
        next unless places.include?(place_geoid)

        info = overlaps[place_geoid]
        unless info
          info = {}
          overlaps[place_geoid] = info
        end
        info[urban_area_geoid] = overlap_land_area / place_land_area
      end

      overlaps.each do |place_geoid, overlap|
        primary_urban_area = overlap.max_by { |_, percent| percent }.first
        places[place_geoid][:urban_area_geoid] = primary_urban_area
      end
    end

    def abbr_state(name)
      STATE_ABBREVIATIONS.each do |state, abbr|
        name = name.gsub(Regexp.new(state, Regexp::IGNORECASE), abbr)
      end
      name
    end

    def abbr_name(name, desired_length)
      ABBREVIATIONS.each do |pattern, replacement|
        break if name.size < desired_length
        name = name.gsub(pattern, replacement)
      end
      name
    end

    def gnis_place_mapping
      gnis_places = {}
      foreach(processed_file(Gnis::PLACES_FILE), col_sep: ",") do |row|
        gnis_id = row["GNIS ID"].to_i
        gnis_places[gnis_id] = {
          gnis_id: gnis_id,
          fips_class: row["FIPS Class"],
          name: row["Name"],
          state: row["State"],
          county_geoid: row["County GEOID"],
          lat: row["Latitude"].to_f,
          lng: row["Longitude"].to_f
        }
      end
      gnis_places
    end
  end
end