CartoDB/cartodb20

View on GitHub
services/twitter-search/lib/twitter-search/json_to_csv_converter.rb

Summary

Maintainability
C
1 day
Test Coverage
module CartoDB
  module TwitterSearch
    class JSONToCSVConverter

      INDIVIDUAL_FIELDS = [
        :link,
        :body,
        :objectType,
        :postedTime,
        :favoritesCount,
        :twitter_lang,
        :retweetCount
      ]

      GROUP_FIELDS = [
        :actor,
        :inReplyTo,
        :geo,
        :twitter_entities, # Save json string,
        :location
      ]

      # Same as above but with fields inside a group field
      SUBGROUP_FIELDS_TO_DUMP = {
        actor: [
          :links,     # links[0].href
          :location,  # May be a Twitter Place, with a displayName and objectType, or a simple String
          :languages  # languages[0]
        ],
        # if this gets renamed to the_geom, cartodb will import it as a bounding box
        location: [
          :geo
        ],
        # same as location->geo, but as a point, so should have higher priority
      }

      # This fields will get dumped as field_subfield. If not present here will be saved as a stringified json
      SUBFIELDS = {
        actor: [
          :id,
          :displayName,
          :image,
          :summary,
          :postedTime,
          :location,
          :utcOffset,
          :preferredUsername,
          :friendsCount,
          :followersCount,
          :listedCount,
          :statusesCount,
          :verified
        ],
        inReplyTo: [
          :link
        ],
        location: [
          :geo,
          :name
        ]
      }

      # Other fields with special behaviour we want to add
      CARTODB_FIELDS = [
        :the_geom
      ]

      def generate_headers(additional_fields = {})
        process([], true, additional_fields)
      end

      # Note: 'the_geom' will be added automatically, no need to add as additional field
      def process(input_data, add_headers = true, additional_fields = {})
        results = []

        if add_headers
          results_row = INDIVIDUAL_FIELDS.map { |field|
            field_to_csv(field)
          }

          GROUP_FIELDS.each do |field|
            if SUBFIELDS[field].nil?
              results_row << field_to_csv(field)
            else
              SUBFIELDS[field].each do |subfield|
                results_row << field_to_csv("#{field.to_s}_#{subfield.to_s}")
              end
            end
          end

          CARTODB_FIELDS.each do |field|
            results_row << field_to_csv(field)
          end

          additional_fields.each do |key, _value|
            results_row << field_to_csv(key)
          end

          results << results_row.join(',')
        end

        # Data rows
        input_data.each do |item|
          results_row = []

          INDIVIDUAL_FIELDS.each do |field|
            results_row << (item[field].nil? ? nil : field_to_csv(item[field]))
          end

          GROUP_FIELDS.each do |field|
            # Group field has no subfields "defined"? then must be dumped
            if SUBFIELDS[field].nil?
              if !item[field].nil?
                results_row << field_to_csv(::JSON.dump(item[field]))
              else
                results_row << nil
              end
            else
              # Go inside fields, repeat similar logic
              SUBFIELDS[field].each do |subfield|
                if !item[field].nil? && !item[field][subfield].nil?
                  # Subitems will either get written as they are or dumped
                  if !SUBGROUP_FIELDS_TO_DUMP[field].nil? && SUBGROUP_FIELDS_TO_DUMP[field].include?(subfield)
                    results_row << field_to_csv(::JSON.dump(item[field][subfield]))
                  else
                    results_row << field_to_csv(item[field][subfield])
                  end
                else
                  results_row << nil
                end
              end
            end
          end

          CARTODB_FIELDS.each do |field|
            if field == :the_geom
              results_row << field_to_csv(calculate_the_geom(item))
            end
          end

          additional_fields.each do |_key, value|
            results_row << field_to_csv(value)
          end

          results << results_row.join(',')
        end

        results.join("\n")
      end

      # INFO: This gets called before field-by-field parsing to speed up things
      def clean_string(contents)
        contents.gsub("\\n", ' ').gsub("\\r", ' ')
      end

      private

      def field_to_csv(field)
        # RFC4180
        '"' + field.to_s.gsub('"', '""').gsub("\\", ' ').gsub("\x0D", ' ').gsub("\x0A", ' ').gsub("\0", '') + '"'
      end

      def calculate_the_geom(row)
        output = nil

        # Point
        if !row[:geo].nil? && !row[:geo].empty?
          # Twitter/Gnip bug: They give GeoJSON-like with (lat,lon) point, so transform to proper GeoJSON (lon,lat)
          # Only happens here, location geo is fine, bounding boxes are fine, geo-enrichment is fine too
          lat = row[:geo][:coordinates][0]
          row[:geo][:coordinates][0] = row[:geo][:coordinates][1]
          row[:geo][:coordinates][1] = lat
          output = ::JSON.dump(row[:geo])
        # Geo-enrichment
        elsif !row[:gnip].nil? && !row[:gnip].empty? && !row[:gnip][:profileLocations].nil? &&
              !row[:gnip][:profileLocations].empty?

          row[:gnip][:profileLocations].each do |location|
            # Store first point found (only)
            if !location[:geo].nil? && !location[:geo].empty? && !location[:geo][:type].nil? &&
               !location[:geo][:type].empty? && location[:geo][:type] == 'point' && output.nil?

              output = ::JSON.dump(location[:geo])
            end
          end
        end

        output
      end
    end
  end
end