ixxi-dante/nw2vec

View on GitHub
sosweet-scripts/pipe-user_timestamp_body-csv

Summary

Maintainability
Test Coverage
#!/bin/bash
# Extract user_id, tweet timestamp and body, from a pipe, and pipe it out as csv

# Don't tolerate errors
set -e

# sed removes leading text which sometimes appears before a valid json line
# jq -R '...' outputs only the valid json lines, skipping those that are not valid json
# jq '...' does our actual parsing to extract the data we want
# the final sed turns the quoted strings output by jq into csv lines (otherwise, using 'jq -r' prints newlines in the json as actual newlines in the csv, see https://github.com/stedolan/jq/issues/48#issuecomment-55262164 )
sed 's/^[^{]*//g' | jq -R 'fromjson?' | jq '[(.actor.id | ltrimstr("id:twitter.com:") | tonumber), .postedTime, .body] | @csv' | sed 's/^"//g;s/"$//g;s/\\"/"/g'