lib/video_info/providers/youtube_scraper.rb
require "oga"
require "net_http_timeout_errors"
class VideoInfo
module Providers
module YoutubeScraper
BASE_URL = "https://www.youtube.com"
CHANNEL_URL = "#{BASE_URL}/channel/"
THUMB_DEFAULT_SIZE = 88
def available?
!!title
end
def date
date = itemprop_node_value("datePublished")
Time.parse(date) if date
end
def author
meta_node_value(channel_meta_nodes, "og:title")
end
def author_thumbnail
image_hq_url = meta_node_value(channel_meta_nodes, "og:image")
resize_thumb(image_hq_url, THUMB_DEFAULT_SIZE)
end
def channel_id
# NOTE the previous implementation, based on itemprop_node_value("channelId"),
# does not work well, probably due to some YouTube change
page_content = data.to_s
matches = page_content.match(/"channelId":"([a-zA-Z0-9_-]+)"/)
matches[1] if matches
end
def author_url
URI.join(CHANNEL_URL, channel_id).to_s
end
def description
meta_node_value(video_meta_nodes, "og:description")
end
def duration
duration = itemprop_node_value("duration")
if duration
ISO8601::Duration.new(duration).to_seconds.to_i
else
0
end
end
def keywords
return unless available?
value = meta_node_value(video_meta_nodes, "keywords")
value&.split(", ")
end
def title
return if meta_node_value(video_meta_nodes, "title").empty?
meta_node_value(video_meta_nodes, "title")
end
def view_count
itemprop_node_value("interactionCount").to_i
end
def stats
{
"viewCount" => view_count
}
end
private
def video_meta_nodes
@video_meta_nodes ||= data.css("meta")
end
def channel_meta_nodes
@channel_meta_nodes ||= channel_data.css("meta")
end
def channel_data
@channel_data ||= Oga.parse_html(URI.parse(author_url).read)
end
def meta_node_value(meta_nodes, name)
node = meta_nodes.detect do |n|
n.attr("name")&.value == name || n.attr("property")&.value == name
end
return unless node
node&.attr("content") && node.attr("content").value
end
def itemprop_node_value(name)
node = video_meta_nodes.detect do |m|
itemprop_attr = m.attr("itemprop")
itemprop_attr.value == name if itemprop_attr
end
return unless node
node.attr("content").value
end
def _set_data_from_api_impl(api_url)
# handle fullscreen video URLs
if url.include?(".com/v/")
video_id = url.split("/v/")[1].split("?")[0]
new_url = "https://www.youtube.com/watch?v=" + video_id
Oga.parse_html(URI.parse(new_url).read)
else
Oga.parse_html(URI.parse(api_url).read)
end
end
def _api_url
uri = URI.parse(@url)
unless uri.scheme
uri.path = uri.path.prepend("//")
end
uri.scheme = "https"
uri.to_s
end
end
end
end