catstodon/app/services/fetch_oembed_service.rb
Claire ec059317fa
Fix some link previews being incorrectly generated from other prior links (#16885)
* Add tests

* Fix some link previews being incorrectly generated from different prior links

PR #12403 added a cache to avoid redundant queries when the OEmbed endpoint can
be guessed from the URL. This caching mechanism is not perfectly correct as
there is no guarantee that all pages from a given domain share the same
OEmbed provider endpoint.

This PR prevents the FetchOEmbedService from caching OEmbed endpoint that
cannot be generalized by replacing a fully-qualified URL from the endpoint's
parameters, greatly reducing the number of incorrect cached generalizations.
2021-10-21 20:39:35 +02:00

113 lines
2.9 KiB
Ruby

# frozen_string_literal: true
class FetchOEmbedService
ENDPOINT_CACHE_EXPIRES_IN = 24.hours.freeze
URL_REGEX = /(=(http[s]?(%3A|:)(\/\/|%2F%2F)))([^&]*)/i.freeze
attr_reader :url, :options, :format, :endpoint_url
def call(url, options = {})
@url = url
@options = options
if @options[:cached_endpoint]
parse_cached_endpoint!
else
discover_endpoint!
end
fetch!
end
private
def discover_endpoint!
return if html.nil?
@format = @options[:format]
page = Nokogiri::HTML(html)
if @format.nil? || @format == :json
@endpoint_url ||= page.at_xpath('//link[@type="application/json+oembed"]')&.attribute('href')&.value
@format ||= :json if @endpoint_url
end
if @format.nil? || @format == :xml
@endpoint_url ||= page.at_xpath('//link[@type="text/xml+oembed"]')&.attribute('href')&.value
@format ||= :xml if @endpoint_url
end
return if @endpoint_url.blank?
@endpoint_url = begin
base_url = Addressable::URI.parse(@url)
# If the OEmbed endpoint is given as http but the URL we opened
# was served over https, we can assume OEmbed will be available
# through https as well
(base_url + @endpoint_url).tap do |absolute_url|
absolute_url.scheme = base_url.scheme if base_url.scheme == 'https'
end.to_s
end
cache_endpoint!
rescue Addressable::URI::InvalidURIError
@endpoint_url = nil
end
def parse_cached_endpoint!
cached = @options[:cached_endpoint]
return if cached[:endpoint].nil? || cached[:format].nil?
@endpoint_url = Addressable::Template.new(cached[:endpoint]).expand(url: @url).to_s
@format = cached[:format]
end
def cache_endpoint!
return unless URL_REGEX.match?(@endpoint_url)
url_domain = Addressable::URI.parse(@url).normalized_host
endpoint_hash = {
endpoint: @endpoint_url.gsub(URL_REGEX, '={url}'),
format: @format,
}
Rails.cache.write("oembed_endpoint:#{url_domain}", endpoint_hash, expires_in: ENDPOINT_CACHE_EXPIRES_IN)
end
def fetch!
return if @endpoint_url.blank?
body = Request.new(:get, @endpoint_url).perform do |res|
res.code != 200 ? nil : res.body_with_limit
end
validate(parse_for_format(body)) if body.present?
rescue Oj::ParseError, Ox::ParseError
nil
end
def parse_for_format(body)
case @format
when :json
Oj.load(body, mode: :strict)&.with_indifferent_access
when :xml
Ox.load(body, mode: :hash_no_attrs)&.with_indifferent_access&.dig(:oembed)
end
end
def validate(oembed)
oembed if oembed[:version] == '1.0' && oembed[:type].present?
end
def html
return @html if defined?(@html)
@html = @options[:html] || Request.new(:get, @url).add_headers('Accept' => 'text/html').perform do |res|
res.code != 200 || res.mime_type != 'text/html' ? nil : res.body_with_limit
end
end
end