catstodon/config/initializers/twitter_regex.rb
Claire 211d5c3c30
Fix inefficiencies in auto-linking code (#16506)
The auto-linking code basically rewrote the whole string escaping non-ascii
characters in an inefficient way, and building a full character offset map
between the unescaped and escaped texts before sending the contents to
TwitterText's extractor.

Instead of doing that, this commit changes the TwitterText regexps to include
valid IRI characters in addition to valid URI characters.
2021-07-15 15:56:58 +02:00

107 lines
5.4 KiB
Ruby

module Twitter::TwitterText
class Configuration
def emoji_parsing_enabled
false
end
end
class Regex
REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou
REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
REGEXEN[:valid_url_balanced_parens] = /
\(
(?:
#{REGEXEN[:valid_general_url_path_chars]}+
|
# allow one nested level of balanced parentheses
(?:
#{REGEXEN[:valid_general_url_path_chars]}*
\(
#{REGEXEN[:valid_general_url_path_chars]}+
\)
#{REGEXEN[:valid_general_url_path_chars]}*
)
)
\)
/iox
REGEXEN[:valid_iri_ucschar] = /[\u{A0}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}\u{10000}-\u{1FFFD}\u{20000}-\u{2FFFD}\u{30000}-\u{3FFFD}\u{40000}-\u{4FFFD}\u{50000}-\u{5FFFD}\u{60000}-\u{6FFFD}\u{70000}-\u{7FFFD}\u{80000}-\u{8FFFD}\u{90000}-\u{9FFFD}\u{A0000}-\u{AFFFD}\u{B0000}-\u{BFFFD}\u{C0000}-\u{CFFFD}\u{D0000}-\u{DFFFD}\u{E1000}-\u{EFFFD}]/iou
REGEXEN[:valid_iri_iprivate] = /[\u{E000}-\u{F8FF}\u{F0000}-\u{FFFFD}\u{100000}-\u{10FFFD}]/iou
REGEXEN[:valid_url_query_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/iou
REGEXEN[:valid_url_query_ending_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9_&=#\/\-]/iou
REGEXEN[:valid_url_path] = /(?:
(?:
#{REGEXEN[:valid_general_url_path_chars]}*
(?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
#{REGEXEN[:valid_url_path_ending_chars]}
)|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
)/iox
REGEXEN[:valid_url] = %r{
( # $1 total match
(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character
( # $3 URL
((?:https?|dat|dweb|ipfs|ipns|ssb|gopher|gemini):\/\/)? # $4 Protocol (optional)
(#{REGEXEN[:valid_domain]}) # $5 Domain(s)
(?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
(/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
)
)
}iox
REGEXEN[:validate_nodeid] = /(?:
#{REGEXEN[:validate_url_unreserved]}|
#{REGEXEN[:validate_url_pct_encoded]}|
[!$()*+,;=]
)/iox
REGEXEN[:validate_resid] = /(?:
#{REGEXEN[:validate_url_unreserved]}|
#{REGEXEN[:validate_url_pct_encoded]}|
#{REGEXEN[:validate_url_sub_delims]}
)/iox
REGEXEN[:xmpp_uri] = %r{
(xmpp:) # Protocol
(//#{REGEXEN[:validate_nodeid]}+@#{REGEXEN[:valid_domain]}/)? # Authority (optional)
(#{REGEXEN[:validate_nodeid]}+@)? # Username in path (optional)
(#{REGEXEN[:valid_domain]}) # Domain in path
(/#{REGEXEN[:validate_resid]}+)? # Resource in path (optional)
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # Query String
}iox
REGEXEN[:magnet_uri] = %r{
(magnet:) # Protocol
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]}) # Query String
}iox
REGEXEN[:valid_extended_uri] = %r{
( # $1 total match
(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character
( # $3 URL
(#{REGEXEN[:xmpp_uri]}) | (#{REGEXEN[:magnet_uri]})
)
)
}iox
end
module Extractor
# Extracts a list of all XMPP and magnet URIs included in the Toot <tt>text</tt> along
# with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
# XMPP or magnet URIs an empty array will be returned.
#
# If a block is given then it will be called for each XMPP URI.
def extract_extra_uris_with_indices(text, _options = {}) # :yields: uri, start, end
return [] unless text && text.index(":")
urls = []
text.to_s.scan(Twitter::TwitterText::Regex[:valid_extended_uri]) do
valid_uri_match_data = $~
start_position = valid_uri_match_data.char_begin(3)
end_position = valid_uri_match_data.char_end(3)
urls << {
:url => valid_uri_match_data[3],
:indices => [start_position, end_position]
}
end
urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
urls
end
end
end