catstodon/config/initializers/twitter_regex.rb

98 lines
4.5 KiB
Ruby
Raw Normal View History

module Twitter
class Regex
REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou
REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
REGEXEN[:valid_url_balanced_parens] = /
\(
(?:
#{REGEXEN[:valid_general_url_path_chars]}+
|
# allow one nested level of balanced parentheses
(?:
#{REGEXEN[:valid_general_url_path_chars]}*
\(
#{REGEXEN[:valid_general_url_path_chars]}+
\)
#{REGEXEN[:valid_general_url_path_chars]}*
)
)
\)
/iox
REGEXEN[:valid_url_path] = /(?:
(?:
#{REGEXEN[:valid_general_url_path_chars]}*
(?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
#{REGEXEN[:valid_url_path_ending_chars]}
)|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
)/iox
REGEXEN[:valid_url] = %r{
( # $1 total match
(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character
( # $3 URL
((?:https?|dat|dweb|ipfs|ipns|ssb|gopher):\/\/)? # $4 Protocol (optional)
(#{REGEXEN[:valid_domain]}) # $5 Domain(s)
(?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
(/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
)
)
}iox
REGEXEN[:validate_nodeid] = /(?:
#{REGEXEN[:validate_url_unreserved]}|
#{REGEXEN[:validate_url_pct_encoded]}|
[!$()*+,;=]
)/iox
REGEXEN[:validate_resid] = /(?:
#{REGEXEN[:validate_url_unreserved]}|
#{REGEXEN[:validate_url_pct_encoded]}|
#{REGEXEN[:validate_url_sub_delims]}
)/iox
2020-01-23 21:27:26 +01:00
REGEXEN[:xmpp_uri] = %r{
(xmpp:) # Protocol
(//#{REGEXEN[:validate_nodeid]}+@#{REGEXEN[:valid_domain]}/)? # Authority (optional)
(#{REGEXEN[:validate_nodeid]}+@)? # Username in path (optional)
(#{REGEXEN[:valid_domain]}) # Domain in path
(/#{REGEXEN[:validate_resid]}+)? # Resource in path (optional)
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # Query String
}iox
REGEXEN[:magnet_uri] = %r{
(magnet:) # Protocol
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]}) # Query String
}iox
REGEXEN[:valid_extended_uri] = %r{
( # $1 total match
(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character
( # $3 URL
(#{REGEXEN[:xmpp_uri]}) | (#{REGEXEN[:magnet_uri]})
)
)
}iox
end
module Extractor
2020-01-23 21:27:26 +01:00
# Extracts a list of all XMPP and magnet URIs included in the Toot <tt>text</tt> along
# with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
2020-01-23 21:27:26 +01:00
# XMPP or magnet URIs an empty array will be returned.
#
# If a block is given then it will be called for each XMPP URI.
2020-01-23 21:27:26 +01:00
def extract_extra_uris_with_indices(text, options = {}) # :yields: uri, start, end
return [] unless text && text.index(":")
urls = []
2020-01-23 21:27:26 +01:00
text.to_s.scan(Twitter::Regex[:valid_extended_uri]) do
valid_uri_match_data = $~
start_position = valid_uri_match_data.char_begin(3)
end_position = valid_uri_match_data.char_end(3)
urls << {
:url => valid_uri_match_data[3],
:indices => [start_position, end_position]
}
end
urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
urls
end
end
end