mastodon/config/initializers/twitter_regex.rb

module Twitter
  class Regex
    REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou
    REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
    REGEXEN[:valid_url_balanced_parens] = /
      \(
        (?:
          #{REGEXEN[:valid_general_url_path_chars]}+
          |
          # allow one nested level of balanced parentheses
          (?:
            #{REGEXEN[:valid_general_url_path_chars]}*
            \(
              #{REGEXEN[:valid_general_url_path_chars]}+
            \)
            #{REGEXEN[:valid_general_url_path_chars]}*
          )
        )
      \)
    /iox
    REGEXEN[:valid_url_path] = /(?:
      (?:
        #{REGEXEN[:valid_general_url_path_chars]}*
        (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
        #{REGEXEN[:valid_url_path_ending_chars]}
      )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
    )/iox
    REGEXEN[:valid_url] = %r{
      (                                                                                     #   $1 total match
        (#{REGEXEN[:valid_url_preceding_chars]})                                            #   $2 Preceding character
        (                                                                                   #   $3 URL
          ((?:https?|dat|dweb|ipfs|ipns|ssb|gopher|gemini):\/\/)?                           #   $4 Protocol (optional)
          (#{REGEXEN[:valid_domain]})                                                       #   $5 Domain(s)
          (?::(#{REGEXEN[:valid_port_number]}))?                                            #   $6 Port number (optional)
          (/#{REGEXEN[:valid_url_path]}*)?                                                  #   $7 URL Path and anchor
          (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? #   $8 Query String
        )
      )
    }iox
    REGEXEN[:validate_nodeid] = /(?:
      #{REGEXEN[:validate_url_unreserved]}|
      #{REGEXEN[:validate_url_pct_encoded]}|
      [!$()*+,;=]
    )/iox
    REGEXEN[:validate_resid] = /(?:
      #{REGEXEN[:validate_url_unreserved]}|
      #{REGEXEN[:validate_url_pct_encoded]}|
      #{REGEXEN[:validate_url_sub_delims]}
    )/iox
    REGEXEN[:xmpp_uri] = %r{
      (xmpp:)                                                                           # Protocol
      (//#{REGEXEN[:validate_nodeid]}+@#{REGEXEN[:valid_domain]}/)?                     # Authority (optional)
      (#{REGEXEN[:validate_nodeid]}+@)?                                                 # Username in path (optional)
      (#{REGEXEN[:valid_domain]})                                                       # Domain in path
      (/#{REGEXEN[:validate_resid]}+)?                                                  # Resource in path (optional)
      (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # Query String
    }iox
    REGEXEN[:magnet_uri] = %r{
      (magnet:)                                                                         # Protocol
      (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})  # Query String
    }iox
    REGEXEN[:valid_extended_uri] = %r{
      (                                                                                 #   $1 total match
        (#{REGEXEN[:valid_url_preceding_chars]})                                        #   $2 Preceding character
        (                                                                               #   $3 URL
          (#{REGEXEN[:xmpp_uri]}) | (#{REGEXEN[:magnet_uri]})
        )
      )
    }iox
  end

  module Extractor
    # Extracts a list of all XMPP and magnet URIs included in the Toot <tt>text</tt> along
    # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
    # XMPP or magnet URIs an empty array will be returned.
    #
    # If a block is given then it will be called for each XMPP URI.
    def extract_extra_uris_with_indices(text, _options = {}) # :yields: uri, start, end
      return [] unless text && text.index(":")
      urls = []

      text.to_s.scan(Twitter::Regex[:valid_extended_uri]) do
        valid_uri_match_data = $~

        start_position = valid_uri_match_data.char_begin(3)
        end_position = valid_uri_match_data.char_end(3)

        urls << {
          :url => valid_uri_match_data[3],
          :indices => [start_position, end_position]
        }
      end
      urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
      urls
    end
  end
end
Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 16:03:20 +00:00			`module Twitter`
			`class Regex`
Fix URL linkifier grabbing full-width spaces and quotations (#9997) Fix #9993 Fix #5654 2019-02-09 19:13:11 +00:00			`REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou`
			`REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\\|@]\|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou`
Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 16:03:20 +00:00			`REGEXEN[:valid_url_balanced_parens] = /`
			`\(`
			`(?:`
			`#{REGEXEN[:valid_general_url_path_chars]}+`
			`\|`
			`# allow one nested level of balanced parentheses`
			`(?:`
			`#{REGEXEN[:valid_general_url_path_chars]}*`
			`\(`
			`#{REGEXEN[:valid_general_url_path_chars]}+`
			`\)`
			`#{REGEXEN[:valid_general_url_path_chars]}*`
			`)`
			`)`
			`\)`
			`/iox`
			`REGEXEN[:valid_url_path] = /(?:`
			`(?:`
			`#{REGEXEN[:valid_general_url_path_chars]}*`
			`(?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]})`
			`#{REGEXEN[:valid_url_path_ending_chars]}`
			`)\|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)`
			`)/iox`
			`REGEXEN[:valid_url] = %r{`
			`( # $1 total match`
Misc. typos (#8694) Found via `codespell -q 3 --skip="./app/javascript/mastodon/locales,./config/locales"` 2018-09-13 22:53:09 +00:00			`(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character`
Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 16:03:20 +00:00			`( # $3 URL`
Add support for Gemini urls (#15013) This PR updates the `valid_url` regex and sanitizer allowlist to provide support for Gemini urls. Closes #14991 2020-10-19 15:02:13 +00:00			`((?:https?\|dat\|dweb\|ipfs\|ipns\|ssb\|gopher\|gemini):\/\/)? # $4 Protocol (optional)`
Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 16:03:20 +00:00			`(#{REGEXEN[:valid_domain]}) # $5 Domain(s)`
			`(?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)`
			`(/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor`
			`(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String`
			`)`
			`)`
			`}iox`
Add support for linking XMPP URIs in toots (#12709) * Fix wrong grouping in Twitter valid_url regex * Add support for xmpp URIs Fixes #9776 The difficult part is autolinking, because Twitter-text's extractor does some pretty ad-hoc stuff to find things that “look like” URLs, and XMPP URIs do not really match the assumptions of that lib, so it doesn't sound wise to try to shoehorn it into the existing regex. This is why I used a specific regex (very close, although slightly more permissive than the RFC), and a specific scan function (a simplified version of the generalized one from Twitter). * Remove leading “xmpp:” from auto-linked text 2020-01-11 01:15:25 +00:00			`REGEXEN[:validate_nodeid] = /(?:`
			`#{REGEXEN[:validate_url_unreserved]}\|`
			`#{REGEXEN[:validate_url_pct_encoded]}\|`
			`[!$()*+,;=]`
			`)/iox`
			`REGEXEN[:validate_resid] = /(?:`
			`#{REGEXEN[:validate_url_unreserved]}\|`
			`#{REGEXEN[:validate_url_pct_encoded]}\|`
			`#{REGEXEN[:validate_url_sub_delims]}`
			`)/iox`
Add support for magnet: URIs (#12905) 2020-01-23 20:27:26 +00:00			`REGEXEN[:xmpp_uri] = %r{`
			`(xmpp:) # Protocol`
			`(//#{REGEXEN[:validate_nodeid]}+@#{REGEXEN[:valid_domain]}/)? # Authority (optional)`
			`(#{REGEXEN[:validate_nodeid]}+@)? # Username in path (optional)`
			`(#{REGEXEN[:valid_domain]}) # Domain in path`
			`(/#{REGEXEN[:validate_resid]}+)? # Resource in path (optional)`
			`(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # Query String`
			`}iox`
			`REGEXEN[:magnet_uri] = %r{`
			`(magnet:) # Protocol`
			`(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]}) # Query String`
			`}iox`
			`REGEXEN[:valid_extended_uri] = %r{`
			`( # $1 total match`
			`(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character`
			`( # $3 URL`
			`(#{REGEXEN[:xmpp_uri]}) \| (#{REGEXEN[:magnet_uri]})`
Add support for linking XMPP URIs in toots (#12709) * Fix wrong grouping in Twitter valid_url regex * Add support for xmpp URIs Fixes #9776 The difficult part is autolinking, because Twitter-text's extractor does some pretty ad-hoc stuff to find things that “look like” URLs, and XMPP URIs do not really match the assumptions of that lib, so it doesn't sound wise to try to shoehorn it into the existing regex. This is why I used a specific regex (very close, although slightly more permissive than the RFC), and a specific scan function (a simplified version of the generalized one from Twitter). * Remove leading “xmpp:” from auto-linked text 2020-01-11 01:15:25 +00:00			`)`
			`)`
			`}iox`
			`end`

			`module Extractor`
Add support for magnet: URIs (#12905) 2020-01-23 20:27:26 +00:00			`# Extracts a list of all XMPP and magnet URIs included in the Toot <tt>text</tt> along`
Add support for linking XMPP URIs in toots (#12709) * Fix wrong grouping in Twitter valid_url regex * Add support for xmpp URIs Fixes #9776 The difficult part is autolinking, because Twitter-text's extractor does some pretty ad-hoc stuff to find things that “look like” URLs, and XMPP URIs do not really match the assumptions of that lib, so it doesn't sound wise to try to shoehorn it into the existing regex. This is why I used a specific regex (very close, although slightly more permissive than the RFC), and a specific scan function (a simplified version of the generalized one from Twitter). * Remove leading “xmpp:” from auto-linked text 2020-01-11 01:15:25 +00:00			`# with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no`
Add support for magnet: URIs (#12905) 2020-01-23 20:27:26 +00:00			`# XMPP or magnet URIs an empty array will be returned.`
Add support for linking XMPP URIs in toots (#12709) * Fix wrong grouping in Twitter valid_url regex * Add support for xmpp URIs Fixes #9776 The difficult part is autolinking, because Twitter-text's extractor does some pretty ad-hoc stuff to find things that “look like” URLs, and XMPP URIs do not really match the assumptions of that lib, so it doesn't sound wise to try to shoehorn it into the existing regex. This is why I used a specific regex (very close, although slightly more permissive than the RFC), and a specific scan function (a simplified version of the generalized one from Twitter). * Remove leading “xmpp:” from auto-linked text 2020-01-11 01:15:25 +00:00			`#`
			`# If a block is given then it will be called for each XMPP URI.`
Fixed code quality issues (#15541) * Added .deepsource.toml * Removed bad use of `alias` * Fixed operand order in the binary expression * Prefixed unused method arguments with an underscore * Replaced the old OpenSSL algorithmic constants with the newer strings initializers. * Removed unnecessary UTF-8 encoding comment 2021-01-31 20:26:09 +00:00			`def extract_extra_uris_with_indices(text, _options = {}) # :yields: uri, start, end`
Add support for linking XMPP URIs in toots (#12709) * Fix wrong grouping in Twitter valid_url regex * Add support for xmpp URIs Fixes #9776 The difficult part is autolinking, because Twitter-text's extractor does some pretty ad-hoc stuff to find things that “look like” URLs, and XMPP URIs do not really match the assumptions of that lib, so it doesn't sound wise to try to shoehorn it into the existing regex. This is why I used a specific regex (very close, although slightly more permissive than the RFC), and a specific scan function (a simplified version of the generalized one from Twitter). * Remove leading “xmpp:” from auto-linked text 2020-01-11 01:15:25 +00:00			`return [] unless text && text.index(":")`
			`urls = []`

Add support for magnet: URIs (#12905) 2020-01-23 20:27:26 +00:00			`text.to_s.scan(Twitter::Regex[:valid_extended_uri]) do`
Add support for linking XMPP URIs in toots (#12709) * Fix wrong grouping in Twitter valid_url regex * Add support for xmpp URIs Fixes #9776 The difficult part is autolinking, because Twitter-text's extractor does some pretty ad-hoc stuff to find things that “look like” URLs, and XMPP URIs do not really match the assumptions of that lib, so it doesn't sound wise to try to shoehorn it into the existing regex. This is why I used a specific regex (very close, although slightly more permissive than the RFC), and a specific scan function (a simplified version of the generalized one from Twitter). * Remove leading “xmpp:” from auto-linked text 2020-01-11 01:15:25 +00:00			`valid_uri_match_data = $~`

			`start_position = valid_uri_match_data.char_begin(3)`
			`end_position = valid_uri_match_data.char_end(3)`

			`urls << {`
			`:url => valid_uri_match_data[3],`
			`:indices => [start_position, end_position]`
			`}`
			`end`
			`urls.each{\|url\| yield url[:url], url[:indices].first, url[:indices].last} if block_given?`
			`urls`
			`end`
Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 16:03:20 +00:00			`end`
			`end`