diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb
index 6603b8df1..0653214f5 100644
--- a/app/lib/formatter.rb
+++ b/app/lib/formatter.rb
@@ -199,12 +199,22 @@ class Formatter
result.flatten.join
end
+ UNICODE_ESCAPE_BLACKLIST_RE = /\p{Z}|\p{P}/
+
def utf8_friendly_extractor(text, options = {})
old_to_new_index = [0]
escaped = text.chars.map do |c|
- output = c.ord.to_s(16).length > 2 ? CGI.escape(c) : c
+ output = begin
+ if c.ord.to_s(16).length > 2 && UNICODE_ESCAPE_BLACKLIST_RE.match(c).nil?
+ CGI.escape(c)
+ else
+ c
+ end
+ end
+
old_to_new_index << old_to_new_index.last + output.length
+
output
end.join
diff --git a/config/initializers/twitter_regex.rb b/config/initializers/twitter_regex.rb
index 0e8f5bfeb..0ddbbee98 100644
--- a/config/initializers/twitter_regex.rb
+++ b/config/initializers/twitter_regex.rb
@@ -1,7 +1,7 @@
module Twitter
class Regex
- REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}\(\)\?]/iou
- REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*';:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
+ REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou
+ REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
REGEXEN[:valid_url_balanced_parens] = /
\(
(?:
diff --git a/spec/lib/formatter_spec.rb b/spec/lib/formatter_spec.rb
index 8fb6695a9..96d2fc7e0 100644
--- a/spec/lib/formatter_spec.rb
+++ b/spec/lib/formatter_spec.rb
@@ -115,6 +115,22 @@ RSpec.describe Formatter do
end
end
+ context 'given a URL in quotation marks' do
+ let(:text) { '"https://example.com/"' }
+
+ it 'does not match the quotation marks' do
+ is_expected.to include 'href="https://example.com/"'
+ end
+ end
+
+ context 'given a URL in angle brackets' do
+ let(:text) { '' }
+
+ it 'does not match the angle brackets' do
+ is_expected.to include 'href="https://example.com/"'
+ end
+ end
+
context 'given a URL with Japanese path string' do
let(:text) { 'https://ja.wikipedia.org/wiki/日本' }
@@ -131,6 +147,22 @@ RSpec.describe Formatter do
end
end
+ context 'given a URL with a full-width space' do
+ let(:text) { 'https://example.com/ abc123' }
+
+ it 'does not match the full-width space' do
+ is_expected.to include 'href="https://example.com/"'
+ end
+ end
+
+ context 'given a URL in Japanese quotation marks' do
+ let(:text) { '「[https://example.org/」' }
+
+ it 'does not match the quotation marks' do
+ is_expected.to include 'href="https://example.org/"'
+ end
+ end
+
context 'given a URL with Simplified Chinese path string' do
let(:text) { 'https://baike.baidu.com/item/中华人民共和国' }
@@ -150,7 +182,11 @@ RSpec.describe Formatter do
context 'given a URL containing unsafe code (XSS attack, visible part)' do
let(:text) { %q{http://example.com/bb} }
- it 'escapes the HTML in the URL' do
+ it 'does not include the HTML in the URL' do
+ is_expected.to include '"http://example.com/b"'
+ end
+
+ it 'escapes the HTML' do
is_expected.to include '<del>b</del>'
end
end
@@ -158,7 +194,11 @@ RSpec.describe Formatter do
context 'given a URL containing unsafe code (XSS attack, invisible part)' do
let(:text) { %q{http://example.com/blahblahblahblah/a} }
- it 'escapes the HTML in the URL' do
+ it 'does not include the HTML in the URL' do
+ is_expected.to include '"http://example.com/blahblahblahblah/a"'
+ end
+
+ it 'escapes the HTML' do
is_expected.to include '<script>alert("Hello")</script>'
end
end