forked from fedi/mastodon
Remove usernames and hashtags from language detection (#3503)
* Add failing specs for hashtag and username extraction in language detector * Remove usernames and hashtags from text before language detection * Handle multiple instances of special case, and reduce whitespace
This commit is contained in:
parent
d1e08bd38c
commit
d010e270e6
|
@ -13,6 +13,10 @@ class LanguageDetector
|
||||||
detected_language_code || default_locale.to_sym
|
detected_language_code || default_locale.to_sym
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def prepared_text
|
||||||
|
simplified_text.strip
|
||||||
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
def detected_language_code
|
def detected_language_code
|
||||||
|
@ -20,18 +24,21 @@ class LanguageDetector
|
||||||
end
|
end
|
||||||
|
|
||||||
def result
|
def result
|
||||||
@result ||= @identifier.find_language(text_without_urls)
|
@result ||= @identifier.find_language(prepared_text)
|
||||||
end
|
end
|
||||||
|
|
||||||
def detected_language_reliable?
|
def detected_language_reliable?
|
||||||
result.reliable?
|
result.reliable?
|
||||||
end
|
end
|
||||||
|
|
||||||
def text_without_urls
|
def simplified_text
|
||||||
text.dup.tap do |new_text|
|
text.dup.tap do |new_text|
|
||||||
URI.extract(new_text).each do |url|
|
URI.extract(new_text).each do |url|
|
||||||
new_text.gsub!(url, '')
|
new_text.gsub!(url, '')
|
||||||
end
|
end
|
||||||
|
new_text.gsub!(Account::MENTION_RE, '')
|
||||||
|
new_text.gsub!(Tag::HASHTAG_RE, '')
|
||||||
|
new_text.gsub!(/\s+/, ' ')
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,45 @@
|
||||||
# frozen_string_literal: true
|
# frozen_string_literal: true
|
||||||
|
|
||||||
require 'rails_helper'
|
require 'rails_helper'
|
||||||
|
|
||||||
describe LanguageDetector do
|
describe LanguageDetector do
|
||||||
|
describe 'prepared_text' do
|
||||||
|
it 'returns unmodified string without special cases' do
|
||||||
|
string = 'just a regular string'
|
||||||
|
result = described_class.new(string).prepared_text
|
||||||
|
|
||||||
|
expect(result).to eq string
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'collapses spacing in strings' do
|
||||||
|
string = 'The formatting in this is very odd'
|
||||||
|
|
||||||
|
result = described_class.new(string).prepared_text
|
||||||
|
expect(result).to eq 'The formatting in this is very odd'
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'strips usernames from strings before detection' do
|
||||||
|
string = '@username Yeah, very surreal...! also @friend'
|
||||||
|
|
||||||
|
result = described_class.new(string).prepared_text
|
||||||
|
expect(result).to eq 'Yeah, very surreal...! also'
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'strips URLs from strings before detection' do
|
||||||
|
string = 'Our website is https://example.com and also http://localhost.dev'
|
||||||
|
|
||||||
|
result = described_class.new(string).prepared_text
|
||||||
|
expect(result).to eq 'Our website is and also'
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'strips #hashtags from strings before detection' do
|
||||||
|
string = 'Hey look at all the #animals and #fish'
|
||||||
|
|
||||||
|
result = described_class.new(string).prepared_text
|
||||||
|
expect(result).to eq 'Hey look at all the and'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
describe 'to_iso_s' do
|
describe 'to_iso_s' do
|
||||||
it 'detects english language for basic strings' do
|
it 'detects english language for basic strings' do
|
||||||
strings = [
|
strings = [
|
||||||
|
|
Loading…
Reference in a new issue