diff --git a/Gemfile b/Gemfile
index afed1ac94..e869e5f7a 100644
--- a/Gemfile
+++ b/Gemfile
@@ -29,9 +29,7 @@ gem 'addressable', '~> 2.8'
gem 'bootsnap', '~> 1.10.2', require: false
gem 'browser'
gem 'charlock_holmes', '~> 0.7.7'
-gem 'iso-639'
gem 'chewy', '~> 7.2'
-gem 'cld3', '~> 3.4.4'
gem 'devise', '~> 4.8'
gem 'devise-two-factor', '~> 4.0'
diff --git a/Gemfile.lock b/Gemfile.lock
index 5ecddec12..f7dd292dd 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -152,8 +152,6 @@ GEM
elasticsearch (>= 7.12.0, < 7.14.0)
elasticsearch-dsl
chunky_png (1.4.0)
- cld3 (3.4.4)
- ffi (>= 1.1.0, < 1.16.0)
climate_control (0.2.0)
coderay (1.1.3)
color_diff (0.1)
@@ -301,7 +299,6 @@ GEM
terminal-table (>= 1.5.1)
idn-ruby (0.1.4)
ipaddress (0.8.3)
- iso-639 (0.3.5)
jmespath (1.5.0)
json (2.5.1)
json-canonicalization (0.3.0)
@@ -698,7 +695,6 @@ DEPENDENCIES
capybara (~> 3.36)
charlock_holmes (~> 0.7.7)
chewy (~> 7.2)
- cld3 (~> 3.4.4)
climate_control (~> 0.2)
color_diff (~> 0.1)
concurrent-ruby
@@ -725,7 +721,6 @@ DEPENDENCIES
httplog (~> 1.5.0)
i18n-tasks (~> 0.9)
idn-ruby
- iso-639
json-ld
json-ld-preloaded (~> 3.2)
kaminari (~> 1.2)
diff --git a/app/helpers/languages_helper.rb b/app/helpers/languages_helper.rb
index 730724208..f3ed7b314 100644
--- a/app/helpers/languages_helper.rb
+++ b/app/helpers/languages_helper.rb
@@ -1,94 +1,237 @@
# frozen_string_literal: true
module LanguagesHelper
- HUMAN_LOCALES = {
- af: 'Afrikaans',
- ar: 'العربية',
- ast: 'Asturianu',
- bg: 'Български',
- bn: 'বাংলা',
- br: 'Breton',
- ca: 'Català',
- co: 'Corsu',
- cs: 'Čeština',
- cy: 'Cymraeg',
- da: 'Dansk',
- de: 'Deutsch',
- el: 'Ελληνικά',
- en: 'English',
- eo: 'Esperanto',
+ ISO_639_1 = {
+ aa: ['Afar', 'Afaraf'].freeze,
+ ab: ['Abkhaz', 'аҧсуа бызшәа'].freeze,
+ ae: ['Avestan', 'avesta'].freeze,
+ af: ['Afrikaans', 'Afrikaans'].freeze,
+ ak: ['Akan', 'Akan'].freeze,
+ am: ['Amharic', 'አማርኛ'].freeze,
+ an: ['Aragonese', 'aragonés'].freeze,
+ ar: ['Arabic', 'اللغة العربية'].freeze,
+ as: ['Assamese', 'অসমীয়া'].freeze,
+ av: ['Avaric', 'авар мацӀ'].freeze,
+ ay: ['Aymara', 'aymar aru'].freeze,
+ az: ['Azerbaijani', 'azərbaycan dili'].freeze,
+ ba: ['Bashkir', 'башҡорт теле'].freeze,
+ be: ['Belarusian', 'беларуская мова'].freeze,
+ bg: ['Bulgarian', 'български език'].freeze,
+ bh: ['Bihari', 'भोजपुरी'].freeze,
+ bi: ['Bislama', 'Bislama'].freeze,
+ bm: ['Bambara', 'bamanankan'].freeze,
+ bn: ['Bengali', 'বাংলা'].freeze,
+ bo: ['Tibetan', 'བོད་ཡིག'].freeze,
+ br: ['Breton', 'brezhoneg'].freeze,
+ bs: ['Bosnian', 'bosanski jezik'].freeze,
+ ca: ['Catalan', 'Català'].freeze,
+ ce: ['Chechen', 'нохчийн мотт'].freeze,
+ ch: ['Chamorro', 'Chamoru'].freeze,
+ co: ['Corsican', 'corsu'].freeze,
+ cr: ['Cree', 'ᓀᐦᐃᔭᐍᐏᐣ'].freeze,
+ cs: ['Czech', 'čeština'].freeze,
+ cu: ['Old Church Slavonic', 'ѩзыкъ словѣньскъ'].freeze,
+ cv: ['Chuvash', 'чӑваш чӗлхи'].freeze,
+ cy: ['Welsh', 'Cymraeg'].freeze,
+ da: ['Danish', 'dansk'].freeze,
+ de: ['German', 'Deutsch'].freeze,
+ dv: ['Divehi', 'Dhivehi'].freeze,
+ dz: ['Dzongkha', 'རྫོང་ཁ'].freeze,
+ ee: ['Ewe', 'Eʋegbe'].freeze,
+ el: ['Greek', 'Ελληνικά'].freeze,
+ en: ['English', 'English'].freeze,
+ eo: ['Esperanto', 'Esperanto'].freeze,
+ es: ['Spanish', 'Español'].freeze,
+ et: ['Estonian', 'eesti'].freeze,
+ eu: ['Basque', 'euskara'].freeze,
+ fa: ['Persian', 'فارسی'].freeze,
+ ff: ['Fula', 'Fulfulde'].freeze,
+ fi: ['Finnish', 'suomi'].freeze,
+ fj: ['Fijian', 'Vakaviti'].freeze,
+ fo: ['Faroese', 'føroyskt'].freeze,
+ fr: ['French', 'Français'].freeze,
+ fy: ['Western Frisian', 'Frysk'].freeze,
+ ga: ['Irish', 'Gaeilge'].freeze,
+ gd: ['Scottish Gaelic', 'Gàidhlig'].freeze,
+ gl: ['Galician', 'galego'].freeze,
+ gu: ['Gujarati', 'ગુજરાતી'].freeze,
+ gv: ['Manx', 'Gaelg'].freeze,
+ ha: ['Hausa', 'هَوُسَ'].freeze,
+ he: ['Hebrew', 'עברית'].freeze,
+ hi: ['Hindi', 'हिन्दी'].freeze,
+ ho: ['Hiri Motu', 'Hiri Motu'].freeze,
+ hr: ['Croatian', 'Hrvatski'].freeze,
+ ht: ['Haitian', 'Kreyòl ayisyen'].freeze,
+ hu: ['Hungarian', 'magyar'].freeze,
+ hy: ['Armenian', 'Հայերեն'].freeze,
+ hz: ['Herero', 'Otjiherero'].freeze,
+ ia: ['Interlingua', 'Interlingua'].freeze,
+ id: ['Indonesian', 'Bahasa Indonesia'].freeze,
+ ie: ['Interlingue', 'Interlingue'].freeze,
+ ig: ['Igbo', 'Asụsụ Igbo'].freeze,
+ ii: ['Nuosu', 'ꆈꌠ꒿ Nuosuhxop'].freeze,
+ ik: ['Inupiaq', 'Iñupiaq'].freeze,
+ io: ['Ido', 'Ido'].freeze,
+ is: ['Icelandic', 'Íslenska'].freeze,
+ it: ['Italian', 'Italiano'].freeze,
+ iu: ['Inuktitut', 'ᐃᓄᒃᑎᑐᑦ'].freeze,
+ ja: ['Japanese', '日本語'].freeze,
+ jv: ['Javanese', 'basa Jawa'].freeze,
+ ka: ['Georgian', 'ქართული'].freeze,
+ kg: ['Kongo', 'Kikongo'].freeze,
+ ki: ['Kikuyu', 'Gĩkũyũ'].freeze,
+ kj: ['Kwanyama', 'Kuanyama'].freeze,
+ kk: ['Kazakh', 'қазақ тілі'].freeze,
+ kl: ['Kalaallisut', 'kalaallisut'].freeze,
+ km: ['Khmer', 'ខេមរភាសា'].freeze,
+ kn: ['Kannada', 'ಕನ್ನಡ'].freeze,
+ ko: ['Korean', '한국어'].freeze,
+ kr: ['Kanuri', 'Kanuri'].freeze,
+ ks: ['Kashmiri', 'कश्मीरी'].freeze,
+ ku: ['Kurdish', 'Kurdî'].freeze,
+ kv: ['Komi', 'коми кыв'].freeze,
+ kw: ['Cornish', 'Kernewek'].freeze,
+ ky: ['Kyrgyz', 'Кыргызча'].freeze,
+ la: ['Latin', 'latine'].freeze,
+ lb: ['Luxembourgish', 'Lëtzebuergesch'].freeze,
+ lg: ['Ganda', 'Luganda'].freeze,
+ li: ['Limburgish', 'Limburgs'].freeze,
+ ln: ['Lingala', 'Lingála'].freeze,
+ lo: ['Lao', 'ພາສາ'].freeze,
+ lt: ['Lithuanian', 'lietuvių kalba'].freeze,
+ lu: ['Luba-Katanga', 'Tshiluba'].freeze,
+ lv: ['Latvian', 'latviešu valoda'].freeze,
+ mg: ['Malagasy', 'fiteny malagasy'].freeze,
+ mh: ['Marshallese', 'Kajin M̧ajeļ'].freeze,
+ mi: ['Māori', 'te reo Māori'].freeze,
+ mk: ['Macedonian', 'македонски јазик'].freeze,
+ ml: ['Malayalam', 'മലയാളം'].freeze,
+ mn: ['Mongolian', 'Монгол хэл'].freeze,
+ mr: ['Marathi', 'मराठी'].freeze,
+ ms: ['Malay', 'Bahasa Malaysia'].freeze,
+ mt: ['Maltese', 'Malti'].freeze,
+ my: ['Burmese', 'ဗမာစာ'].freeze,
+ na: ['Nauru', 'Ekakairũ Naoero'].freeze,
+ nb: ['Norwegian Bokmål', 'Norsk bokmål'].freeze,
+ nd: ['Northern Ndebele', 'isiNdebele'].freeze,
+ ne: ['Nepali', 'नेपाली'].freeze,
+ ng: ['Ndonga', 'Owambo'].freeze,
+ nl: ['Dutch', 'Nederlands'].freeze,
+ nn: ['Norwegian Nynorsk', 'Norsk nynorsk'].freeze,
+ no: ['Norwegian', 'Norsk'].freeze,
+ nr: ['Southern Ndebele', 'isiNdebele'].freeze,
+ nv: ['Navajo', 'Diné bizaad'].freeze,
+ ny: ['Chichewa', 'chiCheŵa'].freeze,
+ oc: ['Occitan', 'occitan'].freeze,
+ oj: ['Ojibwe', 'ᐊᓂᔑᓈᐯᒧᐎᓐ'].freeze,
+ om: ['Oromo', 'Afaan Oromoo'].freeze,
+ or: ['Oriya', 'ଓଡ଼ିଆ'].freeze,
+ os: ['Ossetian', 'ирон æвзаг'].freeze,
+ pa: ['Panjabi', 'ਪੰਜਾਬੀ'].freeze,
+ pi: ['Pāli', 'पाऴि'].freeze,
+ pl: ['Polish', 'Polski'].freeze,
+ ps: ['Pashto', 'پښتو'].freeze,
+ pt: ['Portuguese', 'Português'].freeze,
+ qu: ['Quechua', 'Runa Simi'].freeze,
+ rm: ['Romansh', 'rumantsch grischun'].freeze,
+ rn: ['Kirundi', 'Ikirundi'].freeze,
+ ro: ['Romanian', 'Română'].freeze,
+ ru: ['Russian', 'Русский'].freeze,
+ rw: ['Kinyarwanda', 'Ikinyarwanda'].freeze,
+ sa: ['Sanskrit', 'संस्कृतम्'].freeze,
+ sc: ['Sardinian', 'sardu'].freeze,
+ sd: ['Sindhi', 'सिन्धी'].freeze,
+ se: ['Northern Sami', 'Davvisámegiella'].freeze,
+ sg: ['Sango', 'yângâ tî sängö'].freeze,
+ si: ['Sinhala', 'සිංහල'].freeze,
+ sk: ['Slovak', 'slovenčina'].freeze,
+ sl: ['Slovenian', 'slovenščina'].freeze,
+ sn: ['Shona', 'chiShona'].freeze,
+ so: ['Somali', 'Soomaaliga'].freeze,
+ sq: ['Albanian', 'Shqip'].freeze,
+ sr: ['Serbian', 'српски језик'].freeze,
+ ss: ['Swati', 'SiSwati'].freeze,
+ st: ['Southern Sotho', 'Sesotho'].freeze,
+ su: ['Sundanese', 'Basa Sunda'].freeze,
+ sv: ['Swedish', 'Svenska'].freeze,
+ sw: ['Swahili', 'Kiswahili'].freeze,
+ ta: ['Tamil', 'தமிழ்'].freeze,
+ te: ['Telugu', 'తెలుగు'].freeze,
+ tg: ['Tajik', 'тоҷикӣ'].freeze,
+ th: ['Thai', 'ไทย'].freeze,
+ ti: ['Tigrinya', 'ትግርኛ'].freeze,
+ tk: ['Turkmen', 'Türkmen'].freeze,
+ tl: ['Tagalog', 'Wikang Tagalog'].freeze,
+ tn: ['Tswana', 'Setswana'].freeze,
+ to: ['Tonga', 'faka Tonga'].freeze,
+ tr: ['Turkish', 'Türkçe'].freeze,
+ ts: ['Tsonga', 'Xitsonga'].freeze,
+ tt: ['Tatar', 'татар теле'].freeze,
+ tw: ['Twi', 'Twi'].freeze,
+ ty: ['Tahitian', 'Reo Tahiti'].freeze,
+ ug: ['Uyghur', 'ئۇيغۇرچە'].freeze,
+ uk: ['Ukrainian', 'Українська'].freeze,
+ ur: ['Urdu', 'اردو'].freeze,
+ uz: ['Uzbek', 'Ўзбек'].freeze,
+ ve: ['Venda', 'Tshivenḓa'].freeze,
+ vi: ['Vietnamese', 'Tiếng Việt'].freeze,
+ vo: ['Volapük', 'Volapük'].freeze,
+ wa: ['Walloon', 'walon'].freeze,
+ wo: ['Wolof', 'Wollof'].freeze,
+ xh: ['Xhosa', 'isiXhosa'].freeze,
+ yi: ['Yiddish', 'ייִדיש'].freeze,
+ yo: ['Yoruba', 'Yorùbá'].freeze,
+ za: ['Zhuang', 'Saɯ cueŋƅ'].freeze,
+ zh: ['Chinese', '中文'].freeze,
+ zu: ['Zulu', 'isiZulu'].freeze,
+ }.freeze
+
+ ISO_639_3 = {
+ ast: ['Asturian', 'Asturianu'].freeze,
+ kab: ['Kabyle', 'Taqbaylit'].freeze,
+ kmr: ['Northern Kurdish', 'Kurmancî'].freeze,
+ zgh: ['Standard Moroccan Tamazight', 'ⵜⴰⵎⴰⵣⵉⵖⵜ'].freeze,
+ }.freeze
+
+ SUPPORTED_LOCALES = {}.merge(ISO_639_1).merge(ISO_639_3).freeze
+
+ # For ISO-639-1 and ISO-639-3 language codes, we have their official
+ # names, but for some translations, we need the names of the
+ # regional variants specifically
+ REGIONAL_LOCALE_NAMES = {
'es-AR': 'Español (Argentina)',
'es-MX': 'Español (México)',
- es: 'Español',
- et: 'Eesti',
- eu: 'Euskara',
- fa: 'فارسی',
- fi: 'Suomi',
- fr: 'Français',
- ga: 'Gaeilge',
- gd: 'Gàidhlig',
- gl: 'Galego',
- he: 'עברית',
- hi: 'हिन्दी',
- hr: 'Hrvatski',
- hu: 'Magyar',
- hy: 'Հայերեն',
- id: 'Bahasa Indonesia',
- io: 'Ido',
- is: 'Íslenska',
- it: 'Italiano',
- ja: '日本語',
- ka: 'ქართული',
- kab: 'Taqbaylit',
- kk: 'Қазақша',
- kmr: 'Kurmancî',
- kn: 'ಕನ್ನಡ',
- ko: '한국어',
- ku: 'سۆرانی',
- lt: 'Lietuvių',
- lv: 'Latviešu',
- mk: 'Македонски',
- ml: 'മലയാളം',
- mr: 'मराठी',
- ms: 'Bahasa Melayu',
- nl: 'Nederlands',
- nn: 'Nynorsk',
- no: 'Norsk',
- oc: 'Occitan',
- pl: 'Polski',
'pt-BR': 'Português (Brasil)',
'pt-PT': 'Português (Portugal)',
- pt: 'Português',
- ro: 'Română',
- ru: 'Русский',
- sa: 'संस्कृतम्',
- sc: 'Sardu',
- si: 'සිංහල',
- sk: 'Slovenčina',
- sl: 'Slovenščina',
- sq: 'Shqip',
'sr-Latn': 'Srpski (latinica)',
- sr: 'Српски',
- sv: 'Svenska',
- ta: 'தமிழ்',
- te: 'తెలుగు',
- th: 'ไทย',
- tr: 'Türkçe',
- uk: 'Українська',
- ur: 'اُردُو',
- vi: 'Tiếng Việt',
- zgh: 'ⵜⴰⵎⴰⵣⵉⵖⵜ',
'zh-CN': '简体中文',
'zh-HK': '繁體中文(香港)',
'zh-TW': '繁體中文(臺灣)',
- zh: '中文',
}.freeze
def human_locale(locale)
if locale == 'und'
I18n.t('generic.none')
+ elsif (supported_locale = SUPPORTED_LOCALES[locale.to_sym])
+ supported_locale[1]
+ elsif (regional_locale = REGIONAL_LOCALE_NAMES[locale.to_sym])
+ regional_locale
else
- HUMAN_LOCALES[locale.to_sym] || locale
+ locale
end
end
+
+ def valid_locale_or_nil(str)
+ return if str.blank?
+
+ code, = str.to_s.split(/[_-]/) # Strip out the region from e.g. en_US or ja-JP
+
+ return unless valid_locale?(code)
+
+ code
+ end
+
+ def valid_locale?(locale)
+ SUPPORTED_LOCALES.key?(locale.to_sym)
+ end
end
diff --git a/app/helpers/settings_helper.rb b/app/helpers/settings_helper.rb
index 23739d1cd..3d5592867 100644
--- a/app/helpers/settings_helper.rb
+++ b/app/helpers/settings_helper.rb
@@ -2,7 +2,7 @@
module SettingsHelper
def filterable_languages
- LanguageDetector.instance.language_names.select(&LanguagesHelper::HUMAN_LOCALES.method(:key?))
+ LanguagesHelper::SUPPORTED_LOCALES.keys
end
def hash_to_object(hash)
diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb
index 33998c477..ea8d146d4 100644
--- a/app/lib/activitypub/activity/create.rb
+++ b/app/lib/activitypub/activity/create.rb
@@ -112,7 +112,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity
url: @status_parser.url || @status_parser.uri,
account: @account,
text: converted_object_type? ? converted_text : (@status_parser.text || ''),
- language: @status_parser.language || detected_language,
+ language: @status_parser.language,
spoiler_text: converted_object_type? ? '' : (@status_parser.spoiler_text || ''),
created_at: @status_parser.created_at,
edited_at: @status_parser.edited_at,
@@ -370,10 +370,6 @@ class ActivityPub::Activity::Create < ActivityPub::Activity
Formatter.instance.linkify([@status_parser.title.presence, @status_parser.spoiler_text.presence, @status_parser.url || @status_parser.uri].compact.join("\n\n"))
end
- def detected_language
- LanguageDetector.instance.detect(@status_parser.text, @account) if supported_object_type?
- end
-
def unsupported_media_type?(mime_type)
mime_type.present? && !MediaAttachment.supported_mime_types.include?(mime_type)
end
diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb
deleted file mode 100644
index 40452eddc..000000000
--- a/app/lib/language_detector.rb
+++ /dev/null
@@ -1,101 +0,0 @@
-# frozen_string_literal: true
-
-class LanguageDetector
- include Singleton
-
- WORDS_THRESHOLD = 4
- RELIABLE_CHARACTERS_RE = /[\p{Hebrew}\p{Arabic}\p{Syriac}\p{Thaana}\p{Nko}\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}\p{Thai}]+/m
-
- def initialize
- @identifier = CLD3::NNetLanguageIdentifier.new(1, 2048)
- end
-
- def detect(text, account)
- input_text = prepare_text(text)
-
- return if input_text.blank?
-
- detect_language_code(input_text) || default_locale(account)
- end
-
- def language_names
- @language_names = CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym }.uniq
- end
-
- private
-
- def prepare_text(text)
- simplify_text(text).strip
- end
-
- def unreliable_input?(text)
- !reliable_input?(text)
- end
-
- def reliable_input?(text)
- sufficient_text_length?(text) || language_specific_character_set?(text)
- end
-
- def sufficient_text_length?(text)
- text.split(/\s+/).size >= WORDS_THRESHOLD
- end
-
- def language_specific_character_set?(text)
- words = text.scan(RELIABLE_CHARACTERS_RE)
-
- if words.present?
- words.reduce(0) { |acc, elem| acc + elem.size }.to_f / text.size > 0.3
- else
- false
- end
- end
-
- def detect_language_code(text)
- return if unreliable_input?(text)
-
- result = @identifier.find_language(text)
-
- iso6391(result.language.to_s).to_sym if result&.reliable?
- end
-
- def iso6391(bcp47)
- iso639 = bcp47.split('-').first
-
- # CLD3 returns grandfathered language code for Hebrew
- return 'he' if iso639 == 'iw'
-
- ISO_639.find(iso639).alpha2
- end
-
- def simplify_text(text)
- new_text = remove_html(text)
- new_text.gsub!(FetchLinkCardService::URL_PATTERN, '\1')
- new_text.gsub!(Account::MENTION_RE, '')
- new_text.gsub!(Tag::HASHTAG_RE) { |string| string.gsub(/[#_]/, '#' => '', '_' => ' ').gsub(/[a-z][A-Z]|[a-zA-Z][\d]/) { |s| s.insert(1, ' ') }.downcase }
- new_text.gsub!(/:#{CustomEmoji::SHORTCODE_RE_FRAGMENT}:/, '')
- new_text.gsub!(/\s+/, ' ')
- new_text
- end
-
- def new_scrubber
- scrubber = Rails::Html::PermitScrubber.new
- scrubber.tags = %w(br p)
- scrubber
- end
-
- def scrubber
- @scrubber ||= new_scrubber
- end
-
- def remove_html(text)
- text = Loofah.fragment(text).scrub!(scrubber).to_s
- text.gsub!('
', "\n")
- text.gsub!('
', "\n\n") - text.gsub!(/(^
|<\/p>$)/, '') - text - end - - def default_locale(account) - account.user_locale&.to_sym || I18n.default_locale if account.local? - end -end diff --git a/app/lib/link_details_extractor.rb b/app/lib/link_details_extractor.rb index d2bcf0c25..fabbd244d 100644 --- a/app/lib/link_details_extractor.rb +++ b/app/lib/link_details_extractor.rb @@ -2,6 +2,7 @@ class LinkDetailsExtractor include ActionView::Helpers::TagHelper + include LanguagesHelper # Some publications wrap their JSON-LD data in their