From d010e270e613f6299397601289158bd2acedbe8e Mon Sep 17 00:00:00 2001 From: Matt Jankowski Date: Thu, 1 Jun 2017 09:29:14 -0400 Subject: [PATCH] Remove usernames and hashtags from language detection (#3503) * Add failing specs for hashtag and username extraction in language detector * Remove usernames and hashtags from text before language detection * Handle multiple instances of special case, and reduce whitespace --- app/lib/language_detector.rb | 11 +++++++-- spec/lib/language_detector_spec.rb | 38 ++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb index 1c22a9cc..0d087f8d 100644 --- a/app/lib/language_detector.rb +++ b/app/lib/language_detector.rb @@ -13,6 +13,10 @@ class LanguageDetector detected_language_code || default_locale.to_sym end + def prepared_text + simplified_text.strip + end + private def detected_language_code @@ -20,18 +24,21 @@ class LanguageDetector end def result - @result ||= @identifier.find_language(text_without_urls) + @result ||= @identifier.find_language(prepared_text) end def detected_language_reliable? result.reliable? end - def text_without_urls + def simplified_text text.dup.tap do |new_text| URI.extract(new_text).each do |url| new_text.gsub!(url, '') end + new_text.gsub!(Account::MENTION_RE, '') + new_text.gsub!(Tag::HASHTAG_RE, '') + new_text.gsub!(/\s+/, ' ') end end diff --git a/spec/lib/language_detector_spec.rb b/spec/lib/language_detector_spec.rb index e543edd4..ace7a326 100644 --- a/spec/lib/language_detector_spec.rb +++ b/spec/lib/language_detector_spec.rb @@ -1,7 +1,45 @@ # frozen_string_literal: true + require 'rails_helper' describe LanguageDetector do + describe 'prepared_text' do + it 'returns unmodified string without special cases' do + string = 'just a regular string' + result = described_class.new(string).prepared_text + + expect(result).to eq string + end + + it 'collapses spacing in strings' do + string = 'The formatting in this is very odd' + + result = described_class.new(string).prepared_text + expect(result).to eq 'The formatting in this is very odd' + end + + it 'strips usernames from strings before detection' do + string = '@username Yeah, very surreal...! also @friend' + + result = described_class.new(string).prepared_text + expect(result).to eq 'Yeah, very surreal...! also' + end + + it 'strips URLs from strings before detection' do + string = 'Our website is https://example.com and also http://localhost.dev' + + result = described_class.new(string).prepared_text + expect(result).to eq 'Our website is and also' + end + + it 'strips #hashtags from strings before detection' do + string = 'Hey look at all the #animals and #fish' + + result = described_class.new(string).prepared_text + expect(result).to eq 'Hey look at all the and' + end + end + describe 'to_iso_s' do it 'detects english language for basic strings' do strings = [