From 3816943e6b5e86b22c35f3c068521f7a9007deec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=81=B5=E3=81=81=E3=81=BC=E5=8E=9F?= Date: Fri, 15 Sep 2017 01:03:20 +0900 Subject: [PATCH] Enable to recognize most kinds of characters as URL paths (#4941) --- app/lib/formatter.rb | 2 +- app/services/fetch_link_card_service.rb | 14 +++++-- config/initializers/twitter_regex.rb | 42 +++++++++++++++++++ spec/lib/formatter_spec.rb | 32 ++++++++++++++ spec/services/fetch_link_card_service_spec.rb | 11 +++++ 5 files changed, 96 insertions(+), 5 deletions(-) create mode 100644 config/initializers/twitter_regex.rb diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb index cacc0364..d9f843f4 100644 --- a/app/lib/formatter.rb +++ b/app/lib/formatter.rb @@ -131,7 +131,7 @@ class Formatter end def link_html(url) - url = Addressable::URI.parse(url).display_uri.to_s + url = Addressable::URI.parse(url).to_s prefix = url.match(/\Ahttps?:\/\/(www\.)?/).to_s text = url[prefix.length, 30] suffix = url[prefix.length + 30..-1] diff --git a/app/services/fetch_link_card_service.rb b/app/services/fetch_link_card_service.rb index 215c69fe..4acbfae7 100644 --- a/app/services/fetch_link_card_service.rb +++ b/app/services/fetch_link_card_service.rb @@ -1,9 +1,15 @@ # frozen_string_literal: true class FetchLinkCardService < BaseService - include ActionView::Helpers::TagHelper - - URL_PATTERN = %r{https?://\S+} + URL_PATTERN = %r{ + ( # $1 URL + (https?:\/\/)? # $2 Protocol (optional) + (#{Twitter::Regex[:valid_domain]}) # $3 Domain(s) + (?::(#{Twitter::Regex[:valid_port_number]}))? # $4 Port number (optional) + (/#{Twitter::Regex[:valid_url_path]}*)? # $5 URL Path and anchor + (\?#{Twitter::Regex[:valid_url_query_chars]}*#{Twitter::Regex[:valid_url_query_ending_chars]})? # $6 Query String + ) + }iox def call(status) @status = status @@ -42,7 +48,7 @@ class FetchLinkCardService < BaseService def parse_urls if @status.local? - urls = @status.text.match(URL_PATTERN).to_a.map { |uri| Addressable::URI.parse(uri).normalize } + urls = @status.text.scan(URL_PATTERN).map { |array| Addressable::URI.parse(array[0]).normalize } else html = Nokogiri::HTML(@status.text) links = html.css('a') diff --git a/config/initializers/twitter_regex.rb b/config/initializers/twitter_regex.rb new file mode 100644 index 00000000..5a0723d2 --- /dev/null +++ b/config/initializers/twitter_regex.rb @@ -0,0 +1,42 @@ +module Twitter + class Regex + + REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}\(\)\?]/iou + REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*';:=\,\.\$%\[\]\p{Pd}_~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou + REGEXEN[:valid_url_balanced_parens] = / + \( + (?: + #{REGEXEN[:valid_general_url_path_chars]}+ + | + # allow one nested level of balanced parentheses + (?: + #{REGEXEN[:valid_general_url_path_chars]}* + \( + #{REGEXEN[:valid_general_url_path_chars]}+ + \) + #{REGEXEN[:valid_general_url_path_chars]}* + ) + ) + \) + /iox + REGEXEN[:valid_url_path] = /(?: + (?: + #{REGEXEN[:valid_general_url_path_chars]}* + (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)* + #{REGEXEN[:valid_url_path_ending_chars]} + )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/) + )/iox + REGEXEN[:valid_url] = %r{ + ( # $1 total match + (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter + ( # $3 URL + (https?:\/\/)? # $4 Protocol (optional) + (#{REGEXEN[:valid_domain]}) # $5 Domain(s) + (?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional) + (/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor + (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String + ) + ) + }iox + end +end diff --git a/spec/lib/formatter_spec.rb b/spec/lib/formatter_spec.rb index ab04ccba..f9b7efac 100644 --- a/spec/lib/formatter_spec.rb +++ b/spec/lib/formatter_spec.rb @@ -89,6 +89,38 @@ RSpec.describe Formatter do end end + context 'matches a URL with Japanese path string' do + let(:text) { 'https://ja.wikipedia.org/wiki/日本' } + + it 'has valid URL' do + is_expected.to include 'href="https://ja.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC"' + end + end + + context 'matches a URL with Korean path string' do + let(:text) { 'https://ko.wikipedia.org/wiki/대한민국' } + + it 'has valid URL' do + is_expected.to include 'href="https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD"' + end + end + + context 'matches a URL with Simplified Chinese path string' do + let(:text) { 'https://baike.baidu.com/item/中华人民共和国' } + + it 'has valid URL' do + is_expected.to include 'href="https://baike.baidu.com/item/%E4%B8%AD%E5%8D%8E%E4%BA%BA%E6%B0%91%E5%85%B1%E5%92%8C%E5%9B%BD"' + end + end + + context 'matches a URL with Traditional Chinese path string' do + let(:text) { 'https://zh.wikipedia.org/wiki/臺灣' } + + it 'has valid URL' do + is_expected.to include 'href="https://zh.wikipedia.org/wiki/%E8%87%BA%E7%81%A3"' + end + end + context 'contains HTML (script tag)' do let(:text) { '' } diff --git a/spec/services/fetch_link_card_service_spec.rb b/spec/services/fetch_link_card_service_spec.rb index b0aa740a..ba61d22c 100644 --- a/spec/services/fetch_link_card_service_spec.rb +++ b/spec/services/fetch_link_card_service_spec.rb @@ -12,6 +12,8 @@ RSpec.describe FetchLinkCardService do stub_request(:get, 'http://example.com/sjis_with_wrong_charset').to_return(request_fixture('sjis_with_wrong_charset.txt')) stub_request(:head, 'http://example.com/koi8-r').to_return(status: 200, headers: { 'Content-Type' => 'text/html' }) stub_request(:get, 'http://example.com/koi8-r').to_return(request_fixture('koi8-r.txt')) + stub_request(:head, 'http://example.com/日本語').to_return(status: 200, headers: { 'Content-Type' => 'text/html' }) + stub_request(:get, 'http://example.com/日本語').to_return(request_fixture('sjis.txt')) stub_request(:head, 'https://github.com/qbi/WannaCry').to_return(status: 404) subject.call(status) @@ -52,6 +54,15 @@ RSpec.describe FetchLinkCardService do expect(status.preview_cards.first.title).to eq("Московя начинаетъ только въ XVI ст. привлекать внимане иностранцевъ.") end end + + context do + let(:status) { Fabricate(:status, text: 'テストhttp://example.com/日本語') } + + it 'works with Japanese path string' do + expect(a_request(:get, 'http://example.com/日本語')).to have_been_made.at_least_once + expect(status.preview_cards.first.title).to eq("SJISのページ") + end + end end context 'in a remote status' do