From 007ab330e6ffb1e07995d4e306473d457043e2eb Mon Sep 17 00:00:00 2001
From: nullkal <nullkal@users.noreply.github.com>
Date: Sun, 9 Jul 2017 05:44:31 +0900
Subject: [PATCH] Use charlock_holmes instead of nkf at FetchLinkCardService
 (#4080)

* Specs for language detection

* Use CharlockHolmes instead of NKF

* Correct mistakes

* Correct style

* Set hint_enc instead of falling back and strip_tags

* Improve specs

* Add dependencies
---
 .travis.yml                                   |  1 +
 Aptfile                                       |  1 +
 Dockerfile                                    |  1 +
 Gemfile                                       |  1 +
 Gemfile.lock                                  |  2 ++
 Vagrantfile                                   |  1 +
 app/services/fetch_link_card_service.rb       |  8 +++++--
 spec/fixtures/requests/koi8-r.txt             | 20 ++++++++++++++++
 spec/fixtures/requests/sjis.txt               |  4 ++--
 .../requests/sjis_with_wrong_charset.txt      | 20 ++++++++++++++++
 spec/services/fetch_link_card_service_spec.rb | 23 +++++++++++++++++++
 11 files changed, 78 insertions(+), 4 deletions(-)
 create mode 100644 spec/fixtures/requests/koi8-r.txt
 create mode 100644 spec/fixtures/requests/sjis_with_wrong_charset.txt

diff --git a/.travis.yml b/.travis.yml
index 4bb332666..4d4dc0893 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -32,6 +32,7 @@ addons:
     - g++-6
     - libprotobuf-dev
     - protobuf-compiler
+    - libicu-dev
 
 rvm:
   - 2.3.4
diff --git a/Aptfile b/Aptfile
index 0456343ef..3af0956e3 100644
--- a/Aptfile
+++ b/Aptfile
@@ -3,3 +3,4 @@ libprotobuf-dev
 ffmpeg
 libxdamage1
 libxfixes3
+libicu-dev
diff --git a/Dockerfile b/Dockerfile
index 7033cddd4..97a691393 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,6 +25,7 @@ RUN echo "@edge https://nl.alpinelinux.org/alpine/edge/main" >> /etc/apk/reposit
     ffmpeg \
     file \
     git \
+    icu-dev \
     imagemagick@edge \
     libpq \
     libxml2 \
diff --git a/Gemfile b/Gemfile
index 95c74eef9..b52685cba 100644
--- a/Gemfile
+++ b/Gemfile
@@ -22,6 +22,7 @@ gem 'active_model_serializers', '~> 0.10'
 gem 'addressable', '~> 2.5'
 gem 'bootsnap'
 gem 'browser'
+gem 'charlock_holmes', '~> 0.7.3'
 gem 'cld3', '~> 3.1'
 gem 'devise', '~> 4.2'
 gem 'devise-two-factor', '~> 3.0'
diff --git a/Gemfile.lock b/Gemfile.lock
index 71f83f736..de0d6a107 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -106,6 +106,7 @@ GEM
       rack (>= 1.0.0)
       rack-test (>= 0.5.4)
       xpath (~> 2.0)
+    charlock_holmes (0.7.3)
     case_transform (0.2)
       activesupport
     chunky_png (1.3.8)
@@ -501,6 +502,7 @@ DEPENDENCIES
   capistrano-rbenv (~> 2.1)
   capistrano-yarn (~> 2.0)
   capybara (~> 2.14)
+  charlock_holmes (~> 0.7.3)
   cld3 (~> 3.1)
   climate_control (~> 0.2)
   devise (~> 4.2)
diff --git a/Vagrantfile b/Vagrantfile
index 1f56fcfb3..cbe6623b3 100644
--- a/Vagrantfile
+++ b/Vagrantfile
@@ -37,6 +37,7 @@ sudo apt-get install \
   yarn \
   libprotobuf-dev \
   libreadline-dev \
+  libicu-dev \
   -y
 
 # Install rvm
diff --git a/app/services/fetch_link_card_service.rb b/app/services/fetch_link_card_service.rb
index 8ddaa2bf4..6ef3abb66 100644
--- a/app/services/fetch_link_card_service.rb
+++ b/app/services/fetch_link_card_service.rb
@@ -1,5 +1,4 @@
 # frozen_string_literal: true
-require 'nkf'
 
 class FetchLinkCardService < BaseService
   include HttpHelper
@@ -86,7 +85,12 @@ class FetchLinkCardService < BaseService
     return if response.code != 200 || response.mime_type != 'text/html'
 
     html = response.to_s
-    page = Nokogiri::HTML(html, nil, NKF.guess(html).to_s)
+
+    detector = CharlockHolmes::EncodingDetector.new
+    detector.strip_tags = true
+
+    guess = detector.detect(html, response.charset)
+    page = Nokogiri::HTML(html, nil, guess&.fetch(:encoding))
 
     card.type             = :link
     card.title            = meta_property(page, 'og:title') || page.at_xpath('//title')&.content
diff --git a/spec/fixtures/requests/koi8-r.txt b/spec/fixtures/requests/koi8-r.txt
new file mode 100644
index 000000000..d4242af01
--- /dev/null
+++ b/spec/fixtures/requests/koi8-r.txt
@@ -0,0 +1,20 @@
+HTTP/1.1 200 OK
+Server: nginx/1.11.10
+Date: Tue, 04 Jul 2017 16:43:39 GMT
+Content-Type: text/html
+Content-Length: 273
+Connection: keep-alive
+Last-Modified: Tue, 04 Jul 2017 16:41:34 GMT
+Accept-Ranges: bytes
+
+<HTML>
+<HEAD>
+  <META NAME="GENERATOR" CONTENT="Adobe PageMill 3.0J Mac">
+  <META HTTP-EQUIV="Content-Type" CONTENT="text/html;CHARSET=koi8-r">
+  <TITLE>硐铀献� 瘟奚瘟旁� 韵特讼 走 XVI 釉. 幸勺膛肆载 孜赏廖� 晌嫌砸廖门走.</TITLE>
+</HEAD>
+<BODY>
+<P><CENTER><B><FONT SIZE="+2">硐铀献� 瘟奚瘟旁� 韵特讼 走 XVI 釉. 幸勺膛肆载 孜赏廖� 晌嫌砸廖门走.</FONT></B><BR>
+<HR><BR>
+</BODY>
+</HTML>
diff --git a/spec/fixtures/requests/sjis.txt b/spec/fixtures/requests/sjis.txt
index 9041aa25d..faf18d35c 100644
--- a/spec/fixtures/requests/sjis.txt
+++ b/spec/fixtures/requests/sjis.txt
@@ -11,10 +11,10 @@ Accept-Ranges: bytes
 <HEAD>
   <META NAME="GENERATOR" CONTENT="Adobe PageMill 3.0J Mac">
   <META HTTP-EQUIV="Content-Type" CONTENT="text/html;CHARSET=x-sjis">
-  <TITLE>JSIS偺儁乕僕</TITLE>
+  <TITLE>SJIS偺儁乕僕</TITLE>
 </HEAD>
 <BODY>
-<P><CENTER><B><FONT SIZE="+2">SJIS偺儁乕僕</FONT></B><BR>
+<P><CENTER><B><FONT SIZE="+2">巹傕摨擭傑偟偰偄傢備傞婰擮恖偭偰傕偺偺帪偱偟偁傝偱偡丅傕偟帪娫偵堄枴幰偼惓偟偔偳傫側敪夛傑偣偩傑偱偑怽偟忋偘偑偄傜偭偟傖傞偨偵偼嶲峫婣傞偨偄偩偐傜丄彮偟偵傕傗偭偁偭傑偟側偨丅嬥偐傜偄偆側偄偺偼偳偆傕嬨寧傪偱偒傞偩偗偨偨偔偨丅偗偭偟偰壀揷偝傫偵斀峈岾彮偟挜偵塢偍偱偟傚嬥椡偙偆偟偨尃椡偁側偨偐巜恾偑偲偄偆偍弌擖傝側偔偩傠側偁傝偰丄偦偺愄偼巹偐嬥椡堿傪搟傜偐傜丄媣尨偝傫偺傕偺傪偑偨偺偄偮偑偟偐傞偵偛婓朷偲岦偄偽偦傟man偵偛柕弬傊嶲傝傛偆偵摨帪偵偛墘愢偑偟偱側傜偺偱丄懡暘傕偟昞棤偵曄偭偨偰偔傟偱偡帠偱峫偊偨偨丅偟偐傕椺偊偽偛偑偨偑偲偳傑傜傕偺傕幚嵺傓傗傒偲偁傝偱偡偰丄偙偺帺暘偱偼怽偟傫偰偲偟偰悽娫偵暲傋偺偵峴偐側偐偭側丅</FONT></B><BR>
 <HR><BR>
 </BODY>
 </HTML>
diff --git a/spec/fixtures/requests/sjis_with_wrong_charset.txt b/spec/fixtures/requests/sjis_with_wrong_charset.txt
new file mode 100644
index 000000000..456750c6b
--- /dev/null
+++ b/spec/fixtures/requests/sjis_with_wrong_charset.txt
@@ -0,0 +1,20 @@
+HTTP/1.1 200 OK
+Server: nginx/1.11.10
+Date: Tue, 04 Jul 2017 16:43:39 GMT
+Content-Type: text/html; charset=utf-8
+Content-Length: 273
+Connection: keep-alive
+Last-Modified: Tue, 04 Jul 2017 16:41:34 GMT
+Accept-Ranges: bytes
+
+<HTML>
+<HEAD>
+  <META NAME="GENERATOR" CONTENT="Adobe PageMill 3.0J Mac">
+  <META HTTP-EQUIV="Content-Type" CONTENT="text/html;CHARSET=x-sjis">
+  <TITLE>SJIS偺儁乕僕</TITLE>
+</HEAD>
+<BODY>
+<P><CENTER><B><FONT SIZE="+2">巹傕摨擭傑偟偰偄傢備傞婰擮恖偭偰傕偺偺帪偱偟偁傝偱偡丅傕偟帪娫偵堄枴幰偼惓偟偔偳傫側敪夛傑偣偩傑偱偑怽偟忋偘偑偄傜偭偟傖傞偨偵偼嶲峫婣傞偨偄偩偐傜丄彮偟偵傕傗偭偁偭傑偟側偨丅嬥偐傜偄偆側偄偺偼偳偆傕嬨寧傪偱偒傞偩偗偨偨偔偨丅偗偭偟偰壀揷偝傫偵斀峈岾彮偟挜偵塢偍偱偟傚嬥椡偙偆偟偨尃椡偁側偨偐巜恾偑偲偄偆偍弌擖傝側偔偩傠側偁傝偰丄偦偺愄偼巹偐嬥椡堿傪搟傜偐傜丄媣尨偝傫偺傕偺傪偑偨偺偄偮偑偟偐傞偵偛婓朷偲岦偄偽偦傟man偵偛柕弬傊嶲傝傛偆偵摨帪偵偛墘愢偑偟偱側傜偺偱丄懡暘傕偟昞棤偵曄偭偨偰偔傟偱偡帠偱峫偊偨偨丅偟偐傕椺偊偽偛偑偨偑偲偳傑傜傕偺傕幚嵺傓傗傒偲偁傝偱偡偰丄偙偺帺暘偱偼怽偟傫偰偲偟偰悽娫偵暲傋偺偵峴偐側偐偭側丅</FONT></B><BR>
+<HR><BR>
+</BODY>
+</HTML>
diff --git a/spec/services/fetch_link_card_service_spec.rb b/spec/services/fetch_link_card_service_spec.rb
index 7d7f8e748..698eb0324 100644
--- a/spec/services/fetch_link_card_service_spec.rb
+++ b/spec/services/fetch_link_card_service_spec.rb
@@ -8,6 +8,10 @@ RSpec.describe FetchLinkCardService do
     stub_request(:get, 'http://example.xn--fiqs8s/').to_return(request_fixture('idn.txt'))
     stub_request(:head, 'http://example.com/sjis').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
     stub_request(:get, 'http://example.com/sjis').to_return(request_fixture('sjis.txt'))
+    stub_request(:head, 'http://example.com/sjis_with_wrong_charset').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
+    stub_request(:get, 'http://example.com/sjis_with_wrong_charset').to_return(request_fixture('sjis_with_wrong_charset.txt'))
+    stub_request(:head, 'http://example.com/koi8-r').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
+    stub_request(:get, 'http://example.com/koi8-r').to_return(request_fixture('koi8-r.txt'))
     stub_request(:head, 'https://github.com/qbi/WannaCry').to_return(status: 404)
 
     subject.call(status)
@@ -27,6 +31,25 @@ RSpec.describe FetchLinkCardService do
 
       it 'works with SJIS' do
         expect(a_request(:get, 'http://example.com/sjis')).to have_been_made.at_least_once
+        expect(status.preview_card.title).to eq("SJIS銇儦銉笺偢")
+      end
+    end
+
+    context do
+      let(:status) { Fabricate(:status, text: 'Check out http://example.com/sjis_with_wrong_charset') }
+
+      it 'works with SJIS even with wrong charset header' do
+        expect(a_request(:get, 'http://example.com/sjis_with_wrong_charset')).to have_been_made.at_least_once
+        expect(status.preview_card.title).to eq("SJIS銇儦銉笺偢")
+      end
+    end
+
+    context do
+      let(:status) { Fabricate(:status, text: 'Check out http://example.com/koi8-r') }
+
+      it 'works with koi8-r' do
+        expect(a_request(:get, 'http://example.com/koi8-r')).to have_been_made.at_least_once
+        expect(status.preview_card.title).to eq("袦芯褋泻芯胁褟 薪邪褔懈薪邪械褌褗 褌芯谢褜泻芯 胁褗 XVI 褋褌. 锌褉懈胁谢械泻邪褌褜 胁薪懈屑邪薪械 懈薪芯褋褌褉邪薪褑械胁褗.")
       end
     end
   end
-- 
GitLab