title parse improvement

2020-01-29 11:13:34 +03:00 · 2020-01-29 11:13:34 +03:00 · 1f4fbe9d98
parent f1d5c0f079
commit 1f4fbe9d98
3 changed files with 244 additions and 1 deletions
--- a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex
+++ b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex
@ -48,6 +48,6 @@ defp maybe_put_title(meta, html) when meta != %{} do
  defp maybe_put_title(meta, _), do: meta

  defp get_page_title(html) do
-    Floki.find(html, "title") |> List.first() |> Floki.text()
+    Floki.find(html, "html head title") |> List.first() |> Floki.text()
  end
 end
--- a/test/fixtures/nypd-facial-recognition-children-teenagers4.html
+++ b/test/fixtures/nypd-facial-recognition-children-teenagers4.html
--- a/test/web/rich_media/parsers/twitter_card_test.exs
+++ b/test/web/rich_media/parsers/twitter_card_test.exs
@ -85,4 +85,19 @@ test "respect only first title tag on the page" do
                image: image_path
              }}
  end
+
+  test "takes first founded title in html head if there is html markup error" do
+    html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html")
+
+    assert TwitterCard.parse(html, %{}) ==
+             {:ok,
+              %{
+                site: nil,
+                title:
+                  "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times",
+                "app:id:googleplay": "com.nytimes.android",
+                "app:name:googleplay": "NYTimes",
+                "app:url:googleplay": "nytimes://reader/id/100000006583622"
+              }}
+  end
 end