[epiphany] Update readability.js to the official mozilla version

From: Jan-Michael Brummer <jbrummer src gnome org>
To: commits-list gnome org
Cc:
Subject: [epiphany] Update readability.js to the official mozilla version
Date: Sun, 21 Apr 2019 17:07:27 +0000 (UTC)
commit b2b804715c4fd603568c2d5321e8e6fb040690d0
Author: Jan-Michael Brummer <jan brummer tabos org>
Date:   Fri Apr 19 14:47:45 2019 +0200

    Update readability.js to the official mozilla version
    
    As a side effect this also solves readability warnings, e.g. on apple.com
    
    Fixes: https://gitlab.gnome.org/GNOME/epiphany/issues/718

 embed/ephy-web-view.c                            |   2 +-
 src/resources/{readability.js => Readability.js} | 194 ++++++++++-------------
 src/resources/epiphany.gresource.xml             |   2 +-
 3 files changed, 89 insertions(+), 109 deletions(-)
---
diff --git a/embed/ephy-web-view.c b/embed/ephy-web-view.c
index 55675a2ef..a55e47490 100644
--- a/embed/ephy-web-view.c
+++ b/embed/ephy-web-view.c
@@ -1064,7 +1064,7 @@ run_readability_js_if_needed (gpointer data)
   /* Internal pages should never receive reader mode. */
   if (!ephy_embed_utils_is_no_show_address (web_view->address)) {
     webkit_web_view_run_javascript_from_gresource (WEBKIT_WEB_VIEW (web_view),
-                                                   "/org/gnome/epiphany/readability.js",
+                                                   "/org/gnome/epiphany/Readability.js",
                                                    NULL,
                                                    readability_js_finish_cb,
                                                    web_view);
diff --git a/src/resources/readability.js b/src/resources/Readability.js
similarity index 93%
rename from src/resources/readability.js
rename to src/resources/Readability.js
index 6206d6923..2c716b164 100644
--- a/src/resources/readability.js
+++ b/src/resources/Readability.js
@@ -1,3 +1,8 @@
+//////////////////////////////////////////////////////////////////////////
+// Warning: Epiphany changes at the bottom of the file.                 //
+// https://github.com/mozilla/readability distributed under Apache V2.0 //
+//////////////////////////////////////////////////////////////////////////
+
 /*eslint-env es6:false*/
 /*
  * Copyright (c) 2010 Arc90 Inc
@@ -39,6 +44,7 @@ function Readability(doc, options) {
   this._articleTitle = null;
   this._articleByline = null;
   this._articleDir = null;
+  this._articleSiteName = null;
   this._attempts = [];
 
   // Configurable options
@@ -111,10 +117,13 @@ Readability.prototype = {
   // All of the regular expressions in use within readability.
   // Defined up here so we don't instantiate them repeatedly in loops.
   REGEXPS: {
-    unlikelyCandidates: 
/-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
+    // NOTE: These two regular expressions are duplicated in
+    // Readability-readerable.js. Please keep both copies in sync.
+    unlikelyCandidates: 
/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
+
     positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
-    negative: /hidden|^hid$| hid$| hid |^hid 
|banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
+    negative: /hidden|^hid$| hid$| hid |^hid 
|banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
     byline: /byline|author|dateline|writtenby|p-author/i,
     replaceFonts: /<(\/?)font[^>]*>/gi,
@@ -320,7 +329,7 @@ Readability.prototype = {
       return uri;
     }
 
-    var links = articleContent.getElementsByTagName("a");
+    var links = this._getAllNodesWithTag(articleContent, ["a"]);
     this._forEachNode(links, function(link) {
       var href = link.getAttribute("href");
       if (href) {
@@ -335,7 +344,7 @@ Readability.prototype = {
       }
     });
 
-    var imgs = articleContent.getElementsByTagName("img");
+    var imgs = this._getAllNodesWithTag(articleContent, ["img"]);
     this._forEachNode(imgs, function(img) {
       var src = img.getAttribute("src");
       if (src) {
@@ -408,7 +417,7 @@ Readability.prototype = {
         curTitle = this._getInnerText(hOnes[0]);
     }
 
-    curTitle = curTitle.trim();
+    curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
     // If we now have 4 words or fewer as our title, and either no
     // 'hierarchical' separators (\, /, > or ») were found in the original
     // title or we decreased the number of words by more than 1 word, use
@@ -534,7 +543,16 @@ Readability.prototype = {
       replacement.readability = node.readability;
 
     for (var i = 0; i < node.attributes.length; i++) {
-      replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
+      try {
+        replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
+      } catch (ex) {
+        /* it's possible for setAttribute() to throw if the attribute name
+         * isn't a valid XML Name. Such attributes can however be parsed from
+         * source in HTML docs, see https://github.com/whatwg/html/issues/4275,
+         * so we can hit them here and then throw. We don't care about such
+         * attributes so we ignore them.
+         */
+      }
     }
     return replacement;
   },
@@ -564,10 +582,15 @@ Readability.prototype = {
     this._clean(articleContent, "link");
     this._clean(articleContent, "aside");
 
-    // Clean out elements have "share" in their id/class combinations from final top candidates,
+    // Clean out elements with little content that have "share" in their id/class combinations from final 
top candidates,
     // which means we don't remove the top candidates even they have "share".
-    this._forEachNode(articleContent.children, function(topCandidate) {
-      this._cleanMatchedNodes(topCandidate, /share/);
+
+    var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;
+
+    this._forEachNode(articleContent.children, function (topCandidate) {
+      this._cleanMatchedNodes(topCandidate, function (node, matchString) {
+        return /share/.test(matchString) && node.textContent.length < shareElementThreshold;
+      });
     });
 
     // If there is only one h2 and its text content substantially equals article title,
@@ -718,9 +741,10 @@ Readability.prototype = {
 
     if (node.getAttribute !== undefined) {
       var rel = node.getAttribute("rel");
+      var itemprop = node.getAttribute("itemprop");
     }
 
-    if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && 
this._isValidByline(node.textContent)) {
+    if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || 
this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
       this._articleByline = node.textContent.trim();
       return true;
     }
@@ -789,6 +813,7 @@ Readability.prototype = {
         if (stripUnlikelyCandidates) {
           if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
               !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
+              !this._hasAncestorTag(node, "table") &&
               node.tagName !== "BODY" &&
               node.tagName !== "A") {
             this.log("Removing unlikely candidate - " + matchString);
@@ -1141,7 +1166,7 @@ Readability.prototype = {
           this._attempts.push({articleContent: articleContent, textLength: textLength});
           // No luck after removing flags, just return the longest text we found during the different loops
           this._attempts.sort(function (a, b) {
-            return a.textLength < b.textLength;
+            return b.textLength - a.textLength;
           });
 
           // But first check if we actually have something
@@ -1199,16 +1224,19 @@ Readability.prototype = {
     var metaElements = this._doc.getElementsByTagName("meta");
 
     // property is a space-separated list of values
-    var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title)\s*/gi;
+    var propertyPattern = 
/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*/gi;
 
     // name is a single value
-    var namePattern = 
/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title)\s*$/i;
+    var namePattern = 
/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i;
 
     // Find description tags.
     this._forEachNode(metaElements, function(element) {
       var elementName = element.getAttribute("name");
       var elementProperty = element.getAttribute("property");
       var content = element.getAttribute("content");
+      if (!content) {
+        return;
+      }
       var matches = null;
       var name = null;
 
@@ -1262,6 +1290,9 @@ Readability.prototype = {
                        values["description"] ||
                        values["twitter:description"];
 
+    // get site name
+    metadata.siteName = values["og:site_name"];
+
     return metadata;
   },
 
@@ -1462,17 +1493,17 @@ Readability.prototype = {
     this._removeNodes(e.getElementsByTagName(tag), function(element) {
       // Allow youtube and vimeo videos through as people usually want to see those.
       if (isEmbed) {
-        var attributeValues = [].map.call(element.attributes, function(attr) {
-          return attr.value;
-        }).join("|");
-
         // First, check the elements attributes to see if any of them contain youtube or vimeo
-        if (this.REGEXPS.videos.test(attributeValues))
-          return false;
+        for (var i = 0; i < element.attributes.length; i++) {
+          if (this.REGEXPS.videos.test(element.attributes[i].value)) {
+            return false;
+          }
+        }
 
-        // Then check the elements inside this element for the same.
-        if (this.REGEXPS.videos.test(element.innerHTML))
+        // For embed with <object> tag, check inner HTML as well.
+        if (element.tagName === "object" && this.REGEXPS.videos.test(element.innerHTML)) {
           return false;
+        }
       }
 
       return true;
@@ -1608,11 +1639,16 @@ Readability.prototype = {
     //
     // TODO: Consider taking into account original contentScore here.
     this._removeNodes(e.getElementsByTagName(tag), function(node) {
-      // First check if we're in a data table, in which case don't remove us.
+      // First check if this node IS data table, in which case don't remove it.
       var isDataTable = function(t) {
         return t._readabilityDataTable;
       };
 
+      if (tag === "table" && isDataTable(node)) {
+        return false;
+      }
+
+      // Next check if we're inside a data table, in which case don't remove it as well.
       if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
         return false;
       }
@@ -1636,10 +1672,25 @@ Readability.prototype = {
         var input = node.getElementsByTagName("input").length;
 
         var embedCount = 0;
-        var embeds = node.getElementsByTagName("embed");
-        for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
-          if (!this.REGEXPS.videos.test(embeds[ei].src))
-            embedCount += 1;
+        var embeds = this._concatNodeLists(
+          node.getElementsByTagName("object"),
+          node.getElementsByTagName("embed"),
+          node.getElementsByTagName("iframe"));
+
+        for (var i = 0; i < embeds.length; i++) {
+          // If this embed has attribute that matches video regex, don't delete it.
+          for (var j = 0; j < embeds[i].attributes.length; j++) {
+            if (this.REGEXPS.videos.test(embeds[i].attributes[j].value)) {
+              return false;
+            }
+          }
+
+          // For embed with <object> tag, check inner HTML as well.
+          if (embeds[i].tagName === "object" && this.REGEXPS.videos.test(embeds[i].innerHTML)) {
+            return false;
+          }
+
+          embedCount++;
         }
 
         var linkDensity = this._getLinkDensity(node);
@@ -1660,17 +1711,17 @@ Readability.prototype = {
   },
 
   /**
-   * Clean out elements whose id/class combinations match specific string.
+   * Clean out elements that match the specified conditions
    *
    * @param Element
-   * @param RegExp match id/class combination.
+   * @param Function determines whether a node should be removed
    * @return void
    **/
-  _cleanMatchedNodes: function(e, regex) {
+  _cleanMatchedNodes: function(e, filter) {
     var endOfSearchMarkerNode = this._getNextNode(e, true);
     var next = this._getNextNode(e);
     while (next && next != endOfSearchMarkerNode) {
-      if (regex.test(next.className + " " + next.id)) {
+      if (filter(next, next.className + " " + next.id)) {
         next = this._removeAndGetNext(next);
       } else {
         next = this._getNextNode(next);
@@ -1701,66 +1752,7 @@ Readability.prototype = {
   },
 
   _isProbablyVisible: function(node) {
-    return node.style.display != "none" && !node.hasAttribute("hidden");
-  },
-
-  /**
-   * Decides whether or not the document is reader-able without parsing the whole thing.
-   *
-   * @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
-   */
-  isProbablyReaderable: function(helperIsVisible) {
-    var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
-
-    // Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
-    // Some articles' DOM structures might look like
-    // <div>
-    //   Sentences<br>
-    //   <br>
-    //   Sentences<br>
-    // </div>
-    var brNodes = this._getAllNodesWithTag(this._doc, ["div > br"]);
-    if (brNodes.length) {
-      var set = new Set();
-      [].forEach.call(brNodes, function(node) {
-        set.add(node.parentNode);
-      });
-      nodes = [].concat.apply(Array.from(set), nodes);
-    }
-
-    if (!helperIsVisible) {
-      helperIsVisible = this._isProbablyVisible;
-    }
-
-    var score = 0;
-    // This is a little cheeky, we use the accumulator 'score' to decide what to return from
-    // this callback:
-    return this._someNode(nodes, function(node) {
-      if (helperIsVisible && !helperIsVisible(node))
-        return false;
-      var matchString = node.className + " " + node.id;
-
-      if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
-          !this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
-        return false;
-      }
-
-      if (node.matches && node.matches("li p")) {
-        return false;
-      }
-
-      var textContentLength = node.textContent.trim().length;
-      if (textContentLength < 140) {
-        return false;
-      }
-
-      score += Math.sqrt(textContentLength - 140);
-
-      if (score > 20) {
-        return true;
-      }
-      return false;
-    });
+    return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden");
   },
 
   /**
@@ -1819,6 +1811,7 @@ Readability.prototype = {
       textContent: textContent,
       length: textContent.length,
       excerpt: metadata.excerpt,
+      siteName: metadata.siteName || this._articleSiteName
     };
   }
 };
@@ -1827,20 +1820,7 @@ if (typeof module === "object") {
   module.exports = Readability;
 }
 
-var loc = document.location;
-var uri = {
-  spec: loc.href,
-  host: loc.host,
-  prePath: loc.protocol + "//" + loc.host,
-  scheme: loc.protocol.substr(0, loc.protocol.indexOf(":")),
-  pathBase: loc.protocol + "//" + loc.host + loc.pathname.substr(0, loc.pathname.lastIndexOf("/") + 1)
-};
-
-if (typeof document !== 'undefined') {
-    var documentClone = document.cloneNode(true);
-    reader = new Readability(uri, documentClone);
-    article = reader.parse();
-    var previous_title = document.title;
-    document.title=previous_title;
-    article
-}
+// Added for Epiphany
+var documentClone = document.cloneNode(true);
+reader = new Readability(documentClone);
+reader.parse();
diff --git a/src/resources/epiphany.gresource.xml b/src/resources/epiphany.gresource.xml
index 6a36df47f..e030ffe38 100644
--- a/src/resources/epiphany.gresource.xml
+++ b/src/resources/epiphany.gresource.xml
@@ -35,7 +35,7 @@
     <file preprocess="xml-stripblanks" compressed="true">gtk/shortcuts-dialog.ui</file>
     <file preprocess="xml-stripblanks" compressed="true">gtk/tab-label.ui</file>
     <file preprocess="xml-stripblanks" compressed="true">gtk/webapp-additional-urls-dialog.ui</file>
-    <file compressed="true">readability.js</file>
+    <file compressed="true">Readability.js</file>
     <file compressed="true">reader.css</file>
     <file compressed="true">fonts/Merriweather-Regular.otf</file>
     <file compressed="true">fonts/Merriweather-Bold.otf</file>
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]