[epiphany] Update Readability.js

From: Jan-Michael Brummer <jbrummer src gnome org>
To: commits-list gnome org
Cc:
Subject: [epiphany] Update Readability.js
Date: Thu, 4 Jun 2020 18:06:25 +0000 (UTC)
commit 2683bba39a3033cf90a82c558588fa08a00801d4
Author: Jan-Michael Brummer <jan brummer tabos org>
Date:   Thu Jun 4 19:06:56 2020 +0200

    Update Readability.js
    
    Update Readability.js to latest revision and move it to third_party with
    an own README.epiphany.

 embed/ephy-web-view.c                              |   4 +-
 embed/meson.build                                  |   2 +-
 src/meson.build                                    |   3 +-
 src/resources/epiphany.gresource.xml               |   2 -
 tests/meson.build                                  |   1 +
 third-party/meson.build                            |   7 +
 third-party/readability/README.epiphany            |  18 ++
 .../readability}/Readability.js                    | 322 ++++++++++++++++++---
 third-party/readability/readability.gresource.xml  |   7 +
 .../readability}/reader.css                        |   0
 10 files changed, 324 insertions(+), 42 deletions(-)
---
diff --git a/embed/ephy-web-view.c b/embed/ephy-web-view.c
index 8132ac011..6ebcad0b5 100644
--- a/embed/ephy-web-view.c
+++ b/embed/ephy-web-view.c
@@ -731,7 +731,7 @@ run_readability_js_if_needed (gpointer data)
   /* Internal pages should never receive reader mode. */
   if (!ephy_embed_utils_is_no_show_address (web_view->address)) {
     webkit_web_view_run_javascript_from_gresource (WEBKIT_WEB_VIEW (web_view),
-                                                   "/org/gnome/epiphany/Readability.js",
+                                                   "/org/gnome/epiphany/readability/Readability.js",
                                                    web_view->cancellable,
                                                    readability_js_finish_cb,
                                                    web_view);
@@ -3454,7 +3454,7 @@ ephy_web_view_toggle_reader_mode (EphyWebView *view,
 
   view->reader_url = g_strdup (ephy_web_view_get_address (view));
   html = g_string_new ("");
-  style_css = g_resources_lookup_data ("/org/gnome/epiphany/reader.css", G_RESOURCE_LOOKUP_FLAGS_NONE, NULL);
+  style_css = g_resources_lookup_data ("/org/gnome/epiphany/readability/reader.css", 
G_RESOURCE_LOOKUP_FLAGS_NONE, NULL);
   title = webkit_web_view_get_title (web_view);
   font_style = enum_nick (EPHY_TYPE_PREFS_READER_FONT_STYLE,
                           g_settings_get_enum (EPHY_SETTINGS_READER,
diff --git a/embed/meson.build b/embed/meson.build
index 917334c05..d78c261c4 100644
--- a/embed/meson.build
+++ b/embed/meson.build
@@ -43,7 +43,7 @@ libephyembed_deps = [
   libsecret_dep,
   libsoup_dep,
   m_dep,
-  webkit2gtk_dep,
+  webkit2gtk_dep
 ]
 
 libephyembed_includes = include_directories(
diff --git a/src/meson.build b/src/meson.build
index 50234bf6f..03999ca59 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -136,7 +136,8 @@ epiphany_sources = [
   'ephy-main.c',
   resources,
   pdfjs_resources,
-  highlightjs_resources
+  highlightjs_resources,
+  readability_resources,
 ]
 
 executable('epiphany',
diff --git a/src/resources/epiphany.gresource.xml b/src/resources/epiphany.gresource.xml
index de5e2b5dc..055015b80 100644
--- a/src/resources/epiphany.gresource.xml
+++ b/src/resources/epiphany.gresource.xml
@@ -42,8 +42,6 @@
     <file preprocess="xml-stripblanks" compressed="true">gtk/shortcuts-dialog.ui</file>
     <file preprocess="xml-stripblanks" compressed="true">gtk/tab-label.ui</file>
     <file preprocess="xml-stripblanks" compressed="true">gtk/webapp-additional-urls-dialog.ui</file>
-    <file compressed="true">Readability.js</file>
-    <file compressed="true">reader.css</file>
   </gresource>
   <gresource prefix="/org/gnome/Epiphany/icons">
     <file compressed="true" alias="scalable/actions/ephy-download-symbolic.svg" 
preprocess="xml-stripblanks">ephy-download-symbolic.svg</file>
diff --git a/tests/meson.build b/tests/meson.build
index d879d6aea..00b8f5d14 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -178,6 +178,7 @@ if get_option('unit_tests').enabled()
   web_view_test = executable('test-ephy-web-view',
     'ephy-web-view-test.c',
     resources,
+    readability_resources,
     dependencies: ephymain_dep
   )
   test('Web view test',
diff --git a/third-party/meson.build b/third-party/meson.build
index 72027a07f..43bd89311 100644
--- a/third-party/meson.build
+++ b/third-party/meson.build
@@ -12,6 +12,13 @@ highlightjs_resources = gnome.compile_resources('highlightjs-resources',
     source_dir: 'highlightjs'
 )
 
+readability_resource_files = files('readability/readability.gresource.xml')
+readability_resources = gnome.compile_resources('readability-resources',
+    readability_resource_files,
+    c_name: 'readability',
+    source_dir: 'readability'
+)
+
 libgvdb_sources = [
   'gvdb/gvdb-builder.c',
   'gvdb/gvdb-reader.c'
diff --git a/third-party/readability/README.epiphany b/third-party/readability/README.epiphany
new file mode 100644
index 000000000..731426a81
--- /dev/null
+++ b/third-party/readability/README.epiphany
@@ -0,0 +1,18 @@
+# Embedded readability mode based on Readability.js
+
+This directory contains an official readability.js release version, distributed at: 
https://github.com/mozilla/readability
+
+## Update process
+
+$ wget https://raw.githubusercontent.com/mozilla/readability/master/Readability.js
+
+Copy Readability.js to <epiphany-source>/third-party/readability/
+
+# Added the following to the bottom of the js file:
+
+// Added for Epiphany
+var documentClone = document.cloneNode(true);
+reader = new Readability(documentClone);
+reader.parse();
+
+# Documentation created by Jan-Michael Brummer <jan brummer tabos org>
\ No newline at end of file
diff --git a/src/resources/Readability.js b/third-party/readability/Readability.js
similarity index 84%
rename from src/resources/Readability.js
rename to third-party/readability/Readability.js
index 2c716b164..8af4f072e 100644
--- a/src/resources/Readability.js
+++ b/third-party/readability/Readability.js
@@ -1,8 +1,3 @@
-//////////////////////////////////////////////////////////////////////////
-// Warning: Epiphany changes at the bottom of the file.                 //
-// https://github.com/mozilla/readability distributed under Apache V2.0 //
-//////////////////////////////////////////////////////////////////////////
-
 /*eslint-env es6:false*/
 /*
  * Copyright (c) 2010 Arc90 Inc
@@ -41,6 +36,7 @@ function Readability(doc, options) {
   options = options || {};
 
   this._doc = doc;
+  this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__;
   this._articleTitle = null;
   this._articleByline = null;
   this._articleDir = null;
@@ -53,6 +49,7 @@ function Readability(doc, options) {
   this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
   this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
   this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
+  this._keepClasses = !!options.keepClasses;
 
   // Start with all flags set
   this._flags = this.FLAG_STRIP_UNLIKELYS |
@@ -119,8 +116,8 @@ Readability.prototype = {
   REGEXPS: {
     // NOTE: These two regular expressions are duplicated in
     // Readability-readerable.js. Please keep both copies in sync.
-    unlikelyCandidates: 
/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
-    okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
+    unlikelyCandidates: 
/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
+    okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
 
     positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
     negative: /hidden|^hid$| hid$| hid |^hid 
|banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
@@ -129,10 +126,13 @@ Readability.prototype = {
     replaceFonts: /<(\/?)font[^>]*>/gi,
     normalize: /\s{2,}/g,
     videos: 
/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
+    shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
     nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
     prevLink: /(prev|earl|old|new|<|«)/i,
     whitespace: /^\s*$/,
     hasContent: /\S$/,
+    srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
+    b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i
   },
 
   DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
@@ -157,6 +157,15 @@ Readability.prototype = {
   // These are the classes that readability sets itself.
   CLASSES_TO_PRESERVE: [ "page" ],
 
+  // These are the list of HTML entities that need to be escaped.
+  HTML_ESCAPE_MAP: {
+    "lt": "<",
+    "gt": ">",
+    "amp": "&",
+    "quot": '"',
+    "apos": "'",
+  },
+
   /**
    * Run any post-process modifications to article content as necessary.
    *
@@ -167,8 +176,10 @@ Readability.prototype = {
     // Readability cannot open relative uris so we convert them to absolute uris.
     this._fixRelativeUris(articleContent);
 
-    // Remove classes.
-    this._cleanClasses(articleContent);
+    if (!this._keepClasses) {
+      // Remove classes.
+      this._cleanClasses(articleContent);
+    }
   },
 
   /**
@@ -182,6 +193,10 @@ Readability.prototype = {
    * @return void
    */
   _removeNodes: function(nodeList, filterFn) {
+    // Avoid ever operating on live node lists.
+    if (this._docJSDOMParser && nodeList._isLiveNodeList) {
+      throw new Error("Do not pass live node lists to _removeNodes");
+    }
     for (var i = nodeList.length - 1; i >= 0; i--) {
       var node = nodeList[i];
       var parentNode = node.parentNode;
@@ -201,6 +216,10 @@ Readability.prototype = {
    * @return void
    */
   _replaceNodeTags: function(nodeList, newTagName) {
+    // Avoid ever operating on live node lists.
+    if (this._docJSDOMParser && nodeList._isLiveNodeList) {
+      throw new Error("Do not pass live node lists to _replaceNodeTags");
+    }
     for (var i = nodeList.length - 1; i >= 0; i--) {
       var node = nodeList[i];
       this._setNodeTag(node, newTagName);
@@ -320,6 +339,7 @@ Readability.prototype = {
       if (baseURI == documentURI && uri.charAt(0) == "#") {
         return uri;
       }
+
       // Otherwise, resolve against base URI:
       try {
         return new URL(uri, baseURI).href;
@@ -333,22 +353,50 @@ Readability.prototype = {
     this._forEachNode(links, function(link) {
       var href = link.getAttribute("href");
       if (href) {
-        // Replace links with javascript: URIs with text content, since
+        // Remove links with javascript: URIs, since
         // they won't work after scripts have been removed from the page.
         if (href.indexOf("javascript:") === 0) {
-          var text = this._doc.createTextNode(link.textContent);
-          link.parentNode.replaceChild(text, link);
+          // if the link only contains simple text content, it can be converted to a text node
+          if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) {
+            var text = this._doc.createTextNode(link.textContent);
+            link.parentNode.replaceChild(text, link);
+          } else {
+            // if the link has multiple children, they should all be preserved
+            var container = this._doc.createElement("span");
+            while (link.childNodes.length > 0) {
+              container.appendChild(link.childNodes[0]);
+            }
+            link.parentNode.replaceChild(container, link);
+          }
         } else {
           link.setAttribute("href", toAbsoluteURI(href));
         }
       }
     });
 
-    var imgs = this._getAllNodesWithTag(articleContent, ["img"]);
-    this._forEachNode(imgs, function(img) {
-      var src = img.getAttribute("src");
+    var medias = this._getAllNodesWithTag(articleContent, [
+      "img", "picture", "figure", "video", "audio", "source"
+    ]);
+
+    this._forEachNode(medias, function(media) {
+      var src = media.getAttribute("src");
+      var poster = media.getAttribute("poster");
+      var srcset = media.getAttribute("srcset");
+
       if (src) {
-        img.setAttribute("src", toAbsoluteURI(src));
+        media.setAttribute("src", toAbsoluteURI(src));
+      }
+
+      if (poster) {
+        media.setAttribute("poster", toAbsoluteURI(poster));
+      }
+
+      if (srcset) {
+        var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) {
+          return toAbsoluteURI(p1) + (p2 || "") + p3;
+        });
+
+        media.setAttribute("srcset", newSrcset);
       }
     });
   },
@@ -442,13 +490,13 @@ Readability.prototype = {
     var doc = this._doc;
 
     // Remove all style tags in head
-    this._removeNodes(doc.getElementsByTagName("style"));
+    this._removeNodes(this._getAllNodesWithTag(doc, ["style"]));
 
     if (doc.body) {
       this._replaceBrs(doc.body);
     }
 
-    this._replaceNodeTags(doc.getElementsByTagName("font"), "SPAN");
+    this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN");
   },
 
   /**
@@ -528,7 +576,7 @@ Readability.prototype = {
 
   _setNodeTag: function (node, tag) {
     this.log("_setNodeTag", node, tag);
-    if (node.__JSDOMParser__) {
+    if (this._docJSDOMParser) {
       node.localName = tag.toLowerCase();
       node.tagName = tag.toUpperCase();
       return node;
@@ -572,6 +620,8 @@ Readability.prototype = {
     // visually linked to other content-ful elements (text, images, etc.).
     this._markDataTables(articleContent);
 
+    this._fixLazyImages(articleContent);
+
     // Clean out junk from the article content
     this._cleanConditionally(articleContent, "form");
     this._cleanConditionally(articleContent, "fieldset");
@@ -589,7 +639,7 @@ Readability.prototype = {
 
     this._forEachNode(articleContent.children, function (topCandidate) {
       this._cleanMatchedNodes(topCandidate, function (node, matchString) {
-        return /share/.test(matchString) && node.textContent.length < shareElementThreshold;
+        return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < 
shareElementThreshold;
       });
     });
 
@@ -626,7 +676,7 @@ Readability.prototype = {
     this._cleanConditionally(articleContent, "div");
 
     // Remove extra paragraphs
-    this._removeNodes(articleContent.getElementsByTagName("p"), function (paragraph) {
+    this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) {
       var imgCount = paragraph.getElementsByTagName("img").length;
       var embedCount = paragraph.getElementsByTagName("embed").length;
       var objectCount = paragraph.getElementsByTagName("object").length;
@@ -820,6 +870,12 @@ Readability.prototype = {
             node = this._removeAndGetNext(node);
             continue;
           }
+
+          if (node.getAttribute("role") == "complementary") {
+            this.log("Removing complementary content - " + matchString);
+            node = this._removeAndGetNext(node);
+            continue;
+          }
         }
 
         // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
@@ -1213,6 +1269,26 @@ Readability.prototype = {
     return false;
   },
 
+  /**
+   * Converts some of the common HTML entities in string to their corresponding characters.
+   *
+   * @param str {string} - a string to unescape.
+   * @return string without HTML entity.
+   */
+  _unescapeHtmlEntities: function(str) {
+    if (!str) {
+      return str;
+    }
+
+    var htmlEscapeMap = this.HTML_ESCAPE_MAP;
+    return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) {
+      return htmlEscapeMap[tag];
+    }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) {
+      var num = parseInt(hex || numStr, hex ? 16 : 10);
+      return String.fromCharCode(num);
+    });
+  },
+
   /**
    * Attempts to get excerpt and byline metadata for the article.
    *
@@ -1293,21 +1369,123 @@ Readability.prototype = {
     // get site name
     metadata.siteName = values["og:site_name"];
 
+    // in many sites the meta value is escaped with HTML entities,
+    // so here we need to unescape it
+    metadata.title = this._unescapeHtmlEntities(metadata.title);
+    metadata.byline = this._unescapeHtmlEntities(metadata.byline);
+    metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
+    metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
+
     return metadata;
   },
 
+  /**
+   * Check if node is image, or if node contains exactly only one image
+   * whether as a direct child or as its descendants.
+   *
+   * @param Element
+  **/
+  _isSingleImage: function(node) {
+    if (node.tagName === "IMG") {
+      return true;
+    }
+
+    if (node.children.length !== 1 || node.textContent.trim() !== "") {
+      return false;
+    }
+
+    return this._isSingleImage(node.children[0]);
+  },
+
+  /**
+   * Find all <noscript> that are located after <img> nodes, and which contain only one
+   * <img> element. Replace the first image with the image from inside the <noscript> tag,
+   * and remove the <noscript> tag. This improves the quality of the images we use on
+   * some sites (e.g. Medium).
+   *
+   * @param Element
+  **/
+  _unwrapNoscriptImages: function(doc) {
+    // Find img without source or attributes that might contains image, and remove it.
+    // This is done to prevent a placeholder img is replaced by img from noscript in next step.
+    var imgs = Array.from(doc.getElementsByTagName("img"));
+    this._forEachNode(imgs, function(img) {
+      for (var i = 0; i < img.attributes.length; i++) {
+        var attr = img.attributes[i];
+        switch (attr.name) {
+          case "src":
+          case "srcset":
+          case "data-src":
+          case "data-srcset":
+            return;
+        }
+
+        if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
+          return;
+        }
+      }
+
+      img.parentNode.removeChild(img);
+    });
+
+    // Next find noscript and try to extract its image
+    var noscripts = Array.from(doc.getElementsByTagName("noscript"));
+    this._forEachNode(noscripts, function(noscript) {
+      // Parse content of noscript and make sure it only contains image
+      var tmp = doc.createElement("div");
+      tmp.innerHTML = noscript.innerHTML;
+      if (!this._isSingleImage(tmp)) {
+        return;
+      }
+
+      // If noscript has previous sibling and it only contains image,
+      // replace it with noscript content. However we also keep old
+      // attributes that might contains image.
+      var prevElement = noscript.previousElementSibling;
+      if (prevElement && this._isSingleImage(prevElement)) {
+        var prevImg = prevElement;
+        if (prevImg.tagName !== "IMG") {
+          prevImg = prevElement.getElementsByTagName("img")[0];
+        }
+
+        var newImg = tmp.getElementsByTagName("img")[0];
+        for (var i = 0; i < prevImg.attributes.length; i++) {
+          var attr = prevImg.attributes[i];
+          if (attr.value === "") {
+            continue;
+          }
+
+          if (attr.name === "src" || attr.name === "srcset" || /\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
+            if (newImg.getAttribute(attr.name) === attr.value) {
+              continue;
+            }
+
+            var attrName = attr.name;
+            if (newImg.hasAttribute(attrName)) {
+              attrName = "data-old-" + attrName;
+            }
+
+            newImg.setAttribute(attrName, attr.value);
+          }
+        }
+
+        noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement);
+      }
+    });
+  },
+
   /**
    * Removes script tags from the document.
    *
    * @param Element
   **/
   _removeScripts: function(doc) {
-    this._removeNodes(doc.getElementsByTagName("script"), function(scriptNode) {
+    this._removeNodes(this._getAllNodesWithTag(doc, ["script"]), function(scriptNode) {
       scriptNode.nodeValue = "";
       scriptNode.removeAttribute("src");
       return true;
     });
-    this._removeNodes(doc.getElementsByTagName("noscript"));
+    this._removeNodes(this._getAllNodesWithTag(doc, ["noscript"]));
   },
 
   /**
@@ -1490,7 +1668,7 @@ Readability.prototype = {
   _clean: function(e, tag) {
     var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
 
-    this._removeNodes(e.getElementsByTagName(tag), function(element) {
+    this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(element) {
       // Allow youtube and vimeo videos through as people usually want to see those.
       if (isEmbed) {
         // First, check the elements attributes to see if any of them contain youtube or vimeo
@@ -1621,6 +1799,76 @@ Readability.prototype = {
     }
   },
 
+  /* convert images and figures that have properties like data-src into images that can be loaded without JS 
*/
+  _fixLazyImages: function (root) {
+    this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) {
+      // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
+      // So, here we check if the data uri is too short, just might as well remove it.
+      if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) {
+        // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
+        var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
+        if (parts[1] === "image/svg+xml") {
+          return;
+        }
+
+        // Make sure this element has other attributes which contains image.
+        // If it doesn't, then this src is important and shouldn't be removed.
+        var srcCouldBeRemoved = false;
+        for (var i = 0; i < elem.attributes.length; i++) {
+          var attr = elem.attributes[i];
+          if (attr.name === "src") {
+            continue;
+          }
+
+          if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
+            srcCouldBeRemoved = true;
+            break;
+          }
+        }
+
+        // Here we assume if image is less than 100 bytes (or 133B after encoded to base64)
+        // it will be too small, therefore it might be placeholder image.
+        if (srcCouldBeRemoved) {
+          var b64starts = elem.src.search(/base64\s*/i) + 7;
+          var b64length = elem.src.length - b64starts;
+          if (b64length < 133) {
+            elem.removeAttribute("src");
+          }
+        }
+      }
+
+      // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
+      if ((elem.src || (elem.srcset && elem.srcset != "null")) && 
elem.className.toLowerCase().indexOf("lazy") === -1) {
+        return;
+      }
+
+      for (var j = 0; j < elem.attributes.length; j++) {
+        attr = elem.attributes[j];
+        if (attr.name === "src" || attr.name === "srcset") {
+          continue;
+        }
+        var copyTo = null;
+        if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
+          copyTo = "srcset";
+        } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
+          copyTo = "src";
+        }
+        if (copyTo) {
+          //if this is an img or picture, set the attribute directly
+          if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
+            elem.setAttribute(copyTo, attr.value);
+          } else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", 
"picture"]).length) {
+            //if the item is a <figure> that does not contain an image or picture, create one and place it 
inside the figure
+            //see the nytimes-3 testcase for an example
+            var img = this._doc.createElement("img");
+            img.setAttribute(copyTo, attr.value);
+            elem.appendChild(img);
+          }
+        }
+      }
+    });
+  },
+
   /**
    * Clean an element of all tags of type "tag" if they look fishy.
    * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, 
etc.
@@ -1638,7 +1886,7 @@ Readability.prototype = {
     // without effecting the traversal.
     //
     // TODO: Consider taking into account original contentScore here.
-    this._removeNodes(e.getElementsByTagName(tag), function(node) {
+    this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(node) {
       // First check if this node IS data table, in which case don't remove it.
       var isDataTable = function(t) {
         return t._readabilityDataTable;
@@ -1672,10 +1920,7 @@ Readability.prototype = {
         var input = node.getElementsByTagName("input").length;
 
         var embedCount = 0;
-        var embeds = this._concatNodeLists(
-          node.getElementsByTagName("object"),
-          node.getElementsByTagName("embed"),
-          node.getElementsByTagName("iframe"));
+        var embeds = this._getAllNodesWithTag(node, ["object", "embed", "iframe"]);
 
         for (var i = 0; i < embeds.length; i++) {
           // If this embed has attribute that matches video regex, don't delete it.
@@ -1721,7 +1966,7 @@ Readability.prototype = {
     var endOfSearchMarkerNode = this._getNextNode(e, true);
     var next = this._getNextNode(e);
     while (next && next != endOfSearchMarkerNode) {
-      if (filter(next, next.className + " " + next.id)) {
+      if (filter.call(this, next, next.className + " " + next.id)) {
         next = this._removeAndGetNext(next);
       } else {
         next = this._getNextNode(next);
@@ -1736,11 +1981,9 @@ Readability.prototype = {
    * @return void
   **/
   _cleanHeaders: function(e) {
-    for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
-      this._removeNodes(e.getElementsByTagName("h" + headerIndex), function (header) {
-        return this._getClassWeight(header) < 0;
-      });
-    }
+    this._removeNodes(this._getAllNodesWithTag(e, ["h1", "h2"]), function (header) {
+      return this._getClassWeight(header) < 0;
+    });
   },
 
   _flagIsActive: function(flag) {
@@ -1752,7 +1995,11 @@ Readability.prototype = {
   },
 
   _isProbablyVisible: function(node) {
-    return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden");
+    // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes.
+    return (!node.style || node.style.display != "none")
+      && !node.hasAttribute("hidden")
+      //check for "fallback-image" so that wikimedia math images are displayed
+      && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className 
&& node.className.indexOf && node.className.indexOf("fallback-image") !== -1));
   },
 
   /**
@@ -1776,6 +2023,9 @@ Readability.prototype = {
       }
     }
 
+    // Unwrap image from noscript
+    this._unwrapNoscriptImages(this._doc);
+
     // Remove script tags from the document.
     this._removeScripts(this._doc);
 
diff --git a/third-party/readability/readability.gresource.xml 
b/third-party/readability/readability.gresource.xml
new file mode 100644
index 000000000..b3216fd5f
--- /dev/null
+++ b/third-party/readability/readability.gresource.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<gresources>
+  <gresource prefix="/org/gnome/epiphany/readability">
+    <file compressed="true">Readability.js</file>
+    <file compressed="true">reader.css</file>
+  </gresource>
+</gresources>
diff --git a/src/resources/reader.css b/third-party/readability/reader.css
similarity index 100%
rename from src/resources/reader.css
rename to third-party/readability/reader.css
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]