[epiphany] Update readability.js
- From: Michael Catanzaro <mcatanzaro src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [epiphany] Update readability.js
- Date: Sun, 7 Apr 2019 20:11:17 +0000 (UTC)
commit a6d08113d73e5cbb732a1b74b47a8b60c87a8c04
Author: Jan-Michael Brummer <jan brummer tabos org>
Date: Sun Apr 7 21:50:48 2019 +0200
Update readability.js
src/resources/readability.js | 995 ++++++++++++++++---------------------------
1 file changed, 359 insertions(+), 636 deletions(-)
---
diff --git a/src/resources/readability.js b/src/resources/readability.js
index aa15ecc91..6206d6923 100644
--- a/src/resources/readability.js
+++ b/src/resources/readability.js
@@ -22,42 +22,37 @@
/**
* Public constructor.
- * @param {Object} uri The URI descriptor object.
* @param {HTMLDocument} doc The document to parse.
* @param {Object} options The options object.
*/
-function Readability(uri, doc, options) {
+function Readability(doc, options) {
+ // In some older versions, people passed a URI as the first argument. Cope:
+ if (options && options.documentElement) {
+ doc = options;
+ options = arguments[2];
+ } else if (!doc || !doc.documentElement) {
+ throw new Error("First argument to Readability constructor should be a document object.");
+ }
options = options || {};
- this._uri = uri;
this._doc = doc;
- this._biggestFrame = false;
this._articleTitle = null;
this._articleByline = null;
this._articleDir = null;
+ this._attempts = [];
- // Configureable options
+ // Configurable options
this._debug = !!options.debug;
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
- this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
+ this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
+ this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
// Start with all flags set
this._flags = this.FLAG_STRIP_UNLIKELYS |
this.FLAG_WEIGHT_CLASSES |
this.FLAG_CLEAN_CONDITIONALLY;
- // The list of pages we've parsed in this call of readability,
- // for autopaging. As a key store for easier searching.
- this._parsedPages = {};
-
- // A list of the ETag headers of pages we've parsed, in case they happen to match,
- // we'll know it's a duplicate.
- this._pageETags = {};
-
- // Make an AJAX request for each page and append it to the document.
- this._curPageNum = 1;
-
var logEl;
// Control whether log messages are sent to the console
@@ -96,6 +91,10 @@ Readability.prototype = {
FLAG_WEIGHT_CLASSES: 0x2,
FLAG_CLEAN_CONDITIONALLY: 0x4,
+ // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
+ ELEMENT_NODE: 1,
+ TEXT_NODE: 3,
+
// Max number of nodes supported by this parser. Default: 0 (no limit)
DEFAULT_MAX_ELEMS_TO_PARSE: 0,
@@ -103,25 +102,24 @@ Readability.prototype = {
// tight the competition is among candidates.
DEFAULT_N_TOP_CANDIDATES: 5,
- // The maximum number of pages to loop through before we call
- // it quits and just show a link.
- DEFAULT_MAX_PAGES: 5,
-
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
+ // The default number of chars an article must have in order to return a result
+ DEFAULT_CHAR_THRESHOLD: 500,
+
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
- unlikelyCandidates:
/banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
+ unlikelyCandidates:
/-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
- negative: /hidden|^hid$| hid$| hid |^hid
|banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
+ negative: /hidden|^hid$| hid$| hid |^hid
|banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby|p-author/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
normalize: /\s{2,}/g,
- videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
+ videos:
/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
prevLink: /(prev|earl|old|new|<|«)/i,
whitespace: /^\s*$/,
@@ -132,6 +130,24 @@ Readability.prototype = {
ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
+ PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing",
"frame", "hspace", "rules", "style", "valign", "vspace" ],
+
+ DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
+
+ // The commented out elements qualify as phrasing content but tend to be
+ // removed by readability when put into paragraphs, so we ignore them here.
+ PHRASING_ELEMS: [
+ // "CANVAS", "IFRAME", "SVG", "VIDEO",
+ "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
+ "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
+ "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q",
+ "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB",
+ "SUP", "TEXTAREA", "TIME", "VAR", "WBR"
+ ],
+
+ // These are the classes that readability sets itself.
+ CLASSES_TO_PRESERVE: [ "page" ],
+
/**
* Run any post-process modifications to article content as necessary.
*
@@ -141,6 +157,9 @@ Readability.prototype = {
_postProcessContent: function(articleContent) {
// Readability cannot open relative uris so we convert them to absolute uris.
this._fixRelativeUris(articleContent);
+
+ // Remove classes.
+ this._cleanClasses(articleContent);
},
/**
@@ -209,6 +228,21 @@ Readability.prototype = {
return Array.prototype.some.call(nodeList, fn, this);
},
+ /**
+ * Iterate over a NodeList, return true if all of the provided iterate
+ * function calls return true, false otherwise.
+ *
+ * For convenience, the current object context is applied to the
+ * provided iterate function.
+ *
+ * @param NodeList nodeList The NodeList.
+ * @param Function fn The iterate function.
+ * @return Boolean
+ */
+ _everyNode: function(nodeList, fn) {
+ return Array.prototype.every.call(nodeList, fn, this);
+ },
+
/**
* Concat all nodelists passed as arguments.
*
@@ -226,7 +260,7 @@ Readability.prototype = {
_getAllNodesWithTag: function(node, tagNames) {
if (node.querySelectorAll) {
- return node.querySelectorAll(tagNames.join(','));
+ return node.querySelectorAll(tagNames.join(","));
}
return [].concat.apply([], tagNames.map(function(tag) {
var collection = node.getElementsByTagName(tag);
@@ -234,6 +268,34 @@ Readability.prototype = {
}));
},
+ /**
+ * Removes the class="" attribute from every element in the given
+ * subtree, except those that match CLASSES_TO_PRESERVE and
+ * the classesToPreserve array from the options object.
+ *
+ * @param Element
+ * @return void
+ */
+ _cleanClasses: function(node) {
+ var classesToPreserve = this._classesToPreserve;
+ var className = (node.getAttribute("class") || "")
+ .split(/\s+/)
+ .filter(function(cls) {
+ return classesToPreserve.indexOf(cls) != -1;
+ })
+ .join(" ");
+
+ if (className) {
+ node.setAttribute("class", className);
+ } else {
+ node.removeAttribute("class");
+ }
+
+ for (node = node.firstElementChild; node; node = node.nextElementSibling) {
+ this._cleanClasses(node);
+ }
+ },
+
/**
* Converts each <a> and <img> uri in the given element to an absolute URI,
* ignoring #ref URIs.
@@ -242,34 +304,20 @@ Readability.prototype = {
* @return void
*/
_fixRelativeUris: function(articleContent) {
- var scheme = this._uri.scheme;
- var prePath = this._uri.prePath;
- var pathBase = this._uri.pathBase;
-
+ var baseURI = this._doc.baseURI;
+ var documentURI = this._doc.documentURI;
function toAbsoluteURI(uri) {
- // If this is already an absolute URI, return it.
- if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri))
+ // Leave hash links alone if the base URI matches the document URI:
+ if (baseURI == documentURI && uri.charAt(0) == "#") {
return uri;
-
- // Scheme-rooted relative URI.
- if (uri.substr(0, 2) == "//")
- return scheme + "://" + uri.substr(2);
-
- // Prepath-rooted relative URI.
- if (uri[0] == "/")
- return prePath + uri;
-
- // Dotslash relative URI.
- if (uri.indexOf("./") === 0)
- return pathBase + uri.slice(2);
-
- // Ignore hash URIs:
- if (uri[0] == "#")
- return uri;
-
- // Standard relative URI; add entire path. pathBase already includes a
- // trailing "/".
- return pathBase + uri;
+ }
+ // Otherwise, resolve against base URI:
+ try {
+ return new URL(uri, baseURI).href;
+ } catch (ex) {
+ // Something went wrong, just return the original:
+ }
+ return uri;
}
var links = articleContent.getElementsByTagName("a");
@@ -307,11 +355,11 @@ Readability.prototype = {
var origTitle = "";
try {
- curTitle = origTitle = doc.title;
+ curTitle = origTitle = doc.title.trim();
// If they had an element with id "title" in their HTML
if (typeof curTitle !== "string")
- curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
+ curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]);
} catch (e) {/* ignore exceptions setting the title. */}
var titleHadHierarchicalSeparators = false;
@@ -322,33 +370,39 @@ Readability.prototype = {
// If there's a separator in the title, first remove the final part
if ((/ [\|\-\\\/>»] /).test(curTitle)) {
titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
- curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, '$1');
+ curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1");
// If the resulting title is too short (3 words or fewer), remove
// the first part instead:
if (wordCount(curTitle) < 3)
- curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, '$1');
- } else if (curTitle.indexOf(': ') !== -1) {
+ curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1");
+ } else if (curTitle.indexOf(": ") !== -1) {
// Check if we have an heading containing this exact string, so we
// could assume it's the full title.
var headings = this._concatNodeLists(
- doc.getElementsByTagName('h1'),
- doc.getElementsByTagName('h2')
+ doc.getElementsByTagName("h1"),
+ doc.getElementsByTagName("h2")
);
+ var trimmedTitle = curTitle.trim();
var match = this._someNode(headings, function(heading) {
- return heading.textContent === curTitle;
+ return heading.textContent.trim() === trimmedTitle;
});
// If we don't, let's extract the title out of the original title string.
if (!match) {
- curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
+ curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1);
// If the title is now too short, try the first colon instead:
- if (wordCount(curTitle) < 3)
- curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
+ if (wordCount(curTitle) < 3) {
+ curTitle = origTitle.substring(origTitle.indexOf(":") + 1);
+ // But if we have too many words before the colon there's something weird
+ // with the titles and the H tags so let's just use the original title instead
+ } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) {
+ curTitle = origTitle;
+ }
}
} else if (curTitle.length > 150 || curTitle.length < 15) {
- var hOnes = doc.getElementsByTagName('h1');
+ var hOnes = doc.getElementsByTagName("h1");
if (hOnes.length === 1)
curTitle = this._getInnerText(hOnes[0]);
@@ -396,7 +450,7 @@ Readability.prototype = {
_nextElement: function (node) {
var next = node;
while (next
- && (next.nodeType != Node.ELEMENT_NODE)
+ && (next.nodeType != this.ELEMENT_NODE)
&& this.REGEXPS.whitespace.test(next.textContent)) {
next = next.nextSibling;
}
@@ -439,16 +493,26 @@ Readability.prototype = {
while (next) {
// If we've hit another <br><br>, we're done adding children to this <p>.
if (next.tagName == "BR") {
- var nextElem = this._nextElement(next);
+ var nextElem = this._nextElement(next.nextSibling);
if (nextElem && nextElem.tagName == "BR")
break;
}
+ if (!this._isPhrasingContent(next))
+ break;
+
// Otherwise, make this node a child of the new <p>.
var sibling = next.nextSibling;
p.appendChild(next);
next = sibling;
}
+
+ while (p.lastChild && this._isWhitespace(p.lastChild)) {
+ p.removeChild(p.lastChild);
+ }
+
+ if (p.parentNode.tagName === "P")
+ this._setNodeTag(p.parentNode, "DIV");
}
});
},
@@ -497,6 +561,8 @@ Readability.prototype = {
this._clean(articleContent, "embed");
this._clean(articleContent, "h1");
this._clean(articleContent, "footer");
+ this._clean(articleContent, "link");
+ this._clean(articleContent, "aside");
// Clean out elements have "share" in their id/class combinations from final top candidates,
// which means we don't remove the top candidates even they have "share".
@@ -507,13 +573,19 @@ Readability.prototype = {
// If there is only one h2 and its text content substantially equals article title,
// they are probably using it as a header and not a subheader,
// so remove it since we already extract the title separately.
- var h2 = articleContent.getElementsByTagName('h2');
+ var h2 = articleContent.getElementsByTagName("h2");
if (h2.length === 1) {
var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) /
this._articleTitle.length;
- if (Math.abs(lengthSimilarRate) < 0.5 &&
- (lengthSimilarRate > 0 ? h2[0].textContent.includes(this._articleTitle) :
- this._articleTitle.includes(h2[0].textContent))) {
- this._clean(articleContent, "h2");
+ if (Math.abs(lengthSimilarRate) < 0.5) {
+ var titlesMatch = false;
+ if (lengthSimilarRate > 0) {
+ titlesMatch = h2[0].textContent.includes(this._articleTitle);
+ } else {
+ titlesMatch = this._articleTitle.includes(h2[0].textContent);
+ }
+ if (titlesMatch) {
+ this._clean(articleContent, "h2");
+ }
}
}
@@ -531,12 +603,12 @@ Readability.prototype = {
this._cleanConditionally(articleContent, "div");
// Remove extra paragraphs
- this._removeNodes(articleContent.getElementsByTagName('p'), function (paragraph) {
- var imgCount = paragraph.getElementsByTagName('img').length;
- var embedCount = paragraph.getElementsByTagName('embed').length;
- var objectCount = paragraph.getElementsByTagName('object').length;
+ this._removeNodes(articleContent.getElementsByTagName("p"), function (paragraph) {
+ var imgCount = paragraph.getElementsByTagName("img").length;
+ var embedCount = paragraph.getElementsByTagName("embed").length;
+ var objectCount = paragraph.getElementsByTagName("object").length;
// At this point, nasty iframes have been removed, only remain embedded video ones.
- var iframeCount = paragraph.getElementsByTagName('iframe').length;
+ var iframeCount = paragraph.getElementsByTagName("iframe").length;
var totalCount = imgCount + embedCount + objectCount + iframeCount;
return totalCount === 0 && !this._getInnerText(paragraph, false);
@@ -547,6 +619,19 @@ Readability.prototype = {
if (next && next.tagName == "P")
br.parentNode.removeChild(br);
});
+
+ // Remove single-cell tables
+ this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) {
+ var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table;
+ if (this._hasSingleTagInsideElement(tbody, "TR")) {
+ var row = tbody.firstElementChild;
+ if (this._hasSingleTagInsideElement(row, "TD")) {
+ var cell = row.firstElementChild;
+ cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" :
"DIV");
+ table.parentNode.replaceChild(cell, table);
+ }
+ }
+ });
},
/**
@@ -560,34 +645,34 @@ Readability.prototype = {
node.readability = {"contentScore": 0};
switch (node.tagName) {
- case 'DIV':
+ case "DIV":
node.readability.contentScore += 5;
break;
- case 'PRE':
- case 'TD':
- case 'BLOCKQUOTE':
+ case "PRE":
+ case "TD":
+ case "BLOCKQUOTE":
node.readability.contentScore += 3;
break;
- case 'ADDRESS':
- case 'OL':
- case 'UL':
- case 'DL':
- case 'DD':
- case 'DT':
- case 'LI':
- case 'FORM':
+ case "ADDRESS":
+ case "OL":
+ case "UL":
+ case "DL":
+ case "DD":
+ case "DT":
+ case "LI":
+ case "FORM":
node.readability.contentScore -= 3;
break;
- case 'H1':
- case 'H2':
- case 'H3':
- case 'H4':
- case 'H5':
- case 'H6':
- case 'TH':
+ case "H1":
+ case "H2":
+ case "H3":
+ case "H4":
+ case "H5":
+ case "H6":
+ case "TH":
node.readability.contentScore -= 5;
break;
}
@@ -626,37 +711,6 @@ Readability.prototype = {
return node && node.nextElementSibling;
},
- /**
- * Like _getNextNode, but for DOM implementations with no
- * firstElementChild/nextElementSibling functionality...
- */
- _getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) {
- function nextSiblingEl(n) {
- do {
- n = n.nextSibling;
- } while (n && n.nodeType !== n.ELEMENT_NODE);
- return n;
- }
- // First check for kids if those aren't being ignored
- if (!ignoreSelfAndKids && node.children[0]) {
- return node.children[0];
- }
- // Then for siblings...
- var next = nextSiblingEl(node);
- if (next) {
- return next;
- }
- // And finally, move up the parent chain *and* find a sibling
- // (because this is depth-first traversal, we will have already
- // seen the parent nodes themselves).
- do {
- node = node.parentNode;
- if (node)
- next = nextSiblingEl(node);
- } while (node && !next);
- return node && next;
- },
-
_checkByline: function(node, matchString) {
if (this._articleByline) {
return false;
@@ -719,6 +773,12 @@ Readability.prototype = {
while (node) {
var matchString = node.className + " " + node.id;
+ if (!this._isProbablyVisible(node)) {
+ this.log("Removing hidden node - " + matchString);
+ node = this._removeAndGetNext(node);
+ continue;
+ }
+
// Check to see if this node is a byline, and remove it if it is.
if (this._checkByline(node, matchString)) {
node = this._removeAndGetNext(node);
@@ -752,28 +812,40 @@ Readability.prototype = {
// Turn all divs that don't have children block level elements into p's
if (node.tagName === "DIV") {
+ // Put phrasing content into paragraphs.
+ var p = null;
+ var childNode = node.firstChild;
+ while (childNode) {
+ var nextSibling = childNode.nextSibling;
+ if (this._isPhrasingContent(childNode)) {
+ if (p !== null) {
+ p.appendChild(childNode);
+ } else if (!this._isWhitespace(childNode)) {
+ p = doc.createElement("p");
+ node.replaceChild(p, childNode);
+ p.appendChild(childNode);
+ }
+ } else if (p !== null) {
+ while (p.lastChild && this._isWhitespace(p.lastChild)) {
+ p.removeChild(p.lastChild);
+ }
+ p = null;
+ }
+ childNode = nextSibling;
+ }
+
// Sites like http://mobile.slate.com encloses each paragraph with a DIV
// element. DIVs with only a P element inside and no text content can be
// safely converted into plain P elements to avoid confusing the scoring
// algorithm with DIVs with are, in practice, paragraphs.
- if (this._hasSinglePInsideElement(node)) {
+ if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) {
var newNode = node.children[0];
node.parentNode.replaceChild(newNode, node);
node = newNode;
+ elementsToScore.push(node);
} else if (!this._hasChildBlockElement(node)) {
node = this._setNodeTag(node, "P");
elementsToScore.push(node);
- } else {
- // EXPERIMENTAL
- this._forEachNode(node.childNodes, function(childNode) {
- if (childNode.nodeType === Node.TEXT_NODE && childNode.textContent.trim().length > 0) {
- var p = doc.createElement('p');
- p.textContent = childNode.textContent;
- p.style.display = 'inline';
- p.className = 'readability-styled';
- node.replaceChild(p, childNode);
- }
- });
}
}
node = this._getNextNode(node);
@@ -787,7 +859,7 @@ Readability.prototype = {
**/
var candidates = [];
this._forEachNode(elementsToScore, function(elementToScore) {
- if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === 'undefined')
+ if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined")
return;
// If this paragraph is less than 25 characters, don't even count it.
@@ -806,17 +878,17 @@ Readability.prototype = {
contentScore += 1;
// Add points for any commas within this paragraph.
- contentScore += innerText.split(',').length;
+ contentScore += innerText.split(",").length;
// For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
// Initialize and score ancestors.
this._forEachNode(ancestors, function(ancestor, level) {
- if (!ancestor.tagName)
+ if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) ===
"undefined")
return;
- if (typeof(ancestor.readability) === 'undefined') {
+ if (typeof(ancestor.readability) === "undefined") {
this._initializeNode(ancestor);
candidates.push(ancestor);
}
@@ -847,7 +919,7 @@ Readability.prototype = {
var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
candidate.readability.contentScore = candidateScore;
- this.log('Candidate:', candidate, "with score " + candidateScore);
+ this.log("Candidate:", candidate, "with score " + candidateScore);
for (var t = 0; t < this._nbTopCandidates; t++) {
var aTopCandidate = topCandidates[t];
@@ -966,8 +1038,8 @@ Readability.prototype = {
var sibling = siblings[s];
var append = false;
- this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " +
sibling.readability.contentScore) : '');
- this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : 'Unknown');
+ this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " +
sibling.readability.contentScore) : "");
+ this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown");
if (sibling === topCandidate) {
append = true;
@@ -1001,7 +1073,7 @@ Readability.prototype = {
if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
// We have a node that isn't a common block level element, like a form or td tag.
// Turn it into a div so it doesn't get filtered out later by accident.
- this.log("Altering sibling:", sibling, 'to div.');
+ this.log("Altering sibling:", sibling, "to div.");
sibling = this._setNodeTag(sibling, "DIV");
}
@@ -1023,47 +1095,66 @@ Readability.prototype = {
if (this._debug)
this.log("Article content post-prep: " + articleContent.innerHTML);
- if (this._curPageNum === 1) {
- if (neededToCreateTopCandidate) {
- // We already created a fake div thing, and there wouldn't have been any siblings left
- // for the previous loop, so there's no point trying to create a new div, and then
- // move all the children over. Just assign IDs and class names here. No need to append
- // because that already happened anyway.
- topCandidate.id = "readability-page-1";
- topCandidate.className = "page";
- } else {
- var div = doc.createElement("DIV");
- div.id = "readability-page-1";
- div.className = "page";
- var children = articleContent.childNodes;
- while (children.length) {
- div.appendChild(children[0]);
- }
- articleContent.appendChild(div);
+ if (neededToCreateTopCandidate) {
+ // We already created a fake div thing, and there wouldn't have been any siblings left
+ // for the previous loop, so there's no point trying to create a new div, and then
+ // move all the children over. Just assign IDs and class names here. No need to append
+ // because that already happened anyway.
+ topCandidate.id = "readability-page-1";
+ topCandidate.className = "page";
+ } else {
+ var div = doc.createElement("DIV");
+ div.id = "readability-page-1";
+ div.className = "page";
+ var children = articleContent.childNodes;
+ while (children.length) {
+ div.appendChild(children[0]);
}
+ articleContent.appendChild(div);
}
if (this._debug)
this.log("Article content after paging: " + articleContent.innerHTML);
+ var parseSuccessful = true;
+
// Now that we've gone through the full algorithm, check to see if
// we got any meaningful content. If we didn't, we may need to re-run
// grabArticle with different flags set. This gives us a higher likelihood of
// finding the content, and the sieve approach gives us a higher likelihood of
// finding the -right- content.
- if (this._getInnerText(articleContent, true).length < 500) {
+ var textLength = this._getInnerText(articleContent, true).length;
+ if (textLength < this._charThreshold) {
+ parseSuccessful = false;
page.innerHTML = pageCacheHtml;
if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
} else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
this._removeFlag(this.FLAG_WEIGHT_CLASSES);
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
} else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
} else {
- return null;
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
+ // No luck after removing flags, just return the longest text we found during the different loops
+ this._attempts.sort(function (a, b) {
+ return a.textLength < b.textLength;
+ });
+
+ // But first check if we actually have something
+ if (!this._attempts[0].textLength) {
+ return null;
+ }
+
+ articleContent = this._attempts[0].articleContent;
+ parseSuccessful = true;
}
- } else {
+ }
+
+ if (parseSuccessful) {
// Find out text direction from ancestors of final top candidate.
var ancestors = [parentOfTopCandidate,
topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
this._someNode(ancestors, function(ancestor) {
@@ -1090,7 +1181,7 @@ Readability.prototype = {
* @return Boolean - whether the input string is a byline.
*/
_isValidByline: function(byline) {
- if (typeof byline == 'string' || byline instanceof String) {
+ if (typeof byline == "string" || byline instanceof String) {
byline = byline.trim();
return (byline.length > 0) && (byline.length < 100);
}
@@ -1107,62 +1198,70 @@ Readability.prototype = {
var values = {};
var metaElements = this._doc.getElementsByTagName("meta");
- // Match "description", or Twitter's "twitter:description" (Cards)
- // in name attribute.
- var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/gi;
+ // property is a space-separated list of values
+ var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title)\s*/gi;
- // Match Facebook's Open Graph title & description properties.
- var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi;
+ // name is a single value
+ var namePattern =
/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title)\s*$/i;
// Find description tags.
this._forEachNode(metaElements, function(element) {
var elementName = element.getAttribute("name");
var elementProperty = element.getAttribute("property");
+ var content = element.getAttribute("content");
+ var matches = null;
+ var name = null;
- if ([elementName, elementProperty].indexOf("author") !== -1) {
- metadata.byline = element.getAttribute("content");
- return;
+ if (elementProperty) {
+ matches = elementProperty.match(propertyPattern);
+ if (matches) {
+ for (var i = matches.length - 1; i >= 0; i--) {
+ // Convert to lowercase, and remove any whitespace
+ // so we can match below.
+ name = matches[i].toLowerCase().replace(/\s/g, "");
+ // multiple authors
+ values[name] = content.trim();
+ }
+ }
}
-
- var name = null;
- if (namePattern.test(elementName)) {
+ if (!matches && elementName && namePattern.test(elementName)) {
name = elementName;
- } else if (propertyPattern.test(elementProperty)) {
- name = elementProperty;
- }
-
- if (name) {
- var content = element.getAttribute("content");
if (content) {
- // Convert to lowercase and remove any whitespace
- // so we can match below.
- name = name.toLowerCase().replace(/\s/g, '');
+ // Convert to lowercase, remove any whitespace, and convert dots
+ // to colons so we can match below.
+ name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":");
values[name] = content.trim();
}
}
});
- if ("description" in values) {
- metadata.excerpt = values["description"];
- } else if ("og:description" in values) {
- // Use facebook open graph description.
- metadata.excerpt = values["og:description"];
- } else if ("twitter:description" in values) {
- // Use twitter cards description.
- metadata.excerpt = values["twitter:description"];
- }
+ // get title
+ metadata.title = values["dc:title"] ||
+ values["dcterm:title"] ||
+ values["og:title"] ||
+ values["weibo:article:title"] ||
+ values["weibo:webpage:title"] ||
+ values["title"] ||
+ values["twitter:title"];
- metadata.title = this._getArticleTitle();
if (!metadata.title) {
- if ("og:title" in values) {
- // Use facebook open graph title.
- metadata.title = values["og:title"];
- } else if ("twitter:title" in values) {
- // Use twitter cards title.
- metadata.title = values["twitter:title"];
- }
+ metadata.title = this._getArticleTitle();
}
+ // get author
+ metadata.byline = values["dc:creator"] ||
+ values["dcterm:creator"] ||
+ values["author"];
+
+ // get description
+ metadata.excerpt = values["dc:description"] ||
+ values["dcterm:description"] ||
+ values["og:description"] ||
+ values["weibo:article:description"] ||
+ values["weibo:webpage:description"] ||
+ values["description"] ||
+ values["twitter:description"];
+
return metadata;
},
@@ -1172,36 +1271,37 @@ Readability.prototype = {
* @param Element
**/
_removeScripts: function(doc) {
- this._removeNodes(doc.getElementsByTagName('script'), function(scriptNode) {
+ this._removeNodes(doc.getElementsByTagName("script"), function(scriptNode) {
scriptNode.nodeValue = "";
- scriptNode.removeAttribute('src');
+ scriptNode.removeAttribute("src");
return true;
});
- this._removeNodes(doc.getElementsByTagName('noscript'));
+ this._removeNodes(doc.getElementsByTagName("noscript"));
},
/**
- * Check if this node has only whitespace and a single P element
+ * Check if this node has only whitespace and a single element with given tag
* Returns false if the DIV node contains non-empty text nodes
- * or if it contains no P or more than 1 element.
+ * or if it contains no element with given tag or more than 1 element.
*
* @param Element
+ * @param string tag of child element
**/
- _hasSinglePInsideElement: function(element) {
- // There should be exactly 1 element child which is a P:
- if (element.children.length != 1 || element.children[0].tagName !== "P") {
+ _hasSingleTagInsideElement: function(element, tag) {
+ // There should be exactly 1 element child with given tag
+ if (element.children.length != 1 || element.children[0].tagName !== tag) {
return false;
}
// And there should be no text nodes with real content
return !this._someNode(element.childNodes, function(node) {
- return node.nodeType === Node.TEXT_NODE &&
+ return node.nodeType === this.TEXT_NODE &&
this.REGEXPS.hasContent.test(node.textContent);
});
},
_isElementWithoutContent: function(node) {
- return node.nodeType === Node.ELEMENT_NODE &&
+ return node.nodeType === this.ELEMENT_NODE &&
node.textContent.trim().length == 0 &&
(node.children.length == 0 ||
node.children.length == node.getElementsByTagName("br").length +
node.getElementsByTagName("hr").length);
@@ -1219,6 +1319,21 @@ Readability.prototype = {
});
},
+ /***
+ * Determine if a node qualifies as phrasing content.
+ * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
+ **/
+ _isPhrasingContent: function(node) {
+ return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 ||
+ ((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") &&
+ this._everyNode(node.childNodes, this._isPhrasingContent));
+ },
+
+ _isWhitespace: function(node) {
+ return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) ||
+ (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR");
+ },
+
/**
* Get the inner text of a node - cross browser compatibly.
* This also strips out any excess whitespace to be found.
@@ -1228,7 +1343,7 @@ Readability.prototype = {
* @return string
**/
_getInnerText: function(e, normalizeSpaces) {
- normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
+ normalizeSpaces = (typeof normalizeSpaces === "undefined") ? true : normalizeSpaces;
var textContent = e.textContent.trim();
if (normalizeSpaces) {
@@ -1257,26 +1372,23 @@ Readability.prototype = {
* @return void
**/
_cleanStyles: function(e) {
- e = e || this._doc;
- if (!e)
+ if (!e || e.tagName.toLowerCase() === "svg")
return;
- var cur = e.firstChild;
-
- // Remove any root styles, if we're able.
- if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled')
- e.removeAttribute('style');
- // Go until there are no more child nodes
- while (cur !== null) {
- if (cur.nodeType === cur.ELEMENT_NODE) {
- // Remove style attribute(s) :
- if (cur.className !== "readability-styled")
- cur.removeAttribute("style");
+ // Remove `style` and deprecated presentational attributes
+ for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
+ e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
+ }
- this._cleanStyles(cur);
- }
+ if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
+ e.removeAttribute("width");
+ e.removeAttribute("height");
+ }
- cur = cur.nextSibling;
+ var cur = e.firstElementChild;
+ while (cur !== null) {
+ this._cleanStyles(cur);
+ cur = cur.nextElementSibling;
}
},
@@ -1302,363 +1414,6 @@ Readability.prototype = {
return linkLength / textLength;
},
- /**
- * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
- *
- * @author Dan Lacy
- * @return string the base url
- **/
- _findBaseUrl: function() {
- var uri = this._uri;
- var noUrlParams = uri.path.split("?")[0];
- var urlSlashes = noUrlParams.split("/").reverse();
- var cleanedSegments = [];
- var possibleType = "";
-
- for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) {
- var segment = urlSlashes[i];
-
- // Split off and save anything that looks like a file type.
- if (segment.indexOf(".") !== -1) {
- possibleType = segment.split(".")[1];
-
- // If the type isn't alpha-only, it's probably not actually a file extension.
- if (!possibleType.match(/[^a-zA-Z]/))
- segment = segment.split(".")[0];
- }
-
- // If our first or second segment has anything looking like a page number, remove it.
- if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0)))
- segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
-
- var del = false;
-
- // If this is purely a number, and it's the first or second segment,
- // it's probably a page number. Remove it.
- if (i < 2 && segment.match(/^\d{1,2}$/))
- del = true;
-
- // If this is the first segment and it's just "index", remove it.
- if (i === 0 && segment.toLowerCase() === "index")
- del = true;
-
- // If our first or second segment is smaller than 3 characters,
- // and the first segment was purely alphas, remove it.
- if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i))
- del = true;
-
- // If it's not marked for deletion, push it to cleanedSegments.
- if (!del)
- cleanedSegments.push(segment);
- }
-
- // This is our final, cleaned, base article URL.
- return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/");
- },
-
- /**
- * Look for any paging links that may occur within the document.
- *
- * @param body
- * @return object (array)
- **/
- _findNextPageLink: function(elem) {
- var uri = this._uri;
- var possiblePages = {};
- var allLinks = elem.getElementsByTagName('a');
- var articleBaseUrl = this._findBaseUrl();
-
- // Loop through all links, looking for hints that they may be next-page links.
- // Things like having "page" in their textContent, className or id, or being a child
- // of a node with a page-y className or id.
- //
- // Also possible: levenshtein distance? longest common subsequence?
- //
- // After we do that, assign each page a score, and
- for (var i = 0, il = allLinks.length; i < il; i += 1) {
- var link = allLinks[i];
- var linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
-
- // If we've already seen this page, ignore it.
- if (linkHref === "" ||
- linkHref === articleBaseUrl ||
- linkHref === uri.spec ||
- linkHref in this._parsedPages) {
- continue;
- }
-
- // If it's on a different domain, skip it.
- if (uri.host !== linkHref.split(/\/+/g)[1])
- continue;
-
- var linkText = this._getInnerText(link);
-
- // If the linkText looks like it's not the next page, skip it.
- if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25)
- continue;
-
- // If the leftovers of the URL after removing the base URL don't contain
- // any digits, it's certainly not a next page link.
- var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
- if (!linkHrefLeftover.match(/\d/))
- continue;
-
- if (!(linkHref in possiblePages)) {
- possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
- } else {
- possiblePages[linkHref].linkText += ' | ' + linkText;
- }
-
- var linkObj = possiblePages[linkHref];
-
- // If the articleBaseUrl isn't part of this URL, penalize this link. It could
- // still be the link, but the odds are lower.
- // Example:
http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
- if (linkHref.indexOf(articleBaseUrl) !== 0)
- linkObj.score -= 25;
-
- var linkData = linkText + ' ' + link.className + ' ' + link.id;
- if (linkData.match(this.REGEXPS.nextLink))
- linkObj.score += 50;
-
- if (linkData.match(/pag(e|ing|inat)/i))
- linkObj.score += 25;
-
- if (linkData.match(/(first|last)/i)) {
- // -65 is enough to negate any bonuses gotten from a > or » in the text,
- // If we already matched on "next", last is probably fine.
- // If we didn't, then it's bad. Penalize.
- if (!linkObj.linkText.match(this.REGEXPS.nextLink))
- linkObj.score -= 65;
- }
-
- if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous))
- linkObj.score -= 50;
-
- if (linkData.match(this.REGEXPS.prevLink))
- linkObj.score -= 200;
-
- // If a parentNode contains page or paging or paginat
- var parentNode = link.parentNode;
- var positiveNodeMatch = false;
- var negativeNodeMatch = false;
-
- while (parentNode) {
- var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
-
- if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
- positiveNodeMatch = true;
- linkObj.score += 25;
- }
-
- if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative))
{
- // If this is just something like "footer", give it a negative.
- // If it's something like "body-and-footer", leave it be.
- if (!parentNodeClassAndId.match(this.REGEXPS.positive)) {
- linkObj.score -= 25;
- negativeNodeMatch = true;
- }
- }
-
- parentNode = parentNode.parentNode;
- }
-
- // If the URL looks like it has paging in it, add to the score.
- // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
- if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i))
- linkObj.score += 25;
-
- // If the URL contains negative values, give a slight decrease.
- if (linkHref.match(this.REGEXPS.extraneous))
- linkObj.score -= 15;
-
- /**
- * Minor punishment to anything that doesn't match our current URL.
- * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
- * Dan, can you show me a counterexample where this is necessary?
- * if (linkHref.indexOf(window.location.href) !== 0) {
- * linkObj.score -= 1;
- * }
- **/
-
- // If the link text can be parsed as a number, give it a minor bonus, with a slight
- // bias towards lower numbered pages. This is so that pages that might not have 'next'
- // in their text can still get scored, and sorted properly by score.
- var linkTextAsNumber = parseInt(linkText, 10);
- if (linkTextAsNumber) {
- // Punish 1 since we're either already there, or it's probably
- // before what we want anyways.
- if (linkTextAsNumber === 1) {
- linkObj.score -= 10;
- } else {
- linkObj.score += Math.max(0, 10 - linkTextAsNumber);
- }
- }
- }
-
- // Loop thrugh all of our possible pages from above and find our top
- // candidate for the next page URL. Require at least a score of 50, which
- // is a relatively high confidence that this page is the next link.
- var topPage = null;
- for (var page in possiblePages) {
- if (possiblePages.hasOwnProperty(page)) {
- if (possiblePages[page].score >= 50 &&
- (!topPage || topPage.score < possiblePages[page].score))
- topPage = possiblePages[page];
- }
- }
-
- var nextHref = null;
- if (topPage) {
- nextHref = topPage.href.replace(/\/$/, '');
-
- this.log('NEXT PAGE IS ' + nextHref);
- this._parsedPages[nextHref] = true;
- }
- return nextHref;
- },
-
- _successfulRequest: function(request) {
- return (request.status >= 200 && request.status < 300) ||
- request.status === 304 ||
- (request.status === 0 && request.responseText);
- },
-
- _ajax: function(url, options) {
- var request = new XMLHttpRequest();
-
- function respondToReadyState(readyState) {
- if (request.readyState === 4) {
- if (this._successfulRequest(request)) {
- if (options.success)
- options.success(request);
- } else if (options.error) {
- options.error(request);
- }
- }
- }
-
- if (typeof options === 'undefined')
- options = {};
-
- request.onreadystatechange = respondToReadyState;
-
- request.open('get', url, true);
- request.setRequestHeader('Accept', 'text/html');
-
- try {
- request.send(options.postBody);
- } catch (e) {
- if (options.error)
- options.error();
- }
-
- return request;
- },
-
- _appendNextPage: function(nextPageLink) {
- var doc = this._doc;
- this._curPageNum += 1;
-
- var articlePage = doc.createElement("DIV");
- articlePage.id = 'readability-page-' + this._curPageNum;
- articlePage.className = 'page';
- articlePage.innerHTML = '<p class="page-separator" title="Page ' + this._curPageNum + '">§</p>';
-
- doc.getElementById("readability-content").appendChild(articlePage);
-
- if (this._curPageNum > this._maxPages) {
- var nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next
Page</a></div>";
- articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup;
- return;
- }
-
- // Now that we've built the article page DOM element, get the page content
- // asynchronously and load the cleaned content into the div we created for it.
- (function(pageUrl, thisPage) {
- this._ajax(pageUrl, {
- success: function(r) {
-
- // First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page.
- var eTag = r.getResponseHeader('ETag');
- if (eTag) {
- if (eTag in this._pageETags) {
- this.log("Exact duplicate page found via ETag. Aborting.");
- articlePage.style.display = 'none';
- return;
- }
- this._pageETags[eTag] = 1;
- }
-
- // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse
those away.
- var page = doc.createElement("DIV");
-
- // Do some preprocessing to our HTML to make it ready for appending.
- // - Remove any script tags. Swap and reswap newlines with a unicode
- // character because multiline regex doesn't work in javascript.
- // - Turn any noscript tags into divs so that we can parse them. This
- // allows us to find any next page links hidden via javascript.
- // - Turn all double br's into p's - was handled by prepDocument in the original view.
- // Maybe in the future abstract out prepDocument to work for both the original document
- // and AJAX-added pages.
- var responseHtml = r.responseText.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi,
'');
- responseHtml = responseHtml.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
- responseHtml = responseHtml.replace(/\uffff/g, '\n').replace(/<(\/?)noscript/gi, '<$1div');
- responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>');
-
- page.innerHTML = responseHtml;
- this._replaceBrs(page);
-
- // Reset all flags for the next page, as they will search through it and
- // disable as necessary at the end of grabArticle.
- this._flags = 0x1 | 0x2 | 0x4;
-
- var secondNextPageLink = this._findNextPageLink(page);
-
- // NOTE: if we end up supporting _appendNextPage(), we'll need to
- // change this call to be async
- var content = this._grabArticle(page);
-
- if (!content) {
- this.log("No content found in page to append. Aborting.");
- return;
- }
-
- // Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
- // Compare it against all of the the previous document's we've gotten. If the previous
- // document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
- var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] :
null;
- if (firstP && firstP.innerHTML.length > 100) {
- for (var i = 1; i <= this._curPageNum; i += 1) {
- var rPage = doc.getElementById('readability-page-' + i);
- if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
- this.log('Duplicate of page ' + i + ' - skipping.');
- articlePage.style.display = 'none';
- this._parsedPages[pageUrl] = true;
- return;
- }
- }
- }
-
- this._removeScripts(content);
-
- thisPage.innerHTML = thisPage.innerHTML + content.innerHTML;
-
- // After the page has rendered, post process the content. This delay is necessary because,
- // in webkit at least, offsetWidth is not set in time to determine image width. We have to
- // wait a little bit for reflow to finish before we can fix floating images.
- setTimeout((function() {
- this._postProcessContent(thisPage);
- }).bind(this), 500);
-
-
- if (secondNextPageLink)
- this._appendNextPage(secondNextPageLink);
- }
- });
- }).bind(this)(nextPageLink, articlePage);
- },
-
/**
* Get an elements class/id weight. Uses regular expressions to tell if this
* element looks good or bad.
@@ -1673,7 +1428,7 @@ Readability.prototype = {
var weight = 0;
// Look for a special classname
- if (typeof(e.className) === 'string' && e.className !== '') {
+ if (typeof(e.className) === "string" && e.className !== "") {
if (this.REGEXPS.negative.test(e.className))
weight -= 25;
@@ -1682,7 +1437,7 @@ Readability.prototype = {
}
// Look for a special ID
- if (typeof(e.id) === 'string' && e.id !== '') {
+ if (typeof(e.id) === "string" && e.id !== "") {
if (this.REGEXPS.negative.test(e.id))
weight -= 25;
@@ -1871,7 +1626,7 @@ Readability.prototype = {
return true;
}
- if (this._getCharCount(node, ',') < 10) {
+ if (this._getCharCount(node, ",") < 10) {
// If there are not very many commas, and the number of
// non-paragraph elements is more than paragraphs or other
// ominous signs, remove the element.
@@ -1931,7 +1686,7 @@ Readability.prototype = {
**/
_cleanHeaders: function(e) {
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
- this._removeNodes(e.getElementsByTagName('h' + headerIndex), function (header) {
+ this._removeNodes(e.getElementsByTagName("h" + headerIndex), function (header) {
return this._getClassWeight(header) < 0;
});
}
@@ -1941,14 +1696,14 @@ Readability.prototype = {
return (this._flags & flag) > 0;
},
- _addFlag: function(flag) {
- this._flags = this._flags | flag;
- },
-
_removeFlag: function(flag) {
this._flags = this._flags & ~flag;
},
+ _isProbablyVisible: function(node) {
+ return node.style.display != "none" && !node.hasAttribute("hidden");
+ },
+
/**
* Decides whether or not the document is reader-able without parsing the whole thing.
*
@@ -1973,9 +1728,9 @@ Readability.prototype = {
nodes = [].concat.apply(Array.from(set), nodes);
}
- // FIXME we should have a fallback for helperIsVisible, but this is
- // problematic because of jsdom's elem.style handling - see
- // https://github.com/mozilla/readability/pull/186 for context.
+ if (!helperIsVisible) {
+ helperIsVisible = this._isProbablyVisible;
+ }
var score = 0;
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
@@ -2029,27 +1784,9 @@ Readability.prototype = {
}
}
- if (!this._doc) {
- return null;
- }
-
- if (typeof this._doc.documentElement.firstElementChild === "undefined") {
- this._getNextNode = this._getNextNodeNoElementProperties;
- }
-
// Remove script tags from the document.
this._removeScripts(this._doc);
- // FIXME: Disabled multi-page article support for now as it
- // needs more work on infrastructure.
-
- // Make sure this document is added to the list of parsed pages first,
- // so we don't double up on the first page.
- // this._parsedPages[uri.spec.replace(/\/$/, '')] = true;
-
- // Pull out any possible next page link first.
- // var nextPageLink = this._findNextPageLink(doc.body);
-
this._prepDocument();
var metadata = this._getArticleMetadata();
@@ -2063,14 +1800,6 @@ Readability.prototype = {
this._postProcessContent(articleContent);
- // if (nextPageLink) {
- // // Append any additional pages after a small timeout so that people
- // // can start reading without having to wait for this to finish processing.
- // setTimeout((function() {
- // this._appendNextPage(nextPageLink);
- // }).bind(this), 500);
- // }
-
// If we haven't found an excerpt in the article's metadata, use the article's
// first paragraph as the excerpt. This is used for displaying a preview of
// the article's content.
@@ -2083,7 +1812,6 @@ Readability.prototype = {
var textContent = articleContent.textContent;
return {
- uri: this._uri,
title: this._articleTitle,
byline: metadata.byline || this._articleByline,
dir: this._articleDir,
@@ -2099,7 +1827,6 @@ if (typeof module === "object") {
module.exports = Readability;
}
-
var loc = document.location;
var uri = {
spec: loc.href,
@@ -2110,14 +1837,10 @@ var uri = {
};
if (typeof document !== 'undefined') {
- reader = new Readability(uri, document);
- if (reader.isProbablyReaderable(false))
- // Youtube fails if we do a document.cloneNode(), so only do this if isProbablyReaderable()
- var documentClone = document.cloneNode(true);
- reader = new Readability(uri, documentClone);
- var previous_title = document.title;
- article = reader.parse();
- document.title=previous_title;
- //article.content
- article
+ var documentClone = document.cloneNode(true);
+ reader = new Readability(uri, documentClone);
+ article = reader.parse();
+ var previous_title = document.title;
+ document.title=previous_title;
+ article
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]