From d2fc686740d88ae601c67628f70a79474b197d3a Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Mon, 11 Mar 2024 09:39:17 -0400 Subject: [PATCH 01/16] [unlikelyCandidatesFix] testing Unlikely, and likely candidates against the nodeName instead of the className and id --- Readability.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Readability.js b/Readability.js index 0bbe02e1..3e2e6ef0 100644 --- a/Readability.js +++ b/Readability.js @@ -934,8 +934,8 @@ Readability.prototype = { // Remove unlikely candidates if (stripUnlikelyCandidates) { - if (this.REGEXPS.unlikelyCandidates.test(matchString) && - !this.REGEXPS.okMaybeItsACandidate.test(matchString) && + if (this.REGEXPS.unlikelyCandidates.test(node.tagName) && + !this.REGEXPS.okMaybeItsACandidate.test(node.tagName) && !this._hasAncestorTag(node, "table") && !this._hasAncestorTag(node, "code") && node.tagName !== "BODY" && From 1cf0b50d8b07b38f187d2204e9cc5d3ee16b577a Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Mon, 11 Mar 2024 15:25:47 -0400 Subject: [PATCH 02/16] [Logs] added more logs around some of the other events that happen --- Readability.js | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/Readability.js b/Readability.js index 3e2e6ef0..ddeac870 100644 --- a/Readability.js +++ b/Readability.js @@ -915,12 +915,14 @@ Readability.prototype = { // User is not able to see elements applied with both "aria-modal = true" and "role = dialog" if (node.getAttribute("aria-modal") == "true" && node.getAttribute("role") == "dialog") { + this.log('Removing Modals and Dialogs - ' + matchString) node = this._removeAndGetNext(node); continue; } // Check to see if this node is a byline, and remove it if it is. if (this._checkByline(node, matchString)) { + this.log('Removing byline - ' + matchString); node = this._removeAndGetNext(node); continue; } @@ -957,6 +959,7 @@ Readability.prototype = { node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" || node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") && this._isElementWithoutContent(node)) { + this.log("Removing empty node - " + matchString); node = this._removeAndGetNext(node); continue; } @@ -1014,18 +1017,24 @@ Readability.prototype = { **/ var candidates = []; this._forEachNode(elementsToScore, function(elementToScore) { - if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined") + if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined") { + this.log("NOT SCORING: Element has no parent node - ", matchString); return; + } // If this paragraph is less than 25 characters, don't even count it. var innerText = this._getInnerText(elementToScore); - if (innerText.length < 25) + if (innerText.length < 25) { + this.log("NOT SCORING: Paragraph too short - ", matchString); return; + } // Exclude nodes with no ancestor. var ancestors = this._getNodeAncestors(elementToScore, 5); - if (ancestors.length === 0) + if (ancestors.length === 0) { + this.log("NOT SCORING: No ancestors - ", matchString); return; + } var contentScore = 0; From b2b5e6adc0e2217825fd809961e4b82349151f0b Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Mon, 11 Mar 2024 15:51:28 -0400 Subject: [PATCH 03/16] [toLowerCase] lower case the tagName check --- Readability.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Readability.js b/Readability.js index ddeac870..ca8f9d44 100644 --- a/Readability.js +++ b/Readability.js @@ -936,8 +936,8 @@ Readability.prototype = { // Remove unlikely candidates if (stripUnlikelyCandidates) { - if (this.REGEXPS.unlikelyCandidates.test(node.tagName) && - !this.REGEXPS.okMaybeItsACandidate.test(node.tagName) && + if (this.REGEXPS.unlikelyCandidates.test(node.tagName.toLowerCase()) && + !this.REGEXPS.okMaybeItsACandidate.test(node.tagName.toLowerCase()) && !this._hasAncestorTag(node, "table") && !this._hasAncestorTag(node, "code") && node.tagName !== "BODY" && From 23eef7e72d74b7d297a8637f95453b0597772c1d Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Tue, 12 Mar 2024 08:52:40 -0400 Subject: [PATCH 04/16] Adding back nodeName check instead of class and id check --- Readability.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Readability.js b/Readability.js index ca8f9d44..ff4f5185 100644 --- a/Readability.js +++ b/Readability.js @@ -936,8 +936,8 @@ Readability.prototype = { // Remove unlikely candidates if (stripUnlikelyCandidates) { - if (this.REGEXPS.unlikelyCandidates.test(node.tagName.toLowerCase()) && - !this.REGEXPS.okMaybeItsACandidate.test(node.tagName.toLowerCase()) && + if (this.REGEXPS.unlikelyCandidates.test(node.nodeName.toLowerCase()) && + !this.REGEXPS.okMaybeItsACandidate.test(node.nodeName.toLowerCase()) && !this._hasAncestorTag(node, "table") && !this._hasAncestorTag(node, "code") && node.tagName !== "BODY" && @@ -1098,6 +1098,7 @@ Readability.prototype = { } var topCandidate = topCandidates[0] || null; + this.log('Top candidate:', topCandidate); var neededToCreateTopCandidate = false; var parentOfTopCandidate; From 73d938df1a749de8815fd9ced1d62bed7afd8963 Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Tue, 12 Mar 2024 09:17:49 -0400 Subject: [PATCH 05/16] [PR completeness] Fixed linting issues, and a test --- Readability.js | 6 +++--- test/test-pages/daringfireball-1/expected.html | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Readability.js b/Readability.js index ff4f5185..53ecc2d0 100644 --- a/Readability.js +++ b/Readability.js @@ -915,14 +915,14 @@ Readability.prototype = { // User is not able to see elements applied with both "aria-modal = true" and "role = dialog" if (node.getAttribute("aria-modal") == "true" && node.getAttribute("role") == "dialog") { - this.log('Removing Modals and Dialogs - ' + matchString) + this.log("Removing Modals and Dialogs - " + matchString); node = this._removeAndGetNext(node); continue; } // Check to see if this node is a byline, and remove it if it is. if (this._checkByline(node, matchString)) { - this.log('Removing byline - ' + matchString); + this.log("Removing byline - " + matchString); node = this._removeAndGetNext(node); continue; } @@ -1098,7 +1098,7 @@ Readability.prototype = { } var topCandidate = topCandidates[0] || null; - this.log('Top candidate:', topCandidate); + this.log("Top candidate: ", topCandidate); var neededToCreateTopCandidate = false; var parentOfTopCandidate; diff --git a/test/test-pages/daringfireball-1/expected.html b/test/test-pages/daringfireball-1/expected.html index af4fafae..973278c7 100644 --- a/test/test-pages/daringfireball-1/expected.html +++ b/test/test-pages/daringfireball-1/expected.html @@ -1,5 +1,5 @@
-
+

About This Site

Daring Fireball is written and produced by John Gruber.

From 11df4642e5849d2933f1ae83bb51e4a6d1a492d8 Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Thu, 21 Mar 2024 16:03:21 -0400 Subject: [PATCH 06/16] [price commas fix] Adding a fix to not count commas in the prices of things --- Readability.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Readability.js b/Readability.js index 53ecc2d0..15fc5da4 100644 --- a/Readability.js +++ b/Readability.js @@ -143,7 +143,7 @@ Readability.prototype = { b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i, // Commas as used in Latin, Sindhi, Chinese and various other scripts. // see: https://en.wikipedia.org/wiki/Comma#Comma_variants - commas: /\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C/g, + commas: /[\s\D][\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C][\s\D]/g, // See: https://schema.org/Article jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/ }, From 2b7125f5b7b3331a4f478117aaafe7e7fdbf914a Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Thu, 21 Mar 2024 16:10:39 -0400 Subject: [PATCH 07/16] [naming] changing the package name as not to get conflicts --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 75a143b6..68d46d0e 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "name": "@mozilla/readability", + "name": "@panda01/readability", "version": "0.5.0", "description": "A standalone version of the readability library used for Firefox Reader View.", "main": "index.js", From 3329331a80f3fe08f34c92d863c61e682bccca86 Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Thu, 21 Mar 2024 16:17:10 -0400 Subject: [PATCH 08/16] 0.5.1 --- package-lock.json | 4 ++-- package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/package-lock.json b/package-lock.json index d449c639..878c1ece 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@mozilla/readability", - "version": "0.5.0", + "version": "0.5.1", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "@mozilla/readability", - "version": "0.5.0", + "version": "0.5.1", "license": "Apache-2.0", "devDependencies": { "@release-it/keep-a-changelog": "5.0.0", diff --git a/package.json b/package.json index 68d46d0e..32a8dc9d 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@panda01/readability", - "version": "0.5.0", + "version": "0.5.1", "description": "A standalone version of the readability library used for Firefox Reader View.", "main": "index.js", "types": "index.d.ts", From b67b74df2cf47008a2329426d2f76814da17a635 Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Thu, 21 Mar 2024 16:17:22 -0400 Subject: [PATCH 09/16] 0.5.2 --- package-lock.json | 4 ++-- package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/package-lock.json b/package-lock.json index 878c1ece..f79b4553 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@mozilla/readability", - "version": "0.5.1", + "version": "0.5.2", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "@mozilla/readability", - "version": "0.5.1", + "version": "0.5.2", "license": "Apache-2.0", "devDependencies": { "@release-it/keep-a-changelog": "5.0.0", diff --git a/package.json b/package.json index 32a8dc9d..a0532ffc 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@panda01/readability", - "version": "0.5.1", + "version": "0.5.2", "description": "A standalone version of the readability library used for Firefox Reader View.", "main": "index.js", "types": "index.d.ts", From 88cc372894520f4cdc5481b2c6fb760224cff4fd Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Wed, 27 Mar 2024 22:02:02 -0400 Subject: [PATCH 10/16] [readability positivelist] adding cda-round-up to positive class names in order to fix certain pages like https://www.consumerreports.org/appliances/air-purifiers/best-air-purifiers-of-the-year-a1197763201/ --- Readability.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Readability.js b/Readability.js index 15fc5da4..ec0ea8ef 100644 --- a/Readability.js +++ b/Readability.js @@ -125,7 +125,7 @@ Readability.prototype = { unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, - positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, + positive: /article|body|cda-round-up|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, byline: /byline|author|dateline|writtenby|p-author/i, From 09af4f95a012e501b88b5cbdffeb749a44f5079e Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Wed, 27 Mar 2024 22:02:09 -0400 Subject: [PATCH 11/16] 0.5.3 --- package-lock.json | 4 ++-- package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/package-lock.json b/package-lock.json index f79b4553..7f0e5012 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@mozilla/readability", - "version": "0.5.2", + "version": "0.5.3", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "@mozilla/readability", - "version": "0.5.2", + "version": "0.5.3", "license": "Apache-2.0", "devDependencies": { "@release-it/keep-a-changelog": "5.0.0", diff --git a/package.json b/package.json index a0532ffc..967bb9ae 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@panda01/readability", - "version": "0.5.2", + "version": "0.5.3", "description": "A standalone version of the readability library used for Firefox Reader View.", "main": "index.js", "types": "index.d.ts", From 1fd2f03ee0781bceaeafd8f1d8d5ba35c9c57342 Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Thu, 28 Mar 2024 08:15:06 -0400 Subject: [PATCH 12/16] [logs] adding logging around Cleanconditionally Remove Nodes --- Readability.js | 47 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/Readability.js b/Readability.js index ec0ea8ef..fbf1632a 100644 --- a/Readability.js +++ b/Readability.js @@ -2151,14 +2151,23 @@ Readability.prototype = { var linkDensity = this._getLinkDensity(node); var contentLength = this._getInnerText(node).length; + var linkDensity = this._getLinkDensity(node); + var contentLength = this._getInnerText(node).length; + var lessParagraphsThanImages = (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")); + var isNotListAndMoreListItemsThanParagraphs = (!isList && li > p); + var moreInputsThanPs = (input > Math.floor(p/3)); + var headingDensityAndContentLengthOff = (!isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")); + var weightAndLinkDensityIsLow = (!isList && weight < 25 && linkDensity > 0.2); + var weightAndLinkDensityTooHigh = (weight >= 25 && linkDensity > 0.5); + var embedCountAndContentLengthOff = ((embedCount === 1 && contentLength < 75) || embedCount > 1); var haveToRemove = - (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) || - (!isList && li > p) || - (input > Math.floor(p/3)) || - (!isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) || - (!isList && weight < 25 && linkDensity > 0.2) || - (weight >= 25 && linkDensity > 0.5) || - ((embedCount === 1 && contentLength < 75) || embedCount > 1); + lessParagraphsThanImages || + isNotListAndMoreListItemsThanParagraphs || + moreInputsThanPs || + headingDensityAndContentLengthOff || + weightAndLinkDensityIsLow || + weightAndLinkDensityTooHigh || + embedCountAndContentLengthOff; // Allow simple lists of images to remain in pages if (isList && haveToRemove) { for (var x = 0; x < node.children.length; x++) { @@ -2174,6 +2183,30 @@ Readability.prototype = { return false; } } + if (haveToRemove) { + this.log("haveToRemove will remove node", node, 'with value: ', haveToRemove); + if (lessParagraphsThanImages) { + this.log("lessParagraphsThanImages", node); + } + if (isNotListAndMoreListItemsThanParagraphs) { + this.log("isNotListAndMoreListItemsThanParagraphs", node); + } + if (moreInputsThanPs) { + this.log("moreInputsThanPs", node); + } + if (headingDensityAndContentLengthOff) { + this.log("headingDensityAndContentLengthOff", node); + } + if (weightAndLinkDensityIsLow) { + this.log("weightAndLinkDensityIsLow", node); + } + if (weightAndLinkDensityTooHigh) { + this.log("weightAndLinkDensityTooHigh", node); + } + if (embedCountAndContentLengthOff) { + this.log("embedCountAndContentLengthOff", node); + } + } return haveToRemove; } return false; From c6fbabd5108d293d5838cfff421ffd4598956292 Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Thu, 28 Mar 2024 08:17:30 -0400 Subject: [PATCH 13/16] [cleanConditionally LinkDensity] Increading the linkDensity trigger to 25% for paragraphs to make pages like https://www.consumerreports.org/cars/car-reliability-owner-satisfaction/10-most-reliable-cars-a6569295379/ and https://www.consumerreports.org/cars/car-reliability-owner-satisfaction/10-most-satisfying-cars-owner-satisfaction-a2239167129/ --- Readability.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Readability.js b/Readability.js index fbf1632a..02149413 100644 --- a/Readability.js +++ b/Readability.js @@ -2157,7 +2157,7 @@ Readability.prototype = { var isNotListAndMoreListItemsThanParagraphs = (!isList && li > p); var moreInputsThanPs = (input > Math.floor(p/3)); var headingDensityAndContentLengthOff = (!isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")); - var weightAndLinkDensityIsLow = (!isList && weight < 25 && linkDensity > 0.2); + var weightAndLinkDensityIsLow = (!isList && weight < 25 && linkDensity > 0.25); var weightAndLinkDensityTooHigh = (weight >= 25 && linkDensity > 0.5); var embedCountAndContentLengthOff = ((embedCount === 1 && contentLength < 75) || embedCount > 1); var haveToRemove = From c16d0bc78f012e37c015b9188a0d09ddf030229b Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Thu, 28 Mar 2024 08:17:37 -0400 Subject: [PATCH 14/16] 0.5.4 --- package-lock.json | 4 ++-- package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/package-lock.json b/package-lock.json index 7f0e5012..bedc3520 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@mozilla/readability", - "version": "0.5.3", + "version": "0.5.4", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "@mozilla/readability", - "version": "0.5.3", + "version": "0.5.4", "license": "Apache-2.0", "devDependencies": { "@release-it/keep-a-changelog": "5.0.0", diff --git a/package.json b/package.json index 967bb9ae..6fdd4ce0 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@panda01/readability", - "version": "0.5.3", + "version": "0.5.4", "description": "A standalone version of the readability library used for Firefox Reader View.", "main": "index.js", "types": "index.d.ts", From a2e312158b0a18703e12a24972b3a012168ea482 Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Thu, 28 Mar 2024 09:26:49 -0400 Subject: [PATCH 15/16] [snafu] adding back product to the list of negative classnames --- Readability.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Readability.js b/Readability.js index 02149413..75f40ea6 100644 --- a/Readability.js +++ b/Readability.js @@ -126,7 +126,7 @@ Readability.prototype = { okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, positive: /article|body|cda-round-up|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, - negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, + negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|product|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, byline: /byline|author|dateline|writtenby|p-author/i, replaceFonts: /<(\/?)font[^>]*>/gi, From 64d9a1f1ee233fe7728679a7833f01d706df72ec Mon Sep 17 00:00:00 2001 From: Khalah Jones-Golden Date: Thu, 28 Mar 2024 09:26:54 -0400 Subject: [PATCH 16/16] 0.5.5 --- package-lock.json | 4 ++-- package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/package-lock.json b/package-lock.json index bedc3520..81d086c6 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@mozilla/readability", - "version": "0.5.4", + "version": "0.5.5", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "@mozilla/readability", - "version": "0.5.4", + "version": "0.5.5", "license": "Apache-2.0", "devDependencies": { "@release-it/keep-a-changelog": "5.0.0", diff --git a/package.json b/package.json index 6fdd4ce0..01a3c84a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@panda01/readability", - "version": "0.5.4", + "version": "0.5.5", "description": "A standalone version of the readability library used for Firefox Reader View.", "main": "index.js", "types": "index.d.ts",