[955]readability网页内容提取器

2023-11-23 19:03:02

文章目录

- 相关阅读1
- 相关阅读2
- 相关阅读3

相关阅读3

从网页中提取出主要内容，一直是一个比较有挑战的算法。Readability是其中一个很不错的实现，它通过遍历Dom对象，通过标签和常用文字的加减权，来重新整合出页面的内容。

JS版本的Readability是最好用的，它可以直接在浏览器完成分析，于是用户还可以人工对分析出来的内容进行修改和校正。

GET社区的Chrome插件就使用了这个算法，在你遇到读起来不爽的网页的时候，点一下，世界就清爽了。

比如Breach浏览器的文档页面，看起来很酷，但是阅读久了会让人泪流不止。

但当你点过插件后，这个页面会变成这个样子：

是不是觉得世界更美好了。

那么，接下来我们就简单看看这个算法是如何实现的。

首先，它定义了一系列正则：

regexps: {unlikelyCandidates:    /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,okMaybeItsACandidate:  /and|article|body|column|main|shadow/i,positive:              /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,negative:              /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,extraneous:            /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,divToPElements:        /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,replaceBrs:            /(]*>[ \n\r\t]*){2,}/gi,replaceFonts:          /<(\/?)font[^>]*>/gi,trim:                  /^\s+|\s+$/g,normalize:             /\s{2,}/g,killBreaks:            /((\s| ?)*){1,}/g,videos:                /http:\/\/(www\.)?(youtube|vimeo)\.com/i,skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,nextLink:              /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.prevLink:              /(prev|earl|old|new|<|«)/i},

可以看到，标签和文字都有加权或降权分组。整个内容分析是通过grabArticle函数来实现的。

首先开始遍历节点

for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1)

然后将不像内容的元素去掉

if (stripUnlikelyCandidates) 
{var unlikelyMatchString = node.className + node.id;if ((unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&node.tagName !== "BODY")){dbg("Removing unlikely candidate - " + unlikelyMatchString);node.parentNode.removeChild(node);nodeIndex-=1;continue;}               
}

将DIV替换为P标签后，再对目标节点进行遍历，进行计分:

var candidates = [];
for (var pt=0; pt < nodesToScore.length; pt+=1) {var parentNode      = nodesToScore[pt].parentNode;var grandParentNode = parentNode ? parentNode.parentNode : null;var innerText       = readability.getInnerText(nodesToScore[pt]);if(!parentNode || typeof(parentNode.tagName) === 'undefined') {continue;}/* If this paragraph is less than 25 characters, don't even count it. */if(innerText.length < 25) {continue; }/* Initialize readability data for the parent. */if(typeof parentNode.readability === 'undefined') {readability.initializeNode(parentNode);candidates.push(parentNode);}/* Initialize readability data for the grandparent. */if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {readability.initializeNode(grandParentNode);candidates.push(grandParentNode);}var contentScore = 0;/* Add a point for the paragraph itself as a base. */contentScore+=1;/* Add points for any commas within this paragraph */contentScore += innerText.split(',').length;/* For every 100 characters in this paragraph, add another point. Up to 3 points. */contentScore += Math.min(Math.floor(innerText.length / 100), 3);/* Add the score to the parent. The grandparent gets half. */parentNode.readability.contentScore += contentScore;if(grandParentNode) {grandParentNode.readability.contentScore += contentScore/2;             }
}

最后根据分值，重新拼接内容

var articleContent        = document.createElement("DIV");
if (isPaging) {articleContent.id     = "readability-content";
}
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
var siblingNodes          = topCandidate.parentNode.childNodes;for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {var siblingNode = siblingNodes[s];var append      = false;/*** Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.* Example of error visible here: http://www.esquire.com/features/honesty0707**/if(!siblingNode) {continue;}dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));if(siblingNode === topCandidate){append = true;}var contentBonus = 0;/* Give a bonus if sibling nodes and top candidates have the example same classname */if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {contentBonus += topCandidate.readability.contentScore * 0.2;}if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold){append = true;}if(siblingNode.nodeName === "P") {var linkDensity = readability.getLinkDensity(siblingNode);var nodeContent = readability.getInnerText(siblingNode);var nodeLength  = nodeContent.length;if(nodeLength > 80 && linkDensity < 0.25){append = true;}else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1){append = true;}}if(append) {dbg("Appending node: " + siblingNode);var nodeToAppend = null;if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {/* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');nodeToAppend = document.createElement("DIV");try {nodeToAppend.id = siblingNode.id;nodeToAppend.innerHTML = siblingNode.innerHTML;}catch(er) {dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");nodeToAppend = siblingNode;s-=1;sl-=1;}} else {nodeToAppend = siblingNode;s-=1;sl-=1;}/* To ensure a node does not interfere with readability styles, remove its classnames */nodeToAppend.className = "";/* Append sibling and subtract from our list because it removes the node when you append to another node */articleContent.appendChild(nodeToAppend);}
}

可以看到，里边用到了很多很trick的技巧，比如25字以下的段落不计分。

整个读下来，还是很有趣的。

由于Readability解决的需求很通用，于是其他语言的程序员纷纷移植了该算法。

PHP版本：https://github.com/feelinglucky/php-readability
Java版本：https://github.com/wuman/JReadability
当然会有Node版本了：https://www.npmjs.org/package/node-readability

参考：https://www.jianshu.com/p/b9cbb843e807
https://blog.csdn.net/qq_40659982/article/details/88071546
http://get.ftqq.com/130.get

本文来自互联网用户投稿，文章观点仅代表作者本人，不代表本站立场，不承担相关法律责任。如若转载，请注明出处。 如若内容造成侵权/违法违规/事实不符，请点击【内容举报】进行投诉反馈！

标签：技术

上一篇 > Blob对象以及网页文件导出下载
下一篇 > push和pull技术对比

Duilib中list控件支持ctrl和shif多行选中的实现

[ICML2015]Batch Normalization:Accelerating Deep Network Training by Reducing Internal Covariate Shif

win10系统微软输入法于eclipse ctrl+shif+f冲突间接处理办法

Codeforces Round #259 (Div. 2) B. Little Pony and Sort by Shif

读LDD3，内存映射与DMA--PAGE_SHIF…

VMware虚拟机安装XP【要先分区，再设置BOOT 启动CD，shif+上移】

更换iBus五笔的左与右Shif

sublime ctrl+shif+f 没用解决办法

idea 对 ctrl + z 的撤销是 ctrl + shif + z

计算机最早的设计师应用于,计算机应用基础选择题doc.doc

win10自带截图神器：Win+Shift+S

Python基础之文件目录操作

python简述目录_Python基础之文件目录操作(示例代码)

tp5 如何做数据采集

任务2-7(服务器字体+阿里巴巴矢量库)

html标签（1)：h1~h6,p,br,pre,hr

TI 电量计介绍与芯片选型指南

几款TI电源芯片简介

TI DSP芯片C2000系列读取FLASH数据

德州仪器(Ti)平台嵌入式开发基础

TI三相电机智能栅极驱动芯片特点分类

省选模拟（12.08） T3 圈圈圈圈圈圈圈圈

Hadoop生态圈技术栈（上）

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之6.Impala交互式查询

小猿圈之Linux下Mysql 操作命令

大数据Hadoop生态圈常用面试题

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之4.Hive DDL、DQL和数据操作

备战Noip2018模拟赛11（B组）T3 Monogatari 物语

【智能优化算法-圆圈搜索算法】基于圆圈搜索算法Circle Search Algorithm求解单目标优化问题附matlab代码

NYOJ 78 圈水池

递归问题跑道汽车绕圈问题 Python实现

Hadoop生态圈（三）：MapReduce

[955]readability网页内容提取器

文章目录

相关阅读1

Example Domain

相关阅读2

相关阅读3

相关文章