/** * 文本差异化比对工具,用于比对两段文本之间所存在的差异; * 对HTML文本的比对中,会完全忽略标签内的差异,将<...> * 中的内容当做一个整体的单词; * 注1:现有版本还未对执行效率进行优化; * 注2:原始算法来自于网络:http://www.rohland.co.za/index.php/2009/10/31/csharp-html-diff-algorithm/ * 注3:效率真低啊..........╮(╯▽╰)╭ * @author zhoujq | http://weibo.com/buxiaodewa */ (function () { var root = self; /** * 匹配描述块,一个用来表示相同内容块在新旧文档内位置的描述对象 * @param {Number} startInOld [相同部分在旧文档中的起始位置] * @param {Number} startInNew [相同部分在新文档中的起始位置] * @param {Number} size [相同部分的连续词元个数] */ var Match = function (startInOld, startInNew, size) { this.size = size; this.startInOld = startInOld; this.startInNew = startInNew; this.endInOld = this.startInOld + this.size; this.endInNew = this.startInNew + this.size; }; /** * 操作单元,一个操作单元描述了在具体位置所要 * 执行的具体操作(如:insert、delete、equal与replace) * @param {Function} action * @param {Number} startInOld [相同部分在旧文档中的起始位置] * @param {Number} endInOld [相同部分在旧文档中的结束位置] * @param {Number} startInNew [相同部分在新文档中的起始位置] * @param {Number} endInNew [相同部分在新文档中的结束位置] */ var Operation = function (action, startInOld, endInOld, startInNew, endInNew) { this.action = action; this.endInOld = endInOld; this.endInNew = endInNew; this.startInOld = startInOld; this.startInNew = startInNew; }; /** * 将HTML解析成单词列表 * 例: Hello World ---> [""," ", "Hello", " ", "World", " ", ""] * @param {String} html * @return {Array} */ var convertHtml2Words = function (html) { return html.match(/<[^>]+>|[^<|>|\w]|\w+\b|\s+/mg); }; /** * 差异对比构建类 * @param {String} oldVersion 老文档 * @param {String} newVersion 新文档 */ var DiffBuilder = function (oldVersion, newVersion) { this.oldVersion = oldVersion; this.newVersion = newVersion; this.operation = null; this.wordIndices = {}; this.oldWords = []; this.newWords = []; this.content = []; }; DiffBuilder.prototype = { build: function () { var sd = new Date(); this.splitInputs2Words(); this.indexNewWords(); this.operations = this.getOperations(); this.performOperation(); return this.content.join(''); }, performOperation: function () { var opt; for (var i = 0, len = this.operations.length; i < len; i++) { opt = this.operations[i]; this.operation = opt; opt.action.call(this, opt); } }, /** * 将传入的文本切割成词元 */ splitInputs2Words: function () { if (this.oldVersion == "") this.oldWords = []; else this.oldWords = convertHtml2Words(this.oldVersion); if (this.newVersion == "") this.newWords = []; else this.newWords = convertHtml2Words(this.newVersion); }, /** * 构建一个newWords队列对应的索引表 * 例如: * this.newWords = ["", " ", "Hello", " ", "World", " ", ""]; * * 构建成---> * * this.wordIndices = { * "" : [0], * " " : [1, 3, 5] * "Hello" : [2], * "World" : [4], * "" : [6] * }; */ indexNewWords: function () { var newWords = this.newWords; var wordIndices = this.wordIndices; var key, len = newWords.length; var toString = Object.prototype.toString; for (var i = 0; i < len; i++) { key = newWords[i]; if (toString.call(wordIndices[key]) !== '[object Array]') { wordIndices[key] = []; } wordIndices[key].push(i); } }, /** * 将文档抽象为操作描述队列 * 例如:[euqal, insert, equal],一篇文档可以被抽象为由多个 * “操作命令与位置信息”组成的操作队列; * @return {Array} */ getOperations: function () { var optInOld = 0; var optInNew = 0; var operations = []; var match, action; var matchStartsInOld; var matchStartsInNew; var matchs = this.matchingBlocks(); matchs.push(new Match(this.oldWords.length, this.newWords.length, 0)); for (var i = 0, len = matchs.length; i < len; i++) { action = null; match = matchs[i]; matchStartsInOld = (optInOld === match.startInOld); matchStartsInNew = (optInNew === match.startInNew); if (!matchStartsInOld && !matchStartsInNew) { action = this.replace; } else if (matchStartsInOld && !matchStartsInNew) { action = this.insert; } else if (!matchStartsInOld && matchStartsInNew) { action = this.del; } if (action) { operations.push(new Operation(action, optInOld, match.startInOld, optInNew, match.startInNew)); } if (match.size) { operations.push(new Operation(this.equal, match.startInOld, match.endInOld, match.startInNew, match.endInNew)); } optInOld = match.endInOld; optInNew = match.endInNew; } return operations; }, /** * 获取一个用于描述新旧文档内全部相同内容的匹配描述列表 * @return {Array} 文档匹配描述列表 */ matchingBlocks: function () { var matchingBlocks = []; this.findMatchingBlocks(0, this.oldWords.length, 0, this.newWords.length, matchingBlocks); return matchingBlocks; }, /** * 递归查找匹配项 * @param {Number} startInOld * @param {Number} endInOld * @param {Number} startInNew * @param {Number} endInNew * @param {Array} matchingBlocks */ findMatchingBlocks: function (startInOld, endInOld, startInNew, endInNew, matchingBlocks) { var match = this.findMatch(startInOld, endInOld, startInNew, endInNew); if (!match) { return; } if (startInOld < match.startInOld && startInNew < match.startInNew) { this.findMatchingBlocks(startInOld, match.startInOld, startInNew, match.startInNew, matchingBlocks); } matchingBlocks.push(match); if (match.endInOld < endInOld && match.endInNew < endInNew) { this.findMatchingBlocks(match.endInOld, endInOld, match.endInNew, endInNew, matchingBlocks); } }, /** * 从指定位置开始查询第一块匹配的文本块 * @param {Number} startInOld * @param {Number} endInOld * @param {Number} startInNew * @param {Number} endInNew */ findMatch: function (startInOld, endInOld, startInNew, endInNew) { var bestMatchInOld = startInOld; var bestMatchInNew = startInNew; var bestMatchSize = 0; var matchLengthAt = {}; var newMatchLengthAt; var newMatchLength, wordIndexList, matchIndex, len; for (var idxOld = startInOld; idxOld < endInOld; idxOld++) { newMatchLengthAt = {}; wordIndexList = this.wordIndices[this.oldWords[idxOld]]; len = wordIndexList ? wordIndexList.length : 0; for (var i = 0; i < len; i++) { matchIndex = wordIndexList[i]; if (matchIndex < startInNew) { continue; } if (matchIndex >= endInNew) { break; } newMatchLength = (matchLengthAt[matchIndex - 1] || 0) + 1; newMatchLengthAt[matchIndex] = newMatchLength; if (newMatchLength > bestMatchSize) { bestMatchInOld = idxOld - newMatchLength + 1; bestMatchInNew = matchIndex - newMatchLength + 1; bestMatchSize = newMatchLength; } } matchLengthAt = newMatchLengthAt; } return bestMatchSize ? new Match(bestMatchInOld, bestMatchInNew, bestMatchSize) : null; }, insert: function (opt, tagCls) { //this.insertTag('ins', tagCls || 'diffins', this.newWords.slice(opt.startInNew, opt.endInNew)); }, del: function (opt, tagCls) { this.insertTag('span', tagCls || 'diff', this.oldWords.slice(opt.startInOld, opt.endInOld)); }, equal: function (opt) { this.content = this.content.concat(this.newWords.slice(opt.startInNew, opt.endInNew)); }, replace: function (opt) { this.del(opt, 'diffmod'); this.insert(opt, 'diffmod'); }, /** * 添加标签 * @param {String} tagName * @param {String} cssCls * @param {String} words */ insertTag: function (tagName, cssCls, words) { var nonTags; var that = this; while (words.length) { //获取words内“连续”的非标签字符 nonTags = this.extractConsecutiveWords(words, true); if (nonTags.length) { this.content.push( this.warpText(nonTags.join(''), tagName, cssCls)); } if (words.length) { //获取words内“连续”的标签字符 this.content = this.content.concat( this.extractConsecutiveWords(words, false)); } } }, /** * 获取words内连续的“文本”或“标签” * @param {Array} words * @param {Boolean} isTag * @return {Array} */ extractConsecutiveWords: function (words, isTag) { var idxOfFirstTag = null; for (var i = 0, len = words.length; i < len; i++) { //注:是取判定条件的前一位 if (this.isTag(words[i]) == isTag) { idxOfFirstTag = i; break; } } return words.splice(0, idxOfFirstTag !== null ? idxOfFirstTag : words.length); }, warpText: function (text, tagName, cssCls) { return '<' + tagName + ' class="' + cssCls + '">' + text + ''; }, isOpeningTag: function (item) { return /^\s*<[^>]+>\s*$/ig.test(item); }, isClosingTag: function (item) { return /^\s*<\/[^>]+>\s*$/ig.test(item); }, isTag: function (item) { return this.isOpeningTag(item) ? 1 : this.isClosingTag(item) ? 2 : 0; } }; root.getHTMLDiff = function (oldVersion, newVersion) { return new DiffBuilder(oldVersion, newVersion).build(); }; /** * 针对高级浏览器开启webWorker支持 */ if (typeof postMessage === "function") { onmessage = function (evt) { var data = evt.data; var diff = getHTMLDiff(data.oldVersion, data.newVersion); postMessage(diff); }; } })();