/**
* 文本差异化比对工具,用于比对两段文本之间所存在的差异;
* 对HTML文本的比对中,会完全忽略标签内的差异,将<...>
* 中的内容当做一个整体的单词;
* 注1:现有版本还未对执行效率进行优化;
* 注2:原始算法来自于网络:http://www.rohland.co.za/index.php/2009/10/31/csharp-html-diff-algorithm/
* 注3:效率真低啊..........╮(╯▽╰)╭
* @author zhoujq | http://weibo.com/buxiaodewa
*/
(function () {
var root = self;
/**
* 匹配描述块,一个用来表示相同内容块在新旧文档内位置的描述对象
* @param {Number} startInOld [相同部分在旧文档中的起始位置]
* @param {Number} startInNew [相同部分在新文档中的起始位置]
* @param {Number} size [相同部分的连续词元个数]
*/
var Match = function (startInOld, startInNew, size) {
this.size = size;
this.startInOld = startInOld;
this.startInNew = startInNew;
this.endInOld = this.startInOld + this.size;
this.endInNew = this.startInNew + this.size;
};
/**
* 操作单元,一个操作单元描述了在具体位置所要
* 执行的具体操作(如:insert、delete、equal与replace)
* @param {Function} action
* @param {Number} startInOld [相同部分在旧文档中的起始位置]
* @param {Number} endInOld [相同部分在旧文档中的结束位置]
* @param {Number} startInNew [相同部分在新文档中的起始位置]
* @param {Number} endInNew [相同部分在新文档中的结束位置]
*/
var Operation = function (action, startInOld, endInOld, startInNew, endInNew) {
this.action = action;
this.endInOld = endInOld;
this.endInNew = endInNew;
this.startInOld = startInOld;
this.startInNew = startInNew;
};
/**
* 将HTML解析成单词列表
* 例: Hello World ---> [""," ", "Hello", " ", "World", " ", ""]
* @param {String} html
* @return {Array}
*/
var convertHtml2Words = function (html) {
return html.match(/<[^>]+>|[^<|>|\w]|\w+\b|\s+/mg);
};
/**
* 差异对比构建类
* @param {String} oldVersion 老文档
* @param {String} newVersion 新文档
*/
var DiffBuilder = function (oldVersion, newVersion) {
this.oldVersion = oldVersion;
this.newVersion = newVersion;
this.operation = null;
this.wordIndices = {};
this.oldWords = [];
this.newWords = [];
this.content = [];
};
DiffBuilder.prototype = {
build: function () {
var sd = new Date();
this.splitInputs2Words();
this.indexNewWords();
this.operations = this.getOperations();
this.performOperation();
return this.content.join('');
},
performOperation: function () {
var opt;
for (var i = 0, len = this.operations.length; i < len; i++) {
opt = this.operations[i];
this.operation = opt;
opt.action.call(this, opt);
}
},
/**
* 将传入的文本切割成词元
*/
splitInputs2Words: function () {
if (this.oldVersion == "")
this.oldWords = [];
else
this.oldWords = convertHtml2Words(this.oldVersion);
if (this.newVersion == "")
this.newWords = [];
else
this.newWords = convertHtml2Words(this.newVersion);
},
/**
* 构建一个newWords队列对应的索引表
* 例如:
* this.newWords = ["", " ", "Hello", " ", "World", " ", ""];
*
* 构建成--->
*
* this.wordIndices = {
* "" : [0],
* " " : [1, 3, 5]
* "Hello" : [2],
* "World" : [4],
* "" : [6]
* };
*/
indexNewWords: function () {
var newWords = this.newWords;
var wordIndices = this.wordIndices;
var key, len = newWords.length;
var toString = Object.prototype.toString;
for (var i = 0; i < len; i++) {
key = newWords[i];
if (toString.call(wordIndices[key]) !== '[object Array]') {
wordIndices[key] = [];
}
wordIndices[key].push(i);
}
},
/**
* 将文档抽象为操作描述队列
* 例如:[euqal, insert, equal],一篇文档可以被抽象为由多个
* “操作命令与位置信息”组成的操作队列;
* @return {Array}
*/
getOperations: function () {
var optInOld = 0;
var optInNew = 0;
var operations = [];
var match, action;
var matchStartsInOld;
var matchStartsInNew;
var matchs = this.matchingBlocks();
matchs.push(new Match(this.oldWords.length, this.newWords.length, 0));
for (var i = 0, len = matchs.length; i < len; i++) {
action = null;
match = matchs[i];
matchStartsInOld = (optInOld === match.startInOld);
matchStartsInNew = (optInNew === match.startInNew);
if (!matchStartsInOld && !matchStartsInNew) {
action = this.replace;
} else if (matchStartsInOld && !matchStartsInNew) {
action = this.insert;
} else if (!matchStartsInOld && matchStartsInNew) {
action = this.del;
}
if (action) {
operations.push(new Operation(action, optInOld, match.startInOld, optInNew, match.startInNew));
}
if (match.size) {
operations.push(new Operation(this.equal, match.startInOld, match.endInOld, match.startInNew, match.endInNew));
}
optInOld = match.endInOld;
optInNew = match.endInNew;
}
return operations;
},
/**
* 获取一个用于描述新旧文档内全部相同内容的匹配描述列表
* @return {Array} 文档匹配描述列表
*/
matchingBlocks: function () {
var matchingBlocks = [];
this.findMatchingBlocks(0, this.oldWords.length, 0, this.newWords.length, matchingBlocks);
return matchingBlocks;
},
/**
* 递归查找匹配项
* @param {Number} startInOld
* @param {Number} endInOld
* @param {Number} startInNew
* @param {Number} endInNew
* @param {Array} matchingBlocks
*/
findMatchingBlocks: function (startInOld, endInOld, startInNew, endInNew, matchingBlocks) {
var match = this.findMatch(startInOld, endInOld, startInNew, endInNew);
if (!match) {
return;
}
if (startInOld < match.startInOld && startInNew < match.startInNew) {
this.findMatchingBlocks(startInOld, match.startInOld, startInNew, match.startInNew, matchingBlocks);
}
matchingBlocks.push(match);
if (match.endInOld < endInOld && match.endInNew < endInNew) {
this.findMatchingBlocks(match.endInOld, endInOld, match.endInNew, endInNew, matchingBlocks);
}
},
/**
* 从指定位置开始查询第一块匹配的文本块
* @param {Number} startInOld
* @param {Number} endInOld
* @param {Number} startInNew
* @param {Number} endInNew
*/
findMatch: function (startInOld, endInOld, startInNew, endInNew) {
var bestMatchInOld = startInOld;
var bestMatchInNew = startInNew;
var bestMatchSize = 0;
var matchLengthAt = {};
var newMatchLengthAt;
var newMatchLength, wordIndexList, matchIndex, len;
for (var idxOld = startInOld; idxOld < endInOld; idxOld++) {
newMatchLengthAt = {};
wordIndexList = this.wordIndices[this.oldWords[idxOld]];
len = wordIndexList ? wordIndexList.length : 0;
for (var i = 0; i < len; i++) {
matchIndex = wordIndexList[i];
if (matchIndex < startInNew) {
continue;
}
if (matchIndex >= endInNew) {
break;
}
newMatchLength = (matchLengthAt[matchIndex - 1] || 0) + 1;
newMatchLengthAt[matchIndex] = newMatchLength;
if (newMatchLength > bestMatchSize) {
bestMatchInOld = idxOld - newMatchLength + 1;
bestMatchInNew = matchIndex - newMatchLength + 1;
bestMatchSize = newMatchLength;
}
}
matchLengthAt = newMatchLengthAt;
}
return bestMatchSize ? new Match(bestMatchInOld, bestMatchInNew, bestMatchSize) : null;
},
insert: function (opt, tagCls) {
//this.insertTag('ins', tagCls || 'diffins', this.newWords.slice(opt.startInNew, opt.endInNew));
},
del: function (opt, tagCls) {
this.insertTag('span', tagCls || 'diff', this.oldWords.slice(opt.startInOld, opt.endInOld));
},
equal: function (opt) {
this.content = this.content.concat(this.newWords.slice(opt.startInNew, opt.endInNew));
},
replace: function (opt) {
this.del(opt, 'diffmod');
this.insert(opt, 'diffmod');
},
/**
* 添加标签
* @param {String} tagName
* @param {String} cssCls
* @param {String} words
*/
insertTag: function (tagName, cssCls, words) {
var nonTags;
var that = this;
while (words.length) {
//获取words内“连续”的非标签字符
nonTags = this.extractConsecutiveWords(words, true);
if (nonTags.length) {
this.content.push(
this.warpText(nonTags.join(''), tagName, cssCls));
}
if (words.length) {
//获取words内“连续”的标签字符
this.content = this.content.concat(
this.extractConsecutiveWords(words, false));
}
}
},
/**
* 获取words内连续的“文本”或“标签”
* @param {Array} words
* @param {Boolean} isTag
* @return {Array}
*/
extractConsecutiveWords: function (words, isTag) {
var idxOfFirstTag = null;
for (var i = 0, len = words.length; i < len; i++) {
//注:是取判定条件的前一位
if (this.isTag(words[i]) == isTag) {
idxOfFirstTag = i;
break;
}
}
return words.splice(0, idxOfFirstTag !== null ? idxOfFirstTag : words.length);
},
warpText: function (text, tagName, cssCls) {
return '<' + tagName + ' class="' + cssCls + '">' + text + '' + tagName + '>';
},
isOpeningTag: function (item) {
return /^\s*<[^>]+>\s*$/ig.test(item);
},
isClosingTag: function (item) {
return /^\s*<\/[^>]+>\s*$/ig.test(item);
},
isTag: function (item) {
return this.isOpeningTag(item) ? 1 : this.isClosingTag(item) ? 2 : 0;
}
};
root.getHTMLDiff = function (oldVersion, newVersion) {
return new DiffBuilder(oldVersion, newVersion).build();
};
/**
* 针对高级浏览器开启webWorker支持
*/
if (typeof postMessage === "function") {
onmessage = function (evt) {
var data = evt.data;
var diff = getHTMLDiff(data.oldVersion, data.newVersion);
postMessage(diff);
};
}
})();