Python “编辑距离”(Levens
短信预约 -IT技能 免费直播动态提醒
本文搜集了网上比较常用的几种计算Levenshtein distance的函数,
其中函数(1)为调用数学工具包Numpy, 函数(2)和(1)算法类似,都是采用DP, (3)来自wiki(4)是直接调用python的第三方库Levenshtein
源码和结果如下:
import time
from functools import wraps
import cProfile
import numpy
import Levenshtein
def fn_timer(function):
@wraps(function)
def function_timer(*args, **kwargs):
t0 = time.time()
result = function(*args, **kwargs)
t1 = time.time()
print ("Total time running %s: %s seconds" %
(function.func_name, str(t1-t0))
)
return result
return function_timer
def levenshtein1(source, target):
if len(source) < len(target):
return levenshtein1(target, source)
# So now we have len(source) >= len(target).
if len(target) == 0:
return len(source)
# We call tuple() to force strings to be used as sequences
# ('c', 'a', 't', 's') - numpy uses them as values by default.
source = numpy.array(tuple(source))
target = numpy.array(tuple(target))
# We use a dynamic programming algorithm, but with the
# added optimization that we only need the last two rows
# of the matrix.
previous_row = numpy.arange(target.size + 1)
for s in source:
# Insertion (target grows longer than source):
current_row = previous_row + 1
# Substitution or matching:
# Target and source items are aligned, and either
# are different (cost of 1), or are the same (cost of 0).
current_row[1:] = numpy.minimum(
current_row[1:],
numpy.add(previous_row[:-1], target != s))
# Deletion (target grows shorter than source):
current_row[1:] = numpy.minimum(
current_row[1:],
current_row[0:-1] + 1)
previous_row = current_row
return previous_row[-1]
def levenshtein2(s1, s2):
if len(s1) < len(s2):
return levenshtein2(s2, s1)
# len(s1) >= len(s2)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
deletions = current_row[j] + 1 # than s2
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def levenshtein3(s, t):
''' From Wikipedia article; Iterative with two matrix rows. '''
if s == t: return 0
elif len(s) == 0: return len(t)
elif len(t) == 0: return len(s)
v0 = [None] * (len(t) + 1)
v1 = [None] * (len(t) + 1)
for i in range(len(v0)):
v0[i] = i
for i in range(len(s)):
v1[0] = i + 1
for j in range(len(t)):
cost = 0 if s[i] == t[j] else 1
v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
for j in range(len(v0)):
v0[j] = v1[j]
return v1[len(t)]
@fn_timer
def calllevenshtein1(s,t,n):
for i in range(n):
levenshtein3(s,t)
@fn_timer
def calllevenshtein2(s,t,n):
for i in range(n):
levenshtein3(s,t)
@fn_timer
def calllevenshtein3(s,t,n):
for i in range(n):
levenshtein3(s,t)
@fn_timer
def calllevenshtein4(s,t,n):
for i in range(n):
Levenshtein.distance(s,t)
if __name__ == "__main__":
n = 50000
a = 'abddcdefdgbd22svb'
b = 'bcdefg34rdyvdfsd'
calllevenshtein1(a, b, n)
calllevenshtein2(a, b, n)
calllevenshtein3(a, b, n)
calllevenshtein4(a, b, n)
结果:
Total time running calllevenshtein1: 16.0750000477 seconds
Total time running calllevenshtein2: 16.4990000725 seconds
Total time running calllevenshtein3: 16.2939999104 seconds
Total time running calllevenshtein4: 0.0629999637604 seconds
从结果来看,调用python第三方包效率最高,原因是其内部调用c库,优化了算法结构
免责声明:
① 本站未注明“稿件来源”的信息均来自网络整理。其文字、图片和音视频稿件的所属权归原作者所有。本站收集整理出于非商业性的教育和科研之目的,并不意味着本站赞同其观点或证实其内容的真实性。仅作为临时的测试数据,供内部测试之用。本站并未授权任何人以任何方式主动获取本站任何信息。
② 本站未注明“稿件来源”的临时测试数据将在测试完成后最终做删除处理。有问题或投稿请发送至: 邮箱/279061341@qq.com QQ/279061341