src/Utilities/StringComparators/EditDistanceStringComparator.php
<?php
declare( strict_types = 1 );
namespace MediaWiki\Extension\Translate\Utilities\StringComparators;
/**
* Smart string comparator that uses simple string comparison, and then
* the levenshtein algorithm to compare two strings.
* @author Abijeet Patro
* @since 2023.11
* @license GPL-2.0-or-later
*/
class EditDistanceStringComparator implements StringComparator {
private SimpleStringComparator $simpleStringComparator;
public function __construct() {
$this->simpleStringComparator = new SimpleStringComparator();
}
public function getSimilarity( $a, $b ) {
$similarity = $this->simpleStringComparator->getSimilarity( $a, $b );
if ( $similarity > 0.9 ) {
return $similarity;
}
$similarity = $this->levenshtein( $a, $b, mb_strlen( $a ), mb_strlen( $b ) );
if ( $similarity === -1 ) {
return 0;
}
// See: https://stackoverflow.com/a/59585447
$maxLength = max( strlen( $a ), strlen( $b ) );
return ( $maxLength - $similarity ) / $maxLength;
}
/**
* PHP implementation of Levenshtein edit distance algorithm. Uses the native PHP implementation
* when possible for speed. The native levenshtein is limited to 255 bytes.
*/
public function levenshtein( string $str1, string $str2, int $length1, int $length2 ): int {
if ( $length1 === 0 ) {
return $length2;
}
if ( $length2 === 0 ) {
return $length1;
}
if ( $str1 === $str2 ) {
return 0;
}
$byteLength1 = strlen( $str1 );
$byteLength2 = strlen( $str2 );
if ( $byteLength1 === $length1 && $byteLength1 <= 255
&& $byteLength2 === $length2 && $byteLength2 <= 255
) {
return levenshtein( $str1, $str2 );
}
$prevRow = range( 0, $length2 );
for ( $i = 0; $i < $length1; $i++ ) {
$currentRow = [];
$currentRow[0] = $i + 1;
$c1 = mb_substr( $str1, $i, 1 );
for ( $j = 0; $j < $length2; $j++ ) {
$c2 = mb_substr( $str2, $j, 1 );
$insertions = $prevRow[$j + 1] + 1;
$deletions = $currentRow[$j] + 1;
$substitutions = $prevRow[$j] + ( ( $c1 !== $c2 ) ? 1 : 0 );
$currentRow[] = min( $insertions, $deletions, $substitutions );
}
$prevRow = $currentRow;
}
return $prevRow[$length2];
}
}