How to Align Items on Two Lists Despite Spelling Variations Using Python
Sometimes you may encounter two backup folders where most files appear identical. But, to your annoyance, some are missing in one folder or the other, and some are named slightly differently, and to create a complete backup folder, you don't know which ones to keep and which ones to throw away. The following Python code uses Levenshtein distance to align the items on two lists. First, you need to install a library for Levenshtein, as well as Pandas, by opening a terminal and entering "pip install python-Levenshtein
" or "pip install levenshtein
."
from collections.abc import Sequence
import numpy as np
import pandas as pd
import Levenshtein
def get_ratios(ser1: Sequence, ser2: Sequence):
ratios = []
for t1 in ser1:
row = []
for t2 in ser2:
row.append(Levenshtein.ratio(t1, t2))
ratios.append(row)
return pd.DataFrame(ratios, index=ser1, columns=ser2)
def make_series(arr, name=None):
if type(arr) == pd.core.series.Series:
return arr
else:
return pd.Series(arr, name=name)
def sort_key(arr):
lst = list(pd.concat([ser1, ser2]))
return [lst.index(t) if t in lst else None for t in arr]
def align_lists(ser1, ser2, threshold=0):
ser1 = make_series(ser1, name="series1")
ser2 = make_series(ser2, name="series2")
if len(ser1) < len(ser2):
ser_s, ser_l = ser1, ser2
else:
ser_s, ser_l = ser2, ser1
ratios = get_ratios(ser_l, ser_s)
common = pd.DataFrame(
np.vstack([ser_l.iloc[np.argmax(ratios, axis=0)], ser_s]),
index=[ser_l.name, ser_s.name],
).T
ser_s_in_pairs = ratios.max(axis=0)[ratios.max(axis=0) > threshold]
pairs = (
pd.DataFrame(ser_s_in_pairs)
.assign(
**{
ser_l.name: ser_l.iloc[
np.argmax(ratios.loc[:, ser_s_in_pairs.index], axis=0)
].values
}
)
.reset_index()
.rename(columns={0: "score"})
)
table = (
pd.concat(
[
pairs,
pd.DataFrame(ser_l[~ser_l.isin(pairs[ser_l.name])]).assign(score=0),
pd.DataFrame(ser_s[~ser_s.isin(pairs[ser_s.name])]).assign(score=0),
],
ignore_index=True,
)
.sort_values([ser1.name, ser2.name], key=sort_key)
.fillna("")
.reset_index(drop=True)[[ser1.name, ser2.name, "score"]]
)
return table
Example
Lists 1 and 2 show recent box-office movies, but they are not identical and have spelling variations.
List 1
- Black Panther: Wakanda Forever
- Barbie
- The Super Mario Brothers Movie
- Spider-Man: Across the Spider-Verse
- Guardians of the Galaxy Vol. 3
- Oppenheimer
- The Little Mermaid
- Ant-Man and the Wasp: Quantumania
- John Wick: Chapter 4
- Sound of Freedom
- Taylor Swift: The Eras Tour
List 2
- The Little Mermaid
- Taylor Swift's Eras Tour
- Ant-Man and the Wasp: Quantumania
- The Super Mario Bros. Movie
- Top Gun: Maverick
- Oppenheimer
- Barbecue
- Guardians of the Galaxy Volume Three
- John Wick 4.0
- sound of freedom
- Avatar: The Way of Water
Let's align items on these lists.
Script
list1 = pd.Series(
[
"Black Panther: Wakanda Forever",
"Barbie",
"The Super Mario Brothers Movie",
"Spider-Man: Across the Spider-Verse",
"Guardians of the Galaxy Vol. 3",
"Oppenheimer",
"The Little Mermaid",
"Ant-Man and the Wasp: Quantumania",
"John Wick: Chapter 4",
"Sound of Freedom",
"Taylor Swift: The Eras Tour",
],
name="List 1",
)
list2 = pd.Series(
[
"The Little Mermaid",
"Taylor Swift's Eras Tour",
"Ant-Man and the Wasp: Quantumania",
"The Super Mario Bros. Movie",
"Top Gun: Maverick",
"Oppenheimer",
"Barbecue",
"Guardians of the Galaxy Volume Three",
"John Wick 4.0",
"sound of freedom",
"Avatar: The Way of Water",
],
name="List 2",
)
align_lists(list1,list2,threshold=0.6)
Results
List 1 | List 2 | score | |
---|---|---|---|
0 | The Little Mermaid | The Little Mermaid | 1.0 |
1 | Taylor Swift: The Eras Tour | Taylor Swift's Eras Tour | 0.86 |
2 | Ant-Man and the Wasp: Quantumania | Ant-Man and the Wasp: Quantumania | 1.0 |
3 | The Super Mario Brothers Movie | The Super Mario Bros. Movie | 0.91 |
4 | Oppenheimer | Oppenheimer | 1.0 |
5 | Barbie | Barbecue | 0.71 |
6 | Guardians of the Galaxy Vol. 3 | Guardians of the Galaxy Volume Three | 0.85 |
7 | John Wick: Chapter 4 | John Wick 4.0 | 0.67 |
8 | Sound of Freedom | sound of freedom | 0.88 |
9 | Black Panther: Wakanda Forever | 0.0 | |
10 | Spider-Man: Across the Spider-Verse | 0.0 | |
11 | Top Gun: Maverick | 0.0 | |
12 | Avatar: The Way of Water | 0.0 |