How to Align Items on Two Lists Despite Spelling Variations Using Python

Sometimes you may encounter two backup folders where most files appear identical. But, to your annoyance, some are missing in one folder or the other, and some are named slightly differently, and to create a complete backup folder, you don't know which ones to keep and which ones to throw away. The following Python code uses Levenshtein distance to align the items on two lists. First, you need to install a library for Levenshtein, as well as Pandas, by opening a terminal and entering "pip install python-Levenshtein" or "pip install levenshtein."

from collections.abc import Sequence
import numpy as np
import pandas as pd
import Levenshtein


def get_ratios(ser1: Sequence, ser2: Sequence):
    ratios = []
    for t1 in ser1:
        row = []
        for t2 in ser2:
            row.append(Levenshtein.ratio(t1, t2))
        ratios.append(row)
    return pd.DataFrame(ratios, index=ser1, columns=ser2)


def make_series(arr, name=None):
    if type(arr) == pd.core.series.Series:
        return arr
    else:
        return pd.Series(arr, name=name)


def sort_key(arr):
    lst = list(pd.concat([ser1, ser2]))
    return [lst.index(t) if t in lst else None for t in arr]


def align_lists(ser1, ser2, threshold=0):
    ser1 = make_series(ser1, name="series1")
    ser2 = make_series(ser2, name="series2")

    if len(ser1) < len(ser2):
        ser_s, ser_l = ser1, ser2
    else:
        ser_s, ser_l = ser2, ser1

    ratios = get_ratios(ser_l, ser_s)
    common = pd.DataFrame(
        np.vstack([ser_l.iloc[np.argmax(ratios, axis=0)], ser_s]),
        index=[ser_l.name, ser_s.name],
    ).T
    ser_s_in_pairs = ratios.max(axis=0)[ratios.max(axis=0) > threshold]

    pairs = (
        pd.DataFrame(ser_s_in_pairs)
        .assign(
            **{
                ser_l.name: ser_l.iloc[
                    np.argmax(ratios.loc[:, ser_s_in_pairs.index], axis=0)
                ].values
            }
        )
        .reset_index()
        .rename(columns={0: "score"})
    )
    table = (
        pd.concat(
            [
                pairs,
                pd.DataFrame(ser_l[~ser_l.isin(pairs[ser_l.name])]).assign(score=0),
                pd.DataFrame(ser_s[~ser_s.isin(pairs[ser_s.name])]).assign(score=0),
            ],
            ignore_index=True,
        )
        .sort_values([ser1.name, ser2.name], key=sort_key)
        .fillna("")
        .reset_index(drop=True)[[ser1.name, ser2.name, "score"]]
    )
    return table

Example

Lists 1 and 2 show recent box-office movies, but they are not identical and have spelling variations.

List 1

  • Black Panther: Wakanda Forever
  • Barbie
  • The Super Mario Brothers Movie
  • Spider-Man: Across the Spider-Verse
  • Guardians of the Galaxy Vol. 3
  • Oppenheimer
  • The Little Mermaid
  • Ant-Man and the Wasp: Quantumania
  • John Wick: Chapter 4
  • Sound of Freedom
  • Taylor Swift: The Eras Tour

List 2

  • The Little Mermaid
  • Taylor Swift's Eras Tour
  • Ant-Man and the Wasp: Quantumania
  • The Super Mario Bros. Movie
  • Top Gun: Maverick
  • Oppenheimer
  • Barbecue
  • Guardians of the Galaxy Volume Three
  • John Wick 4.0
  • sound of freedom
  • Avatar: The Way of Water

Let's align items on these lists.

Script

list1 = pd.Series(
    [
        "Black Panther: Wakanda Forever",
        "Barbie",
        "The Super Mario Brothers Movie",
        "Spider-Man: Across the Spider-Verse",
        "Guardians of the Galaxy Vol. 3",
        "Oppenheimer",
        "The Little Mermaid",
        "Ant-Man and the Wasp: Quantumania",
        "John Wick: Chapter 4",
        "Sound of Freedom",
        "Taylor Swift: The Eras Tour",
    ],
    name="List 1",
)

list2 = pd.Series(
    [
        "The Little Mermaid",
        "Taylor Swift's Eras Tour",
        "Ant-Man and the Wasp: Quantumania",
        "The Super Mario Bros. Movie",
        "Top Gun: Maverick",
        "Oppenheimer",
        "Barbecue",
        "Guardians of the Galaxy Volume Three",
        "John Wick 4.0",
        "sound of freedom",
        "Avatar: The Way of Water",
    ],
    name="List 2",
)

align_lists(list1,list2,threshold=0.6)

Results

List 1List 2score
0The Little MermaidThe Little Mermaid1.0
1Taylor Swift: The Eras TourTaylor Swift's Eras Tour0.86
2Ant-Man and the Wasp: QuantumaniaAnt-Man and the Wasp: Quantumania1.0
3The Super Mario Brothers MovieThe Super Mario Bros. Movie0.91
4OppenheimerOppenheimer1.0
5BarbieBarbecue0.71
6Guardians of the Galaxy Vol. 3Guardians of the Galaxy Volume Three0.85
7John Wick: Chapter 4John Wick 4.00.67
8Sound of Freedomsound of freedom0.88
9Black Panther: Wakanda Forever0.0
10Spider-Man: Across the Spider-Verse0.0
11Top Gun: Maverick0.0
12Avatar: The Way of Water0.0
Results

Leave a Reply

Your email address will not be published. Required fields are marked *