summaryrefslogtreecommitdiff
path: root/data/dedupe.py
blob: 1ba21ac7004d43c144d7ad6179e40277b2464889 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
import re

tracks = [fn.replace(".txt", "") for fn in os.listdir("all")]
lyrics = {}

for track in tracks:
    f = open(f"all/{track}.txt")
    lyrics[track] = set(
        [
            word.strip(",.?!'\"():").replace("'s", "").replace("'d", "").lower()
            for word in f.read().split()
        ]
    )
    f.close()

rf = open("list")

for line in rf.read().splitlines():
    track, word = tuple(line.split("\t"))
    other_tracks = [t for t in tracks if t != track]
    # there might be false positives, e.g. even <-> evening, to <-> toes
    variations = [
        word + "s",
        word + "es",
        word + "d",
        word + "ed",
        word + "ing",
        re.sub("s$", "", word),
        re.sub("es$", "", word),
        re.sub("d$", "", word),
        re.sub("ed$", "", word),
        re.sub("ing$", "", word),
    ]
    dupe = False
    for other_track in other_tracks:
        if dupe:
            break

        for var in variations:
            if var in lyrics[other_track]:
                print(f"{word} ({track}), {var} ({other_track})")
                dupe = True
                break

rf.close()