summaryrefslogtreecommitdiff
path: root/data/one_song_words.py
blob: 3b740650146dacdcf3bbec8274ca30a0e7584c9c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import os

# list all tracks
tracks = [fn.replace(".txt", "") for fn in os.listdir("all") if fn.endswith(".txt")]
lyrics = {}

for track in tracks:
    f = open(f"all/{track}.txt")
    # find every discrete word, deduped and normalized with best effort
    lyrics[track] = set(
        [
            word.strip(",.?!'\"():").replace("'s", "").replace("'d", "").lower()
            for word in f.read().split()
        ]
    )
    f.close()

rf = open("results", "w")

for track in tracks:
    other_tracks = [t for t in tracks if t != track]
    for word in lyrics[track]:
        # if word does not appear in any other track
        if not any([(word in lyrics[o]) for o in other_tracks]):
            rf.write(f"{track}\t{word}\n")

rf.close()