presets/scripts/parse_lexicon.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

#!/usr/bin/env python3
import re
import json

def parse_patterns(file_content):
    # Split the text into sections by the pattern delimiter
    sections = file_content.split("\n\n")
    print(sections)

    patterns = []
    for section in sections:
        # Extract the pattern name
        name_match = re.match(r":([\w\-\(\),/ ]+):", section)
        if not name_match:
            continue
        name = name_match.group(1).strip()


        # Extract the pattern description
        pattern_match = re.search(r"\n\t.*([\.\*]*\n)", section)
        pattern = pattern_match.group(0).strip() if pattern_match else None

        # Extract the discoverer
        discoverer_match = re.search(r"Found by ([\w\s]+) in", section, re.IGNORECASE)
        discoverer = discoverer_match.group(1).strip() if discoverer_match else None

        # Add the extracted data to the patterns list
        patterns.append({
            "name": name,
            "pattern": pattern,
            "discoverer": discoverer
        })

    return patterns

# Load the input file
with open("../lexicon_stripped.txt", "r") as file:
    content = file.read()

# Parse the patterns
parsed_patterns = parse_patterns(content)

print(parsed_patterns[0])

# Output the result as JSON
output_file = "patterns.json"
with open(output_file, "w") as json_file:
    json.dump(parsed_patterns, json_file, indent=4)


print(f"Patterns saved to {output_file}")