aboutsummaryrefslogtreecommitdiff
path: root/presets/scripts/parse_lexicon.py
diff options
context:
space:
mode:
Diffstat (limited to 'presets/scripts/parse_lexicon.py')
-rwxr-xr-xpresets/scripts/parse_lexicon.py54
1 files changed, 54 insertions, 0 deletions
diff --git a/presets/scripts/parse_lexicon.py b/presets/scripts/parse_lexicon.py
new file mode 100755
index 0000000..253b2dd
--- /dev/null
+++ b/presets/scripts/parse_lexicon.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+import re
+import json
+
+def parse_patterns(file_content):
+ # Split the text into sections by the pattern delimiter
+ sections = file_content.split("\n\n")
+ print(sections)
+
+ patterns = []
+ for section in sections:
+ # Extract the pattern name
+ name_match = re.match(r":([\w\-\(\),/ ]+):", section)
+ if not name_match:
+ continue
+ name = name_match.group(1).strip()
+
+
+ # Extract the pattern description
+ pattern_match = re.search(r"\n\t.*([\.\*]*\n)", section)
+ pattern = pattern_match.group(0).strip() if pattern_match else None
+
+ # Extract the discoverer
+ discoverer_match = re.search(r"Found by ([\w\s]+) in", section, re.IGNORECASE)
+ discoverer = discoverer_match.group(1).strip() if discoverer_match else None
+
+ # Add the extracted data to the patterns list
+ patterns.append({
+ "name": name,
+ "pattern": pattern,
+ "discoverer": discoverer
+ })
+
+ return patterns
+
+# Load the input file
+with open("../lexicon_stripped.txt", "r") as file:
+ content = file.read()
+
+# Parse the patterns
+parsed_patterns = parse_patterns(content)
+
+print(parsed_patterns[0])
+
+# Output the result as JSON
+output_file = "patterns.json"
+with open(output_file, "w") as json_file:
+ json.dump(parsed_patterns, json_file, indent=4)
+
+
+
+
+print(f"Patterns saved to {output_file}")
+