aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xREADME.md4
-rwxr-xr-xsrc/main.rs240
2 files changed, 117 insertions, 127 deletions
diff --git a/README.md b/README.md
index 0e2e49a..eb01564 100755
--- a/README.md
+++ b/README.md
@@ -1,4 +1,6 @@
# A simple regex engine library written in rust
## Resources
-- Wikipedia: https://en.wikipedia.org/wiki/Finite-state_machine , https://en.wikipedia.org/wiki/Regular_expression
+- FSM: https://en.wikipedia.org/wiki/Finite-state_machine
+- Regex: https://en.wikipedia.org/wiki/Regular_expression
+- Thompson's construction: https://en.wikipedia.org//wiki/Thompson's_construction
diff --git a/src/main.rs b/src/main.rs
index cfcfcbf..01a4105 100755
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,156 +1,144 @@
#![allow(dead_code, unused_variables, unused_mut)]
-#[derive(Debug)]
-struct State {
- transitions: Vec<(Trans, usize)>, // (input symbol, destination state index)
+type ReToken = Box<RegexToken>;
+
+#[derive(Debug, PartialEq, Clone)]
+enum RegexToken {
+ Token(ReToken),
+ Symbol(char),
+ Number(usize),
+ Concat((ReToken, ReToken)),
+ Union((ReToken, ReToken)),
+ Star(ReToken),
+ Dot,
+ None,
}
-#[derive(Debug)]
-struct NFA {
- graph: Vec<State>,
- initial_state: usize,
- accepting_states: Vec<usize>,
+macro_rules! Sym {
+ ($c:expr) => {
+ RegexToken::Symbol($c)
+ };
}
-impl NFA {
- fn new(graph: Vec<State>, initial_state: usize, accepting_states: Vec<usize>) -> Self {
- NFA {
- graph,
- initial_state,
- accepting_states,
- }
- }
+macro_rules! Star {
+ ($c:expr) => {
+ RegexToken::Star(Box::new($c))
+ };
+}
- fn simulate(&self, input: &str) -> bool {
- let mut current_states = vec![self.initial_state];
-
- for c in input.chars() {
- let mut next_states = vec![];
-
- for state in &current_states {
- for (trans, dest_state) in &self.graph[*state].transitions {
- // println!("char: {c}, trans: {trans:?}, dest_state: {dest_state:?}");
- match trans {
- Trans::Symbol(symbol) if *symbol == c => {
- next_states.push(*dest_state);
- }
- Trans::Epsilon => {
- next_states.push(*dest_state);
- }
- _ => {}
- }
- }
- }
+macro_rules! Concat {
+ ($a:expr, $b:expr) => {
+ RegexToken::Concat((Box::new($a), Box::new($b)))
+ };
+}
- current_states = next_states;
- }
+macro_rules! Union {
+ ($a:expr, $b:expr) => {
+ RegexToken::Union((Box::new($a), Box::new($b)))
+ };
+}
- // println!("states: {:?}", current_states);
+#[cfg(test)]
+mod tests {
+ use super::*;
- current_states
- .iter()
- .any(|&state| self.accepting_states.contains(&state))
+ #[test]
+ fn test_concat() {
+ assert_eq!(
+ Regex::new(String::from("ab")),
+ Concat!(Sym!('a'), Sym!('b'))
+ )
}
-}
-#[derive(PartialEq, Clone, Debug)]
-enum Trans {
- Symbol(char),
- Epsilon,
-}
+ #[test]
+ fn test_union() {
+ assert_eq!(
+ Regex::new(String::from("(a|b)")),
+ Concat!(Union!(Sym!('a'), Sym!('b')), RegexToken::None)
+ )
+ }
-struct Regex {
- regex: &'static str,
+ #[test]
+ fn test_none() {
+ assert_eq!(Regex::new(String::from("")), RegexToken::None)
+ }
+
+ #[test]
+ fn test_star() {
+ assert_eq!(
+ Regex::new(String::from("a*b")),
+ Concat!(Star!(Sym!('a')), Sym!('b'))
+ )
+ }
}
+#[derive(Debug, PartialEq)]
+struct Regex {}
+
impl Regex {
- fn new(input: &'static str) -> Self {
- Regex { regex: input }
+ fn new(input: String) -> RegexToken {
+ Regex::parse(input)
}
- fn compile_nfa(&self) -> NFA {
- let mut graph: Vec<State> = vec![State {
- transitions: vec![],
- }];
- let initial_state = 0;
- let mut accepting_states = vec![];
-
- let mut current_state = initial_state;
- let mut prev_state: Option<usize> = None;
-
- let input: Vec<char> = self.regex.chars().collect();
-
- for (i, &c) in input.iter().enumerate() {
- match c {
- '*' => {}
- '+' => todo!(),
- '.' => {
- let next_state = graph.len();
- println!("{next_state}");
- graph.push(State {
- transitions: vec![],
- });
-
- graph[next_state]
- .transitions
- .push((Trans::Epsilon, next_state));
-
- current_state = next_state;
- }
+ fn parse(input: String) -> RegexToken {
+ if input.is_empty() {
+ return RegexToken::None;
+ }
+
+ let mut chars = input.chars().peekable();
+ let mut parsed_token = Self::parse_token(&mut RegexToken::None, &mut chars);
+
+ Self::parse_expression(&mut parsed_token, &mut chars)
+ }
+
+ fn parse_expression(
+ left: &mut RegexToken,
+ chars: &mut std::iter::Peekable<std::str::Chars>,
+ ) -> RegexToken {
+ while let Some(&next) = chars.peek() {
+ match next {
'|' => {
- let before = input[i - 1];
- let after = input[i + 1];
- let next_state = graph.len();
- graph.push(State {
- transitions: vec![],
- });
- graph[current_state]
- .transitions
- .push((Trans::Symbol(c), next_state));
-
- current_state = next_state;
+ chars.next(); // Consume '|'
+ let right = Self::parse_token(left, chars);
+ *left = RegexToken::Union((Box::new(left.clone()), Box::new(right)));
+ }
+ '*' => {
+ chars.next(); // Consume '|'
+ let right = Self::parse_token(left, chars);
+ *left = RegexToken::Star(Box::new(left.clone()));
}
_ => {
- // let after = input[i + 1];
- // if after != '|' {
- let next_state = graph.len();
- graph.push(State {
- transitions: vec![],
- });
- graph.push(State {
- transitions: vec![],
- });
-
- graph[current_state]
- .transitions
- .push((Trans::Symbol(c), next_state + 1));
- graph[next_state]
- .transitions
- .push((Trans::Epsilon, next_state + 1));
-
- current_state = next_state + 1;
- // }
+ let right = Self::parse_token(left, chars);
+ *left = RegexToken::Concat((Box::new(left.clone()), Box::new(right)));
}
}
}
+ left.clone()
+ }
- accepting_states.push(current_state);
-
- NFA::new(graph, initial_state, accepting_states)
+ fn parse_token(
+ left: &mut RegexToken,
+ chars: &mut std::iter::Peekable<std::str::Chars>,
+ ) -> RegexToken {
+ match chars.next() {
+ Some('(') => {
+ let token = Self::parse(chars.collect());
+ chars.next(); // Skip ')'
+ token
+ }
+ Some('.') => RegexToken::Dot,
+ Some(c) if c.is_ascii_alphanumeric() => Sym!(c),
+ Some('*') => {
+ let token = Self::parse_token(left, chars);
+ Star!(left.clone())
+ }
+ _ => RegexToken::None, // Handle other cases accordingly
+ }
}
}
fn main() {
- let regex = Regex::new("a.bc");
- let nfa = regex.compile_nfa();
-
- println!("NFA: {:#?}", nfa);
-
- let input = vec!["a", "abcc", "abc", "abbc", "abbbc", "abbbbc", "ac", "ab"];
-
- println!("Regex: {:#?}", regex.regex);
-
- for inp in input {
- println!("{} -> {:?}", inp, nfa.simulate(inp)); // Should match
- }
+ let input = "a*b";
+ let token = Regex::new(String::from(input));
+ println!("{input}\n{:#?}", token)
}