-- Haskell implementation of simple regular expressions, inspired by -- Torben Mogensens textbook "Introduction to Compiler Design". -- -- Character ranges are supported through translation to alternations. -- Negative ranges are not supported. A backslash causes the -- following character to be read literally, even if it is normally -- syntactically significant. The full path from regular expressions -- over NFAs to DFAs has been implemented, and several utility -- functions for printing the state machines as graphs (in GraphViz -- format) are also provided. -- -- If compiled as an executable, the program acts as a very simple -- `grep` clone. module Main where import Control.Applicative import Control.Monad.State import Control.Monad.Writer import Data.Char import Data.Maybe import Data.List import qualified Data.Map as M import qualified Data.Set as S import Text.Parsec hiding ((<|>), many) import Text.Parsec.String import System.Environment -- | Simple regular expressions. Note that there is no concept of -- matching a string, only matching a single character, which can then -- be combined by concatenation. data Regexp = ReChar Char -- ^ A literal character. | ReEpsilon -- ^ Nothing. | ReConcat Regexp Regexp -- ^ Two expressions following each other. | ReChoice Regexp Regexp -- ^ Either of the given expressions. | ReKleene Regexp -- ^ Zero or more instances of the expression. deriving (Show) -- | This parser is somewhat nasty, but not very interesting. Much of -- the complexity stems from handling escaping. regexp :: Parser Regexp regexp = chainl simple (char '|' >> pure ReChoice) ReEpsilon <* eof where simple = chainl term (pure ReConcat) ReEpsilon term = kleene (between (char '(') (char ')') regexp) <|> kleene (between (char '[') (char ']') clss) <|> text clss = chainl1 (comp <|> chr) (pure ReChoice) comp = do (c1, c2) <- try $ pure (,) <*> clchr <*> (char '-' *> clchr) case map ReChar [c1..c2] of [] -> fail $ "Empty range " ++ [c1] ++ "-" ++ [c2] (c:cs) -> return $ foldl ReChoice c cs text = chainl1 (kleene chr) (pure ReConcat) clchr = noneOf "[]" <|> char '\\' *> anyChar chr = pure ReChar <*> (char '\\' *> anyChar <|> noneOf "()|*[]+") kleene p = do s <- p (char '*' >> return (ReKleene s)) <|> (char '+' >> return (ReConcat s $ ReKleene s)) <|> return s parseRegexp :: SourceName -> String -> Either ParseError Regexp parseRegexp = parse regexp -- | An NFA state is uniquely identified by an integer. type NFAState = Int -- | A transition in an NFA is either by a concrete character or an epsilon. data Symbol = Symbol Char | Epsilon deriving (Eq, Ord, Show) -- | The transitions in an NFA are a mapping from symbols to sets of -- states (as a single symbol may lead to one of several possible -- states, hence the nondeterminism). type NFATransitions = M.Map Symbol (S.Set NFAState) data NFA = NFA { nfaStates :: S.Set NFAState -- ^ The set of states in the NFA. , nfaStartState :: NFAState -- ^ The unique starting state. , nfaTransitions :: M.Map NFAState NFATransitions -- ^ A mapping from a state to the transitions going out of it. , nfaAccepting :: S.Set NFAState -- ^ The set of accepting states (can be empty, -- although such an NFA would not be very useful, and -- annot be constructed via regular expressions.) } deriving (Show) -- | We represent DFA states by sets of NFA states. This is really -- unnecessary, they could be unique integers instead, but it makes -- the subset construction algorithm slightly more convenient. type DFAState = S.Set NFAState -- | DFA transitions are maps from concrete characters to a single -- state. type DFATransitions = M.Map Char DFAState data DFA = DFA { dfaStates :: S.Set DFAState , dfaStartState :: DFAState , dfaTransitions :: M.Map DFAState DFATransitions , dfaAccepting :: S.Set DFAState } deriving (Show) runDFA :: DFA -> String -> Maybe (String, String) runDFA dfa str' = runDFA' str' (dfaStartState dfa) where runDFA' str s | s `S.member` dfaAccepting dfa = consume str s <|> Just ("", str) | otherwise = consume str s consume [] _ = Nothing consume (c:cs) s = case M.lookup s $ dfaTransitions dfa of Nothing -> error "Invalid DFA" Just ts -> prepend c <$> (runDFA' cs =<< M.lookup c ts) prepend c (a,b) = (c:a, b) -- | Compute the complete epsilon closure for the given set of NFA -- states in the given NFA. This is done by repeatedly calling -- 'eTransitions' until we reach a fixed point. eClosure :: NFA -> S.Set NFAState -> DFAState eClosure nfa t = if t' `S.isSubsetOf` t then t else eClosure nfa $ t' `S.union` t where t' = eTransitions nfa t -- | @eTransitions' nfa ss@ is the states reachable by the states in @ss@ -- via epsilon-transitions, which does not necessarily include @ss@ -- itself. eTransitions :: NFA -> S.Set NFAState -> S.Set NFAState eTransitions nfa ss = S.unions $ map f (S.toList ss) where f s = S.unions (map snd eTransitions) where eTransitions = filter ((==Epsilon) . fst) transitions transitions = maybe [] M.toList $ M.lookup s $ nfaTransitions nfa -- | Convert an NFA to a DFA using subset construction. nfaToDFA :: NFA -> DFA nfaToDFA nfa = dfa' { dfaAccepting = S.filter accepting $ dfaStates dfa' } -- We start by constructing a DFA consisting of nothing but the -- starting state, then using nfaToDFA' to perform the actual work. where dfa = DFA { dfaStates = S.singleton start , dfaStartState = start , dfaTransitions = M.empty , dfaAccepting = S.empty } dfa' = nfaToDFA' nfa dfa [] [start] start = eClosure nfa $ S.singleton $ nfaStartState nfa accepting = not . S.null . S.intersection (nfaAccepting nfa) nfaToDFA' :: NFA -> DFA -> [DFAState] -> [DFAState] -> DFA nfaToDFA' _ dfa _ [] = dfa nfaToDFA' nfa dfa seen (s:ss) = nfaToDFA' nfa dfa' (s:seen) ss' where dfa' = dfa { dfaStates = foldr S.insert (dfaStates dfa) (M.elems ts) , dfaTransitions = M.insert s ts (dfaTransitions dfa) } ss' = union ss (M.elems ts) \\ seen ts = M.map (eClosure nfa) $ foldl (M.unionWith S.union) M.empty $ map transitions $ S.toList s transitions = maybe M.empty (M.foldlWithKey noepsilons M.empty) . flip M.lookup (nfaTransitions nfa) noepsilons m Epsilon _ = m noepsilons m (Symbol c) v = M.insert c v m -- | Convert a regular expression to an NFA. Runs in a state monad -- solely so we can generate unique numbers for representing the NFA -- states. regexpToNfa :: Regexp -> NFA regexpToNfa regex = flip evalState 0 $ do start <- newstate end <- newstate (states, trs) <- regexpToNFA' regex start end return NFA { nfaStates = S.fromList [start,end] `S.union` states , nfaStartState = start , nfaTransitions = trs , nfaAccepting = S.singleton end } where newstate = do v <- get put (v+1) return v connect from c to = M.singleton from (M.singleton c (S.singleton to)) combine = M.unionWith (M.unionWith S.union) regexpToNFA' (ReChar c) from to = return (S.empty, connect from (Symbol c) to) regexpToNFA' (ReEpsilon) from to = return (S.empty, connect from Epsilon to) regexpToNFA' (ReConcat t1 t2) from to = do s <- newstate (t1', trs1) <- regexpToNFA' t1 from s (t2', trs2) <- regexpToNFA' t2 s to return ( s `S.insert` t1' `S.union` t2' , trs1 `combine` trs2) regexpToNFA' (ReChoice t1 t2) from to = do (t1', trs1) <- regexpToNFA' t1 from to (t2', trs2) <- regexpToNFA' t2 from to return ( t1' `S.union` t2', trs1 `combine` trs2 ) regexpToNFA' (ReKleene t) from to = do s <- newstate (t', trs) <- regexpToNFA' t s s return ( s `S.insert` t' , trs `combine` connect from Epsilon s `combine` connect s Epsilon to ) initialMatch :: Regexp -> String -> Maybe (String, String) initialMatch regex str = runDFA dfa str where dfa = nfaToDFA $ regexpToNfa regex -- The real work is done. The functions below deal with printing NFAs -- and DFAs in Graphviz format. match :: Regexp -> String -> Bool match r str = isJust (initialMatch r str) || case str of [] -> False (_:cs) -> match r cs class Ord a => Next a where next :: a -> a instance Next Symbol where next Epsilon = Epsilon next (Symbol c) = Symbol $ chr $ ord c + 1 instance Next Char where next c = chr (ord c + 1) ranges :: (Next a, Ord b) => [(a, b)] -> [([(a,a)], b)] ranges = map ranges' . M.toList . foldl arrange M.empty where arrange m (k, v) = M.insertWith (++) v [k] m ranges' (v, ks) = (foldl ranges'' [] $ sort ks, v) ranges'' [] x = [(x,x)] ranges'' ((y1,y2):ys) x | next y2 == x = (y1, x) : ys | otherwise = (x,x) : (y1,y2) : ys -- | Escape the double-quotes in a string so it can be put in a -- Graphviz string. escape :: String -> String escape [] = [] escape ('"':s) = "\\\"" ++ escape s escape (c:s) = c : escape s translabel :: Next a => (a -> String) -> [(a,a)] -> String translabel f [(x,y)] | x == y = f x | otherwise = "[" ++ f x ++ "-" ++ f y ++ "]" translabel f xs = "[" ++ concatMap charclass xs ++ "]" where charclass (x,y) | x == y = f x | otherwise = f x ++ "-" ++ f y statelabel :: String -> Int -> Bool -> String statelabel k i a = k ++ "[label=\"" ++ show i ++ "\"" ++ (if a then ", shape=doublecircle" else "shape=circle") ++ "]\n" printNFA :: NFA -> String printNFA nfa = evalState (execWriterT printNFA') (M.empty,0::Int) where node k = do (s, v) <- get case M.lookup k s of Just v' -> return v' Nothing -> do let v' = v+1 v'' = 'S' : show v' tell $ statelabel v'' v' $ k `S.member` nfaAccepting nfa put (M.insert k v'' s, v') return v'' symbol Epsilon = "epsilon" symbol (Symbol c) = [c] trans (from, ts) = forM_ (ranges $ M.toList ts) $ \(clss, tos) -> forM_ (S.toList tos) $ \to -> do from' <- node from to' <- node to tell $ from' ++ "->" ++ to' ++ " [label=\"" ++ escape (translabel symbol clss) ++ "\"]\n" printNFA' = do tell "digraph nfa {\nrankdir=TD\nEmp [style=invisible]\n" mapM_ trans $ M.toList $ nfaTransitions nfa tell "}\n" printDFA :: DFA -> String printDFA dfa = evalState (execWriterT printDFA') (M.empty,0::Int) where node k = do (s, v) <- get case M.lookup k s of Just v' -> return v' Nothing -> do let v' = v+1 v'' = 'S' : show v' tell $ statelabel v'' v $ k `S.member` dfaAccepting dfa put (M.insert k v'' s, v') return v'' trans (from, ts) = forM_ (ranges $ M.toList ts) $ \(clss, to) -> do from' <- node from to' <- node to tell $ from' ++ "->" ++ to' ++ " [label=\"" ++ escape (translabel (:[]) clss) ++ "\"]\n" printDFA' = do tell "digraph dfa {\nrankdir=LR\nEmp [style=invisible]\n" mapM_ trans $ M.toList $ dfaTransitions dfa tell "}\n" compileRegexp :: SourceName -> String -> Either ParseError NFA compileRegexp s r = regexpToNfa <$> parseRegexp s r printRegexpDFA :: String -> String printRegexpDFA regex = printDFA $ nfaToDFA nfa where nfa = case compileRegexp "command-line" regex of Left e -> error $ show e Right v -> v printRegexpNFA :: String -> String printRegexpNFA regex = printNFA nfa where nfa = case compileRegexp "command-line" regex of Left e -> error $ show e Right v -> v -- | Finally, a simple program entry point. main :: IO () main = do args <- getArgs prog <- getProgName case args of ["grep", r] -> grep r ["nfa", r] -> putStr $ printRegexpNFA r ["dfa", r] -> putStr $ printRegexpDFA r _ -> error $ "Usage: " ++ prog ++ " regexp" where grep r = case parseRegexp "command-line" r of Left e -> error $ show e Right r' -> interact (unlines . filter (match r') . lines)