{-| Module : TSL Description : Tier-based Strictly Local Grammars and Applications Copyright : (c) Greg Kobele, 2019 Maintainer : gkobele@uni-leipzig.de This module implements Tier-based Strictly local string grammars, and parsing, generation, and learning algorithms for them: [Parsing]: How could we use a grammar to understand (recognize) language data? [Generation]: How could we use a grammar to construct a sentence? [Learning]: How could we learn a grammar from language data? -} module TSL (Sentence , Text , TSL , acceptTSL , generateTSL , learnTSL) where import Data.List (tails,nub,partition,isPrefixOf) import Data.Function ((&)) {- We will make use of set operations on lists, in particular, 'subset' -} subset :: Eq a => [a] -> [a] -> Bool s `subset` t = all (`elem` t) s -- * Delimited Strings {- A string is delimited iff it begins with 'Start', ends with 'End', and the symbols in between are of the form 'Mid' a. -} data Delimited a = Start | Mid a | End deriving (Eq,Ord) instance Show a => Show (Delimited a) where show Start = ">" show End = "<" show (Mid b) = show b {- We compute a value given a 'Delimited' b as input. -} delTest :: a -- ^ The value to return if given 'Start' -> (b -> a) -- ^ How to compute the value if given a 'Mid' b -> a -- ^ The value to return if given 'End' -> Delimited b -> a delTest s m e Start = s delTest s m e (Mid a) = m a delTest s m e End = e -- The following type abbreviations make the types of our functions -- easier to read -- | a sentence is a list of symbols type Sentence a = [a] -- | a text is a list of sentences type Text a = [Sentence a] {- Sentences can be delimited by adding a 'Start' to their front, and 'End' to their end, and prefixing each symbol with a 'Mid' -} delim :: Sentence a -> Sentence (Delimited a) delim word = Start : foldr ((:) . Mid) [End] word {- A delimited sentence can be undelimited by erasing the 'Start' and 'End' symbols, and removing the prefix 'Mid' from the symbols in between -} undelim :: Sentence (Delimited a) -> Sentence a undelim = foldr (delTest id (:) id) [] -- * Grammars and Factors -- -- We introduce some more type abbreviations, this time for the -- objects we use for describing 'Sentences'. type Factor a = [Delimited a] -- ^ a factor is just a sequence of segments type Grammar a = [Factor a] -- ^ a grammar is just a list of factors {- The function 'fac' breaks a sentence up into its factors (of length given by its 'Int' argument). If its input sentence is too short, it just returns the input as a single (but possibly shorter than desired) factor. -} fac :: Int -> Sentence a -> Grammar a fac k sentence = if length sentence < k - 2 then [delim sentence] else getFac sentence where getFac = foldr (zipWith (:)) (repeat []) . take k . tails . delim -- * TSL Grammars type Tier a = [a] -- ^ a tier is defined by specifying which items appear on it type TSL a = (Tier a, Grammar a) -- ^ a TSL grammar consists of -- -- 1. a tier, and -- -- 2. an SL grammar {- Projecting a string onto a Tier can be implemented by erasing all non-tier relevant segments. -} project :: Eq a => Tier a -> Sentence a -> Sentence a project tier = filter (`elem` tier) -- ** Parsing -- | determining whether a sentence is generated by the grammar is a -- matter of checking that its factors are permitted by the grammar acceptTSL :: Eq a => TSL a -> Int -> Sentence a -> Bool acceptTSL (tier,grammar) k word = fac k (project tier word) `subset` grammar -- ** Generation -- -- We can generate all grammatical sentences from a grammar, by -- enumerating them one by one (i.e. by outputting a list of the -- grammatical sentences). We could do this in a number of ways, but -- an important desideratum is that every sentence of the grammar be -- at some finite point in the enumeration. This could be ensured by, -- for example, enumerating the shorter sentences before the longer -- ones. Using natural numbers as an easy example, if we enumerate the -- even numbers before any odd numbers, the first odd number will not -- appear at any finite position in the enumerated list! -- | The generation algorithm first makes a tree of all the -- grammatical ways of adding a next letter to what we had before, -- then enumerates the nodes of this tree in a breadth-first fashion. -- The unfinished sentences are filtered out of this list of nodes, -- and then the remaining sentences are made pretty. generateTSL :: Eq a => [a] -> Int -> TSL a -> [Sentence a] generateTSL alphabet k (tier,grammar) = mkTree (filter filterFunc . nexts) [Start] -- make a tree of -- all the grammatical -- ways of adding a next -- letter to what we had -- before -- & breadthFirst -- enumerate the nodes of -- this tree -- & filter isFinished -- keep only those -- sentences which are -- finished -- & fmap makePretty -- make them pretty -- (reverse them, and -- remove the delimiter -- signs) where makePretty = reverse . undelim isFinished (End:_) = True isFinished _ = False nexts (End:_) = [] nexts l = fmap (: l) $ tail (delim alphabet) filterFunc xs = any (isPrefixOf initialSegment) grammar where initialSegment = reverse $ take k $ filter (delTest True (flip elem tier) True) xs {- The generation algorithm is based on a Tree data structure, which we define here. A tree consists of a node, and its daughters, which are themselves trees. -} data Tree a = Tree a [Tree a] {- We can create a tree by specifying two things: 1. how to create the daughter root nodes, given the current node value, and 2. what the value of the root is -} mkTree :: (a -> [a]) -> a -> Tree a mkTree next seed = Tree seed $ fmap (mkTree next) $ next seed {- We can enumerate the nodes of a tree one level at a time (a so called 'breadth-first' enumeration). -} breadthFirst :: Tree a -> [a] breadthFirst tree = bf [tree] where bf [] = [] bf (Tree b bs : ts) = b : bf (ts ++ bs) -- ** Learning -- | The TSL learner works by first identifying the elements on a -- tier, and then by running the usual SL learner on the -- tier-projection of the input text. This implements the algorithm -- described by [Jardine and -- McMullin](https://adamjardine.net/files/jardinemcmullin2016tslk.pdf). learnTSL :: Eq a => Int -> Text a -> TSL a learnTSL k text = (tier, grammar) where grammar = nub $ concat $ fmap (fac k . project tier) text tier = filter (testTier kFacMinus kFac kFacPlus) $ nub $ concat text kFacMinus = text >>= nub . fac (pred k) kFac = text >>= nub . fac k kFacPlus = text >>= nub . fac (succ k) {- To test whether a segment should be projected we see whether it can be added to all k-1 factors, and removed from all k+1 factors, each time giving us some already seen k factor. If so, then the segment is *NOT* tier relevant. -} testTier :: Eq a => Grammar a -> Grammar a -> Grammar a -> a -> Bool testTier kFacMinus kFac kFacPlus c = not $ (added ++ removed) `subset` kFac where added = [added | factor <- kFacMinus, added <- addLetter c factor] removed = [removed | factor <- kFacPlus, removed <- removeLetter c factor] {- We can add a letter to a factor in a number of ways. -} addLetter :: a -> Factor a -> [Factor a] addLetter c [] = [] addLetter c (Start : ls) = fmap (Start :) (addLetter c ls) addLetter c (End : _) = [[Mid c,End]] addLetter c (Mid b : ls) = (Mid c : Mid b : ls) : fmap (Mid b :) (addLetter c ls) {- Similarly, a letter can be removed from a factor whenever it occurs -} removeLetter :: Eq a => a -> Factor a -> [Factor a] removeLetter c [] = [] removeLetter c (Mid b:bs) = let removedList = -- The list of ways of removing something from bs, -- and then putting b in front fmap (Mid b :) $ removeLetter c bs in if c == b then bs : removedList else removedList removeLetter c (Start:bs) = fmap (Start :) $ removeLetter c bs -- the beginning of a string cannot be removed removeLetter c (End:_) = [] -- once you come to the end of a string, there are no more letters to remove