{-|
Module      : TSL
Description : Tier-based Strictly Local Grammars and Applications
Copyright   : (c) Greg Kobele, 2019
Maintainer  : gkobele@uni-leipzig.de

This module implements Tier-based Strictly local string grammars, and
parsing, generation, and learning algorithms for them:

[Parsing]: How could we use a grammar to understand (recognize) language data?

[Generation]: How could we use a grammar to construct a sentence?

[Learning]: How could we learn a grammar from language data?
-}

module TSL (Sentence
           , Text
           , TSL
           , acceptTSL
           , generateTSL
           , learnTSL)
where


import Data.List (tails,nub,partition,isPrefixOf)
import Data.Function ((&))

{- We will make use of set operations on lists, in particular, 'subset' -}
subset :: Eq a => [a] -> [a] -> Bool
s `subset` t = all (`elem` t) s


-- * Delimited Strings

{- A string is delimited iff it begins with 'Start', ends with 'End',
 and the symbols in between are of the form 'Mid' a. -}
data Delimited a = Start | Mid a | End deriving (Eq,Ord)

instance Show a => Show (Delimited a) where
  show Start = ">"
  show End = "<"
  show (Mid b) = show b

{- We compute a value given a 'Delimited' b as input. -}
delTest :: a        -- ^ The value to return if given 'Start'
        -> (b -> a) -- ^ How to compute the value if given a 'Mid' b
        -> a        -- ^ The value to return if given 'End'
        -> Delimited b -> a
delTest s m e Start = s
delTest s m e (Mid a) = m a
delTest s m e End = e

-- The following type abbreviations make the types of our functions
-- easier to read

-- | a sentence is a list of symbols
type Sentence a = [a]
-- | a text is a list of sentences
type Text a = [Sentence a]

{- Sentences can be delimited by adding a 'Start' to their front, and
 'End' to their end, and prefixing each symbol with a 'Mid' -}
delim :: Sentence a -> Sentence (Delimited a)
delim word = Start : foldr ((:) . Mid) [End] word

{- A delimited sentence can be undelimited by erasing the 'Start' and
 'End' symbols, and removing the prefix 'Mid' from the symbols in
 between -}
undelim :: Sentence (Delimited a) -> Sentence a
undelim = foldr (delTest id (:) id) []

-- * Grammars and Factors
--
-- We introduce some more type abbreviations, this time for the
-- objects we use for describing 'Sentences'.

type Factor a = [Delimited a] -- ^ a factor is just a sequence of segments
type Grammar a = [Factor a] -- ^ a grammar is just a list of factors

{- The function 'fac' breaks a sentence up into its factors (of length
 given by its 'Int' argument).  If its input sentence is too short, it
 just returns the input as a single (but possibly shorter than
 desired) factor. -}
fac :: Int -> Sentence a -> Grammar a
fac k sentence = if length sentence < k - 2
                 then [delim sentence]
                 else getFac sentence
  where
    getFac = foldr (zipWith (:)) (repeat []) . take k . tails . delim


-- * TSL Grammars

type Tier a = [a] -- ^ a tier is defined by specifying which items appear on it

type TSL a = (Tier a, Grammar a)
-- ^ a TSL grammar consists of
--
--    1. a tier, and
--
--    2. an SL grammar

{-
  Projecting a string onto a Tier can be implemented by erasing all
  non-tier relevant segments.
-}
project :: Eq a => Tier a -> Sentence a -> Sentence a
project tier = filter (`elem` tier)

-- ** Parsing

-- | determining whether a sentence is generated by the grammar is a
--  matter of checking that its factors are permitted by the grammar
acceptTSL :: Eq a => TSL a -> Int -> Sentence a -> Bool
acceptTSL (tier,grammar) k word =
  fac k (project tier word) `subset` grammar


-- ** Generation
--
-- We can generate all grammatical sentences from a grammar, by
-- enumerating them one by one (i.e. by outputting a list of the
-- grammatical sentences).  We could do this in a number of ways, but
-- an important desideratum is that every sentence of the grammar be
-- at some finite point in the enumeration.  This could be ensured by,
-- for example, enumerating the shorter sentences before the longer
-- ones. Using natural numbers as an easy example, if we enumerate the
-- even numbers before any odd numbers, the first odd number will not
-- appear at any finite position in the enumerated list!

-- | The generation algorithm first makes a tree of all the
-- grammatical ways of adding a next letter to what we had before,
-- then enumerates the nodes of this tree in a breadth-first fashion.
-- The unfinished sentences are filtered out of this list of nodes,
-- and then the remaining sentences are made pretty.
generateTSL :: Eq a => [a] -> Int -> TSL a -> [Sentence a]
generateTSL alphabet k (tier,grammar) =
  mkTree (filter filterFunc . nexts) [Start] -- make a tree of
                                             -- all the grammatical
                                             -- ways of adding a next
                                             -- letter to what we had
                                             -- before
                                             --
  & breadthFirst                             -- enumerate the nodes of
                                             -- this tree
                                             --
  & filter isFinished                        -- keep only those
                                             -- sentences which are
                                             -- finished
                                             --  
  & fmap makePretty                          -- make them pretty
                                             -- (reverse them, and
                                             -- remove the delimiter
                                             -- signs)
  where
    makePretty = reverse . undelim
    isFinished (End:_) = True
    isFinished _ = False
    nexts (End:_) = []
    nexts l = fmap (: l) $ tail (delim alphabet)
    filterFunc xs = any (isPrefixOf initialSegment) grammar
      where
        initialSegment = reverse $ take k $ filter (delTest True (flip elem tier) True) xs


{- The generation algorithm is based on a Tree data structure, which we
  define here.  A tree consists of a node, and its daughters, which
  are themselves trees. -}
data Tree a = Tree a [Tree a]

{- We can create a tree by specifying two things:
  1. how to create the daughter root nodes, given the current node
   value, and
  2. what the value of the root is -}
mkTree :: (a -> [a]) -> a -> Tree a
mkTree next seed = Tree seed $ fmap (mkTree next) $ next seed

{- We can enumerate the nodes of a tree one level at a time (a so called
 'breadth-first' enumeration). -}
breadthFirst :: Tree a -> [a]
breadthFirst tree = bf [tree]
  where
    bf [] = []
    bf (Tree b bs : ts) = b : bf (ts ++ bs)

-- ** Learning

-- | The TSL learner works by first identifying the elements on a
-- tier, and then by running the usual SL learner on the
-- tier-projection of the input text.  This implements the algorithm
-- described by [Jardine and
-- McMullin](https://adamjardine.net/files/jardinemcmullin2016tslk.pdf).
learnTSL :: Eq a => Int -> Text a -> TSL a
learnTSL k text = (tier, grammar)
  where
    grammar = nub $ concat $ fmap (fac k . project tier) text
    tier = filter (testTier kFacMinus kFac kFacPlus) $ nub $ concat text
    kFacMinus = text >>= nub . fac (pred k)
    kFac = text >>= nub . fac k
    kFacPlus = text >>= nub . fac (succ k)

{-
  To test whether a segment should be projected we see whether it can
  be added to all k-1 factors, and removed from all k+1 factors, each
  time giving us some already seen k factor.  If so, then the segment
  is *NOT* tier relevant.
-}
testTier :: Eq a => Grammar a -> Grammar a -> Grammar a -> a -> Bool
testTier kFacMinus kFac kFacPlus c = 
  not $ (added ++ removed) `subset` kFac
  where
    added = [added | factor <- kFacMinus, added <- addLetter c factor]
    removed = [removed | factor <- kFacPlus, removed <- removeLetter c factor]

{-
We can add a letter to a factor in a number of ways.
-}
addLetter :: a -> Factor a -> [Factor a]
addLetter c [] = []
addLetter c (Start : ls) = fmap (Start :) (addLetter c ls)
addLetter c (End : _) = [[Mid c,End]]
addLetter c (Mid b : ls) = (Mid c : Mid b : ls) : fmap (Mid b :) (addLetter c ls)

{-
Similarly, a letter can be removed from a factor whenever it occurs
-}
removeLetter :: Eq a => a -> Factor a -> [Factor a]
removeLetter c [] = []
removeLetter c (Mid b:bs) =
  let removedList = -- The list of ways of removing something from bs,
                    -- and then putting b in front
        fmap (Mid b :) $ removeLetter c bs
  in
    if c == b then bs : removedList else removedList
removeLetter c (Start:bs) =
  fmap (Start :) $ removeLetter c bs -- the beginning of a string cannot be removed
removeLetter c (End:_) =
  [] -- once you come to the end of a string, there are no more letters to remove