|
| 1 | +module Text.Parsing.StringParser.Examples where |
| 2 | + |
| 3 | +import Prelude hiding (between) |
| 4 | + |
| 5 | +import Control.Alt ((<|>)) |
| 6 | +import Data.Either (Either(..)) |
| 7 | +import Data.Foldable (fold, foldl, sum) |
| 8 | +import Data.List.Types (NonEmptyList) |
| 9 | +import Effect (Effect) |
| 10 | +import Effect.Console (log, logShow) |
| 11 | +import Text.Parsing.StringParser (Parser, fail, runParser, unParser) |
| 12 | +import Text.Parsing.StringParser.CodePoints (anyChar, char, eof, regex, skipSpaces, string) |
| 13 | +import Text.Parsing.StringParser.Combinators (between, endBy1, lookAhead, many, many1, sepBy1, (<?>)) |
| 14 | + |
| 15 | +-- Serves only to make this file runnable |
| 16 | +main :: Effect Unit |
| 17 | +main = printResults |
| 18 | + |
| 19 | +printResults :: Effect Unit |
| 20 | +printResults = do |
| 21 | + log "" -- empty blank line to separate output from function call |
| 22 | + |
| 23 | + log "### Example Content 1 ###" |
| 24 | + doBoth "fail" ((fail "example failure message") :: Parser Unit) exampleContent1 |
| 25 | + doBoth "numberOfAs" numberOfAs exampleContent1 |
| 26 | + doBoth "removePunctuation" removePunctuation exampleContent1 |
| 27 | + doBoth "replaceVowelsWithUnderscore" replaceVowelsWithUnderscore exampleContent1 |
| 28 | + doBoth "tokenizeContentBySpaceChars" tokenizeContentBySpaceChars exampleContent1 |
| 29 | + doBoth "extractWords" extractWords exampleContent1 |
| 30 | + doBoth "badExtractWords" badExtractWords exampleContent1 |
| 31 | + doBoth "quotedLetterExists" quotedLetterExists exampleContent1 |
| 32 | + |
| 33 | + log "\n\ |
| 34 | + \### Example Content 2 ###" |
| 35 | + doBoth "parseCSV" parseCSV exampleContent2 |
| 36 | + |
| 37 | +-- Example Content 1 |
| 38 | + |
| 39 | +exampleContent1 :: String |
| 40 | +exampleContent1 = |
| 41 | + "How many 'a's are in this sentence, you ask? Not that many." |
| 42 | + |
| 43 | +numberOfAs :: Parser Int |
| 44 | +numberOfAs = do |
| 45 | + let |
| 46 | + oneIfA = 1 <$ string "a" <?> "Letter was 'a'" |
| 47 | + zeroIfNotA = 0 <$ regex "[^a]" <?> "Letter was not 'a'" |
| 48 | + letterIsOneOrZero = oneIfA <|> zeroIfNotA <?> |
| 49 | + "The impossible happened: \ |
| 50 | + \a letter was not 'a', and was not not-'a'." |
| 51 | + convertLettersToList = many1 letterIsOneOrZero |
| 52 | + {- |
| 53 | + list <- convertLettersToList -} |
| 54 | + list <- many1 |
| 55 | + ( (1 <$ string "a") |
| 56 | + <|> (0 <$ regex "[^a]") |
| 57 | + ) |
| 58 | + -- calculate total number by adding Ints in list together |
| 59 | + pure $ sum list |
| 60 | + |
| 61 | +removePunctuation :: Parser String |
| 62 | +removePunctuation = do {- |
| 63 | + let |
| 64 | + charsAndSpaces = regex "[a-zA-Z ]+" |
| 65 | + everythingElse = regex "[^a-zA-Z ]+" |
| 66 | + ignoreEverythingElse = "" <$ everythingElse |
| 67 | + zeroOrMoreFragments = many1 $ charsAndSpaces <|> ignoreEverythingElse -} |
| 68 | + list <- many1 |
| 69 | + ( regex "[a-zA-Z ]+" |
| 70 | + <|> ("" <$ regex "[^a-zA-Z ]+" ) |
| 71 | + ) |
| 72 | + |
| 73 | + -- combine the list's contents together via '<>' |
| 74 | + pure $ foldl (<>) "" list |
| 75 | + |
| 76 | +replaceVowelsWithUnderscore :: Parser String |
| 77 | +replaceVowelsWithUnderscore = do |
| 78 | + list <- many1 $ ( ( "_" <$ regex "[aeiou]") |
| 79 | + <|> regex "[^aeiou]+" |
| 80 | + ) |
| 81 | + |
| 82 | + pure $ foldl (<>) "" list |
| 83 | + |
| 84 | +tokenizeContentBySpaceChars :: Parser (NonEmptyList String) |
| 85 | +tokenizeContentBySpaceChars = do |
| 86 | + (regex "[^ ]+") `sepBy1` (string " ") |
| 87 | + |
| 88 | +extractWords :: Parser (NonEmptyList String) |
| 89 | +extractWords = do |
| 90 | + endBy1 (regex "[a-zA-Z]+") |
| 91 | + -- try commenting out one of the "<|> string ..." lines and see what happens |
| 92 | + (many1 ( string " " <?> "Failed to match space as a separator" |
| 93 | + <|> string "'" <?> "Failed to match single-quote char as a separator" |
| 94 | + <|> string "," <?> "Failed to match comma as a separator" |
| 95 | + <|> string "?" <?> "Failed to match question mark as a separator" |
| 96 | + <|> string "." <?> "Failed to match period as a separator" |
| 97 | + <?> "Could not find a character that separated the content..." |
| 98 | + ) |
| 99 | + ) |
| 100 | + |
| 101 | +badExtractWords :: Parser (NonEmptyList String) |
| 102 | +badExtractWords = do |
| 103 | + list <- endBy1 (regex "[a-zA-Z]+") |
| 104 | + -- try commenting out the below "<|> string ..." lines |
| 105 | + (many1 ( string " " <?> "Failed to match space as a separator" |
| 106 | + <|> string "'" <?> "Failed to match single-quote char as a separator" |
| 107 | + <|> string "," <?> "Failed to match comma as a separator" |
| 108 | + -- <|> string "?" <?> "Failed to match question mark as a separator" |
| 109 | + -- <|> string "." <?> "Failed to match period as a separator" |
| 110 | + <?> "Could not find a character that separated the content..." |
| 111 | + ) |
| 112 | + ) |
| 113 | + -- short for 'end of file' or 'end of content' |
| 114 | + eof <?> "Entire content should have been parsed but wasn't." |
| 115 | + pure list |
| 116 | + |
| 117 | +-- there are better ways of doing this using `whileM`, but this explains |
| 118 | +-- the basic idea: |
| 119 | +quotedLetterExists :: Parser Boolean |
| 120 | +quotedLetterExists = do |
| 121 | + let |
| 122 | + singleQuoteChar = string "'" |
| 123 | + betweenSingleQuotes parser = |
| 124 | + between singleQuoteChar singleQuoteChar parser |
| 125 | + |
| 126 | + list <- many ( true <$ (betweenSingleQuotes (char 'a') <?> "No 'a' found.") |
| 127 | + <|> false <$ anyChar |
| 128 | + ) |
| 129 | + pure $ foldl (||) false list |
| 130 | + |
| 131 | +-- Example Content 2 |
| 132 | + |
| 133 | +-- CSV sample with some inconsistent spacing |
| 134 | +exampleContent2 :: String |
| 135 | +exampleContent2 = |
| 136 | + "ID, FirstName, LastName, Age, Email\n\ |
| 137 | + \523, Mark, Kenderson, 24, [email protected]\n" |
| 138 | + |
| 139 | +type CsvContent = |
| 140 | + { idNumber :: String |
| 141 | + , firstName :: String |
| 142 | + , lastName :: String |
| 143 | + , age :: String |
| 144 | + , originalEmail :: String |
| 145 | + , modifiedEmail :: String |
| 146 | + } |
| 147 | + |
| 148 | +parseCSV :: Parser CsvContent |
| 149 | +parseCSV = do |
| 150 | + let |
| 151 | + commaThenSpaces = string "," *> skipSpaces |
| 152 | + idNumber_ = string "ID" |
| 153 | + firstName_ = string "FirstName" |
| 154 | + lastName_ = string "LastName" |
| 155 | + age_ = string "Age" |
| 156 | + email_ = string "Email" |
| 157 | + newline = string "\n" |
| 158 | + csvColumn = regex "[^,]+" |
| 159 | + |
| 160 | + -- parse headers but don't produce output |
| 161 | + void $ idNumber_ *> commaThenSpaces *> |
| 162 | + firstName_ *> commaThenSpaces *> |
| 163 | + lastName_ *> commaThenSpaces *> |
| 164 | + age_ *> commaThenSpaces *> |
| 165 | + email_ |
| 166 | + |
| 167 | + void newline |
| 168 | + |
| 169 | + -- now we're on line 2 |
| 170 | + idNumber <- csvColumn <* commaThenSpaces |
| 171 | + firstName <- csvColumn <* commaThenSpaces |
| 172 | + lastName <- csvColumn <* commaThenSpaces |
| 173 | + age <- csvColumn <* commaThenSpaces |
| 174 | + |
| 175 | + -- lookAhead will parse the content ahead of us, |
| 176 | + -- then reset the position of the string |
| 177 | + -- to what it was before it. |
| 178 | + originalEmail <- lookAhead $ regex "[^\n]+" |
| 179 | + |
| 180 | + let |
| 181 | + parseAlphanumericChars = regex "[a-zA-Z0-9]+" |
| 182 | + parsePeriodsAndPlusesAsEmptyStrings = |
| 183 | + "" <$ ((string ".") <|> (string "+")) |
| 184 | + parseListOfParts = |
| 185 | + many1 ( parseAlphanumericChars |
| 186 | + <|> parsePeriodsAndPlusesAsEmptyStrings |
| 187 | + ) |
| 188 | + |
| 189 | + usernameWithoutPeriodsOrPluses <- fold <$> parseListOfParts |
| 190 | + void $ string "@" |
| 191 | + domainName <- fold <$> (many1 ((regex "[a-zA-Z0-9]+") <|> (string "."))) |
| 192 | + void $ string "\n" |
| 193 | + |
| 194 | + -- Ensure we hit the end of the string content via 'end-of-file' |
| 195 | + void eof |
| 196 | + |
| 197 | + -- now return the parsed content |
| 198 | + pure { idNumber, firstName, lastName, age, originalEmail |
| 199 | + , modifiedEmail: usernameWithoutPeriodsOrPluses <> "@" <> domainName |
| 200 | + } |
| 201 | + |
| 202 | +-- Helper functions |
| 203 | + |
| 204 | +doBoth :: forall a. Show a => String -> Parser a -> String -> Effect Unit |
| 205 | +doBoth parserName parser content = do |
| 206 | + doRunParser parserName parser content |
| 207 | + doUnParser parserName parser content |
| 208 | + |
| 209 | +-- | Shows the results of calling `unParser`. You typically want to use |
| 210 | +-- | this function when writing a parser because it includes other info |
| 211 | +-- | to help you debug your code. |
| 212 | +doUnParser :: forall a. Show a => String -> Parser a -> String -> Effect Unit |
| 213 | +doUnParser parserName parser content = do |
| 214 | + log $ "(unParser) Parsing content with '" <> parserName <> "'" |
| 215 | + case unParser parser { str: content, pos: 0 } of |
| 216 | + Left rec -> log $ "Position: " <> show rec.pos <> "\n\ |
| 217 | + \Error: " <> show rec.error |
| 218 | + Right rec -> log $ "Result was: " <> show rec.result <> "\n\ |
| 219 | + \Suffix was: " <> show rec.suffix |
| 220 | + log "-----" |
| 221 | + |
| 222 | + |
| 223 | +-- | Shows the results of calling `runParser`. You typically don't want to use |
| 224 | +-- | this function when writing a parser because it doesn't help you debug |
| 225 | +-- | your code when you write it incorrectly. |
| 226 | +doRunParser :: forall a. Show a => String -> Parser a -> String -> Effect Unit |
| 227 | +doRunParser parserName parser content = do |
| 228 | + log $ "(runParser) Parsing content with '" <> parserName <> "'" |
| 229 | + case runParser parser content of |
| 230 | + Left error -> logShow error |
| 231 | + Right result -> log $ "Result was: " <> show result |
| 232 | + log "-----" |
0 commit comments