oglaf/Parser.hs

55 lines
1.5 KiB
Haskell
Raw Normal View History

2015-10-09 15:36:53 +02:00
{-# LANGUAGE OverloadedStrings #-}
module Parser where
import Data.Monoid
import Data.ByteString.Lazy (ByteString)
import qualified Data.Text as T
import Data.Text (Text, unpack)
import Text.XML.Cursor
import Text.HTML.DOM (parseLBS)
import Text.HTML.TagSoup.Entity (lookupEntity)
data Comic = Comic
{ title :: String
, descr :: String
, imgUrl :: String
, page :: Int
}
parseArchive :: ByteString -> [Text]
parseArchive str = mkCursor str $// selectLinks &| extractUrl
where
selectLinks = element "a" &/ element "img" >=> parent
extractUrl = T.concat . attribute "href"
parseComic :: ByteString -> Int -> Comic
parseComic str n = Comic
{ title = decodeHtml $ unpack $ head $ cur $// selectTitle &| extractContent
, descr = decodeHtml $ unpack $ head $ cur $// selectImg &| extractDescr
, imgUrl = unpack $ head $ cur $// selectImg &| extractUrl
, page = n
} where
selectTitle = element "title" >=> child
selectImg = element "img" >=> attributeIs "id" "strip"
extractUrl = T.concat . attribute "src"
extractDescr = T.concat . attribute "title"
extractContent = T.concat . content
cur = mkCursor str
mkCursor :: ByteString -> Cursor
mkCursor = fromDocument . parseLBS
decodeHtml :: String -> String
decodeHtml [] = []
decodeHtml ('&' : xs) =
let (b, a) = break (== ';') xs in
case (lookupEntity b, a) of
(Just c, ';' : as) -> c <> decodeHtml as
_ -> '&' : decodeHtml xs
decodeHtml (x : xs) = x : decodeHtml xs