{-# LANGUAGE OverloadedStrings #-} module Parser where import Data.Monoid import Data.ByteString.Lazy (ByteString) import qualified Data.Text as T import Data.Text (Text, unpack) import Text.XML.Cursor import Text.HTML.DOM (parseLBS) import Text.HTML.TagSoup.Entity (lookupEntity) data Comic = Comic { title :: String , descr :: String , imgUrl :: String , page :: Int } parseArchive :: ByteString -> [Text] parseArchive str = mkCursor str $// selectLinks &| extractUrl where selectLinks = element "a" &/ element "img" >=> parent extractUrl = T.concat . attribute "href" parseComic :: ByteString -> Int -> Comic parseComic str n = Comic { title = decodeHtml $ unpack $ head $ cur $// selectTitle &| extractContent , descr = decodeHtml $ unpack $ head $ cur $// selectImg &| extractDescr , imgUrl = unpack $ head $ cur $// selectImg &| extractUrl , page = n } where selectTitle = element "title" >=> child selectImg = element "img" >=> attributeIs "id" "strip" extractUrl = T.concat . attribute "src" extractDescr = T.concat . attribute "title" extractContent = T.concat . content cur = mkCursor str isLastPage :: ByteString -> Bool isLastPage str = (cur $// page &| link) == (cur $// story &| link) where cur = mkCursor str page = element "div" >=> attributeIs "id" "nx" >=> parent story = element "div" >=> attributeIs "id" "ns" >=> parent link = T.concat . attribute "href" mkCursor :: ByteString -> Cursor mkCursor = fromDocument . parseLBS decodeHtml :: String -> String decodeHtml [] = [] decodeHtml ('&' : xs) = let (b, a) = break (== ';') xs in case (lookupEntity b, a) of (Just c, ';' : as) -> c <> decodeHtml as _ -> '&' : decodeHtml xs decodeHtml (x : xs) = x : decodeHtml xs