{-# LANGUAGE OverloadedStrings #-} module Parser where import Data.Monoid import Data.ByteString.Lazy (ByteString) import qualified Data.Text as T import Data.Text (Text, unpack) import Text.XML.Cursor import Text.HTML.DOM (parseLBS) import Text.HTML.TagSoup.Entity (lookupEntity) data Comic = Comic { title :: String , descr :: String , imgUrl :: String , page :: Int } parseArchive :: ByteString -> [Text] parseArchive str = mkCursor str $// selectLinks &| extractUrl where selectLinks = element "a" &/ element "img" >=> parent extractUrl = T.concat . attribute "href" parseComic :: ByteString -> Int -> Comic parseComic str n = Comic { title = decodeHtml $ unpack $ head $ cur $// selectTitle &| extractContent , descr = decodeHtml $ unpack $ head $ cur $// selectImg &| extractDescr , imgUrl = unpack $ head $ cur $// selectImg &| extractUrl , page = n } where selectTitle = element "title" >=> child selectImg = element "img" >=> attributeIs "id" "strip" extractUrl = T.concat . attribute "src" extractDescr = T.concat . attribute "title" extractContent = T.concat . content cur = mkCursor str mkCursor :: ByteString -> Cursor mkCursor = fromDocument . parseLBS decodeHtml :: String -> String decodeHtml [] = [] decodeHtml ('&' : xs) = let (b, a) = break (== ';') xs in case (lookupEntity b, a) of (Just c, ';' : as) -> c <> decodeHtml as _ -> '&' : decodeHtml xs decodeHtml (x : xs) = x : decodeHtml xs