55 lines
1.5 KiB
Haskell
55 lines
1.5 KiB
Haskell
{-# LANGUAGE OverloadedStrings #-}
|
|
|
|
module Parser where
|
|
|
|
import Data.Monoid
|
|
import Data.ByteString.Lazy (ByteString)
|
|
import qualified Data.Text as T
|
|
import Data.Text (Text, unpack)
|
|
import Text.XML.Cursor
|
|
import Text.HTML.DOM (parseLBS)
|
|
import Text.HTML.TagSoup.Entity (lookupEntity)
|
|
|
|
data Comic = Comic
|
|
{ title :: String
|
|
, descr :: String
|
|
, imgUrl :: String
|
|
, page :: Int
|
|
}
|
|
|
|
|
|
parseArchive :: ByteString -> [Text]
|
|
parseArchive str = mkCursor str $// selectLinks &| extractUrl
|
|
where
|
|
selectLinks = element "a" &/ element "img" >=> parent
|
|
extractUrl = T.concat . attribute "href"
|
|
|
|
|
|
parseComic :: ByteString -> Int -> Comic
|
|
parseComic str n = Comic
|
|
{ title = decodeHtml $ unpack $ head $ cur $// selectTitle &| extractContent
|
|
, descr = decodeHtml $ unpack $ head $ cur $// selectImg &| extractDescr
|
|
, imgUrl = unpack $ head $ cur $// selectImg &| extractUrl
|
|
, page = n
|
|
} where
|
|
selectTitle = element "title" >=> child
|
|
selectImg = element "img" >=> attributeIs "id" "strip"
|
|
extractUrl = T.concat . attribute "src"
|
|
extractDescr = T.concat . attribute "title"
|
|
extractContent = T.concat . content
|
|
cur = mkCursor str
|
|
|
|
|
|
mkCursor :: ByteString -> Cursor
|
|
mkCursor = fromDocument . parseLBS
|
|
|
|
|
|
decodeHtml :: String -> String
|
|
decodeHtml [] = []
|
|
decodeHtml ('&' : xs) =
|
|
let (b, a) = break (== ';') xs in
|
|
case (lookupEntity b, a) of
|
|
(Just c, ';' : as) -> c <> decodeHtml as
|
|
_ -> '&' : decodeHtml xs
|
|
decodeHtml (x : xs) = x : decodeHtml xs
|