oglaf/Parser.hs

64 lines
1.8 KiB
Haskell
Raw Permalink Normal View History

2015-10-09 15:36:53 +02:00
{-# LANGUAGE OverloadedStrings #-}
module Parser where
import Data.Monoid
import Data.ByteString.Lazy (ByteString)
import qualified Data.Text as T
import Data.Text (Text, unpack)
import Text.XML.Cursor
import Text.HTML.DOM (parseLBS)
import Text.HTML.TagSoup.Entity (lookupEntity)
data Comic = Comic
{ title :: String
, descr :: String
, imgUrl :: String
, page :: Int
}
parseArchive :: ByteString -> [Text]
parseArchive str = mkCursor str $// selectLinks &| extractUrl
where
selectLinks = element "a" &/ element "img" >=> parent
extractUrl = T.concat . attribute "href"
parseComic :: ByteString -> Int -> Comic
parseComic str n = Comic
{ title = decodeHtml $ unpack $ head $ cur $// selectTitle &| extractContent
, descr = decodeHtml $ unpack $ head $ cur $// selectImg &| extractDescr
, imgUrl = unpack $ head $ cur $// selectImg &| extractUrl
, page = n
} where
selectTitle = element "title" >=> child
selectImg = element "img" >=> attributeIs "id" "strip"
extractUrl = T.concat . attribute "src"
extractDescr = T.concat . attribute "title"
extractContent = T.concat . content
cur = mkCursor str
2016-12-17 15:15:03 +01:00
isLastPage :: ByteString -> Bool
isLastPage str = (cur $// page &| link) == (cur $// story &| link)
where
cur = mkCursor str
page = element "div" >=> attributeIs "id" "nx" >=> parent
story = element "div" >=> attributeIs "id" "ns" >=> parent
link = T.concat . attribute "href"
2015-10-09 15:36:53 +02:00
mkCursor :: ByteString -> Cursor
mkCursor = fromDocument . parseLBS
decodeHtml :: String -> String
decodeHtml [] = []
decodeHtml ('&' : xs) =
let (b, a) = break (== ';') xs in
case (lookupEntity b, a) of
(Just c, ';' : as) -> c <> decodeHtml as
_ -> '&' : decodeHtml xs
decodeHtml (x : xs) = x : decodeHtml xs