refactor tag extraction and fix string shadowing

This commit is contained in:
Felix Van der Jeugt 2015-12-18 23:06:26 +01:00
parent 38803375f5
commit 4814abe286

View File

@ -25,7 +25,7 @@ import functools
import math import math
import os import os
import re import re
import string from string import ascii_lowercase
from PyQt5.QtCore import (pyqtSignal, pyqtSlot, QObject, QEvent, Qt, QUrl, from PyQt5.QtCore import (pyqtSignal, pyqtSlot, QObject, QEvent, Qt, QUrl,
QTimer) QTimer)
@ -161,7 +161,7 @@ class HintManager(QObject):
def _initialize_word_hints(self): def _initialize_word_hints(self):
if not self._words: if not self._words:
with open(config.get("hints", "dictionary")) as wordfile: with open(config.get("hints", "dictionary")) as wordfile:
alphabet = set(string.ascii_lowercase) alphabet = set(ascii_lowercase)
hints = set() hints = set()
lines = (line.rstrip().lower() for line in wordfile) lines = (line.rstrip().lower() for line in wordfile)
for word in lines: for word in lines:
@ -254,39 +254,53 @@ class HintManager(QObject):
Return: Return:
A list of hint strings, in the same order as the elements. A list of hint strings, in the same order as the elements.
""" """
def html_elem_to_hints(elem): just_get_it = lambda tag: lambda elem: elem[tag]
candidates = [] take_last_part = lambda tag: lambda elem: elem[tag].split('/')[-1]
if elem.tagName() == "IMG": tag_extractors = {
"alt" in elem and candidates.append(elem["alt"]) "alt": just_get_it("alt"),
"title" in elem and candidates.append(elem["title"]) "title": just_get_it("title"),
"src" in elem and candidates.append(elem["src"].split('/')[-1]) "src": take_last_part("src"),
elif elem.tagName() == "A": "href": take_last_part("href"),
candidates.append(str(elem)) "name": just_get_it("name"),
"title" in elem and candidates.append(elem["title"]) }
"href" in elem and candidates.append(elem["href"].split('/')[-1])
elif elem.tagName() == "INPUT": tags_for = collections.defaultdict(list, {
"name" in elem and candidates.append(elem["name"]) "IMG": ["alt", "title", "src"],
for candidate in candidates: "A": ["title", "href"],
if not candidate: "INPUT": ["name"],
continue })
def extract_tag_words(elem):
if elem.tagName() == "A":
# link text is a special case, alas.
yield str(elem)
yield from (tag_extractors[tag](elem)
for tag in tags_for[elem.tagName()]
if tag in elem)
def tag_words_to_hints(words):
for candidate in filter(bool, words):
match = self.FIRST_ALPHABETIC.search(candidate) match = self.FIRST_ALPHABETIC.search(candidate)
if not match: if not match:
continue continue
if match.end() - match.start() < 4:
continue
yield candidate[match.start():match.end()].lower() yield candidate[match.start():match.end()].lower()
def any_prefix(hint, existing): def any_prefix(hint, existing):
return any(hint.startswith(e) or e.startswith(hint) for e in existing) return any(hint.startswith(e) or e.startswith(hint)
for e in existing)
def first_good_hint(new, existing): def new_hint_for(elem, existing):
new = filter(bool, new) new = tag_words_to_hints(extract_tag_words(elem))
new = filter(lambda h: len(h) > 4, new)
new = filter(lambda h: not any_prefix(h, existing), new) new = filter(lambda h: not any_prefix(h, existing), new)
return next(new, None) # either the first good, or None # either the first good, or None
return next(new, None)
hints = [] hints = []
used_hints = set() used_hints = set()
for elem in elems: for elem in elems:
hint = first_good_hint(html_elem_to_hints(elem), used_hints) or next(words) hint = new_hint_for(elem, used_hints) or next(words)
used_hints.add(hint) used_hints.add(hint)
hints.append(hint) hints.append(hint)
return hints return hints