refactor tag extraction and fix string shadowing
This commit is contained in:
parent
38803375f5
commit
4814abe286
@ -25,7 +25,7 @@ import functools
|
|||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import string
|
from string import ascii_lowercase
|
||||||
|
|
||||||
from PyQt5.QtCore import (pyqtSignal, pyqtSlot, QObject, QEvent, Qt, QUrl,
|
from PyQt5.QtCore import (pyqtSignal, pyqtSlot, QObject, QEvent, Qt, QUrl,
|
||||||
QTimer)
|
QTimer)
|
||||||
@ -161,7 +161,7 @@ class HintManager(QObject):
|
|||||||
def _initialize_word_hints(self):
|
def _initialize_word_hints(self):
|
||||||
if not self._words:
|
if not self._words:
|
||||||
with open(config.get("hints", "dictionary")) as wordfile:
|
with open(config.get("hints", "dictionary")) as wordfile:
|
||||||
alphabet = set(string.ascii_lowercase)
|
alphabet = set(ascii_lowercase)
|
||||||
hints = set()
|
hints = set()
|
||||||
lines = (line.rstrip().lower() for line in wordfile)
|
lines = (line.rstrip().lower() for line in wordfile)
|
||||||
for word in lines:
|
for word in lines:
|
||||||
@ -254,39 +254,53 @@ class HintManager(QObject):
|
|||||||
Return:
|
Return:
|
||||||
A list of hint strings, in the same order as the elements.
|
A list of hint strings, in the same order as the elements.
|
||||||
"""
|
"""
|
||||||
def html_elem_to_hints(elem):
|
just_get_it = lambda tag: lambda elem: elem[tag]
|
||||||
candidates = []
|
take_last_part = lambda tag: lambda elem: elem[tag].split('/')[-1]
|
||||||
if elem.tagName() == "IMG":
|
tag_extractors = {
|
||||||
"alt" in elem and candidates.append(elem["alt"])
|
"alt": just_get_it("alt"),
|
||||||
"title" in elem and candidates.append(elem["title"])
|
"title": just_get_it("title"),
|
||||||
"src" in elem and candidates.append(elem["src"].split('/')[-1])
|
"src": take_last_part("src"),
|
||||||
elif elem.tagName() == "A":
|
"href": take_last_part("href"),
|
||||||
candidates.append(str(elem))
|
"name": just_get_it("name"),
|
||||||
"title" in elem and candidates.append(elem["title"])
|
}
|
||||||
"href" in elem and candidates.append(elem["href"].split('/')[-1])
|
|
||||||
elif elem.tagName() == "INPUT":
|
tags_for = collections.defaultdict(list, {
|
||||||
"name" in elem and candidates.append(elem["name"])
|
"IMG": ["alt", "title", "src"],
|
||||||
for candidate in candidates:
|
"A": ["title", "href"],
|
||||||
if not candidate:
|
"INPUT": ["name"],
|
||||||
continue
|
})
|
||||||
|
|
||||||
|
def extract_tag_words(elem):
|
||||||
|
if elem.tagName() == "A":
|
||||||
|
# link text is a special case, alas.
|
||||||
|
yield str(elem)
|
||||||
|
yield from (tag_extractors[tag](elem)
|
||||||
|
for tag in tags_for[elem.tagName()]
|
||||||
|
if tag in elem)
|
||||||
|
|
||||||
|
def tag_words_to_hints(words):
|
||||||
|
for candidate in filter(bool, words):
|
||||||
match = self.FIRST_ALPHABETIC.search(candidate)
|
match = self.FIRST_ALPHABETIC.search(candidate)
|
||||||
if not match:
|
if not match:
|
||||||
continue
|
continue
|
||||||
|
if match.end() - match.start() < 4:
|
||||||
|
continue
|
||||||
yield candidate[match.start():match.end()].lower()
|
yield candidate[match.start():match.end()].lower()
|
||||||
|
|
||||||
def any_prefix(hint, existing):
|
def any_prefix(hint, existing):
|
||||||
return any(hint.startswith(e) or e.startswith(hint) for e in existing)
|
return any(hint.startswith(e) or e.startswith(hint)
|
||||||
|
for e in existing)
|
||||||
|
|
||||||
def first_good_hint(new, existing):
|
def new_hint_for(elem, existing):
|
||||||
new = filter(bool, new)
|
new = tag_words_to_hints(extract_tag_words(elem))
|
||||||
new = filter(lambda h: len(h) > 4, new)
|
|
||||||
new = filter(lambda h: not any_prefix(h, existing), new)
|
new = filter(lambda h: not any_prefix(h, existing), new)
|
||||||
return next(new, None) # either the first good, or None
|
# either the first good, or None
|
||||||
|
return next(new, None)
|
||||||
|
|
||||||
hints = []
|
hints = []
|
||||||
used_hints = set()
|
used_hints = set()
|
||||||
for elem in elems:
|
for elem in elems:
|
||||||
hint = first_good_hint(html_elem_to_hints(elem), used_hints) or next(words)
|
hint = new_hint_for(elem, used_hints) or next(words)
|
||||||
used_hints.add(hint)
|
used_hints.add(hint)
|
||||||
hints.append(hint)
|
hints.append(hint)
|
||||||
return hints
|
return hints
|
||||||
|
Loading…
Reference in New Issue
Block a user