Improve script to print entries grouped by browser

The ua_fetch.py has been re-written entirely to find specific entries for
each browser based on diversity rather than sole popularity. The output
is now formatted to print the entries for each browser grouped together.
This commit is contained in:
Lamar Pavel 2015-11-14 15:57:24 +01:00
parent 45f9e61815
commit d745819715
2 changed files with 97 additions and 46 deletions

View File

@ -1564,6 +1564,22 @@ class UserAgent(BaseType):
def complete(self):
"""Complete a list of common user agents."""
out = [
('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 '
'Firefox/41.0',
"Firefox 41.0 Win7 64-bit"),
('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 '
'Firefox/40.0',
"Firefox 40.0 Win7 64-bit"),
('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) '
'AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 '
'Safari/600.8.9',
"Safari 8.0 MacOSX"),
('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11) '
'AppleWebKit/601.1.56 (KHTML, like Gecko) Version/9.0 '
'Safari/601.1.56',
"Safari Generic MacOSX"),
('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/45.0.2454.101 Safari/537.36',
"Chrome 45.0 Win7 64-bit"),
@ -1571,29 +1587,12 @@ class UserAgent(BaseType):
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 '
'Safari/537.36',
"Chrome 45.0 MacOSX"),
('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) '
'AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 '
'Safari/600.8.9',
"Safari 8.0 MacOSX"),
('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/45.0.2454.101 Safari/537.36',
"Chrome 45.0 Win10 64-bit"),
('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 '
'Firefox/41.0',
"Firefox 41.0 Win7 64-bit"),
('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11) '
'AppleWebKit/601.1.56 (KHTML, like Gecko) Version/9.0 '
'Safari/601.1.56',
"Safari Generic MacOSX"),
('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/45.0.2454.101 Safari/537.36',
"Chrome 45.0 Win8.1 64-bit"),
('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 '
'Firefox/40.0',
"Firefox 40.0 Win7 64-bit"),
('Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like '
'Gecko',
"IE 11.0 for Desktop Win7 64-bit"),
('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/45.0.2454.93 Safari/537.36',
"Chrome 45.0 Win7 64-bit"),
@ -1605,6 +1604,10 @@ class UserAgent(BaseType):
'like Gecko) Chrome/45.0.2454.99 Safari/537.36',
"Chrome 45.0 Win7 64-bit"),
('Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like '
'Gecko',
"IE 11.0 for Desktop Win7 64-bit"),
('Mozilla/5.0 (compatible; Googlebot/2.1; '
'+http://www.google.com/bot.html',
"Google Bot"),

View File

@ -29,33 +29,81 @@ script is formatted to be pasted into configtypes.py.
import requests
from lxml import html # pylint: disable=import-error
# Fetch list of popular user-agents and store the relevant strings
url = 'https://techblog.willshouse.com/2012/01/03/most-common-user-agents/'
page = requests.get(url)
page = html.fromstring(page.text)
path = '//*[@id="post-2229"]/div[2]/table/tbody'
table = page.xpath(path)[0]
indent = " "
# Print function defition followed by an automatically fetched list of popular
# user agents and a few additional entries for diversity.
print("%sdef complete(self):" % indent)
print("%s\"\"\"Complete a list of common user agents.\"\"\"" % (2 * indent))
print("%sout = [" % (2 * indent))
for row in table[:12]:
ua = row[1].text_content()
browser = row[2].text_content()
print("%s(\'%s\',\n%s \"%s\")," % (3 * indent, ua, 3 * indent, browser))
print("""
('Mozilla/5.0 (iPhone; CPU iPhone OS 8_1_2 like Mac OS X) '
'AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 '
'Mobile/12B440 Safari/600.1.4',
"Mobile Safari 8.0 iOS"),
('Mozilla/5.0 (Android; Mobile; rv:35.0) Gecko/35.0 Firefox/35.0',
"Firefox 35, Android"),
('Mozilla/5.0 (Linux; Android 5.0.2; One Build/KTU84L.H4) '
'AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 '
'Chrome/37.0.0.0 Mobile Safari/537.36',
"Android Browser")
""")
print("%s]\n%sreturn out\n" % (2 * indent, 2 * indent))
# Fetch list of popular user-agents and return list of relevant strings
def fetch():
url = 'https://techblog.willshouse.com/2012/01/03/most-common-user-agents/'
page = requests.get(url)
page = html.fromstring(page.text)
path = '//*[@id="post-2229"]/div[2]/table/tbody'
return page.xpath(path)[0]
# Filter the received list based on a look up table. The LUT should be a
# dictionary of the format {browser: versions}, where 'browser' is the name of
# the browser (eg. "Firefox") as string and 'versions' is a set of different
# versions of this browser that should be included when found (eg. {"Linux",
# "MacOSX"}). This function returns a dictionary with the same keys as the
# LUT, but storing lists of tuples (user_agent, browser_description) as values.
def filter_list(complete_list, browsers):
table = {}
for entry in complete_list:
# Tuple of (user_agent, browser_description)
candidate = (entry[1].text_content(), entry[2].text_content())
for name in browsers:
found = False
if name.lower() in candidate[1].lower():
for version in browsers[name]:
if version.lower() in candidate[1].lower():
if table.get(name) is None:
table[name] = []
table[name].append(candidate)
browsers[name].remove(version)
found = True
break
if found:
break
return table
# Insert a few additional entries for diversity into the dict (as returned by
# filter_list())
def add_diversity(table):
table["Obscure"] = [
('Mozilla/5.0 (compatible; Googlebot/2.1; '
'+http://www.google.com/bot.html',
"Google Bot"),
('Wget/1.16.1 (linux-gnu)',
"wget 1.16.1"),
('curl/7.40.0',
"curl 7.40.0")
]
return table
if __name__ == '__main__':
fetched = fetch()
lut = {
"Firefox": {"Win", "MacOSX", "Linux", "Android"},
"Chrome": {"Win", "MacOSX", "Linux"},
"Safari": {"MacOSX", "iOS"}
}
filtered = filter_list(fetched, lut)
filtered = add_diversity(filtered)
tab = " "
print("%sdef complete(self):" % tab)
print("%s\"\"\"Complete a list of common user agents.\"\"\"" % (2 * tab))
print("%sout = [" %(2 * tab))
for browser in ["Firefox", "Safari", "Chrome", "Obscure"]:
for it in filtered[browser]:
print("%s(\'%s\',\n%s \"%s\")," % (3 * tab, it[0], 3 * tab, it[1]))
print("")
print("""\
('Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like '
'Gecko',
"IE 11.0 for Desktop Win7 64-bit")""")
print("%s]\n%sreturn out\n" % (2 * tab, 2 * tab))