qutebrowser/scripts/dev/ua_fetch.py
2018-02-05 12:19:50 +01:00

123 lines
4.1 KiB
Python
Executable File
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:
# Copyright 2015-2018 lamarpavel
# Copyright 2015-2018 Alexey Nabrodov (Averrin)
# Copyright 2015-2018 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
#
# This file is part of qutebrowser.
#
# qutebrowser is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# qutebrowser is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with qutebrowser. If not, see <http://www.gnu.org/licenses/>.
"""Fetch list of popular user-agents.
The script is based on a gist posted by github.com/averrin, the output of this
script is formatted to be pasted into configdata.yml
"""
import requests
from lxml import html # pylint: disable=import-error
def fetch():
"""Fetch list of popular user-agents.
Return:
List of relevant strings.
"""
url = 'https://techblog.willshouse.com/2012/01/03/most-common-user-agents/'
page = requests.get(url)
page = html.fromstring(page.text)
path = '//*[@id="post-2229"]/div[2]/table/tbody'
return page.xpath(path)[0]
def filter_list(complete_list, browsers):
"""Filter the received list based on a look up table.
The LUT should be a dictionary of the format {browser: versions}, where
'browser' is the name of the browser (eg. "Firefox") as string and
'versions' is a set of different versions of this browser that should be
included when found (eg. {"Linux", "MacOSX"}). This function returns a
dictionary with the same keys as the LUT, but storing lists of tuples
(user_agent, browser_description) as values.
"""
# pylint: disable=too-many-nested-blocks
table = {}
for entry in complete_list:
# Tuple of (user_agent, browser_description)
candidate = (entry[1].text_content(), entry[2].text_content())
for name in browsers:
found = False
if name.lower() in candidate[1].lower():
for version in browsers[name]:
if version.lower() in candidate[1].lower():
if table.get(name) is None:
table[name] = []
table[name].append(candidate)
browsers[name].remove(version)
found = True
break
if found:
break
return table
def add_diversity(table):
"""Insert a few additional entries for diversity into the dict.
(as returned by filter_list())
"""
table["Obscure"] = [
('Mozilla/5.0 (compatible; Googlebot/2.1; '
'+http://www.google.com/bot.html',
"Google Bot"),
('Wget/1.16.1 (linux-gnu)',
"wget 1.16.1"),
('curl/7.40.0',
"curl 7.40.0"),
('Mozilla/5.0 (Linux; U; Android 7.1.2) AppleWebKit/534.30 '
'(KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
"Mobile Generic Android"),
('Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like '
'Gecko',
"IE 11.0 for Desktop Win7 64-bit"),
]
return table
def main():
"""Generate user agent code."""
fetched = fetch()
lut = {
"Firefox": {"Win", "MacOSX", "Linux", "Android"},
"Chrome": {"Win", "MacOSX", "Linux"},
"Safari": {"MacOSX", "iOS"}
}
filtered = filter_list(fetched, lut)
filtered = add_diversity(filtered)
tab = " "
for browser in ["Firefox", "Safari", "Chrome", "Obscure"]:
for it in filtered[browser]:
print('{}- - "{}"'.format(3 * tab, it[0]))
desc = it[1].replace('\xa0', ' ').replace(' ', ' ')
print("{}- {}".format(4 * tab, desc))
print("")
if __name__ == '__main__':
main()