qutebrowser/scripts/importer.py

140 lines
5.4 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
2014-10-15 20:43:47 +02:00
# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:
2017-05-09 21:37:03 +02:00
# Copyright 2014-2017 Claude (longneck) <longneck@scratchbook.ch>
# Copyright 2014-2017 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
# This file is part of qutebrowser.
#
# qutebrowser is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# qutebrowser is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with qutebrowser. If not, see <http://www.gnu.org/licenses/>.
2014-10-15 06:20:54 +02:00
"""Tool to import data from other browsers.
Currently only importing bookmarks from Netscape Bookmark files is supported.
2014-10-15 06:20:54 +02:00
"""
2014-10-15 06:16:12 +02:00
import argparse
def main():
args = get_args()
bookmark_types = []
output_format = ''
if args.search_query or args.search_output:
bookmark_types = ['search']
if args.newconfig:
output_format = 'ncsearch'
else:
output_format = 'search'
else:
if args.bookmark_output:
output_format = 'bookmark'
elif args.quickmark_output:
output_format = 'quickmark'
if args.bookmark_query:
bookmark_types.append('bookmark')
if args.keyword_query:
bookmark_types.append('keyword')
if not bookmark_types:
bookmark_types = ['bookmark','keyword']
if not output_format:
output_format = 'quickmark'
if args.browser in ['chromium', 'firefox', 'ie']:
import_netscape_bookmarks(args.bookmarks,bookmark_types,output_format)
2014-10-15 06:16:12 +02:00
def get_args():
"""Get the argparse parser."""
parser = argparse.ArgumentParser(
epilog="To import bookmarks from Chromium, Firefox or IE, "
"export them to HTML in your browsers bookmark manager. "
"By default, this script will output in a quickmarks format.")
parser.add_argument('browser', help="Which browser? (chromium, firefox)",
choices=['chromium', 'firefox', 'ie'],
2014-10-15 06:15:50 +02:00
metavar='browser')
parser.add_argument('-b', help="Output in bookmark format.",
dest='bookmark_output', action='store_true',
default=False, required=False)
parser.add_argument('-q', help="Output in quickmark format (default).",
dest='quickmark_output', action='store_true',
default=False,required=False)
parser.add_argument('-s', help="Output search engine format",
dest='search_output', action='store_true',
default=False,required=False)
parser.add_argument('--newconfig', help="Output search engine format for new config.py format",
default=False,action='store_true',required=False)
parser.add_argument('-S', help="Import search engines",
dest='search_query', action='store_true',
default=False,required=False)
parser.add_argument('-B', help="Import plain bookmarks (no keywords)",
dest='bookmark_query', action='store_true',
default=False,required=False)
parser.add_argument('-K', help="Import keywords (no search)",
dest='keyword_query', action='store_true',
default=False,required=False)
parser.add_argument('bookmarks', help="Bookmarks file (html format)")
args = parser.parse_args()
return args
2016-03-20 13:02:04 +01:00
def import_netscape_bookmarks(bookmarks_file, bookmark_types, output_format):
2016-03-20 13:02:04 +01:00
"""Import bookmarks from a NETSCAPE-Bookmark-file v1.
2014-10-15 06:16:12 +02:00
2016-03-20 12:08:26 +01:00
Generated by Chromium, Firefox, IE and possibly more browsers
"""
2014-10-15 06:17:00 +02:00
import bs4
2014-10-15 21:06:52 +02:00
with open(bookmarks_file, encoding='utf-8') as f:
Update to beautifulsoup 4.4.0. Upstream changelog: Especially important changes: * Added a warning when you instantiate a BeautifulSoup object without explicitly naming a parser. [bug=1398866] * __repr__ now returns an ASCII bytestring in Python 2, and a Unicode string in Python 3, instead of a UTF8-encoded bytestring in both versions. In Python 3, __str__ now returns a Unicode string instead of a bytestring. [bug=1420131] * The `text` argument to the find_* methods is now called `string`, which is more accurate. `text` still works, but `string` is the argument described in the documentation. `text` may eventually change its meaning, but not for a very long time. [bug=1366856] * Changed the way soup objects work under copy.copy(). Copying a NavigableString or a Tag will give you a new NavigableString that's equal to the old one but not connected to the parse tree. Patch by Martijn Peters. [bug=1307490] * Started using a standard MIT license. [bug=1294662] * Added a Chinese translation of the documentation by Delong .w. New features: * Introduced the select_one() method, which uses a CSS selector but only returns the first match, instead of a list of matches. [bug=1349367] * You can now create a Tag object without specifying a TreeBuilder. Patch by Martijn Pieters. [bug=1307471] * You can now create a NavigableString or a subclass just by invoking the constructor. [bug=1294315] * Added an `exclude_encodings` argument to UnicodeDammit and to the Beautiful Soup constructor, which lets you prohibit the detection of an encoding that you know is wrong. [bug=1469408] * The select() method now supports selector grouping. Patch by Francisco Canas [bug=1191917] Bug fixes: * Fixed yet another problem that caused the html5lib tree builder to create a disconnected parse tree. [bug=1237763] * Force object_was_parsed() to keep the tree intact even when an element from later in the document is moved into place. [bug=1430633] * Fixed yet another bug that caused a disconnected tree when html5lib copied an element from one part of the tree to another. [bug=1270611] * Fixed a bug where Element.extract() could create an infinite loop in the remaining tree. * The select() method can now find tags whose names contain dashes. Patch by Francisco Canas. [bug=1276211] * The select() method can now find tags with attributes whose names contain dashes. Patch by Marek Kapolka. [bug=1304007] * Improved the lxml tree builder's handling of processing instructions. [bug=1294645] * Restored the helpful syntax error that happens when you try to import the Python 2 edition of Beautiful Soup under Python 3. [bug=1213387] * In Python 3.4 and above, set the new convert_charrefs argument to the html.parser constructor to avoid a warning and future failures. Patch by Stefano Revera. [bug=1375721] * The warning when you pass in a filename or URL as markup will now be displayed correctly even if the filename or URL is a Unicode string. [bug=1268888] * If the initial <html> tag contains a CDATA list attribute such as 'class', the html5lib tree builder will now turn its value into a list, as it would with any other tag. [bug=1296481] * Fixed an import error in Python 3.5 caused by the removal of the HTMLParseError class. [bug=1420063] * Improved docstring for encode_contents() and decode_contents(). [bug=1441543] * Fixed a crash in Unicode, Dammit's encoding detector when the name of the encoding itself contained invalid bytes. [bug=1360913] * Improved the exception raised when you call .unwrap() or .replace_with() on an element that's not attached to a tree. * Raise a NotImplementedError whenever an unsupported CSS pseudoclass is used in select(). Previously some cases did not result in a NotImplementedError. * It's now possible to pickle a BeautifulSoup object no matter which tree builder was used to create it. However, the only tree builder that survives the pickling process is the HTMLParserTreeBuilder ('html.parser'). If you unpickle a BeautifulSoup object created with some other tree builder, soup.builder will be None. [bug=1231545]
2015-07-06 10:47:49 +02:00
soup = bs4.BeautifulSoup(f, 'html.parser')
bookmark_query = {
'search':
lambda tag: (tag.name == 'a') and ('shortcuturl' in tag.attrs) and ('%s' in tag['href']),
'keyword':
lambda tag: (tag.name == 'a') and ('shortcuturl' in tag.attrs) and ('%s' not in tag['href']),
'bookmark':
lambda tag: (tag.name == 'a') and ('shortcuturl' not in tag.attrs) and (tag.string)
}
output_template = {
'ncsearch': {
2017-09-15 10:03:24 +02:00
'search': "c.url.searchengines['{tag[shortcuturl]}'] = '{tag[href]}' #{tag.string}"
},
'search': {
'search': '{tag[shortcuturl]} = {tag[href]} #{tag.string}',
},
'bookmark': {
'bookmark': '{tag[href]} {tag.string}',
'keyword': '{tag[href]} {tag.string}'
},
'quickmark': {
'bookmark': '{tag.string} {tag[href]}',
'keyword': '{tag[shortcuturl]} {tag[href]}'
}
}
bookmarks = []
for typ in bookmark_types:
tags = soup.findAll(bookmark_query[typ])
for tag in tags:
if typ=='search':
tag['href'] = tag['href'].replace('%s','{}')
if tag['href'] not in bookmarks:
bookmarks.append(output_template[output_format][typ].format(tag=tag))
for bookmark in bookmarks:
print(bookmark)
2014-10-15 06:16:12 +02:00
if __name__ == '__main__':
main()