From 837d8a17172f226a6a09dd4ddd8914bfec2cb1aa Mon Sep 17 00:00:00 2001 From: rnhmjoj Date: Sat, 4 Apr 2020 10:49:40 +0200 Subject: [PATCH] python: add elearning video downloader --- python/elearning.py | 340 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 340 insertions(+) create mode 100755 python/elearning.py diff --git a/python/elearning.py b/python/elearning.py new file mode 100755 index 0000000..638c4c1 --- /dev/null +++ b/python/elearning.py @@ -0,0 +1,340 @@ +#!/usr/bin/env nix-script +#!>python3 +#! python3 | requests beautifulsoup4 +#! shell | ffmpeg +#! env | EUSER EPASS + +import requests +import subprocess +import argparse +import pathlib +import getpass +import json +import sys +import os + +from datetime import datetime +from requests.utils import unquote, urlparse +from bs4 import BeautifulSoup + + +parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=''' + Download all video lessons from an elearning course. + + The videos are taken at the original quality and encoded + using h.265 slow profile, 96kb/s opus for audio, via ffmpeg. + + If authentication is required the EUSER,EPASS variables + are tried for logging in, otherwise they will be prompted. + Only Kaltura videos are supported (dual screen and captions + work, though).''', + epilog=''' + Copyright (C) 2020 Michele Guerini Rocco (rnhmjoj) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + ''') +parser.add_argument('course_id', metavar='course-id', type=str, + help='the id of the course to download. it can be found' + ' at the end of the course homepage url') +parser.add_argument('--skip', '-s', metavar='N', type=int, + default=0, help='skip the first N links') +parser.add_argument('--link-only', '-l', action='store_true', + help='only print the links without downloading') +parser.add_argument('--json', '-j', action='store_true', + help='print the video metadata in JSON') +parser.add_argument('--directory', '-d', metavar='DIR', + type=str, default=pathlib.Path(), + help='directory where to save the videos. defaults to' + ' the currenct directory if not given') +parser.add_argument('--ffmpeg', '-f', metavar='ARG', + type=str, default=[], nargs='+', + help='extra arguments to pass to ffmpeg') +parser.add_argument('--base-url', metavar='URL', type=str, + default='https://elearning.unimib.it', + help='the base elearning website url') +parser.add_argument('--auth-url', metavar='URL', type=str, + default='https://idp-idm.unimib.it/idp/' + 'profile/SAML2/Redirect/SSO', + help='the url of Shibboleth identity provider.' + ' if you have no idea what it is, leave it') + + +def printr(*args, **kwargs): + ''' + Shorthand for print to the stderr. + ''' + print(*args, **kwargs, file=sys.stderr) + + +def inputr(prompt): + printr(prompt, end='') + return input() + + +def getenv(var, fallback): + ''' + Read an environment variable or use + a call a function for a default value. + ''' + val = os.getenv(var) + if val is None: + return fallback() + + +def open_course(url, args): + ''' + GET and parse the couse page. + Also tames the legendary black beast of Shibboleth. + ''' + session = requests.Session() + + res = session.get(url) + page = BeautifulSoup(res.content, 'html.parser') + printr('done') + + # do the authentication + if 'enrol' in res.url: + printr('\n# authentication needed.. sigh') + username = getenv('EUSER', lambda: inputr('username: ')) + password = getenv('EPASS', lambda: getpass.getpass('password: ')) + + # elearning login request + key = page.find('input', attrs={'name': 'sesskey'})['value'] + res = session.get(args.base_url + '/auth/unimibsaml/login.php', + params=dict(wantsurl=url, sesskey=key)) + + # shibboleth auth request + page = BeautifulSoup(res.content, 'html.parser') + res = session.post(args.auth_url, params=dict(execution='e1s1'), + data=dict(_eventId_proceed='')) + + # submit shibboleth login form + login_form = dict(j_username=username, + j_password=password, + _eventId_proceed='') + res = session.post(res.url, data=login_form) + + if 'e1s3' in res.url: + printr('# authentication failed :(') + exit(1) + + # finally get the auth token + page = BeautifulSoup(res.content, 'html.parser') + form = page.find('form') + resp = form.find('input', attrs={'name': 'SAMLResponse'})['value'] + res = session.post(unquote(form['action']), + data=dict(SAMLResponse=resp)) + page = BeautifulSoup(res.content, 'html.parser') + + printr('# done!\n') + + return session, page + + +def parse(url, session=requests): + ''' + GET a url and parse the html response. + ''' + res = session.get(url) + return BeautifulSoup(res.content, 'html.parser') + + +def get_info(partner_id, entry_id): + ''' + Downloads metadata information of the video + with 'entry_id' from the 'partner_id'. + ''' + url = 'https://cdnapisec.kaltura.com/api_v3/index.php' + + init = dict( + action='null', + apiVersion='3.1', + clientTag='kwidget:v2.80', + format=1, + service='multirequest') + + # this returns a session key "ks" + # which is used in subsequest reqs. + session = dict( + expiry=86400, + service='session', + action='startWidgetSession', + widgetId=f'_{partner_id}') + + # video metadata + info_parent = { + 'action': 'get', + 'entryId': entry_id, + 'service': 'baseentry', + 'ks': '{1:result:ks}'} + + # child contains a secondary stream: + # it could be screen+webcam + info_child = { + 'ks': '{1:result:ks}', + 'service': 'baseEntry', + 'action': 'list', + 'filter:objectType': 'KalturaBaseEntryFilter', + 'filter:typeEqual': 1, + 'filter:parentEntryIdEqual': entry_id} + + # join requests + query = init + for i, a in enumerate([session, info_parent, info_child], start=1): + for k, v in a.items(): + query['%d:%s' % (i, k)] = v + + info_parent, info_child = requests.get(url, params=query).json()[1:] + + info = [info_parent] + if info_child['totalCount'] > 0: + info += info_child['objects'] + + # strip html from description + for i in info: + text = BeautifulSoup(i['description'], 'html.parser').get_text() + i['description'] = text + return info + + +def extract_ids(page, partner_id=None): + ''' + Given the player iframe page extracts the + 'partner_id' and 'entry_id' of the video. + The partner is is only fetcher if 'partner_id' is + None, this saves one http request per video. + ''' + url = page.find(id='contentframe')['src'] + query = urlparse(url).query + params = dict(i.split('=') for i in query.split('&')) + source = unquote(params['source']) + + settings = urlparse(source).path.split('/') + entry_id = settings[settings.index('entryid') + 1] + + if partner_id is None: + iframe = parse(url) + partner_id = iframe.find( + 'input', attrs=dict(name='oauth_consumer_key'))['value'] + + return partner_id, entry_id + + +def save_video(infos, args): + ''' + Download and convert the video + using ffmpeg and x265. + ''' + urls = (i['downloadUrl'] for i in infos) + info = infos[0] + + # use the description as a filename + title = [] + for word in info['description'].split(): + if word != '-': + title.append(word.lower()) + filename = '-'.join(title) + + # parse creation date + date = datetime.fromtimestamp(info['createdAt']) + info['createdAt'] = date.isoformat() + + # create directory if necessary + dir = pathlib.Path(args.directory) + dir.mkdir(parents=True, exist_ok=True) + + # create ffmpeg input args + inputs, maps = [], [] + for i, url in enumerate(urls): + inputs.extend(['-i', url]) + maps.extend(['-map', str(i) + (':v' if i > 0 else '')]) + + ffmpeg = [ + 'ffmpeg', '-hide_banner', + '-loglevel', 'error', + '-stats', '-n', + ] + inputs + maps + args.ffmpeg + [ + # video + '-c:v', 'libx265', '-preset', 'slow', '-crf', '23', + '-x265-params', 'log-level=error', + # audio + '-c:a', 'libopus', '-b:a', '96k', + + # metadata + '-metadata', 'title=' + info['description'], + '-metadata', 'AUTHOR=' + info['userId'], + '-metadata', 'DATE=' + info['createdAt'], + '-metadata', 'IDS=' + ','.join(i['id'] for i in infos), + + # output + (dir / filename).with_suffix('.mkv') + ] + + info['duration'] = int(info['duration'])/60 + printr('# downloading "{description}" ' + '- {duration:.1f}min'.format_map(info)) + printr('# by {userId}, {views} views'.format_map(info)) + subprocess.run(ffmpeg) + printr() + + +def main(args): + course = ('{base_url}/course' + '/view.php?id={course_id}'.format_map(vars(args))) + + printr('* opening course...', end='', flush=True) + session, page = open_course(course, args) + + links = [] + for li in page.find_all('li', class_='kalvidres'): + links.append(li.find('a')['href']) + printr('* {} videos found!\n'.format(len(links) or 'no')) + + partner = None + output = [] + for i, link in enumerate(links[args.skip:], start=args.skip): + page = parse(link, session) + + printr(f'{i+1}. fetching video metadata...', end='', flush=True) + partner, entry = extract_ids(page, partner) + info = get_info(partner, entry) + printr('done') + + if args.link_only: + print('desc: {description}\n' + 'author: {userId}\n' + 'views: {views}'.format_map(info[0])) + if len(info) > 1: + print('dual video') + print('camera url:', info[0]['downloadUrl']) + print('screen url:', info[1]['downloadUrl']) + else: + print('url:', info[0]['downloadUrl']) + printr() + if args.json: + output.append(info) + else: + save_video(info, args) + + if args.json: + print(json.dumps(output)) + + +if __name__ == '__main__': + try: + main(parser.parse_args()) + except KeyboardInterrupt: + printr('\nbye!')