python: add elearning video downloader

2020-04-04 10:49:40 +02:00 · 2020-04-04 10:49:40 +02:00 · 837d8a1717
commit 837d8a1717
parent ec139f4fb1
1 changed files with 340 additions and 0 deletions
--- a/python/elearning.py
+++ b/python/elearning.py
@ -0,0 +1,340 @@
+#!/usr/bin/env nix-script
+#!>python3
+#! python3 | requests beautifulsoup4
+#! shell   | ffmpeg
+#! env     | EUSER EPASS
+
+import requests
+import subprocess
+import argparse
+import pathlib
+import getpass
+import json
+import sys
+import os
+
+from datetime       import datetime
+from requests.utils import unquote, urlparse
+from bs4            import BeautifulSoup
+
+
+parser = argparse.ArgumentParser(
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    description='''
+    Download all video lessons from an elearning course.
+
+    The videos are taken at the original quality and encoded
+    using h.265 slow profile, 96kb/s opus for audio, via ffmpeg.
+
+    If authentication is required the EUSER,EPASS variables
+    are tried for logging in, otherwise they will be prompted.
+    Only Kaltura videos are supported (dual screen and captions
+    work, though).''',
+    epilog='''
+    Copyright (C) 2020 Michele Guerini Rocco (rnhmjoj)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program. If not, see <https://www.gnu.org/licenses/>.
+    ''')
+parser.add_argument('course_id', metavar='course-id', type=str,
+                    help='the id of the course to download. it can be found'
+                         ' at the end of the course homepage url')
+parser.add_argument('--skip', '-s', metavar='N', type=int,
+                    default=0, help='skip the first N links')
+parser.add_argument('--link-only', '-l', action='store_true',
+                    help='only print the links without downloading')
+parser.add_argument('--json', '-j', action='store_true',
+                    help='print the video metadata in JSON')
+parser.add_argument('--directory', '-d', metavar='DIR',
+                    type=str, default=pathlib.Path(),
+                    help='directory where to save the videos. defaults to'
+                         ' the currenct directory if not given')
+parser.add_argument('--ffmpeg', '-f', metavar='ARG',
+                    type=str, default=[], nargs='+',
+                    help='extra arguments to pass to ffmpeg')
+parser.add_argument('--base-url', metavar='URL', type=str,
+                    default='https://elearning.unimib.it',
+                    help='the base elearning website url')
+parser.add_argument('--auth-url', metavar='URL', type=str,
+                    default='https://idp-idm.unimib.it/idp/'
+                            'profile/SAML2/Redirect/SSO',
+                    help='the url of Shibboleth identity provider.'
+                         ' if you have no idea what it is, leave it')
+
+
+def printr(*args, **kwargs):
+    '''
+    Shorthand for print to the stderr.
+    '''
+    print(*args, **kwargs, file=sys.stderr)
+
+
+def inputr(prompt):
+    printr(prompt, end='')
+    return input()
+
+
+def getenv(var, fallback):
+    '''
+    Read an environment variable or use
+    a call a function for a default value.
+    '''
+    val = os.getenv(var)
+    if val is None:
+        return fallback()
+
+
+def open_course(url, args):
+    '''
+    GET and parse the couse page.
+    Also tames the legendary black beast of Shibboleth.
+    '''
+    session = requests.Session()
+
+    res  = session.get(url)
+    page = BeautifulSoup(res.content, 'html.parser')
+    printr('done')
+
+    # do the authentication
+    if 'enrol' in res.url:
+        printr('\n# authentication needed.. sigh')
+        username = getenv('EUSER', lambda: inputr('username: '))
+        password = getenv('EPASS', lambda: getpass.getpass('password: '))
+
+        # elearning login request
+        key = page.find('input', attrs={'name': 'sesskey'})['value']
+        res = session.get(args.base_url + '/auth/unimibsaml/login.php',
+                          params=dict(wantsurl=url, sesskey=key))
+
+        # shibboleth auth request
+        page = BeautifulSoup(res.content, 'html.parser')
+        res = session.post(args.auth_url, params=dict(execution='e1s1'),
+                           data=dict(_eventId_proceed=''))
+
+        # submit shibboleth login form
+        login_form = dict(j_username=username,
+                          j_password=password,
+                          _eventId_proceed='')
+        res = session.post(res.url, data=login_form)
+
+        if 'e1s3' in res.url:
+            printr('# authentication failed :(')
+            exit(1)
+
+        # finally get the auth token
+        page = BeautifulSoup(res.content, 'html.parser')
+        form = page.find('form')
+        resp = form.find('input', attrs={'name': 'SAMLResponse'})['value']
+        res  = session.post(unquote(form['action']),
+                            data=dict(SAMLResponse=resp))
+        page = BeautifulSoup(res.content, 'html.parser')
+
+        printr('# done!\n')
+
+    return session, page
+
+
+def parse(url, session=requests):
+    '''
+    GET a url and parse the html response.
+    '''
+    res  = session.get(url)
+    return BeautifulSoup(res.content, 'html.parser')
+
+
+def get_info(partner_id, entry_id):
+    '''
+    Downloads metadata information of the video
+    with 'entry_id' from the 'partner_id'.
+    '''
+    url = 'https://cdnapisec.kaltura.com/api_v3/index.php'
+
+    init = dict(
+        action='null',
+        apiVersion='3.1',
+        clientTag='kwidget:v2.80',
+        format=1,
+        service='multirequest')
+
+    # this returns a session key "ks"
+    # which is used in subsequest reqs.
+    session = dict(
+        expiry=86400,
+        service='session',
+        action='startWidgetSession',
+        widgetId=f'_{partner_id}')
+
+    # video metadata
+    info_parent = {
+        'action': 'get',
+        'entryId': entry_id,
+        'service': 'baseentry',
+        'ks': '{1:result:ks}'}
+
+    # child contains a secondary stream:
+    # it could be screen+webcam
+    info_child = {
+        'ks': '{1:result:ks}',
+        'service': 'baseEntry',
+        'action': 'list',
+        'filter:objectType': 'KalturaBaseEntryFilter',
+        'filter:typeEqual': 1,
+        'filter:parentEntryIdEqual': entry_id}
+
+    # join requests
+    query = init
+    for i, a in enumerate([session, info_parent, info_child], start=1):
+        for k, v in a.items():
+            query['%d:%s' % (i, k)] = v
+
+    info_parent, info_child = requests.get(url, params=query).json()[1:]
+
+    info = [info_parent]
+    if info_child['totalCount'] > 0:
+        info += info_child['objects']
+
+    # strip html from description
+    for i in info:
+        text = BeautifulSoup(i['description'], 'html.parser').get_text()
+        i['description'] = text
+    return info
+
+
+def extract_ids(page, partner_id=None):
+    '''
+    Given the player iframe page extracts the
+    'partner_id' and 'entry_id' of the video.
+    The partner is is only fetcher if 'partner_id' is
+    None, this saves one http request per video.
+    '''
+    url    = page.find(id='contentframe')['src']
+    query  = urlparse(url).query
+    params = dict(i.split('=') for i in query.split('&'))
+    source = unquote(params['source'])
+
+    settings = urlparse(source).path.split('/')
+    entry_id = settings[settings.index('entryid') + 1]
+
+    if partner_id is None:
+        iframe = parse(url)
+        partner_id = iframe.find(
+            'input', attrs=dict(name='oauth_consumer_key'))['value']
+
+    return partner_id, entry_id
+
+
+def save_video(infos, args):
+    '''
+    Download and convert the video
+    using ffmpeg and x265.
+    '''
+    urls = (i['downloadUrl'] for i in infos)
+    info = infos[0]
+
+    # use the description as a filename
+    title = []
+    for word in info['description'].split():
+        if word != '-':
+            title.append(word.lower())
+    filename = '-'.join(title)
+
+    # parse creation date
+    date = datetime.fromtimestamp(info['createdAt'])
+    info['createdAt'] = date.isoformat()
+
+    # create directory if necessary
+    dir = pathlib.Path(args.directory)
+    dir.mkdir(parents=True, exist_ok=True)
+
+    # create ffmpeg input args
+    inputs, maps = [], []
+    for i, url in enumerate(urls):
+        inputs.extend(['-i', url])
+        maps.extend(['-map', str(i) + (':v' if i > 0 else '')])
+
+    ffmpeg = [
+        'ffmpeg', '-hide_banner',
+        '-loglevel', 'error',
+        '-stats', '-n',
+    ] + inputs + maps + args.ffmpeg + [
+        # video
+        '-c:v', 'libx265', '-preset', 'slow', '-crf', '23',
+                '-x265-params', 'log-level=error',
+        # audio
+        '-c:a', 'libopus', '-b:a', '96k',
+
+        # metadata
+        '-metadata', 'title='  + info['description'],
+        '-metadata', 'AUTHOR=' + info['userId'],
+        '-metadata', 'DATE='   + info['createdAt'],
+        '-metadata', 'IDS='    + ','.join(i['id'] for i in infos),
+
+        # output
+        (dir / filename).with_suffix('.mkv')
+    ]
+
+    info['duration'] = int(info['duration'])/60
+    printr('# downloading "{description}" '
+           '- {duration:.1f}min'.format_map(info))
+    printr('# by {userId}, {views} views'.format_map(info))
+    subprocess.run(ffmpeg)
+    printr()
+
+
+def main(args):
+    course = ('{base_url}/course'
+              '/view.php?id={course_id}'.format_map(vars(args)))
+
+    printr('* opening course...', end='', flush=True)
+    session, page = open_course(course, args)
+
+    links = []
+    for li in page.find_all('li', class_='kalvidres'):
+        links.append(li.find('a')['href'])
+    printr('* {} videos found!\n'.format(len(links) or 'no'))
+
+    partner = None
+    output = []
+    for i, link in enumerate(links[args.skip:], start=args.skip):
+        page = parse(link, session)
+
+        printr(f'{i+1}. fetching video metadata...', end='', flush=True)
+        partner, entry = extract_ids(page, partner)
+        info = get_info(partner, entry)
+        printr('done')
+
+        if args.link_only:
+            print('desc: {description}\n'
+                  'author: {userId}\n'
+                  'views: {views}'.format_map(info[0]))
+            if len(info) > 1:
+                print('dual video')
+                print('camera url:', info[0]['downloadUrl'])
+                print('screen url:', info[1]['downloadUrl'])
+            else:
+                print('url:', info[0]['downloadUrl'])
+            printr()
+        if args.json:
+            output.append(info)
+        else:
+            save_video(info, args)
+
+    if args.json:
+        print(json.dumps(output))
+
+
+if __name__ == '__main__':
+    try:
+        main(parser.parse_args())
+    except KeyboardInterrupt:
+        printr('\nbye!')