misc/python/elearning.py

#!/usr/bin/env nix-script
#!>python3
#! python3 | requests beautifulsoup4
#! shell   | ffmpeg
#! env     | EUSER EPASS

import requests
import subprocess
import argparse
import pathlib
import getpass
import json
import sys
import os

from datetime       import datetime
from requests.utils import unquote, urlparse
from bs4            import BeautifulSoup


parser = argparse.ArgumentParser(
    formatter_class=argparse.RawDescriptionHelpFormatter,
    description='''
    Download all video lessons from an elearning course.

    The videos are taken at the original quality and encoded
    using h.265 slow profile, 96kb/s opus for audio, via ffmpeg.

    If authentication is required the EUSER,EPASS variables
    are tried for logging in, otherwise they will be prompted.
    Only Kaltura videos are supported (dual screen and captions
    work, though).''',
    epilog='''
    Copyright (C) 2020 Michele Guerini Rocco (rnhmjoj)

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program. If not, see <https://www.gnu.org/licenses/>.
    ''')
parser.add_argument('course_id', metavar='course-id', type=str,
                    help='the id of the course to download. it can be found'
                         ' at the end of the course homepage url')
parser.add_argument('--skip', '-s', metavar='N', type=int,
                    default=0, help='skip the first N links')
parser.add_argument('--link-only', '-l', action='store_true',
                    help='only print the links without downloading')
parser.add_argument('--json', '-j', action='store_true',
                    help='print the video metadata in JSON')
parser.add_argument('--directory', '-d', metavar='DIR',
                    type=str, default=pathlib.Path(),
                    help='directory where to save the videos. defaults to'
                         ' the currenct directory if not given')
parser.add_argument('--ffmpeg', '-f', metavar='ARG',
                    type=str, default=[], nargs='+',
                    help='extra arguments to pass to ffmpeg')
parser.add_argument('--base-url', metavar='URL', type=str,
                    default='https://elearning.unimib.it',
                    help='the base elearning website url')
parser.add_argument('--auth-url', metavar='URL', type=str,
                    default='https://idp-idm.unimib.it/idp/'
                            'profile/SAML2/Redirect/SSO',
                    help='the url of Shibboleth identity provider.'
                         ' if you have no idea what it is, leave it')


def printr(*args, **kwargs):
    '''
    Shorthand for print to the stderr.
    '''
    print(*args, **kwargs, file=sys.stderr)


def inputr(prompt):
    printr(prompt, end='')
    return input()


def getenv(var, fallback):
    '''
    Read an environment variable or use
    a call a function for a default value.
    '''
    val = os.getenv(var)
    if val is None:
        return fallback()


def open_course(url, args):
    '''
    GET and parse the couse page.
    Also tames the legendary black beast of Shibboleth.
    '''
    session = requests.Session()

    res  = session.get(url)
    page = BeautifulSoup(res.content, 'html.parser')
    printr('done')

    # do the authentication
    if 'enrol' in res.url:
        printr('\n# authentication needed.. sigh')
        username = getenv('EUSER', lambda: inputr('username: '))
        password = getenv('EPASS', lambda: getpass.getpass('password: '))

        # elearning login request
        key = page.find('input', attrs={'name': 'sesskey'})['value']
        res = session.get(args.base_url + '/auth/unimibsaml/login.php',
                          params=dict(wantsurl=url, sesskey=key))

        # shibboleth auth request
        page = BeautifulSoup(res.content, 'html.parser')
        res = session.post(args.auth_url, params=dict(execution='e1s1'),
                           data=dict(_eventId_proceed=''))

        # submit shibboleth login form
        login_form = dict(j_username=username,
                          j_password=password,
                          _eventId_proceed='')
        res = session.post(res.url, data=login_form)

        if 'e1s3' in res.url:
            printr('# authentication failed :(')
            exit(1)

        # finally get the auth token
        page = BeautifulSoup(res.content, 'html.parser')
        form = page.find('form')
        resp = form.find('input', attrs={'name': 'SAMLResponse'})['value']
        res  = session.post(unquote(form['action']),
                            data=dict(SAMLResponse=resp))
        page = BeautifulSoup(res.content, 'html.parser')

        printr('# done!\n')

    return session, page


def parse(url, session=requests):
    '''
    GET a url and parse the html response.
    '''
    res  = session.get(url)
    return BeautifulSoup(res.content, 'html.parser')


def get_info(partner_id, entry_id):
    '''
    Downloads metadata information of the video
    with 'entry_id' from the 'partner_id'.
    '''
    url = 'https://cdnapisec.kaltura.com/api_v3/index.php'

    init = dict(
        action='null',
        apiVersion='3.1',
        clientTag='kwidget:v2.80',
        format=1,
        service='multirequest')

    # this returns a session key "ks"
    # which is used in subsequest reqs.
    session = dict(
        expiry=86400,
        service='session',
        action='startWidgetSession',
        widgetId=f'_{partner_id}')

    # video metadata
    info_parent = {
        'action': 'get',
        'entryId': entry_id,
        'service': 'baseentry',
        'ks': '{1:result:ks}'}

    # child contains a secondary stream:
    # it could be screen+webcam
    info_child = {
        'ks': '{1:result:ks}',
        'service': 'baseEntry',
        'action': 'list',
        'filter:objectType': 'KalturaBaseEntryFilter',
        'filter:typeEqual': 1,
        'filter:parentEntryIdEqual': entry_id}

    # join requests
    query = init
    for i, a in enumerate([session, info_parent, info_child], start=1):
        for k, v in a.items():
            query['%d:%s' % (i, k)] = v

    info_parent, info_child = requests.get(url, params=query).json()[1:]

    info = [info_parent]
    if info_child['totalCount'] > 0:
        info += info_child['objects']

    # strip html from description
    for i in info:
        text = BeautifulSoup(i['description'], 'html.parser').get_text()
        i['description'] = text
    return info


def extract_ids(page, partner_id=None):
    '''
    Given the player iframe page extracts the
    'partner_id' and 'entry_id' of the video.
    The partner is is only fetcher if 'partner_id' is
    None, this saves one http request per video.
    '''
    url    = page.find(id='contentframe')['src']
    query  = urlparse(url).query
    params = dict(i.split('=') for i in query.split('&'))
    source = unquote(params['source'])

    settings = urlparse(source).path.split('/')
    entry_id = settings[settings.index('entryid') + 1]

    if partner_id is None:
        iframe = parse(url)
        partner_id = iframe.find(
            'input', attrs=dict(name='oauth_consumer_key'))['value']

    return partner_id, entry_id


def save_video(infos, args):
    '''
    Download and convert the video
    using ffmpeg and x265.
    '''
    urls = (i['downloadUrl'] for i in infos)
    info = infos[0]

    # use the description as a filename
    title = []
    for word in info['description'].split():
        if word != '-':
            title.append(word.lower())
    filename = '-'.join(title)

    # parse creation date
    date = datetime.fromtimestamp(info['createdAt'])
    info['createdAt'] = date.isoformat()

    # create directory if necessary
    dir = pathlib.Path(args.directory)
    dir.mkdir(parents=True, exist_ok=True)

    # create ffmpeg input args
    inputs, maps = [], []
    for i, url in enumerate(urls):
        inputs.extend(['-i', url])
        maps.extend(['-map', str(i) + (':v' if i > 0 else '')])

    ffmpeg = [
        'ffmpeg', '-hide_banner',
        '-loglevel', 'error',
        '-stats', '-n',
    ] + inputs + maps + args.ffmpeg + [
        # video
        '-c:v', 'libx265', '-preset', 'slow', '-crf', '23',
                '-x265-params', 'log-level=error',
        # audio
        '-c:a', 'libopus', '-b:a', '96k',

        # metadata
        '-metadata', 'title='  + info['description'],
        '-metadata', 'AUTHOR=' + info['userId'],
        '-metadata', 'DATE='   + info['createdAt'],
        '-metadata', 'IDS='    + ','.join(i['id'] for i in infos),

        # output
        (dir / filename).with_suffix('.mkv')
    ]

    info['duration'] = int(info['duration'])/60
    printr('# downloading "{description}" '
           '- {duration:.1f}min'.format_map(info))
    printr('# by {userId}, {views} views'.format_map(info))
    subprocess.run(ffmpeg)
    printr()


def main(args):
    course = ('{base_url}/course'
              '/view.php?id={course_id}'.format_map(vars(args)))

    printr('* opening course...', end='', flush=True)
    session, page = open_course(course, args)

    links = []
    for li in page.find_all('li', class_='kalvidres'):
        links.append(li.find('a')['href'])
    printr('* {} videos found!\n'.format(len(links) or 'no'))

    partner = None
    output = []
    for i, link in enumerate(links[args.skip:], start=args.skip):
        page = parse(link, session)

        printr(f'{i+1}. fetching video metadata...', end='', flush=True)
        partner, entry = extract_ids(page, partner)
        info = get_info(partner, entry)
        printr('done')

        if args.link_only:
            print('desc: {description}\n'
                  'author: {userId}\n'
                  'views: {views}'.format_map(info[0]))
            if len(info) > 1:
                print('dual video')
                print('camera url:', info[0]['downloadUrl'])
                print('screen url:', info[1]['downloadUrl'])
            else:
                print('url:', info[0]['downloadUrl'])
            printr()
        if args.json:
            output.append(info)
        else:
            save_video(info, args)

    if args.json:
        print(json.dumps(output))


if __name__ == '__main__':
    try:
        main(parser.parse_args())
    except KeyboardInterrupt:
        printr('\nbye!')
python: add elearning video downloader 2020-04-04 10:49:40 +02:00			`#!/usr/bin/env nix-script`
			`#!>python3`
			`#! python3 \| requests beautifulsoup4`
			`#! shell \| ffmpeg`
			`#! env \| EUSER EPASS`

			`import requests`
			`import subprocess`
			`import argparse`
			`import pathlib`
			`import getpass`
			`import json`
			`import sys`
			`import os`

			`from datetime import datetime`
			`from requests.utils import unquote, urlparse`
			`from bs4 import BeautifulSoup`


			`parser = argparse.ArgumentParser(`
			`formatter_class=argparse.RawDescriptionHelpFormatter,`
			`description='''`
			`Download all video lessons from an elearning course.`

			`The videos are taken at the original quality and encoded`
			`using h.265 slow profile, 96kb/s opus for audio, via ffmpeg.`

			`If authentication is required the EUSER,EPASS variables`
			`are tried for logging in, otherwise they will be prompted.`
			`Only Kaltura videos are supported (dual screen and captions`
			`work, though).''',`
			`epilog='''`
			`Copyright (C) 2020 Michele Guerini Rocco (rnhmjoj)`

			`This program is free software: you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation, either version 3 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with this program. If not, see <https://www.gnu.org/licenses/>.`
			`''')`
			`parser.add_argument('course_id', metavar='course-id', type=str,`
			`help='the id of the course to download. it can be found'`
			`' at the end of the course homepage url')`
			`parser.add_argument('--skip', '-s', metavar='N', type=int,`
			`default=0, help='skip the first N links')`
			`parser.add_argument('--link-only', '-l', action='store_true',`
			`help='only print the links without downloading')`
			`parser.add_argument('--json', '-j', action='store_true',`
			`help='print the video metadata in JSON')`
			`parser.add_argument('--directory', '-d', metavar='DIR',`
			`type=str, default=pathlib.Path(),`
			`help='directory where to save the videos. defaults to'`
			`' the currenct directory if not given')`
			`parser.add_argument('--ffmpeg', '-f', metavar='ARG',`
			`type=str, default=[], nargs='+',`
			`help='extra arguments to pass to ffmpeg')`
			`parser.add_argument('--base-url', metavar='URL', type=str,`
			`default='https://elearning.unimib.it',`
			`help='the base elearning website url')`
			`parser.add_argument('--auth-url', metavar='URL', type=str,`
			`default='https://idp-idm.unimib.it/idp/'`
			`'profile/SAML2/Redirect/SSO',`
			`help='the url of Shibboleth identity provider.'`
			`' if you have no idea what it is, leave it')`


			`def printr(args, *kwargs):`
			`'''`
			`Shorthand for print to the stderr.`
			`'''`
			`print(args, *kwargs, file=sys.stderr)`


			`def inputr(prompt):`
			`printr(prompt, end='')`
			`return input()`


			`def getenv(var, fallback):`
			`'''`
			`Read an environment variable or use`
			`a call a function for a default value.`
			`'''`
			`val = os.getenv(var)`
			`if val is None:`
			`return fallback()`


			`def open_course(url, args):`
			`'''`
			`GET and parse the couse page.`
			`Also tames the legendary black beast of Shibboleth.`
			`'''`
			`session = requests.Session()`

			`res = session.get(url)`
			`page = BeautifulSoup(res.content, 'html.parser')`
			`printr('done')`

			`# do the authentication`
			`if 'enrol' in res.url:`
			`printr('\n# authentication needed.. sigh')`
			`username = getenv('EUSER', lambda: inputr('username: '))`
			`password = getenv('EPASS', lambda: getpass.getpass('password: '))`

			`# elearning login request`
			`key = page.find('input', attrs={'name': 'sesskey'})['value']`
			`res = session.get(args.base_url + '/auth/unimibsaml/login.php',`
			`params=dict(wantsurl=url, sesskey=key))`

			`# shibboleth auth request`
			`page = BeautifulSoup(res.content, 'html.parser')`
			`res = session.post(args.auth_url, params=dict(execution='e1s1'),`
			`data=dict(_eventId_proceed=''))`

			`# submit shibboleth login form`
			`login_form = dict(j_username=username,`
			`j_password=password,`
			`_eventId_proceed='')`
			`res = session.post(res.url, data=login_form)`

			`if 'e1s3' in res.url:`
			`printr('# authentication failed :(')`
			`exit(1)`

			`# finally get the auth token`
			`page = BeautifulSoup(res.content, 'html.parser')`
			`form = page.find('form')`
			`resp = form.find('input', attrs={'name': 'SAMLResponse'})['value']`
			`res = session.post(unquote(form['action']),`
			`data=dict(SAMLResponse=resp))`
			`page = BeautifulSoup(res.content, 'html.parser')`

			`printr('# done!\n')`

			`return session, page`


			`def parse(url, session=requests):`
			`'''`
			`GET a url and parse the html response.`
			`'''`
			`res = session.get(url)`
			`return BeautifulSoup(res.content, 'html.parser')`


			`def get_info(partner_id, entry_id):`
			`'''`
			`Downloads metadata information of the video`
			`with 'entry_id' from the 'partner_id'.`
			`'''`
			`url = 'https://cdnapisec.kaltura.com/api_v3/index.php'`

			`init = dict(`
			`action='null',`
			`apiVersion='3.1',`
			`clientTag='kwidget:v2.80',`
			`format=1,`
			`service='multirequest')`

			`# this returns a session key "ks"`
			`# which is used in subsequest reqs.`
			`session = dict(`
			`expiry=86400,`
			`service='session',`
			`action='startWidgetSession',`
			`widgetId=f'_{partner_id}')`

			`# video metadata`
			`info_parent = {`
			`'action': 'get',`
			`'entryId': entry_id,`
			`'service': 'baseentry',`
			`'ks': '{1:result:ks}'}`

			`# child contains a secondary stream:`
			`# it could be screen+webcam`
			`info_child = {`
			`'ks': '{1:result:ks}',`
			`'service': 'baseEntry',`
			`'action': 'list',`
			`'filter:objectType': 'KalturaBaseEntryFilter',`
			`'filter:typeEqual': 1,`
			`'filter:parentEntryIdEqual': entry_id}`

			`# join requests`
			`query = init`
			`for i, a in enumerate([session, info_parent, info_child], start=1):`
			`for k, v in a.items():`
			`query['%d:%s' % (i, k)] = v`

			`info_parent, info_child = requests.get(url, params=query).json()[1:]`

			`info = [info_parent]`
			`if info_child['totalCount'] > 0:`
			`info += info_child['objects']`

			`# strip html from description`
			`for i in info:`
			`text = BeautifulSoup(i['description'], 'html.parser').get_text()`
			`i['description'] = text`
			`return info`


			`def extract_ids(page, partner_id=None):`
			`'''`
			`Given the player iframe page extracts the`
			`'partner_id' and 'entry_id' of the video.`
			`The partner is is only fetcher if 'partner_id' is`
			`None, this saves one http request per video.`
			`'''`
			`url = page.find(id='contentframe')['src']`
			`query = urlparse(url).query`
			`params = dict(i.split('=') for i in query.split('&'))`
			`source = unquote(params['source'])`

			`settings = urlparse(source).path.split('/')`
			`entry_id = settings[settings.index('entryid') + 1]`

			`if partner_id is None:`
			`iframe = parse(url)`
			`partner_id = iframe.find(`
			`'input', attrs=dict(name='oauth_consumer_key'))['value']`

			`return partner_id, entry_id`


			`def save_video(infos, args):`
			`'''`
			`Download and convert the video`
			`using ffmpeg and x265.`
			`'''`
			`urls = (i['downloadUrl'] for i in infos)`
			`info = infos[0]`

			`# use the description as a filename`
			`title = []`
			`for word in info['description'].split():`
			`if word != '-':`
			`title.append(word.lower())`
			`filename = '-'.join(title)`

			`# parse creation date`
			`date = datetime.fromtimestamp(info['createdAt'])`
			`info['createdAt'] = date.isoformat()`

			`# create directory if necessary`
			`dir = pathlib.Path(args.directory)`
			`dir.mkdir(parents=True, exist_ok=True)`

			`# create ffmpeg input args`
			`inputs, maps = [], []`
			`for i, url in enumerate(urls):`
			`inputs.extend(['-i', url])`
			`maps.extend(['-map', str(i) + (':v' if i > 0 else '')])`

			`ffmpeg = [`
			`'ffmpeg', '-hide_banner',`
			`'-loglevel', 'error',`
			`'-stats', '-n',`
			`] + inputs + maps + args.ffmpeg + [`
			`# video`
			`'-c:v', 'libx265', '-preset', 'slow', '-crf', '23',`
			`'-x265-params', 'log-level=error',`
			`# audio`
			`'-c:a', 'libopus', '-b:a', '96k',`

			`# metadata`
			`'-metadata', 'title=' + info['description'],`
			`'-metadata', 'AUTHOR=' + info['userId'],`
			`'-metadata', 'DATE=' + info['createdAt'],`
			`'-metadata', 'IDS=' + ','.join(i['id'] for i in infos),`

			`# output`
			`(dir / filename).with_suffix('.mkv')`
			`]`

			`info['duration'] = int(info['duration'])/60`
			`printr('# downloading "{description}" '`
			`'- {duration:.1f}min'.format_map(info))`
			`printr('# by {userId}, {views} views'.format_map(info))`
			`subprocess.run(ffmpeg)`
			`printr()`


			`def main(args):`
			`course = ('{base_url}/course'`
			`'/view.php?id={course_id}'.format_map(vars(args)))`

			`printr('* opening course...', end='', flush=True)`
			`session, page = open_course(course, args)`

			`links = []`
			`for li in page.find_all('li', class_='kalvidres'):`
			`links.append(li.find('a')['href'])`
			`printr('* {} videos found!\n'.format(len(links) or 'no'))`

			`partner = None`
			`output = []`
			`for i, link in enumerate(links[args.skip:], start=args.skip):`
			`page = parse(link, session)`

			`printr(f'{i+1}. fetching video metadata...', end='', flush=True)`
			`partner, entry = extract_ids(page, partner)`
			`info = get_info(partner, entry)`
			`printr('done')`

			`if args.link_only:`
			`print('desc: {description}\n'`
			`'author: {userId}\n'`
			`'views: {views}'.format_map(info[0]))`
			`if len(info) > 1:`
			`print('dual video')`
			`print('camera url:', info[0]['downloadUrl'])`
			`print('screen url:', info[1]['downloadUrl'])`
			`else:`
			`print('url:', info[0]['downloadUrl'])`
			`printr()`
			`if args.json:`
			`output.append(info)`
			`else:`
			`save_video(info, args)`

			`if args.json:`
			`print(json.dumps(output))`


			`if __name__ == '__main__':`
			`try:`
			`main(parser.parse_args())`
			`except KeyboardInterrupt:`
			`printr('\nbye!')`