python: add elearning video downloader
This commit is contained in:
parent
ec139f4fb1
commit
837d8a1717
340
python/elearning.py
Executable file
340
python/elearning.py
Executable file
@ -0,0 +1,340 @@
|
||||
#!/usr/bin/env nix-script
|
||||
#!>python3
|
||||
#! python3 | requests beautifulsoup4
|
||||
#! shell | ffmpeg
|
||||
#! env | EUSER EPASS
|
||||
|
||||
import requests
|
||||
import subprocess
|
||||
import argparse
|
||||
import pathlib
|
||||
import getpass
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
|
||||
from datetime import datetime
|
||||
from requests.utils import unquote, urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description='''
|
||||
Download all video lessons from an elearning course.
|
||||
|
||||
The videos are taken at the original quality and encoded
|
||||
using h.265 slow profile, 96kb/s opus for audio, via ffmpeg.
|
||||
|
||||
If authentication is required the EUSER,EPASS variables
|
||||
are tried for logging in, otherwise they will be prompted.
|
||||
Only Kaltura videos are supported (dual screen and captions
|
||||
work, though).''',
|
||||
epilog='''
|
||||
Copyright (C) 2020 Michele Guerini Rocco (rnhmjoj)
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
''')
|
||||
parser.add_argument('course_id', metavar='course-id', type=str,
|
||||
help='the id of the course to download. it can be found'
|
||||
' at the end of the course homepage url')
|
||||
parser.add_argument('--skip', '-s', metavar='N', type=int,
|
||||
default=0, help='skip the first N links')
|
||||
parser.add_argument('--link-only', '-l', action='store_true',
|
||||
help='only print the links without downloading')
|
||||
parser.add_argument('--json', '-j', action='store_true',
|
||||
help='print the video metadata in JSON')
|
||||
parser.add_argument('--directory', '-d', metavar='DIR',
|
||||
type=str, default=pathlib.Path(),
|
||||
help='directory where to save the videos. defaults to'
|
||||
' the currenct directory if not given')
|
||||
parser.add_argument('--ffmpeg', '-f', metavar='ARG',
|
||||
type=str, default=[], nargs='+',
|
||||
help='extra arguments to pass to ffmpeg')
|
||||
parser.add_argument('--base-url', metavar='URL', type=str,
|
||||
default='https://elearning.unimib.it',
|
||||
help='the base elearning website url')
|
||||
parser.add_argument('--auth-url', metavar='URL', type=str,
|
||||
default='https://idp-idm.unimib.it/idp/'
|
||||
'profile/SAML2/Redirect/SSO',
|
||||
help='the url of Shibboleth identity provider.'
|
||||
' if you have no idea what it is, leave it')
|
||||
|
||||
|
||||
def printr(*args, **kwargs):
|
||||
'''
|
||||
Shorthand for print to the stderr.
|
||||
'''
|
||||
print(*args, **kwargs, file=sys.stderr)
|
||||
|
||||
|
||||
def inputr(prompt):
|
||||
printr(prompt, end='')
|
||||
return input()
|
||||
|
||||
|
||||
def getenv(var, fallback):
|
||||
'''
|
||||
Read an environment variable or use
|
||||
a call a function for a default value.
|
||||
'''
|
||||
val = os.getenv(var)
|
||||
if val is None:
|
||||
return fallback()
|
||||
|
||||
|
||||
def open_course(url, args):
|
||||
'''
|
||||
GET and parse the couse page.
|
||||
Also tames the legendary black beast of Shibboleth.
|
||||
'''
|
||||
session = requests.Session()
|
||||
|
||||
res = session.get(url)
|
||||
page = BeautifulSoup(res.content, 'html.parser')
|
||||
printr('done')
|
||||
|
||||
# do the authentication
|
||||
if 'enrol' in res.url:
|
||||
printr('\n# authentication needed.. sigh')
|
||||
username = getenv('EUSER', lambda: inputr('username: '))
|
||||
password = getenv('EPASS', lambda: getpass.getpass('password: '))
|
||||
|
||||
# elearning login request
|
||||
key = page.find('input', attrs={'name': 'sesskey'})['value']
|
||||
res = session.get(args.base_url + '/auth/unimibsaml/login.php',
|
||||
params=dict(wantsurl=url, sesskey=key))
|
||||
|
||||
# shibboleth auth request
|
||||
page = BeautifulSoup(res.content, 'html.parser')
|
||||
res = session.post(args.auth_url, params=dict(execution='e1s1'),
|
||||
data=dict(_eventId_proceed=''))
|
||||
|
||||
# submit shibboleth login form
|
||||
login_form = dict(j_username=username,
|
||||
j_password=password,
|
||||
_eventId_proceed='')
|
||||
res = session.post(res.url, data=login_form)
|
||||
|
||||
if 'e1s3' in res.url:
|
||||
printr('# authentication failed :(')
|
||||
exit(1)
|
||||
|
||||
# finally get the auth token
|
||||
page = BeautifulSoup(res.content, 'html.parser')
|
||||
form = page.find('form')
|
||||
resp = form.find('input', attrs={'name': 'SAMLResponse'})['value']
|
||||
res = session.post(unquote(form['action']),
|
||||
data=dict(SAMLResponse=resp))
|
||||
page = BeautifulSoup(res.content, 'html.parser')
|
||||
|
||||
printr('# done!\n')
|
||||
|
||||
return session, page
|
||||
|
||||
|
||||
def parse(url, session=requests):
|
||||
'''
|
||||
GET a url and parse the html response.
|
||||
'''
|
||||
res = session.get(url)
|
||||
return BeautifulSoup(res.content, 'html.parser')
|
||||
|
||||
|
||||
def get_info(partner_id, entry_id):
|
||||
'''
|
||||
Downloads metadata information of the video
|
||||
with 'entry_id' from the 'partner_id'.
|
||||
'''
|
||||
url = 'https://cdnapisec.kaltura.com/api_v3/index.php'
|
||||
|
||||
init = dict(
|
||||
action='null',
|
||||
apiVersion='3.1',
|
||||
clientTag='kwidget:v2.80',
|
||||
format=1,
|
||||
service='multirequest')
|
||||
|
||||
# this returns a session key "ks"
|
||||
# which is used in subsequest reqs.
|
||||
session = dict(
|
||||
expiry=86400,
|
||||
service='session',
|
||||
action='startWidgetSession',
|
||||
widgetId=f'_{partner_id}')
|
||||
|
||||
# video metadata
|
||||
info_parent = {
|
||||
'action': 'get',
|
||||
'entryId': entry_id,
|
||||
'service': 'baseentry',
|
||||
'ks': '{1:result:ks}'}
|
||||
|
||||
# child contains a secondary stream:
|
||||
# it could be screen+webcam
|
||||
info_child = {
|
||||
'ks': '{1:result:ks}',
|
||||
'service': 'baseEntry',
|
||||
'action': 'list',
|
||||
'filter:objectType': 'KalturaBaseEntryFilter',
|
||||
'filter:typeEqual': 1,
|
||||
'filter:parentEntryIdEqual': entry_id}
|
||||
|
||||
# join requests
|
||||
query = init
|
||||
for i, a in enumerate([session, info_parent, info_child], start=1):
|
||||
for k, v in a.items():
|
||||
query['%d:%s' % (i, k)] = v
|
||||
|
||||
info_parent, info_child = requests.get(url, params=query).json()[1:]
|
||||
|
||||
info = [info_parent]
|
||||
if info_child['totalCount'] > 0:
|
||||
info += info_child['objects']
|
||||
|
||||
# strip html from description
|
||||
for i in info:
|
||||
text = BeautifulSoup(i['description'], 'html.parser').get_text()
|
||||
i['description'] = text
|
||||
return info
|
||||
|
||||
|
||||
def extract_ids(page, partner_id=None):
|
||||
'''
|
||||
Given the player iframe page extracts the
|
||||
'partner_id' and 'entry_id' of the video.
|
||||
The partner is is only fetcher if 'partner_id' is
|
||||
None, this saves one http request per video.
|
||||
'''
|
||||
url = page.find(id='contentframe')['src']
|
||||
query = urlparse(url).query
|
||||
params = dict(i.split('=') for i in query.split('&'))
|
||||
source = unquote(params['source'])
|
||||
|
||||
settings = urlparse(source).path.split('/')
|
||||
entry_id = settings[settings.index('entryid') + 1]
|
||||
|
||||
if partner_id is None:
|
||||
iframe = parse(url)
|
||||
partner_id = iframe.find(
|
||||
'input', attrs=dict(name='oauth_consumer_key'))['value']
|
||||
|
||||
return partner_id, entry_id
|
||||
|
||||
|
||||
def save_video(infos, args):
|
||||
'''
|
||||
Download and convert the video
|
||||
using ffmpeg and x265.
|
||||
'''
|
||||
urls = (i['downloadUrl'] for i in infos)
|
||||
info = infos[0]
|
||||
|
||||
# use the description as a filename
|
||||
title = []
|
||||
for word in info['description'].split():
|
||||
if word != '-':
|
||||
title.append(word.lower())
|
||||
filename = '-'.join(title)
|
||||
|
||||
# parse creation date
|
||||
date = datetime.fromtimestamp(info['createdAt'])
|
||||
info['createdAt'] = date.isoformat()
|
||||
|
||||
# create directory if necessary
|
||||
dir = pathlib.Path(args.directory)
|
||||
dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# create ffmpeg input args
|
||||
inputs, maps = [], []
|
||||
for i, url in enumerate(urls):
|
||||
inputs.extend(['-i', url])
|
||||
maps.extend(['-map', str(i) + (':v' if i > 0 else '')])
|
||||
|
||||
ffmpeg = [
|
||||
'ffmpeg', '-hide_banner',
|
||||
'-loglevel', 'error',
|
||||
'-stats', '-n',
|
||||
] + inputs + maps + args.ffmpeg + [
|
||||
# video
|
||||
'-c:v', 'libx265', '-preset', 'slow', '-crf', '23',
|
||||
'-x265-params', 'log-level=error',
|
||||
# audio
|
||||
'-c:a', 'libopus', '-b:a', '96k',
|
||||
|
||||
# metadata
|
||||
'-metadata', 'title=' + info['description'],
|
||||
'-metadata', 'AUTHOR=' + info['userId'],
|
||||
'-metadata', 'DATE=' + info['createdAt'],
|
||||
'-metadata', 'IDS=' + ','.join(i['id'] for i in infos),
|
||||
|
||||
# output
|
||||
(dir / filename).with_suffix('.mkv')
|
||||
]
|
||||
|
||||
info['duration'] = int(info['duration'])/60
|
||||
printr('# downloading "{description}" '
|
||||
'- {duration:.1f}min'.format_map(info))
|
||||
printr('# by {userId}, {views} views'.format_map(info))
|
||||
subprocess.run(ffmpeg)
|
||||
printr()
|
||||
|
||||
|
||||
def main(args):
|
||||
course = ('{base_url}/course'
|
||||
'/view.php?id={course_id}'.format_map(vars(args)))
|
||||
|
||||
printr('* opening course...', end='', flush=True)
|
||||
session, page = open_course(course, args)
|
||||
|
||||
links = []
|
||||
for li in page.find_all('li', class_='kalvidres'):
|
||||
links.append(li.find('a')['href'])
|
||||
printr('* {} videos found!\n'.format(len(links) or 'no'))
|
||||
|
||||
partner = None
|
||||
output = []
|
||||
for i, link in enumerate(links[args.skip:], start=args.skip):
|
||||
page = parse(link, session)
|
||||
|
||||
printr(f'{i+1}. fetching video metadata...', end='', flush=True)
|
||||
partner, entry = extract_ids(page, partner)
|
||||
info = get_info(partner, entry)
|
||||
printr('done')
|
||||
|
||||
if args.link_only:
|
||||
print('desc: {description}\n'
|
||||
'author: {userId}\n'
|
||||
'views: {views}'.format_map(info[0]))
|
||||
if len(info) > 1:
|
||||
print('dual video')
|
||||
print('camera url:', info[0]['downloadUrl'])
|
||||
print('screen url:', info[1]['downloadUrl'])
|
||||
else:
|
||||
print('url:', info[0]['downloadUrl'])
|
||||
printr()
|
||||
if args.json:
|
||||
output.append(info)
|
||||
else:
|
||||
save_video(info, args)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(output))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
main(parser.parse_args())
|
||||
except KeyboardInterrupt:
|
||||
printr('\nbye!')
|
Loading…
Reference in New Issue
Block a user