From 1d37bced63a1a6b96e284180da27124c1a7ee9e8 Mon Sep 17 00:00:00 2001 From: Florent VIOLLEAU Date: Mon, 20 Apr 2020 01:59:35 +0200 Subject: [PATCH 1/2] [Fun Mooc] Fix login issue and add quality selection --- edx_dl/common.py | 1 + edx_dl/edx_dl.py | 84 ++++++++++++++++----- edx_dl/parsing.py | 184 +++++++++++++++++++++++++++++++++++++++++++++- edx_dl/utils.py | 7 ++ 4 files changed, 255 insertions(+), 21 deletions(-) diff --git a/edx_dl/common.py b/edx_dl/common.py index 16e52318..d135bec0 100644 --- a/edx_dl/common.py +++ b/edx_dl/common.py @@ -168,6 +168,7 @@ class ExitCode(object): INVALID_COURSE_URL = 4 UNKNOWN_PLATFORM = 5 NO_DOWNLOADABLE_VIDEO = 6 + MISSING_CSRF_TOKEN = 7 YOUTUBE_DL_CMD = ['youtube-dl', '--ignore-config'] diff --git a/edx_dl/edx_dl.py b/edx_dl/edx_dl.py index fd573f2b..019e7ec7 100644 --- a/edx_dl/edx_dl.py +++ b/edx_dl/edx_dl.py @@ -76,6 +76,7 @@ }, 'fun': { 'url': 'https://www.fun-mooc.fr', + 'isSpecialLogin': True, 'courseware-selector': ('section', {'aria-label': 'Menu du cours'}), }, 'gwu-seas': { @@ -97,6 +98,7 @@ } BASE_URL = OPENEDX_SITES['edx']['url'] EDX_HOMEPAGE = BASE_URL + '/user_api/v1/account/login_session' +IS_SPECIAL_LOGIN = False LOGIN_API = BASE_URL + '/login_ajax' DASHBOARD = BASE_URL + '/dashboard' COURSEWARE_SEL = OPENEDX_SITES['edx']['courseware-selector'] @@ -108,6 +110,7 @@ def change_openedx_site(site_name): """ global BASE_URL global EDX_HOMEPAGE + global IS_SPECIAL_LOGIN global LOGIN_API global DASHBOARD global COURSEWARE_SEL @@ -118,7 +121,13 @@ def change_openedx_site(site_name): sys.exit(ExitCode.UNKNOWN_PLATFORM) BASE_URL = OPENEDX_SITES[site_name]['url'] - EDX_HOMEPAGE = BASE_URL + '/user_api/v1/account/login_session' + if 'isSpecialLogin' in OPENEDX_SITES[site_name]: + EDX_HOMEPAGE = BASE_URL + IS_SPECIAL_LOGIN = True + else: + EDX_HOMEPAGE = BASE_URL + '/user_api/v1/account/login_session' + IS_SPECIAL_LOGIN = False + logging.info('%s', EDX_HOMEPAGE) LOGIN_API = BASE_URL + '/login_ajax' DASHBOARD = BASE_URL + '/dashboard' COURSEWARE_SEL = OPENEDX_SITES[site_name]['courseware-selector'] @@ -150,7 +159,7 @@ def get_courses_info(url, headers): return courses -def _get_initial_token(url): +def _get_initial_token(url, isSpecialLogin): """ Create initial connection to get authentication token for future requests. @@ -168,7 +177,10 @@ def _get_initial_token(url): for cookie in cookiejar: if cookie.name == 'csrftoken': - logging.info('Found CSRF token.') + if isSpecialLogin: + logging.info('Found first CSRF token: %s.', cookie.value) + else: + logging.info('Found CSRF token.') return cookie.value logging.warn('Did not find the CSRF token.') @@ -210,7 +222,7 @@ def edx_get_subtitle(url, headers, return None -def edx_login(url, headers, username, password): +def edx_login(url, headers, username, password, isSpecialLogin): """ Log in user into the openedx website. """ @@ -223,11 +235,25 @@ def edx_login(url, headers, username, password): request = Request(url, post_data, headers) try: response = urlopen(request) + + cookieItems = response.info()['Set-Cookie'].split(';') + csrfToken = '' + for cookieItem in cookieItems: + cookieEntry = cookieItem.split('=') + if cookieEntry[0] == 'csrftoken' and cookieEntry[1]: + logging.info('Found new CSRF token %s.', cookieEntry[1]) + csrfToken = cookieEntry[1] + break + if csrfToken == '': + logging.error('Unable to find any CSRF token') + exit(ExitCode.MISSING_CSRF_TOKEN) + except HTTPError as e: logging.info('Error, cannot login: %s', e) return {'success': False} resp = json.loads(response.read().decode('utf-8')) + resp.update({'csrfToken': csrfToken}) return resp @@ -275,6 +301,13 @@ def parse_args(): default=False, help='download subtitles with the videos') + parser.add_argument('-q', + '--quality', + dest='quality', + action='store', + default='720p', + help='specify quality of video to download, one of: 1080p, 720p, 480p, 240p') + parser.add_argument('-o', '--output-dir', action='store', @@ -415,7 +448,7 @@ def parse_args(): return args -def edx_get_headers(): +def edx_get_headers(csrfToken = None): """ Build the Open edX headers to create future requests. """ @@ -427,14 +460,14 @@ def edx_get_headers(): 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', 'Referer': EDX_HOMEPAGE, 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRFToken': _get_initial_token(EDX_HOMEPAGE), + 'X-CSRFToken': csrfToken if csrfToken else _get_initial_token(EDX_HOMEPAGE, IS_SPECIAL_LOGIN), } logging.debug('Headers built: %s', headers) return headers -def extract_units(url, headers, file_formats): +def extract_units(url, headers, file_formats, quality): """ Parses a webpage and extracts its resources e.g. video_url, sub_url, etc. """ @@ -442,12 +475,12 @@ def extract_units(url, headers, file_formats): page = get_page_contents(url, headers) page_extractor = get_page_extractor(url) - units = page_extractor.extract_units_from_html(page, BASE_URL, file_formats) + units = page_extractor.extract_units_from_html(page, BASE_URL, file_formats, quality) return units -def extract_all_units_in_sequence(urls, headers, file_formats): +def extract_all_units_in_sequence(urls, headers, file_formats, quality): """ Returns a dict of all the units in the selected_sections: {url, units} sequentially, this is clearer for debug purposes @@ -455,13 +488,13 @@ def extract_all_units_in_sequence(urls, headers, file_formats): logging.info('Extracting all units information in sequentially.') logging.debug('urls: ' + str(urls)) - units = [extract_units(url, headers, file_formats) for url in urls] + units = [extract_units(url, headers, file_formats, quality) for url in urls] all_units = dict(zip(urls, units)) return all_units -def extract_all_units_in_parallel(urls, headers, file_formats): +def extract_all_units_in_parallel(urls, headers, file_formats, quality): """ Returns a dict of all the units in the selected_sections: {url, units} in parallel @@ -469,7 +502,7 @@ def extract_all_units_in_parallel(urls, headers, file_formats): logging.info('Extracting all units information in parallel.') logging.debug('urls: ' + str(urls)) - mapfunc = partial(extract_units, file_formats=file_formats, headers=headers) + mapfunc = partial(extract_units, file_formats=file_formats, headers=headers, quality=quality) pool = ThreadPool(16) units = pool.map(mapfunc, urls) pool.close() @@ -742,8 +775,12 @@ def download_youtube_url(url, filename, headers, args): """ Downloads a youtube URL and applies the filters from args """ - logging.info('Downloading video with URL %s from YouTube.', url) - video_format_option = args.format + '/mp4' if args.format else 'mp4' + logging.info('Downloading video with URL %s from YouTube with quality %s. If quality is not found, take the best of the format %s.', url, args.quality, args.format) + quality = re.sub('\D', '', args.quality) + if quality: + video_format_option = '(' + args.format + ')[height=' + quality + ']' + '/' + args.format + '/(mp4)[height=' + quality + ']/mp4' if args.format else '(mp4)[height=' + quality + ']/mp4' + else: + video_format_option = args.format + '/mp4' if args.format else 'mp4' cmd = YOUTUBE_DL_CMD + ['-o', filename, '-f', video_format_option] if args.subtitles: @@ -751,6 +788,7 @@ def download_youtube_url(url, filename, headers, args): cmd.extend(args.youtube_dl_options.split()) cmd.append(url) + logging.debug('Youtube_dl cmd: %s', cmd) execute_command(cmd, args) @@ -908,7 +946,8 @@ def num_urls_in_units_dict(units_dict): def extract_all_units_with_cache(all_urls, headers, file_formats, filename=DEFAULT_CACHE_FILENAME, - extractor=extract_all_units_in_parallel): + extractor=extract_all_units_in_parallel, + quality=None): """ Extracts the units which are not in the cache and extract their resources returns the full list of units (cached+new) @@ -928,7 +967,7 @@ def extract_all_units_with_cache(all_urls, headers, file_formats, new_urls = [url for url in all_urls if url not in cached_units] logging.info('loading %d urls from cache [%s]', len(cached_units.keys()), filename) - new_units = extractor(new_urls, headers, file_formats) + new_units = extractor(new_urls, headers, file_formats, quality) all_units = cached_units.copy() all_units.update(new_units) @@ -1005,10 +1044,16 @@ def main(): headers = edx_get_headers() # Login - resp = edx_login(LOGIN_API, headers, args.username, args.password) + resp = edx_login(LOGIN_API, headers, args.username, args.password, IS_SPECIAL_LOGIN) if not resp.get('success', False): logging.error(resp.get('value', "Wrong Email or Password.")) exit(ExitCode.WRONG_EMAIL_OR_PASSWORD) + if not resp.get('csrfToken', False): + logging.error('Unable to find any CSRF token') + exit(ExitCode.MISSING_CSRF_TOKEN) + + # Set new header especially csrftoken + headers = edx_get_headers(resp.get('csrfToken')) # Parse and select the available courses courses = get_courses_info(DASHBOARD, headers) @@ -1045,9 +1090,10 @@ def main(): if args.cache: all_units = extract_all_units_with_cache(all_urls, headers, file_formats, - extractor=extractor) + extractor=extractor, + quality=args.quality) else: - all_units = extractor(all_urls, headers, file_formats) + all_units = extractor(all_urls, headers, file_formats, args.quality) parse_units(selections) diff --git a/edx_dl/parsing.py b/edx_dl/parsing.py index c3812838..9c847211 100644 --- a/edx_dl/parsing.py +++ b/edx_dl/parsing.py @@ -407,6 +407,183 @@ def _make_subsections(section_soup): return sections +class FunMoocPageExtractor(CurrentEdXPageExtractor): + """ + A new page extractor for the latest changes of Fun Mooc + """ + def extract_units_from_html(self, page, BASE_URL, file_formats, quality): + """ + Extract Units from the html of a subsection webpage as a list of + resources + """ + # in this function we avoid using beautifulsoup for performance reasons + # parsing html with regular expressions is really nasty, don't do this if + # you don't need to ! + re_units = re.compile('(]id="seq_contents_\d+".*?>.*?<\/div>)', + re.DOTALL) + units = [] + + for unit_html in re_units.findall(page): + unit = self.extract_unit(unit_html, BASE_URL, file_formats, quality) + if len(unit.videos) > 0 or len(unit.resources_urls) > 0: + units.append(unit) + return units + + def extract_mp4_urls(self, text, quality = None): + """ + Looks for available links to the mp4 version of the videos + """ + # mp4 urls may be in two places, in the field data-sources, and as + # refs This regex tries to match all the appearances, however we + # exclude the ';' # character in the urls, since it is used to separate + # multiple urls in one string, however ';' is a valid url name + # character, but it is not really common. + re_mp4_urls = re.compile(r'(?:(https?://[^;]*?\.mp4))') + mp4_urls = list(set(re_mp4_urls.findall(text))) + + # choose the good quality accordingly + new_mp4_urls = [] + qualities_dict = { + '1080p': 'HD', + '720p': 'SD', + '480p': 'LD', + '240p': 'UL' + } + best_qualities_order = { + 'HD': 1, + 'SD': 2, + 'LD': 3, + 'UL': 4 + } + url_quality_list = {} + if not quality: + return mp4_urls + if quality not in qualities_dict: + return mp4_urls + else: + # list all formats availables if the url is identical + for mp4_url in mp4_urls: + # split url and check the filename + url_split = mp4_url.rsplit('/', 1) + start_url = url_split[0] + file_name = url_split.pop().rsplit('.')[0] + if start_url and file_name: + # file_name is not a known quality + # add it in the final array to return + if file_name not in best_qualities_order: + new_mp4_urls.append(mp4_url) + else: + # add in a new dict the url as key and quality + # in an array + if start_url in url_quality_list and url_quality_list[start_url]: + url_quality_list[start_url].append(file_name) + else: + # key do not exist + url_quality_list[start_url] = [file_name] + else: + # add it in the final array to return + new_mp4_urls.append(mp4_url) + + found = False + # pick the best quality + for url, qualities in url_quality_list.items(): + if len(qualities) > 1: + for quality_url in qualities: + # quality selected by the user exists in the array + if quality_url == qualities_dict[quality]: + new_mp4_urls.append(url + '/' + quality_url + '.mp4') + found = True + break + # pick the quality less that the one selected + # by the user + if not found: + for quality_order, priority in best_qualities_order.items(): + args_quality = best_qualities_order[qualities_dict[quality]] + if args_quality < priority: + new_mp4_urls.append(url + '/' + quality_order + '.mp4') + break + else: + new_mp4_urls.append(url + '/' + qualities[0] + '.mp4') + + return new_mp4_urls + + def extract_unit(self, text, BASE_URL, file_formats, quality): + """ + Parses the
of each unit and extracts the urls of its resources + """ + video_youtube_url = self.extract_video_youtube_url(text) + available_subs_url, sub_template_url = self.extract_subtitle_urls(text, BASE_URL) + mp4_urls = self.extract_mp4_urls(text, quality) + videos = [Video(video_youtube_url=video_youtube_url, + available_subs_url=available_subs_url, + sub_template_url=sub_template_url, + mp4_urls=mp4_urls)] + + resources_urls = self.extract_resources_urls(text, BASE_URL, + file_formats) + return Unit(videos=videos, resources_urls=resources_urls) + + def extract_sections_from_html(self, page, BASE_URL): + """ + Extract sections (Section->SubSection) from the html page + """ + def _make_url(section_soup): # FIXME: Extract from here and test + try: + # no url for the header section + return None + except AttributeError: + # Section might be empty and contain no links + return None + + def _get_section_name(section_soup): # FIXME: Extract from here and test + try: + return section_soup.a.span.get_text().strip() + except AttributeError: + return None + + def _make_subsections(section_soup): + try: + subsections_soup = section_soup.find_all('div', class_=['menu-item']) + except AttributeError: + return [] + # FIXME correct extraction of subsection.name (unicode) + subsections = [SubSection(position=i, + url=BASE_URL + s.a['href'], + name=s.a.p.string.strip()) + for i, s in enumerate(subsections_soup, 1)] + return subsections + + soup = BeautifulSoup(page) + parent_sections_soup = soup.find('nav', class_=['course-navigation']) + chapter_sections_soup = parent_sections_soup.find_all('a', class_=['button-chapter', 'chapter']) + subsections_container_soup = parent_sections_soup.find_all('div', class_=['chapter-content-container']) + + # Manipulate dom to put chapter section soup + # into subsections + sections_soup = [] + for chapter_section_soup in chapter_sections_soup: + for subsection_container_soup in subsections_container_soup: + chapter_id = chapter_section_soup.get('href').replace('#', '') + if chapter_id == subsection_container_soup.get('id'): + soup_empty = BeautifulSoup('
') + section = soup_empty.new_tag('div') + section.append(chapter_section_soup) + section.append(subsection_container_soup.div.div) + sections_soup.append(section) + break + + sections = [Section(position=i, + name=_get_section_name(section_soup), + url=_make_url(section_soup), + subsections=_make_subsections(section_soup)) + for i, section_soup in enumerate(sections_soup, 1)] + + # Filter out those sections for which name could not be parsed + sections = [section for section in sections + if section.name] + + return sections + def get_page_extractor(url): """ @@ -419,10 +596,13 @@ def get_page_extractor(url): return NewEdXPageExtractor() elif ( url.startswith('https://edge.edx.org') or - url.startswith('https://lagunita.stanford.edu') or - url.startswith('https://www.fun-mooc.fr') + url.startswith('https://lagunita.stanford.edu') ): return NewEdXPageExtractor() + elif ( + url.startswith('https://www.fun-mooc.fr') + ): + return FunMoocPageExtractor() else: return ClassicEdXPageExtractor() diff --git a/edx_dl/utils.py b/edx_dl/utils.py index 0ec44718..e04e8470 100644 --- a/edx_dl/utils.py +++ b/edx_dl/utils.py @@ -56,6 +56,13 @@ def get_page_contents(url, headers): request, we use the headers given in the dictionary in headers. """ result = urlopen(Request(url, None, headers)) + + # get the final redirection + url = result.geturl() + + if url: + result = urlopen(Request(url, None, headers)) + try: # for python3 charset = result.headers.get_content_charset(failobj="utf-8") From e71a6dc98ef5b23ea8d0b1342143d675248e9d5c Mon Sep 17 00:00:00 2001 From: Florent VIOLLEAU Date: Mon, 20 Apr 2020 02:57:46 +0200 Subject: [PATCH 2/2] Remove logs, fix issues for non Fun edx and remove uneeded checks --- edx_dl/edx_dl.py | 18 ++++++------------ edx_dl/parsing.py | 18 +++++++++--------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/edx_dl/edx_dl.py b/edx_dl/edx_dl.py index 019e7ec7..98c74b06 100644 --- a/edx_dl/edx_dl.py +++ b/edx_dl/edx_dl.py @@ -178,7 +178,7 @@ def _get_initial_token(url, isSpecialLogin): for cookie in cookiejar: if cookie.name == 'csrftoken': if isSpecialLogin: - logging.info('Found first CSRF token: %s.', cookie.value) + logging.info('Found first CSRF token') else: logging.info('Found CSRF token.') return cookie.value @@ -241,12 +241,9 @@ def edx_login(url, headers, username, password, isSpecialLogin): for cookieItem in cookieItems: cookieEntry = cookieItem.split('=') if cookieEntry[0] == 'csrftoken' and cookieEntry[1]: - logging.info('Found new CSRF token %s.', cookieEntry[1]) + logging.info('Found new CSRF token') csrfToken = cookieEntry[1] break - if csrfToken == '': - logging.error('Unable to find any CSRF token') - exit(ExitCode.MISSING_CSRF_TOKEN) except HTTPError as e: logging.info('Error, cannot login: %s', e) @@ -775,7 +772,7 @@ def download_youtube_url(url, filename, headers, args): """ Downloads a youtube URL and applies the filters from args """ - logging.info('Downloading video with URL %s from YouTube with quality %s. If quality is not found, take the best of the format %s.', url, args.quality, args.format) + logging.info('Downloading video with URL %s from YouTube with quality %s. If quality is not found, take the best of the format selected.', url, args.quality) quality = re.sub('\D', '', args.quality) if quality: video_format_option = '(' + args.format + ')[height=' + quality + ']' + '/' + args.format + '/(mp4)[height=' + quality + ']/mp4' if args.format else '(mp4)[height=' + quality + ']/mp4' @@ -1048,12 +1045,9 @@ def main(): if not resp.get('success', False): logging.error(resp.get('value', "Wrong Email or Password.")) exit(ExitCode.WRONG_EMAIL_OR_PASSWORD) - if not resp.get('csrfToken', False): - logging.error('Unable to find any CSRF token') - exit(ExitCode.MISSING_CSRF_TOKEN) - - # Set new header especially csrftoken - headers = edx_get_headers(resp.get('csrfToken')) + if resp.get('csrfToken', False): + # Set new header especially csrftoken + headers = edx_get_headers(resp.get('csrfToken')) # Parse and select the available courses courses = get_courses_info(DASHBOARD, headers) diff --git a/edx_dl/parsing.py b/edx_dl/parsing.py index 9c847211..26f03597 100644 --- a/edx_dl/parsing.py +++ b/edx_dl/parsing.py @@ -60,7 +60,7 @@ class PageExtractor(object): >>> ... """ - def extract_units_from_html(self, page, BASE_URL, file_formats): + def extract_units_from_html(self, page, BASE_URL, file_formats, quality): """ Method to extract the resources (units) from the given page """ @@ -81,7 +81,7 @@ def extract_courses_from_html(self, page, BASE_URL): class ClassicEdXPageExtractor(PageExtractor): - def extract_units_from_html(self, page, BASE_URL, file_formats): + def extract_units_from_html(self, page, BASE_URL, file_formats, quality): """ Extract Units from the html of a subsection webpage as a list of resources @@ -94,18 +94,18 @@ def extract_units_from_html(self, page, BASE_URL, file_formats): units = [] for unit_html in re_units.findall(page): - unit = self.extract_unit(unit_html, BASE_URL, file_formats) + unit = self.extract_unit(unit_html, BASE_URL, file_formats, quality) if len(unit.videos) > 0 or len(unit.resources_urls) > 0: units.append(unit) return units - def extract_unit(self, text, BASE_URL, file_formats): + def extract_unit(self, text, BASE_URL, file_formats, quality = None): """ Parses the
of each unit and extracts the urls of its resources """ video_youtube_url = self.extract_video_youtube_url(text) available_subs_url, sub_template_url = self.extract_subtitle_urls(text, BASE_URL) - mp4_urls = self.extract_mp4_urls(text) + mp4_urls = self.extract_mp4_urls(text, quality) videos = [Video(video_youtube_url=video_youtube_url, available_subs_url=available_subs_url, sub_template_url=sub_template_url, @@ -152,7 +152,7 @@ def extract_subtitle_urls(self, text, BASE_URL): return available_subs_url, sub_template_url - def extract_mp4_urls(self, text): + def extract_mp4_urls(self, text, quality = None): """ Looks for available links to the mp4 version of the videos """ @@ -283,7 +283,7 @@ class CurrentEdXPageExtractor(ClassicEdXPageExtractor): """ A new page extractor for the recent changes in layout of edx """ - def extract_unit(self, text, BASE_URL, file_formats): + def extract_unit(self, text, BASE_URL, file_formats, quality = None): re_metadata = re.compile(r'data-metadata='(.*?)'') videos = [] match_metadatas = re_metadata.findall(text) @@ -411,7 +411,7 @@ class FunMoocPageExtractor(CurrentEdXPageExtractor): """ A new page extractor for the latest changes of Fun Mooc """ - def extract_units_from_html(self, page, BASE_URL, file_formats, quality): + def extract_units_from_html(self, page, BASE_URL, file_formats, quality = None): """ Extract Units from the html of a subsection webpage as a list of resources @@ -507,7 +507,7 @@ def extract_mp4_urls(self, text, quality = None): return new_mp4_urls - def extract_unit(self, text, BASE_URL, file_formats, quality): + def extract_unit(self, text, BASE_URL, file_formats, quality = None): """ Parses the
of each unit and extracts the urls of its resources """