From 1d37bced63a1a6b96e284180da27124c1a7ee9e8 Mon Sep 17 00:00:00 2001
From: Florent VIOLLEAU <floviolleau@laposte.net>
Date: Mon, 20 Apr 2020 01:59:35 +0200
Subject: [PATCH 1/2] [Fun Mooc] Fix login issue and add quality selection

---
 edx_dl/common.py  |   1 +
 edx_dl/edx_dl.py  |  84 ++++++++++++++++-----
 edx_dl/parsing.py | 184 +++++++++++++++++++++++++++++++++++++++++++++-
 edx_dl/utils.py   |   7 ++
 4 files changed, 255 insertions(+), 21 deletions(-)

diff --git a/edx_dl/common.py b/edx_dl/common.py
index 16e52318..d135bec0 100644
--- a/edx_dl/common.py
+++ b/edx_dl/common.py
@@ -168,6 +168,7 @@ class ExitCode(object):
     INVALID_COURSE_URL = 4
     UNKNOWN_PLATFORM = 5
     NO_DOWNLOADABLE_VIDEO = 6
+    MISSING_CSRF_TOKEN = 7
 
 
 YOUTUBE_DL_CMD = ['youtube-dl', '--ignore-config']
diff --git a/edx_dl/edx_dl.py b/edx_dl/edx_dl.py
index fd573f2b..019e7ec7 100644
--- a/edx_dl/edx_dl.py
+++ b/edx_dl/edx_dl.py
@@ -76,6 +76,7 @@
     },
     'fun': {
         'url': 'https://www.fun-mooc.fr',
+        'isSpecialLogin': True,
         'courseware-selector': ('section', {'aria-label': 'Menu du cours'}),
     },
     'gwu-seas': {
@@ -97,6 +98,7 @@
 }
 BASE_URL = OPENEDX_SITES['edx']['url']
 EDX_HOMEPAGE = BASE_URL + '/user_api/v1/account/login_session'
+IS_SPECIAL_LOGIN = False
 LOGIN_API = BASE_URL + '/login_ajax'
 DASHBOARD = BASE_URL + '/dashboard'
 COURSEWARE_SEL = OPENEDX_SITES['edx']['courseware-selector']
@@ -108,6 +110,7 @@ def change_openedx_site(site_name):
     """
     global BASE_URL
     global EDX_HOMEPAGE
+    global IS_SPECIAL_LOGIN
     global LOGIN_API
     global DASHBOARD
     global COURSEWARE_SEL
@@ -118,7 +121,13 @@ def change_openedx_site(site_name):
         sys.exit(ExitCode.UNKNOWN_PLATFORM)
 
     BASE_URL = OPENEDX_SITES[site_name]['url']
-    EDX_HOMEPAGE = BASE_URL + '/user_api/v1/account/login_session'
+    if 'isSpecialLogin' in OPENEDX_SITES[site_name]:
+        EDX_HOMEPAGE = BASE_URL
+        IS_SPECIAL_LOGIN = True
+    else:
+        EDX_HOMEPAGE = BASE_URL + '/user_api/v1/account/login_session'
+        IS_SPECIAL_LOGIN = False
+    logging.info('%s', EDX_HOMEPAGE)
     LOGIN_API = BASE_URL + '/login_ajax'
     DASHBOARD = BASE_URL + '/dashboard'
     COURSEWARE_SEL = OPENEDX_SITES[site_name]['courseware-selector']
@@ -150,7 +159,7 @@ def get_courses_info(url, headers):
     return courses
 
 
-def _get_initial_token(url):
+def _get_initial_token(url, isSpecialLogin):
     """
     Create initial connection to get authentication token for future
     requests.
@@ -168,7 +177,10 @@ def _get_initial_token(url):
 
     for cookie in cookiejar:
         if cookie.name == 'csrftoken':
-            logging.info('Found CSRF token.')
+            if isSpecialLogin:
+                logging.info('Found first CSRF token: %s.', cookie.value)
+            else:
+                logging.info('Found CSRF token.')
             return cookie.value
 
     logging.warn('Did not find the CSRF token.')
@@ -210,7 +222,7 @@ def edx_get_subtitle(url, headers,
         return None
 
 
-def edx_login(url, headers, username, password):
+def edx_login(url, headers, username, password, isSpecialLogin):
     """
     Log in user into the openedx website.
     """
@@ -223,11 +235,25 @@ def edx_login(url, headers, username, password):
     request = Request(url, post_data, headers)
     try:
         response = urlopen(request)
+
+        cookieItems = response.info()['Set-Cookie'].split(';')
+        csrfToken = ''
+        for cookieItem in cookieItems:
+            cookieEntry = cookieItem.split('=')
+            if cookieEntry[0] == 'csrftoken' and cookieEntry[1]:
+                logging.info('Found new CSRF token %s.', cookieEntry[1])
+                csrfToken = cookieEntry[1]
+                break
+        if csrfToken == '':
+            logging.error('Unable to find any CSRF token')
+            exit(ExitCode.MISSING_CSRF_TOKEN)
+
     except HTTPError as e:
         logging.info('Error, cannot login: %s', e)
         return {'success': False}
 
     resp = json.loads(response.read().decode('utf-8'))
+    resp.update({'csrfToken': csrfToken})
 
     return resp
 
@@ -275,6 +301,13 @@ def parse_args():
                         default=False,
                         help='download subtitles with the videos')
 
+    parser.add_argument('-q',
+                        '--quality',
+                        dest='quality',
+                        action='store',
+                        default='720p',
+                        help='specify quality of video to download, one of: 1080p, 720p, 480p, 240p')
+
     parser.add_argument('-o',
                         '--output-dir',
                         action='store',
@@ -415,7 +448,7 @@ def parse_args():
     return args
 
 
-def edx_get_headers():
+def edx_get_headers(csrfToken = None):
     """
     Build the Open edX headers to create future requests.
     """
@@ -427,14 +460,14 @@ def edx_get_headers():
         'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
         'Referer': EDX_HOMEPAGE,
         'X-Requested-With': 'XMLHttpRequest',
-        'X-CSRFToken': _get_initial_token(EDX_HOMEPAGE),
+        'X-CSRFToken': csrfToken if csrfToken else _get_initial_token(EDX_HOMEPAGE, IS_SPECIAL_LOGIN),
     }
 
     logging.debug('Headers built: %s', headers)
     return headers
 
 
-def extract_units(url, headers, file_formats):
+def extract_units(url, headers, file_formats, quality):
     """
     Parses a webpage and extracts its resources e.g. video_url, sub_url, etc.
     """
@@ -442,12 +475,12 @@ def extract_units(url, headers, file_formats):
 
     page = get_page_contents(url, headers)
     page_extractor = get_page_extractor(url)
-    units = page_extractor.extract_units_from_html(page, BASE_URL, file_formats)
+    units = page_extractor.extract_units_from_html(page, BASE_URL, file_formats, quality)
 
     return units
 
 
-def extract_all_units_in_sequence(urls, headers, file_formats):
+def extract_all_units_in_sequence(urls, headers, file_formats, quality):
     """
     Returns a dict of all the units in the selected_sections: {url, units}
     sequentially, this is clearer for debug purposes
@@ -455,13 +488,13 @@ def extract_all_units_in_sequence(urls, headers, file_formats):
     logging.info('Extracting all units information in sequentially.')
     logging.debug('urls: ' + str(urls))
 
-    units = [extract_units(url, headers, file_formats) for url in urls]
+    units = [extract_units(url, headers, file_formats, quality) for url in urls]
     all_units = dict(zip(urls, units))
 
     return all_units
 
 
-def extract_all_units_in_parallel(urls, headers, file_formats):
+def extract_all_units_in_parallel(urls, headers, file_formats, quality):
     """
     Returns a dict of all the units in the selected_sections: {url, units}
     in parallel
@@ -469,7 +502,7 @@ def extract_all_units_in_parallel(urls, headers, file_formats):
     logging.info('Extracting all units information in parallel.')
     logging.debug('urls: ' + str(urls))
 
-    mapfunc = partial(extract_units, file_formats=file_formats, headers=headers)
+    mapfunc = partial(extract_units, file_formats=file_formats, headers=headers, quality=quality)
     pool = ThreadPool(16)
     units = pool.map(mapfunc, urls)
     pool.close()
@@ -742,8 +775,12 @@ def download_youtube_url(url, filename, headers, args):
     """
     Downloads a youtube URL and applies the filters from args
     """
-    logging.info('Downloading video with URL %s from YouTube.', url)
-    video_format_option = args.format + '/mp4' if args.format else 'mp4'
+    logging.info('Downloading video with URL %s from YouTube with quality %s. If quality is not found, take the best of the format %s.', url, args.quality, args.format)
+    quality = re.sub('\D', '', args.quality)
+    if quality:
+        video_format_option = '(' + args.format + ')[height=' + quality + ']' + '/' + args.format + '/(mp4)[height=' + quality + ']/mp4' if args.format else '(mp4)[height=' + quality + ']/mp4'
+    else:
+        video_format_option = args.format + '/mp4' if args.format else 'mp4'
     cmd = YOUTUBE_DL_CMD + ['-o', filename, '-f', video_format_option]
 
     if args.subtitles:
@@ -751,6 +788,7 @@ def download_youtube_url(url, filename, headers, args):
     cmd.extend(args.youtube_dl_options.split())
     cmd.append(url)
 
+    logging.debug('Youtube_dl cmd: %s', cmd)
     execute_command(cmd, args)
 
 
@@ -908,7 +946,8 @@ def num_urls_in_units_dict(units_dict):
 
 def extract_all_units_with_cache(all_urls, headers, file_formats,
                                  filename=DEFAULT_CACHE_FILENAME,
-                                 extractor=extract_all_units_in_parallel):
+                                 extractor=extract_all_units_in_parallel,
+                                 quality=None):
     """
     Extracts the units which are not in the cache and extract their resources
     returns the full list of units (cached+new)
@@ -928,7 +967,7 @@ def extract_all_units_with_cache(all_urls, headers, file_formats,
     new_urls = [url for url in all_urls if url not in cached_units]
     logging.info('loading %d urls from cache [%s]', len(cached_units.keys()),
                  filename)
-    new_units = extractor(new_urls, headers, file_formats)
+    new_units = extractor(new_urls, headers, file_formats, quality)
     all_units = cached_units.copy()
     all_units.update(new_units)
 
@@ -1005,10 +1044,16 @@ def main():
     headers = edx_get_headers()
 
     # Login
-    resp = edx_login(LOGIN_API, headers, args.username, args.password)
+    resp = edx_login(LOGIN_API, headers, args.username, args.password, IS_SPECIAL_LOGIN)
     if not resp.get('success', False):
         logging.error(resp.get('value', "Wrong Email or Password."))
         exit(ExitCode.WRONG_EMAIL_OR_PASSWORD)
+    if not resp.get('csrfToken', False):
+        logging.error('Unable to find any CSRF token')
+        exit(ExitCode.MISSING_CSRF_TOKEN)
+
+    # Set new header especially csrftoken
+    headers = edx_get_headers(resp.get('csrfToken'))
 
     # Parse and select the available courses
     courses = get_courses_info(DASHBOARD, headers)
@@ -1045,9 +1090,10 @@ def main():
     if args.cache:
         all_units = extract_all_units_with_cache(all_urls, headers,
                                                  file_formats,
-                                                 extractor=extractor)
+                                                 extractor=extractor,
+                                                 quality=args.quality)
     else:
-        all_units = extractor(all_urls, headers, file_formats)
+        all_units = extractor(all_urls, headers, file_formats, args.quality)
 
     parse_units(selections)
 
diff --git a/edx_dl/parsing.py b/edx_dl/parsing.py
index c3812838..9c847211 100644
--- a/edx_dl/parsing.py
+++ b/edx_dl/parsing.py
@@ -407,6 +407,183 @@ def _make_subsections(section_soup):
 
         return sections
 
+class FunMoocPageExtractor(CurrentEdXPageExtractor):
+    """
+    A new page extractor for the latest changes of Fun Mooc
+    """
+    def extract_units_from_html(self, page, BASE_URL, file_formats, quality):
+        """
+        Extract Units from the html of a subsection webpage as a list of
+        resources
+        """
+        # in this function we avoid using beautifulsoup for performance reasons
+        # parsing html with regular expressions is really nasty, don't do this if
+        # you don't need to !
+        re_units = re.compile('(<div?[^>]id="seq_contents_\d+".*?>.*?<\/div>)',
+                              re.DOTALL)
+        units = []
+
+        for unit_html in re_units.findall(page):
+            unit = self.extract_unit(unit_html, BASE_URL, file_formats, quality)
+            if len(unit.videos) > 0 or len(unit.resources_urls) > 0:
+                units.append(unit)
+        return units
+
+    def extract_mp4_urls(self, text, quality = None):
+        """
+        Looks for available links to the mp4 version of the videos
+        """
+        # mp4 urls may be in two places, in the field data-sources, and as <a>
+        # refs This regex tries to match all the appearances, however we
+        # exclude the ';' # character in the urls, since it is used to separate
+        # multiple urls in one string, however ';' is a valid url name
+        # character, but it is not really common.
+        re_mp4_urls = re.compile(r'(?:(https?://[^;]*?\.mp4))')
+        mp4_urls = list(set(re_mp4_urls.findall(text)))
+
+        # choose the good quality accordingly
+        new_mp4_urls = []
+        qualities_dict = {
+            '1080p': 'HD',
+            '720p': 'SD',
+            '480p': 'LD',
+            '240p': 'UL'
+        }
+        best_qualities_order = {
+            'HD': 1,
+            'SD': 2,
+            'LD': 3,
+            'UL': 4
+        }
+        url_quality_list = {}
+        if not quality:
+            return mp4_urls
+        if quality not in qualities_dict:
+            return mp4_urls
+        else:
+            # list all formats availables if the url is identical
+            for mp4_url in mp4_urls:
+                # split url and check the filename
+                url_split = mp4_url.rsplit('/', 1)
+                start_url = url_split[0]
+                file_name = url_split.pop().rsplit('.')[0]
+                if start_url and file_name:
+                    # file_name is not a known quality
+                    # add it in the final array to return
+                    if file_name not in best_qualities_order:
+                        new_mp4_urls.append(mp4_url)
+                    else:
+                        # add in a new dict the url as key and quality
+                        # in an array
+                        if start_url in url_quality_list and url_quality_list[start_url]:
+                            url_quality_list[start_url].append(file_name)
+                        else:
+                            # key do not exist
+                            url_quality_list[start_url] = [file_name]
+                else:
+                    # add it in the final array to return
+                    new_mp4_urls.append(mp4_url)
+
+            found = False
+            # pick the best quality
+            for url, qualities in url_quality_list.items():
+                if len(qualities) > 1:
+                    for quality_url in qualities:
+                        # quality selected by the user exists in the array
+                        if quality_url == qualities_dict[quality]:
+                            new_mp4_urls.append(url + '/' + quality_url + '.mp4')
+                            found = True
+                            break
+		    # pick the quality less that the one selected
+                    # by the user
+                    if not found:
+                        for quality_order, priority in best_qualities_order.items():
+                            args_quality = best_qualities_order[qualities_dict[quality]]
+                            if args_quality < priority:
+                                new_mp4_urls.append(url + '/' + quality_order + '.mp4')
+                                break
+                else:
+                    new_mp4_urls.append(url + '/' + qualities[0] + '.mp4')
+
+        return new_mp4_urls
+
+    def extract_unit(self, text, BASE_URL, file_formats, quality):
+        """
+        Parses the <div> of each unit and extracts the urls of its resources
+        """
+        video_youtube_url = self.extract_video_youtube_url(text)
+        available_subs_url, sub_template_url = self.extract_subtitle_urls(text, BASE_URL)
+        mp4_urls = self.extract_mp4_urls(text, quality)
+        videos = [Video(video_youtube_url=video_youtube_url,
+                        available_subs_url=available_subs_url,
+                        sub_template_url=sub_template_url,
+                        mp4_urls=mp4_urls)]
+
+        resources_urls = self.extract_resources_urls(text, BASE_URL,
+                                                     file_formats)
+        return Unit(videos=videos, resources_urls=resources_urls)
+
+    def extract_sections_from_html(self, page, BASE_URL):
+        """
+        Extract sections (Section->SubSection) from the html page
+        """
+        def _make_url(section_soup):  # FIXME: Extract from here and test
+            try:
+                # no url for the header section
+                return None
+            except AttributeError:
+                # Section might be empty and contain no links
+                return None
+
+        def _get_section_name(section_soup):  # FIXME: Extract from here and test
+            try:
+                return section_soup.a.span.get_text().strip()
+            except AttributeError:
+                return None
+
+        def _make_subsections(section_soup):
+            try:
+                subsections_soup = section_soup.find_all('div', class_=['menu-item'])
+            except AttributeError:
+                return []
+            # FIXME correct extraction of subsection.name (unicode)
+            subsections = [SubSection(position=i,
+                                      url=BASE_URL + s.a['href'],
+                                      name=s.a.p.string.strip())
+                           for i, s in enumerate(subsections_soup, 1)]
+            return subsections
+
+        soup = BeautifulSoup(page)
+        parent_sections_soup = soup.find('nav', class_=['course-navigation'])
+        chapter_sections_soup = parent_sections_soup.find_all('a', class_=['button-chapter', 'chapter'])
+        subsections_container_soup = parent_sections_soup.find_all('div', class_=['chapter-content-container'])
+
+        # Manipulate dom to put chapter section soup
+        # into subsections
+        sections_soup = []
+        for chapter_section_soup in chapter_sections_soup:
+            for subsection_container_soup in subsections_container_soup:
+                chapter_id = chapter_section_soup.get('href').replace('#', '')
+                if chapter_id == subsection_container_soup.get('id'):
+                    soup_empty = BeautifulSoup('<div></div>')
+                    section = soup_empty.new_tag('div')
+                    section.append(chapter_section_soup)
+                    section.append(subsection_container_soup.div.div)
+                    sections_soup.append(section)
+                    break
+
+        sections = [Section(position=i,
+                            name=_get_section_name(section_soup),
+                            url=_make_url(section_soup),
+                            subsections=_make_subsections(section_soup))
+                    for i, section_soup in enumerate(sections_soup, 1)]
+
+        # Filter out those sections for which name could not be parsed
+        sections = [section for section in sections
+                    if section.name]
+
+        return sections
+
 
 def get_page_extractor(url):
     """
@@ -419,10 +596,13 @@ def get_page_extractor(url):
         return NewEdXPageExtractor()
     elif (
         url.startswith('https://edge.edx.org') or
-        url.startswith('https://lagunita.stanford.edu') or
-        url.startswith('https://www.fun-mooc.fr')
+        url.startswith('https://lagunita.stanford.edu')
     ):
         return NewEdXPageExtractor()
+    elif (
+        url.startswith('https://www.fun-mooc.fr')
+    ):
+        return FunMoocPageExtractor()
     else:
         return ClassicEdXPageExtractor()
 
diff --git a/edx_dl/utils.py b/edx_dl/utils.py
index 0ec44718..e04e8470 100644
--- a/edx_dl/utils.py
+++ b/edx_dl/utils.py
@@ -56,6 +56,13 @@ def get_page_contents(url, headers):
     request, we use the headers given in the dictionary in headers.
     """
     result = urlopen(Request(url, None, headers))
+
+    # get the final redirection
+    url = result.geturl()
+
+    if url:
+        result = urlopen(Request(url, None, headers))
+
     try:
         # for python3
         charset = result.headers.get_content_charset(failobj="utf-8")

From e71a6dc98ef5b23ea8d0b1342143d675248e9d5c Mon Sep 17 00:00:00 2001
From: Florent VIOLLEAU <floviolleau@laposte.net>
Date: Mon, 20 Apr 2020 02:57:46 +0200
Subject: [PATCH 2/2] Remove logs, fix issues for non Fun edx and remove
 uneeded checks

---
 edx_dl/edx_dl.py  | 18 ++++++------------
 edx_dl/parsing.py | 18 +++++++++---------
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/edx_dl/edx_dl.py b/edx_dl/edx_dl.py
index 019e7ec7..98c74b06 100644
--- a/edx_dl/edx_dl.py
+++ b/edx_dl/edx_dl.py
@@ -178,7 +178,7 @@ def _get_initial_token(url, isSpecialLogin):
     for cookie in cookiejar:
         if cookie.name == 'csrftoken':
             if isSpecialLogin:
-                logging.info('Found first CSRF token: %s.', cookie.value)
+                logging.info('Found first CSRF token')
             else:
                 logging.info('Found CSRF token.')
             return cookie.value
@@ -241,12 +241,9 @@ def edx_login(url, headers, username, password, isSpecialLogin):
         for cookieItem in cookieItems:
             cookieEntry = cookieItem.split('=')
             if cookieEntry[0] == 'csrftoken' and cookieEntry[1]:
-                logging.info('Found new CSRF token %s.', cookieEntry[1])
+                logging.info('Found new CSRF token')
                 csrfToken = cookieEntry[1]
                 break
-        if csrfToken == '':
-            logging.error('Unable to find any CSRF token')
-            exit(ExitCode.MISSING_CSRF_TOKEN)
 
     except HTTPError as e:
         logging.info('Error, cannot login: %s', e)
@@ -775,7 +772,7 @@ def download_youtube_url(url, filename, headers, args):
     """
     Downloads a youtube URL and applies the filters from args
     """
-    logging.info('Downloading video with URL %s from YouTube with quality %s. If quality is not found, take the best of the format %s.', url, args.quality, args.format)
+    logging.info('Downloading video with URL %s from YouTube with quality %s. If quality is not found, take the best of the format selected.', url, args.quality)
     quality = re.sub('\D', '', args.quality)
     if quality:
         video_format_option = '(' + args.format + ')[height=' + quality + ']' + '/' + args.format + '/(mp4)[height=' + quality + ']/mp4' if args.format else '(mp4)[height=' + quality + ']/mp4'
@@ -1048,12 +1045,9 @@ def main():
     if not resp.get('success', False):
         logging.error(resp.get('value', "Wrong Email or Password."))
         exit(ExitCode.WRONG_EMAIL_OR_PASSWORD)
-    if not resp.get('csrfToken', False):
-        logging.error('Unable to find any CSRF token')
-        exit(ExitCode.MISSING_CSRF_TOKEN)
-
-    # Set new header especially csrftoken
-    headers = edx_get_headers(resp.get('csrfToken'))
+    if resp.get('csrfToken', False):
+        # Set new header especially csrftoken
+        headers = edx_get_headers(resp.get('csrfToken'))
 
     # Parse and select the available courses
     courses = get_courses_info(DASHBOARD, headers)
diff --git a/edx_dl/parsing.py b/edx_dl/parsing.py
index 9c847211..26f03597 100644
--- a/edx_dl/parsing.py
+++ b/edx_dl/parsing.py
@@ -60,7 +60,7 @@ class PageExtractor(object):
       >>> ...
     """
 
-    def extract_units_from_html(self, page, BASE_URL, file_formats):
+    def extract_units_from_html(self, page, BASE_URL, file_formats, quality):
         """
         Method to extract the resources (units) from the given page
         """
@@ -81,7 +81,7 @@ def extract_courses_from_html(self, page, BASE_URL):
 
 class ClassicEdXPageExtractor(PageExtractor):
 
-    def extract_units_from_html(self, page, BASE_URL, file_formats):
+    def extract_units_from_html(self, page, BASE_URL, file_formats, quality):
         """
         Extract Units from the html of a subsection webpage as a list of
         resources
@@ -94,18 +94,18 @@ def extract_units_from_html(self, page, BASE_URL, file_formats):
         units = []
 
         for unit_html in re_units.findall(page):
-            unit = self.extract_unit(unit_html, BASE_URL, file_formats)
+            unit = self.extract_unit(unit_html, BASE_URL, file_formats, quality)
             if len(unit.videos) > 0 or len(unit.resources_urls) > 0:
                 units.append(unit)
         return units
 
-    def extract_unit(self, text, BASE_URL, file_formats):
+    def extract_unit(self, text, BASE_URL, file_formats, quality = None):
         """
         Parses the <div> of each unit and extracts the urls of its resources
         """
         video_youtube_url = self.extract_video_youtube_url(text)
         available_subs_url, sub_template_url = self.extract_subtitle_urls(text, BASE_URL)
-        mp4_urls = self.extract_mp4_urls(text)
+        mp4_urls = self.extract_mp4_urls(text, quality)
         videos = [Video(video_youtube_url=video_youtube_url,
                         available_subs_url=available_subs_url,
                         sub_template_url=sub_template_url,
@@ -152,7 +152,7 @@ def extract_subtitle_urls(self, text, BASE_URL):
 
         return available_subs_url, sub_template_url
 
-    def extract_mp4_urls(self, text):
+    def extract_mp4_urls(self, text, quality = None):
         """
         Looks for available links to the mp4 version of the videos
         """
@@ -283,7 +283,7 @@ class CurrentEdXPageExtractor(ClassicEdXPageExtractor):
     """
     A new page extractor for the recent changes in layout of edx
     """
-    def extract_unit(self, text, BASE_URL, file_formats):
+    def extract_unit(self, text, BASE_URL, file_formats, quality = None):
         re_metadata = re.compile(r'data-metadata=&#39;(.*?)&#39;')
         videos = []
         match_metadatas = re_metadata.findall(text)
@@ -411,7 +411,7 @@ class FunMoocPageExtractor(CurrentEdXPageExtractor):
     """
     A new page extractor for the latest changes of Fun Mooc
     """
-    def extract_units_from_html(self, page, BASE_URL, file_formats, quality):
+    def extract_units_from_html(self, page, BASE_URL, file_formats, quality = None):
         """
         Extract Units from the html of a subsection webpage as a list of
         resources
@@ -507,7 +507,7 @@ def extract_mp4_urls(self, text, quality = None):
 
         return new_mp4_urls
 
-    def extract_unit(self, text, BASE_URL, file_formats, quality):
+    def extract_unit(self, text, BASE_URL, file_formats, quality = None):
         """
         Parses the <div> of each unit and extracts the urls of its resources
         """