[PRED-3419] Remove redundant check for encoding (#169)

Remove encoding check. We using opening file in text mode with encoding later - so this check redundant
datarobot · Dec 10, 2019 · 48bc821 · 48bc821
1 parent b8a6786
commit 48bc821
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 1 deletion.
diff --git a/datarobot_batch_scoring/reader.py b/datarobot_batch_scoring/reader.py
@@ -458,7 +458,6 @@ def investigate_encoding_and_dialect(dataset, sep, ui, fast=False,
     else:
         ui.debug('investigate_encoding_and_dialect - skip encoding detect')
         encoding = encoding.lower()
-        sample[:1000].decode(encoding)  # Fail here if the encoding is invalid
 
     opener, mode = _get_opener_and_mode(is_gz, text=True)
     with opener(dataset, mode, encoding=encoding) as dfile:

diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -170,6 +170,46 @@ def test_simple_with_unicode(live_server, tmpdir, func_params, dataset_name):
     assert str(actual) == str(expected), expected
 
 
+def test_simple_with_wrong_encoding(live_server, tmpdir, func_params):
+    out = tmpdir.join('out.csv')
+    ui = PickableMock()
+    base_url = '{webhost}/predApi/v1.0/'.format(webhost=live_server.url())
+    with pytest.raises(UnicodeDecodeError) as execinfo:
+        run_batch_predictions(
+            base_url=base_url,
+            base_headers={},
+            user='username',
+            pwd='password',
+            api_token=None,
+            create_api_token=False,
+            deployment_id=func_params['deployment_id'],
+            pid=func_params['pid'],
+            lid=func_params['lid'],
+            import_id=None,
+            n_retry=3,
+            concurrent=1,
+            resume=False,
+            n_samples=10,
+            out_file=str(out),
+            keep_cols=None,
+            delimiter=None,
+            dataset='tests/fixtures/jpReview_books_reg.csv',
+            pred_name=None,
+            pred_threshold_name=None,
+            pred_decision_name=None,
+            timeout=None,
+            ui=ui,
+            auto_sample=False,
+            fast_mode=False,
+            dry_run=False,
+            encoding='cp932',
+            skip_dialect=False
+        )
+
+    # Fixture dataset encoding 'utf-8' and we trying to decode it with 'cp932'
+    assert "'cp932' codec can't decode byte" in str(execinfo.value)
+
+
 def test_prediction_explanations(live_server, tmpdir):
     # train one model in project
     out = tmpdir.join('out.csv')