Skip to content

Commit

Permalink
wip: Cenários de BDD usando o modulo behave
Browse files Browse the repository at this point in the history
  • Loading branch information
joao-parana committed Jul 28, 2023
1 parent c3b9972 commit 11fcc9c
Show file tree
Hide file tree
Showing 13 changed files with 184 additions and 37 deletions.
2 changes: 1 addition & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"--no-capture",
"--no-capture-stderr",
"--no-skipped",
"${file}"
"features"
],
"env": {
"WORKSPACE_DIR": "/Volumes/dev/t8s"
Expand Down
Binary file added data/parquet/ts_long_01.parquet
Binary file not shown.
20 changes: 13 additions & 7 deletions features/02.convert_timeserie.feature
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,18 @@ Value Statement:
Then I have a time series with 3 columns and the `correct` number of rows
And I have a text representation for the time serie like this below
"""
TimeSerie(format=wide, features=3, df=
timestamp temperatura velocidade
0 2022-01-01 00:00:00 25.000000 3000.0
1 2022-01-01 01:00:00 26.000000 1100.0
2 2022-01-01 02:00:00 27.000000 1200.0
3 2022-01-01 03:00:00 23.200001 4000.0) +
types: [<class 'pandas._libs.tslibs.timestamps.Timestamp'>, <class 'numpy.float32'>, <class 'numpy.float32'>]
TimeSerie(format=long, features=3, df=
timestamp ds value
0 2022-01-01 00:00:00 temperatura 25.000000
4 2022-01-01 00:00:00 velocidade 3000.000000
1 2022-01-01 01:00:00 temperatura 26.000000
5 2022-01-01 01:00:00 velocidade 1100.000000
2 2022-01-01 02:00:00 temperatura 27.000000
6 2022-01-01 02:00:00 velocidade 1200.000000
3 2022-01-01 03:00:00 temperatura 23.200001
7 2022-01-01 03:00:00 velocidade 4000.000000) +
types: [<class 'pandas._libs.tslibs.timestamps.Timestamp'>, <class 'str'>, <class 'numpy.float32'>]
"""
And can I save this long format time series to a parquet file in the T8S_WORKSPACE_DIR/data/parquet directory

# Constraint: The Dataframe doesn't have invalid values
17 changes: 17 additions & 0 deletions features/03.split_join_timeserie.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Feature: Convert a multivariate Timeseries to list of univariate Timeseries and vice versa

Value Statement:
As a data analyst
I want the ability to convert between Timeseries types ['univariate', 'multivariate'] for use in different situations
So I can start analyzing the data right away and come up with solutions for the business.

Background:
Given that I have a T8S_WORKSPACE_DIR and a long format time series persisted to a Parquet file

Scenario: Conversion of Timeseries types ['univariate', 'multivariate'] for use in different situations
Given that I create a Timeseries using the selected parquet file in the T8S_WORKSPACE/data/parquet directory
When I convert Timeseries from long format to wide format
Then I convert the Timeseries from multivariate to a list of univariate Timeseries
And I convert the list of univariate Timeseries into a single multivariate Timeseries
And I check the result.
# Constraint: The Timeseries has no invalid values
26 changes: 23 additions & 3 deletions features/steps/02.convert_timeserie.feature_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,34 @@ def create_time_serie(context):

@when('I convert the time series from the original wide format to long format')
def convert_time_serie_from_wide_to_long_format(context):
logger.info(f'context.ts1 BEFORE -> \n{str(context.ts1)}')
assert context.ts1 is not None, 'context.ts1 is None'
assert context.ts1.format == 'wide', 'context.ts1.format is not wide'
context.ts1 = context.ts1.to_long()
context.ts1.to_long()
logger.info(f'context.ts1 AFTER -> \n{str(context.ts1)}')

@then('I have a time series with 3 columns and the `correct` number of rows')
def check_time_serie(context):
pass
logger.info(f'context.ts1 AFTER -> \n{str(context.ts1)}')
assert context.ts1 is not None, 'context.ts1 is None'
assert context.ts1.format == 'long', 'context.ts1.format is not long'
assert int(context.ts1.features) == 3, 'context.ts1.features is not 3'
assert len(context.ts1.df) == 8, 'len(context.ts1.df) is not 8'
assert context.ts1.is_multivariate() == True, 'context.ts1.is_multivariate() is not True'

@then('I have a text representation for the time serie like this below')
def check_time_serie_text_representation(context):
pass
logger.info(f'context.ts1 AFTER -> \n{str(context.ts1)}')

@then('can I save this long format time series to a parquet file in the T8S_WORKSPACE_DIR/data/parquet directory')
def save_time_serie_to_parquet(context):
logger.info(f'context.ts1 AFTER -> \n{str(context.ts1)}')
def write_ts_to_parquet_file(ts, parquet_path, filename: str):
parquet_file_path_str: str = str(parquet_path) + '/' + filename
path_ts = Path(parquet_file_path_str)
# Devido a problemas de 'circular import' tivemos que usar a classe Util
Util.to_parquet(ts, path_ts)

# Grava a série temporal ts1 em parquet
write_ts_to_parquet_file(context.ts1, context.PARQUET_PATH, 'ts_long_01.parquet')
context.list_files(f'save_time_serie_to_parquet: ', context)
90 changes: 90 additions & 0 deletions features/steps/03.split_join_timeserie.feature_steps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import os
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
from t8s import get_sample_df
from t8s.log_config import LogConfig
from t8s.util import Util
from t8s.io import IO
from t8s.ts import TimeSerie
from t8s.ts_writer import TSWriter, WriteParquetFile
from t8s.ts_builder import TSBuilder
from t8s.ts_builder import ReadParquetFile
from behave import given, when, then, use_step_matcher, step
from behave.model import Table
from behave_pandas import table_to_dataframe, dataframe_to_table
from logging import INFO, DEBUG, WARNING, ERROR, CRITICAL

LogConfig().initialize_logger(DEBUG)
logger = LogConfig().getLogger()

"""
Feature: Convert a multivariate Timeseries to list of univariate Timeseries and vice versa
Value Statement:
As a data analyst
I want the ability to convert between Timeseries types ['univariate', 'multivariate'] for use in different situations
So I can start analyzing the data right away and come up with solutions for the business.
Background:
Given that I have a T8S_WORKSPACE_DIR and a long format time series persisted to a Parquet file
"""

@given(u'that I have a T8S_WORKSPACE_DIR and a long format time series persisted to a Parquet file')
def background(context):
logger.info(u'STEP: Given that I have a T8S_WORKSPACE_DIR and a long format time series persisted to a Parquet file')

if context.status == 'data directory empty':
context.create_sample_ts_and_save_as_parquet(context)
context.status = 'data directory with parquet file'

# O método before_feature() em features/environment.py atualiza o contexto
logger.info(f'-------------------------------------------------')
logger.info(f'Background @given: T8S_WORKSPACE_DIR = {context.T8S_WORKSPACE_DIR}')
logger.info(f'Background@given: CSV_PATH = {context.CSV_PATH}')
logger.info(f'Background@given: PARQUET_PATH = {context.PARQUET_PATH}')
context.list_files(f'Background@given: ', context)
# logger.info(f'\background : context.ts1 -> \n{str(context.ts1)}')
logger.info(f'-------------------------------------------------')
# A forma de passar estes dados para os steps seguintes é usando o objeto context

"""
Scenario: Conversion of Timeseries types ['univariate', 'multivariate'] for use in different situations
Given that I create a Timeseries using the selected parquet file in the T8S_WORKSPACE/data/parquet directory
When I convert Timeseries from long format to wide format
Then I convert the Timeseries from multivariate to a list of univariate Timeseries
And I convert the list of univariate Timeseries into a single multivariate Timeseries
And I check the result.
# Constraint: The Timeseries has no invalid values
"""

@given(u'that I create a Timeseries using the selected parquet file in the T8S_WORKSPACE/data/parquet directory')
def create_time_serie(context):
filename = 'ts_long_01.parquet'
path_str: str = str(context.PARQUET_PATH) + '/' + filename
path = Path(path_str)
logger.debug('path: ' + str(path))
ctx = TSBuilder(ReadParquetFile())
logger.debug("Client: Strategy is set to read Parquet file.")
ts1: TimeSerie = ctx.build_from_file(Path(path_str))
assert int(ts1.features) == 3
assert ts1.format == 'long'
assert ts1.df.__len__() == 8
context.ts1 = ts1

@when(u'I convert Timeseries from long format to wide format')
def convert_time_serie_from_long_to_wide_format(context):
logger.info(f'context.ts1 BEFORE -> \n{str(context.ts1)}')

@then(u'I convert the Timeseries from multivariate to a list of univariate Timeseries')
def convert_time_serie_from_multivariate_to_list_of_univariate(context):
logger.info(f'context.ts1 BEFORE -> \n{str(context.ts1)}')

@then(u'I convert the list of univariate Timeseries into a single multivariate Timeseries')
def convert_list_of_univariate_to_multivariate(context):
logger.info(f'context.ts1 BEFORE -> \n{str(context.ts1)}')

@then(u'I check the result.')
def check_result(context):
logger.info(f'context.ts1 BEFORE -> \n{str(context.ts1)}')
12 changes: 6 additions & 6 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@

# Grava a série temporal em parquet
logger.debug(f'Grava a série temporal (formato {ts.format}) em um arquivo parquet {path}')
context = TSWriter(WriteParquetFile())
ctx = TSWriter(WriteParquetFile())
logger.debug("Client: Strategy was seted to write Parquet file.")
context.write(Path(path_str), ts)
ctx.write(Path(path_str), ts)
logger.debug(f'\nArquivo {str(path)} gerado à partir da TimeSerie fornecida')

# --------------------------------------------------------------------------------
Expand All @@ -65,14 +65,14 @@
# to make the right choice.
assert isinstance(path, Path), "path must be a Path object"
if (str(path)).endswith('.parquet'):
context = TSBuilder(ReadParquetFile())
ctx = TSBuilder(ReadParquetFile())
logger.debug("Client: ReadStrategy is set to read Parquet file.")
ts = context.build_from_file(Path(path_str))
ts = ctx.build_from_file(Path(path_str))
else:
assert str(path).endswith('.csv'), "If path is not a Parquet file the path must be a CSV file"
logger.debug("Client: ReadStrategy is set to read CSV file.")
context = TSBuilder(ReadCsvFile())
ts = context.build_from_file(Path(path_str))
ctx = TSBuilder(ReadCsvFile())
ts = ctx.build_from_file(Path(path_str))

assert int(ts.features) == 3
assert ts.format == 'wide'
Expand Down
12 changes: 6 additions & 6 deletions smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@

# Grava a série temporal em parquet
logger.debug(f'Grava a série temporal (formato {ts.format}) em um arquivo parquet {path}')
context = TSWriter(WriteParquetFile())
ctx = TSWriter(WriteParquetFile())
logger.debug("Client: Strategy was seted to write Parquet file.")
context.write(Path(path_str), ts)
ctx.write(Path(path_str), ts)
logger.debug(f'\nArquivo {str(path)} gerado à partir da TimeSerie fornecida')

# --------------------------------------------------------------------------------
Expand All @@ -63,14 +63,14 @@
# to make the right choice.
assert isinstance(path, Path), "path must be a Path object"
if (str(path)).endswith('.parquet'):
context = TSBuilder(ReadParquetFile())
ctx = TSBuilder(ReadParquetFile())
logger.debug("Client: ReadStrategy is set to read Parquet file.")
ts = context.build_from_file(Path(path_str))
ts = ctx.build_from_file(Path(path_str))
else:
assert str(path).endswith('.csv'), "If path is not a Parquet file the path must be a CSV file"
logger.debug("Client: ReadStrategy is set to read CSV file.")
context = TSBuilder(ReadCsvFile())
ts = context.build_from_file(Path(path_str))
ctx = TSBuilder(ReadCsvFile())
ts = ctx.build_from_file(Path(path_str))

assert int(ts.features) == 3
assert ts.format == 'wide'
Expand Down
22 changes: 18 additions & 4 deletions src/t8s/ts.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,28 +165,41 @@ def to_long(self):
# Em algumas situações `ds` pode ser o id do par `datasource/indicator`.
first_column_name = self.df.columns[0]
df_long_format = pd.melt(self.df, id_vars=[first_column_name], var_name='ds', value_name='value')
# Ordena o DataFrame pela coluna 'timestamp' em ordem crescente
df_long_format.sort_values(by=['timestamp', 'ds'], inplace=True)
print(df_long_format)
self.df = df_long_format
self.format = 'long'
self.columns = ['timestamp', 'ds', 'value']

def to_wide(self):
# Converte a série temporal para o formato Wide
# Implementação aqui
raise NotImplementedError('Not implemented for long format')

def is_univariate(self):
def is_univariate(self) -> bool:
# Verifica se a série temporal é univariada
if self.format == 'long':
raise NotImplementedError('Not implemented for long format')
# Obtém os valores distintos da coluna 'ds'
distinct_ds_values = self.df['ds'].unique()
# Se a quantidade de valores distintos for 1, então a série
# temporal é univariada
return distinct_ds_values.size == 1
return self.df.columns.size == 2

def is_multivariate(self):
# Verifica se a série temporal é multivariada
if self.format == 'long':
raise NotImplementedError('Not implemented for long format')
# Obtém os valores distintos da coluna 'ds'
distinct_ds_values = self.df['ds'].unique()
# Se a quantidade de valores distintos for maior que 1, então a série
# temporal é multivariada
return distinct_ds_values.size > 1
return self.df.columns.size > 2

def split(self) -> list[TimeSerie]: # Alternativa: list['TimeSerie']
# TODO: garantir que a primeira coluna seja o indice no Dataframe quando o formato for long ou wide
# TODO: garantir que a primeira coluna seja do tipo Timesamp (datetime) quando o formato for long ou wide
# TODO: garantir que a primeira coluna seja do tipo Timestamp (datetime) quando o formato for long ou wide
# Cria várias séries temporais univariadas à partir de uma série temporal multivariada
result = []
if self.format == 'long':
Expand All @@ -208,6 +221,7 @@ def split(self) -> list[TimeSerie]: # Alternativa: list['TimeSerie']
raise Exception('Formato de série temporal não suportado')

msg = 'O método split deve retornar uma lista de objetos TimeSerie'
assert isinstance(result, list), msg
for ts in result:
assert isinstance(ts, TimeSerie), msg

Expand Down
4 changes: 2 additions & 2 deletions src/t8s/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ class Util:
def to_parquet(ts: TimeSerie, path_ts: Path):
# def write_ts_to_parquet_file(ts, parquet_path, filename: str):
logger.debug(f'Grava a série temporal (formato {ts.format}) em um arquivo parquet {path_ts}')
context = TSWriter(WriteParquetFile())
ctx = TSWriter(WriteParquetFile())
logger.debug("Client: Strategy was seted to write Parquet file.")
context.write(Path(path_ts), ts)
ctx.write(Path(path_ts), ts)
logger.debug(f'\nArquivo {str(path_ts)} gerado à partir da TimeSerie fornecida')

@staticmethod
Expand Down
8 changes: 4 additions & 4 deletions tests/data_seed.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@
path_str: str = 'data/parquet/ts_01.parquet'
path = Path(path_str)
logger.debug(f'Grava a série temporal (formato {ts1.format}) em um arquivo parquet {path}')
context = TSWriter(WriteParquetFile())
ctx = TSWriter(WriteParquetFile())
logger.debug("Client: Strategy was seted to write Parquet file.")
context.write(Path(path_str), ts1)
ctx.write(Path(path_str), ts1)
# ---------------------------------------------------------------------------------------------
# Outro caso de uso
number_of_records = 4
Expand Down Expand Up @@ -62,6 +62,6 @@
path_str: str = 'data/parquet/ts_02.parquet'
path = Path(path_str)
logger.debug(f'Grava a série temporal (formato {ts2.format}) em um arquivo parquet {path}')
context = TSWriter(WriteParquetFile())
ctx = TSWriter(WriteParquetFile())
logger.debug("Client: Strategy was seted to write Parquet file.")
context.write(Path(path_str), ts2)
ctx.write(Path(path_str), ts2)
4 changes: 2 additions & 2 deletions tests/test_build_from_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ def test_build_from_file():
path_str: str = 'data/parquet/ts_01.parquet'
path = Path(path_str)
print('path: ', path)
context = TSBuilder(ReadParquetFile())
ctx = TSBuilder(ReadParquetFile())
print("Client: Strategy is set to read Parquet file.")
ts: TimeSerie = context.build_from_file(Path(path_str))
ts: TimeSerie = ctx.build_from_file(Path(path_str))
assert int(ts.features) == 3
assert ts.format == 'wide'
assert ts.df.__len__() == 4
Expand Down
4 changes: 2 additions & 2 deletions tests/test_to_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ def test_to_parquet():
path_str: str = 'data/parquet/ts_01.parquet'
path = Path(path_str)
logger.debug(f'Grava a série temporal (formato {ts.format}) em um arquivo parquet {path}')
context = TSWriter(WriteParquetFile())
ctx = TSWriter(WriteParquetFile())
logger.debug("Client: Strategy was seted to write Parquet file.")
context.write(Path(path_str), ts)
ctx.write(Path(path_str), ts)
logger.debug(f'\nArquivo {str(path)} gerado à partir da TimeSerie fornecida')
check_schema(ts, path, [datetime, np.float32, np.int32])
logger.info('FIM')
Expand Down

0 comments on commit 11fcc9c

Please sign in to comment.