Skip to content

Commit

Permalink
improvements readme
Browse files Browse the repository at this point in the history
  • Loading branch information
jeanCarloMachado committed Aug 26, 2023
1 parent 22c455d commit e947b94
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 24 deletions.
22 changes: 4 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,26 +61,12 @@ ddataflow = DDataflow(**config)
## 3. Use ddataflow in a pipeline

```py
# filename: pipeline.py
from pyspark.sql import SparkSession
from ddataflow_config import ddataflow

spark = SparkSession.builder.getOrCreate()

# register the tables to mimick a real environment
# when you use ddatflow for real you will have your production tables in place already
spark.read.parquet("/tmp/demo_locations.parquet").registerTempTable("demo_locations")
spark.read.parquet("/tmp/demo_tours.parquet").registerTempTable("demo_tours")

# pyspark code using a different source name
total_locations = spark.table(ddataflow.name('demo_locations')).count()
# sql code also works
total_tours = spark.sql(f""" SELECT COUNT(1) from {ddataflow.name('demo_tours')}""").collect()[0]['count(1)']
print("Totals follow below:")
print({
"total_locations": total_locations,
"total_tours": total_tours,
})
# replace spark.table for ddataflow source will return a spark dataframe
print(ddataflow.source('demo_locations').count())
# for sql queries replace only the name of the table for the sample data source name provided by ddataflow
print(spark.sql(f""" SELECT COUNT(1) from {ddataflow.name('demo_tours')}""").collect()[0]['count(1)'])
```

Now run it twice and observe the difference in the amount of records:
Expand Down
12 changes: 6 additions & 6 deletions examples/ddataflow_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
# add here your tables or paths with customized sampling logic
"data_sources": {
"demo_tours": {
"source": lambda spark: spark.table("demo_tours"),
"filter": lambda df: df.limit(500),
},
"source": lambda spark: spark.table('demo_tours'),
"filter": lambda df: df.limit(500)
}
"demo_locations": {
"source": lambda spark: spark.table("demo_locations"),
"source": lambda spark: spark.table('demo_locations'),
"default_sampling": True,
},
}
},
"project_folder_name": "ddataflow_demo",
}

# initialize the application and validate the configuration
ddataflow = DDataflow(**config)
ddataflow = DDataflow(**config)

0 comments on commit e947b94

Please sign in to comment.