Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

theilsen: code optimization(refer issue #52) #63

Open
wants to merge 6 commits into
base: master
Choose a base branch
from

Conversation

Tanvi-Jain01
Copy link

@nipunbatra , @patel-zeel
This PR proposes solution for issue #52

BEFORE:

CODE:

vayu/vayu/TheilSen.py

Lines 1 to 113 in ef99aef

def TheilSen(df, pollutant):
"""Connected scatter plot.
Plots a connected scatter plot of the average value of
the pollutant every month of every year. Then plots a
line of best fit through the plot showing the user
the overall trend of the pollutant through the years.
Parameters
----------
df: data frame
minimally containing date and at least one other
pollutant
pollutant: type string
A pollutant name correspoinding to
a variable in a data frame, ex: 'pm25'
"""
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
from numpy import array
import seaborn as sns
import scipy
from scipy import stats
import math
# =============================================================================
# df = pd.read_csv("mydata.csv")
# =============================================================================
df.index = pd.to_datetime(df.date)
unique_years = np.unique(df.index.year)
# df = df[pd.notnull(df[pollutant])]
i = 0
year = []
while i < len(unique_years):
year.append(str(unique_years[i]))
i = i + 1
num_unique_years = len(year)
# df = df.drop("date", axis=1)
# print(df)
i = 0
x = 0
j = 0
var2 = []
while i < num_unique_years:
df_new = df[year[j]].resample("1D").mean()
df_new = df_new.fillna(method="ffill")
df_new["month"] = df_new.index.month
# df_new['day']=df_new.index.dayofweek
# df_new['hour']=df_new.index.hour
i = i + 1
j = j + 1
x = 0
while x < 12:
a = df_new[df_new.month == x]
mean_var2 = a[pollutant].mean()
var2.append(mean_var2)
x = x + 1
i = 0
while i < len(var2):
if pd.notnull(var2[i]) == False:
var2[i] = (var2[i - 1] + var2[i + 1]) / 2
i = i + 1
scatterX = []
t = 0
while t < num_unique_years:
r = 0
while r < 12:
scatterX.append(t + (r / 12))
r = r + 1
t = t + 1
y = var2
x = scatterX
def best_fit(X, Y):
xbar = sum(X) / len(X)
ybar = sum(Y) / len(Y)
n = len(X) # or len(Y)
numer = sum([xi * yi for xi, yi in zip(X, Y)]) - n * xbar * ybar
denum = sum([xi ** 2 for xi in X]) - n * xbar ** 2
b = numer / denum
a = ybar - b * xbar
# print('best fit line:\ny = {:.2f} + {:.2f}x'.format(a, b))
return a, b
a, b = best_fit(x, y)
fig = plt.figure()
ax = fig.add_subplot(111)
# print(len(x))
ax.plot(x, y, "-o")
ax.set_xlabel("Year")
ax.set_ylabel(pollutant)
ax.set_title("TheilSen plot")
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)), color="red")
plt.show()
# =============================================================================
# df = pd.read_csv("mydata.csv")
# TheilSen(df, 'o3')

OUTPUT:
theilsen

AFTER:

CODE:

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib.ticker import ScalarFormatter

def theilsen(df:pd.DataFrame, pollutant:str):
    """Connected scatter plot.

    Plots a connected scatter plot of the average value of
    the pollutant every month of every year. Then plots a
    line of best fit through the plot showing the user
    the overall trend of the pollutant through the years.
    
    Parameters
    ----------
    df: data frame
        minimally containing date and at least one other
        pollutant 
    pollutant: type string
        A pollutant name correspoinding to 
        a variable in a data frame, ex: 'pm25'

    """
    
    df.index = pd.to_datetime(df.date)
    unique_years = np.unique(df.index.year)

    var2 = []
    scatterX = []

    for year in unique_years:
        df_year = df[df.index.year == year]
        df_monthly = df_year.resample("M").mean()
        monthly_mean = df_monthly[pollutant].values
        var2.extend(monthly_mean)
        scatterX.extend(np.arange(len(monthly_mean)) / 12 + year)

    y = np.array(var2)
    x = np.array(scatterX)

    def best_fit(X, Y):
        xbar = np.mean(X)
        ybar = np.mean(Y)
        n = len(X)

        numer = np.sum(X * Y) - n * xbar * ybar
        denum = np.sum(X ** 2) - n * xbar ** 2

        b = numer / denum
        a = ybar - b * xbar

        return a, b

    a, b = best_fit(x, y)

    fig, ax = plt.subplots()
    ax.plot(x, y, "-o")
    ax.set_xlabel("Year")
    ax.set_ylabel(pollutant)
    ax.set_title("TheilSen plot")
    plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)), color="red")

    # Format y-axis tick labels
    ax.xaxis.set_major_formatter(ScalarFormatter(useOffset=False))
    plt.savefig("TheilSenplot.png", bbox_inches="tight")
    print("Your plot has also been saved")

    plt.show()


============================================================================

USAGE:

df = pd.read_csv("mydata.csv")
theilsen(df, 'pm25')

OUTPUT:
TheilSenplot

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant