In [1]:
!pip install xlrd
Requirement already satisfied: xlrd in c:\users\tommy\anaconda3\lib\site-packages (2.0.1)
In [7]:
import pandas as pd
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
print(df_full.head())
print(df_full.tail())
# read the .xls file
try:
df_full = pd.read_excel(xls_path, sheet_name=0, engine=None)
df_head = df_full.head()
print(df_head.to_dict()) # Convert to dictionary for easier display of columns and data
except Exception as e:
print(f"An error occurred: {e}")
Back to Contents Data 1: U.S. Gasoline and Diesel Retail Prices \
0 Sourcekey EMM_EPM0_PTE_NUS_DPG
1 Date U.S. All Grades All Formulations Retail Gasoli...
2 1990-08-15 00:00:00 NaN
3 1990-09-15 00:00:00 NaN
4 1990-10-15 00:00:00 NaN
Unnamed: 2 \
0 EMM_EPM0U_PTE_NUS_DPG
1 U.S. All Grades Conventional Retail Gasoline P...
2 NaN
3 NaN
4 NaN
Unnamed: 3 \
0 EMM_EPM0R_PTE_NUS_DPG
1 U.S. All Grades Reformulated Retail Gasoline P...
2 NaN
3 NaN
4 NaN
Unnamed: 4 \
0 EMM_EPMR_PTE_NUS_DPG
1 U.S. Regular All Formulations Retail Gasoline ...
2 NaN
3 1.258
4 1.335
Unnamed: 5 \
0 EMM_EPMRU_PTE_NUS_DPG
1 U.S. Regular Conventional Retail Gasoline Pric...
2 NaN
3 1.258
4 1.335
Unnamed: 6 \
0 EMM_EPMRR_PTE_NUS_DPG
1 U.S. Regular Reformulated Retail Gasoline Pric...
2 NaN
3 NaN
4 NaN
Unnamed: 7 \
0 EMM_EPMM_PTE_NUS_DPG
1 U.S. Midgrade All Formulations Retail Gasoline...
2 NaN
3 NaN
4 NaN
Unnamed: 8 \
0 EMM_EPMMU_PTE_NUS_DPG
1 U.S. Midgrade Conventional Retail Gasoline Pri...
2 NaN
3 NaN
4 NaN
Unnamed: 9 \
0 EMM_EPMMR_PTE_NUS_DPG
1 U.S. Midgrade Reformulated Retail Gasoline Pri...
2 NaN
3 NaN
4 NaN
Unnamed: 10 \
0 EMM_EPMP_PTE_NUS_DPG
1 U.S. Premium All Formulations Retail Gasoline ...
2 NaN
3 NaN
4 NaN
Unnamed: 11 \
0 EMM_EPMPU_PTE_NUS_DPG
1 U.S. Premium Conventional Retail Gasoline Pric...
2 NaN
3 NaN
4 NaN
Unnamed: 12 \
0 EMM_EPMPR_PTE_NUS_DPG
1 U.S. Premium Reformulated Retail Gasoline Pric...
2 NaN
3 NaN
4 NaN
Unnamed: 13 \
0 EMD_EPD2D_PTE_NUS_DPG
1 U.S. No 2 Diesel Retail Prices (Dollars per Ga...
2 NaN
3 NaN
4 NaN
Unnamed: 14 \
0 EMD_EPD2DXL0_PTE_NUS_DPG
1 U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) R...
2 NaN
3 NaN
4 NaN
Unnamed: 15
0 EMD_EPD2DM10_PTE_NUS_DPG
1 U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retai...
2 NaN
3 NaN
4 NaN
Back to Contents Data 1: U.S. Gasoline and Diesel Retail Prices \
401 2023-11-15 00:00:00 3.443
402 2023-12-15 00:00:00 3.257
403 2024-01-15 00:00:00 3.197
404 2024-02-15 00:00:00 3.328
405 2024-03-15 00:00:00 3.542
Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7 \
401 3.261 3.802 3.318 3.172 3.634 3.88
402 3.104 3.558 3.134 3.014 3.393 3.685
403 3.045 3.495 3.075 2.957 3.331 3.619
404 3.188 3.605 3.212 3.102 3.446 3.729
405 3.402 3.815 3.426 3.318 3.657 3.938
Unnamed: 8 Unnamed: 9 Unnamed: 10 Unnamed: 11 Unnamed: 12 Unnamed: 13 \
401 3.612 4.275 4.235 3.984 4.533 4.254
402 3.457 4.014 4.038 3.836 4.273 3.972
403 3.391 3.945 3.969 3.764 4.206 3.854
404 3.511 4.042 4.077 3.888 4.296 4.044
405 3.719 4.252 4.286 4.098 4.506 4.022
Unnamed: 14 Unnamed: 15
401 4.254 NaN
402 3.972 NaN
403 3.854 NaN
404 4.044 NaN
405 4.022 NaN
{'Unnamed: 0': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan}, 'Unnamed: 1': {0: 'Workbook Contents', 1: 'U.S. Gasoline and Diesel Retail Prices', 2: nan, 3: 'Click worksheet name or tab at bottom for data', 4: 'Worksheet Name'}, 'Unnamed: 2': {0: nan, 1: nan, 2: nan, 3: nan, 4: 'Description'}, 'Unnamed: 3': {0: nan, 1: nan, 2: nan, 3: nan, 4: '# Of Series'}, 'Unnamed: 4': {0: nan, 1: nan, 2: nan, 3: nan, 4: 'Frequency'}, 'Unnamed: 5': {0: nan, 1: nan, 2: nan, 3: nan, 4: 'Latest Data for'}, 'Unnamed: 6': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan}}
In [6]:
print(df_full.head())
print(df_full.tail())
Back to Contents Data 1: U.S. Gasoline and Diesel Retail Prices \
0 Sourcekey EMM_EPM0_PTE_NUS_DPG
1 Date U.S. All Grades All Formulations Retail Gasoli...
2 1990-08-15 00:00:00 NaN
3 1990-09-15 00:00:00 NaN
4 1990-10-15 00:00:00 NaN
Unnamed: 2 \
0 EMM_EPM0U_PTE_NUS_DPG
1 U.S. All Grades Conventional Retail Gasoline P...
2 NaN
3 NaN
4 NaN
Unnamed: 3 \
0 EMM_EPM0R_PTE_NUS_DPG
1 U.S. All Grades Reformulated Retail Gasoline P...
2 NaN
3 NaN
4 NaN
Unnamed: 4 \
0 EMM_EPMR_PTE_NUS_DPG
1 U.S. Regular All Formulations Retail Gasoline ...
2 NaN
3 1.258
4 1.335
Unnamed: 5 \
0 EMM_EPMRU_PTE_NUS_DPG
1 U.S. Regular Conventional Retail Gasoline Pric...
2 NaN
3 1.258
4 1.335
Unnamed: 6 \
0 EMM_EPMRR_PTE_NUS_DPG
1 U.S. Regular Reformulated Retail Gasoline Pric...
2 NaN
3 NaN
4 NaN
Unnamed: 7 \
0 EMM_EPMM_PTE_NUS_DPG
1 U.S. Midgrade All Formulations Retail Gasoline...
2 NaN
3 NaN
4 NaN
Unnamed: 8 \
0 EMM_EPMMU_PTE_NUS_DPG
1 U.S. Midgrade Conventional Retail Gasoline Pri...
2 NaN
3 NaN
4 NaN
Unnamed: 9 \
0 EMM_EPMMR_PTE_NUS_DPG
1 U.S. Midgrade Reformulated Retail Gasoline Pri...
2 NaN
3 NaN
4 NaN
Unnamed: 10 \
0 EMM_EPMP_PTE_NUS_DPG
1 U.S. Premium All Formulations Retail Gasoline ...
2 NaN
3 NaN
4 NaN
Unnamed: 11 \
0 EMM_EPMPU_PTE_NUS_DPG
1 U.S. Premium Conventional Retail Gasoline Pric...
2 NaN
3 NaN
4 NaN
Unnamed: 12 \
0 EMM_EPMPR_PTE_NUS_DPG
1 U.S. Premium Reformulated Retail Gasoline Pric...
2 NaN
3 NaN
4 NaN
Unnamed: 13 \
0 EMD_EPD2D_PTE_NUS_DPG
1 U.S. No 2 Diesel Retail Prices (Dollars per Ga...
2 NaN
3 NaN
4 NaN
Unnamed: 14 \
0 EMD_EPD2DXL0_PTE_NUS_DPG
1 U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) R...
2 NaN
3 NaN
4 NaN
Unnamed: 15
0 EMD_EPD2DM10_PTE_NUS_DPG
1 U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retai...
2 NaN
3 NaN
4 NaN
Back to Contents Data 1: U.S. Gasoline and Diesel Retail Prices \
401 2023-11-15 00:00:00 3.443
402 2023-12-15 00:00:00 3.257
403 2024-01-15 00:00:00 3.197
404 2024-02-15 00:00:00 3.328
405 2024-03-15 00:00:00 3.542
Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7 \
401 3.261 3.802 3.318 3.172 3.634 3.88
402 3.104 3.558 3.134 3.014 3.393 3.685
403 3.045 3.495 3.075 2.957 3.331 3.619
404 3.188 3.605 3.212 3.102 3.446 3.729
405 3.402 3.815 3.426 3.318 3.657 3.938
Unnamed: 8 Unnamed: 9 Unnamed: 10 Unnamed: 11 Unnamed: 12 Unnamed: 13 \
401 3.612 4.275 4.235 3.984 4.533 4.254
402 3.457 4.014 4.038 3.836 4.273 3.972
403 3.391 3.945 3.969 3.764 4.206 3.854
404 3.511 4.042 4.077 3.888 4.296 4.044
405 3.719 4.252 4.286 4.098 4.506 4.022
Unnamed: 14 Unnamed: 15
401 4.254 NaN
402 3.972 NaN
403 3.854 NaN
404 4.044 NaN
405 4.022 NaN
In [5]:
#succesfully target the data
import pandas as pd
# Define the path to your Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
# Attempt to read the sheet named 'Data 1' which contains the actual data
try:
df_full = pd.read_excel(xls_path, sheet_name='Data 1')
df_head = df_full.head()
print(df_head.to_dict()) # Convert to dictionary for easier display of columns and data
except Exception as e:
print(f"An error occurred: {e}")
{'Back to Contents': {0: 'Sourcekey', 1: 'Date', 2: datetime.datetime(1990, 8, 15, 0, 0), 3: datetime.datetime(1990, 9, 15, 0, 0), 4: datetime.datetime(1990, 10, 15, 0, 0)}, 'Data 1: U.S. Gasoline and Diesel Retail Prices': {0: 'EMM_EPM0_PTE_NUS_DPG', 1: 'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)', 2: nan, 3: nan, 4: nan}, 'Unnamed: 2': {0: 'EMM_EPM0U_PTE_NUS_DPG', 1: 'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)', 2: nan, 3: nan, 4: nan}, 'Unnamed: 3': {0: 'EMM_EPM0R_PTE_NUS_DPG', 1: 'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)', 2: nan, 3: nan, 4: nan}, 'Unnamed: 4': {0: 'EMM_EPMR_PTE_NUS_DPG', 1: 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', 2: nan, 3: 1.258, 4: 1.335}, 'Unnamed: 5': {0: 'EMM_EPMRU_PTE_NUS_DPG', 1: 'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)', 2: nan, 3: 1.258, 4: 1.335}, 'Unnamed: 6': {0: 'EMM_EPMRR_PTE_NUS_DPG', 1: 'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)', 2: nan, 3: nan, 4: nan}, 'Unnamed: 7': {0: 'EMM_EPMM_PTE_NUS_DPG', 1: 'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)', 2: nan, 3: nan, 4: nan}, 'Unnamed: 8': {0: 'EMM_EPMMU_PTE_NUS_DPG', 1: 'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)', 2: nan, 3: nan, 4: nan}, 'Unnamed: 9': {0: 'EMM_EPMMR_PTE_NUS_DPG', 1: 'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)', 2: nan, 3: nan, 4: nan}, 'Unnamed: 10': {0: 'EMM_EPMP_PTE_NUS_DPG', 1: 'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)', 2: nan, 3: nan, 4: nan}, 'Unnamed: 11': {0: 'EMM_EPMPU_PTE_NUS_DPG', 1: 'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)', 2: nan, 3: nan, 4: nan}, 'Unnamed: 12': {0: 'EMM_EPMPR_PTE_NUS_DPG', 1: 'U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)', 2: nan, 3: nan, 4: nan}, 'Unnamed: 13': {0: 'EMD_EPD2D_PTE_NUS_DPG', 1: 'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)', 2: nan, 3: nan, 4: nan}, 'Unnamed: 14': {0: 'EMD_EPD2DXL0_PTE_NUS_DPG', 1: 'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)', 2: nan, 3: nan, 4: nan}, 'Unnamed: 15': {0: 'EMD_EPD2DM10_PTE_NUS_DPG', 1: 'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)', 2: nan, 3: nan, 4: nan}}
In [ ]:
import pandas as pd
# Define the path
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1', skipping rows to get to the actual data
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce') # Coerce for non datetime values
# Filter the DataFrame to only include entries from January 2019 onwards
start_date = '2019-01-01'
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy() # Make a copy
# Drop rows where the date is NaT (not a time), which might have been created due to 'coerce'
df_filtered.dropna(subset=['Date'], inplace=True)
print(df_filtered.head())
except Exception as e:
print(f"An error occurred: {e}")
In [ ]:
print(df_full.head())
print(df_full.tail())
In [8]:
import pandas as pd
# Define the path
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1', skipping rows to get to the actual data
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce') # Coerce for non-datetime values
# Find the first non-null value and its corresponding date
first_valid_date = df_full.loc[df_full.notna().any(axis=1), 'Date'].min()
# Convert to string format
start_date = first_valid_date.strftime('%Y-%m-%d')
# Filter the DataFrame to only include entries from the first valid date onwards
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
# Check the column names in the DataFrame
print(df_filtered.columns)
# Define the columns of interest
columns_of_interest = [
'Date',
'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
]
# Check if all columns of interest exist in the DataFrame
if all(col in df_filtered.columns for col in columns_of_interest):
# Drop rows where the columns of interest have missing values
df_clean = df_filtered.dropna(subset=columns_of_interest)
# If needed, perform additional data cleaning steps here
print(df_clean.head())
else:
print("One or more columns of interest not found in the DataFrame.")
except Exception as e:
print(f"An error occurred: {e}")
Index(['Date',
'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'],
dtype='object')
Date \
198 2007-02-15
199 2007-03-15
200 2007-04-15
201 2007-05-15
202 2007-06-15
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) \
198 2.323
199 2.609
200 2.891
201 3.187
202 3.102
U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) \
198 2.276
199 2.546
200 2.831
201 3.157
202 3.067
U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) \
198 2.418
199 2.738
200 3.013
201 3.248
202 3.173
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) \
198 2.278
199 2.563
200 2.845
201 3.146
202 3.056
U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) \
198 2.235
199 2.503
200 2.787
201 3.119
202 3.024
U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) \
198 2.369
199 2.689
200 2.965
201 3.202
202 3.122
U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) \
198 2.385
199 2.670
200 2.953
201 3.241
202 3.161
U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) \
198 2.325
199 2.593
200 2.881
201 3.199
202 3.118
U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) \
198 2.500
199 2.820
200 3.093
201 3.321
202 3.247
U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) \
198 2.479
199 2.766
200 3.052
201 3.331
202 3.263
U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) \
198 2.431
199 2.706
200 2.996
201 3.300
202 3.227
U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) \
198 2.567
199 2.877
200 3.155
201 3.389
202 3.330
U.S. No 2 Diesel Retail Prices (Dollars per Gallon) \
198 2.488
199 2.667
200 2.834
201 2.796
202 2.808
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) \
198 2.513
199 2.680
200 2.847
201 2.818
202 2.826
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
198 2.435
199 2.630
200 2.790
201 2.726
202 2.746
In [ ]:
In [ ]:
# Display the first few column names
print(df_clean.columns.tolist())
In [9]:
# Check the range of dates available
print(df_clean['Date'].min(), df_clean['Date'].max())
# Check for non-NaN values in one of the columns you're interested in plotting
print(df_clean['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].dropna().head())
2007-02-15 00:00:00 2008-11-15 00:00:00 198 2.278 199 2.563 200 2.845 201 3.146 202 3.056 Name: U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon), dtype: float64
In [ ]:
print(df_clean['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].isna().sum())
In [10]:
# Print all column names to verify their exact names
print(df_clean.columns)
# Intended columns to plot (make sure these are the correct names)
columns_to_plot_corrected = [
'Regular_All_Formulations', # This should match exactly with your DataFrame's column name
# Add other columns if necessary
]
# Check and only keep columns that actually exist in df_clean
columns_to_plot_existing = [col for col in columns_to_plot_corrected if col in df_clean.columns]
# Setting 'Date' as the index, if not already set
if 'Date' not in df_clean.index.names:
df_clean.set_index('Date', inplace=True)
# Plotting with the corrected and existing column names
if columns_to_plot_existing:
df_clean[columns_to_plot_existing].plot(figsize=(14, 7), marker='o', title='U.S. Gasoline Prices Over Time')
plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.grid(True)
plt.legend()
plt.show()
else:
print("None of the intended columns to plot were found in the DataFrame.")
Index(['Date',
'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'],
dtype='object')
None of the intended columns to plot were found in the DataFrame.
In [ ]:
In [11]:
import plotly.graph_objects as go
# Define the columns you want to plot and their corresponding colors
columns_to_plot = [
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
]
colors = ['#FFFF00', # Yellow
'#FF0000', # Red
'#0000FF', # Blue
'#008000'] # Green
# Creating the Plotly figure
fig = go.Figure()
# Adding a line plot for each fuel type
for column, color in zip(columns_to_plot, colors):
fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], name=column, # Keep full column name in legend
mode='lines+markers', line=dict(color=color)))
# Updating the layout
fig.update_layout(
title='Interactive U.S. Gasoline and Diesel Retail Prices Over Time (2019-2023)',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
legend_title='Fuel Type',
template='plotly_white',
width=1400, # Wider chart for better visualization
hovermode='x unified' # Unified hover mode for easier comparison
)
# Display the figure
fig.show()
In [12]:
import plotly.graph_objects as go
# Define the columns you want to plot and their corresponding colors
columns_to_plot = [
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
]
colors = ['#FFFF00', # Yellow
'#FF0000', # Red
'#0000FF', # Blue
'#008000'] # Green
# Creating the Plotly figure
fig = go.Figure()
# Adding a line plot for each fuel type
for column, color in zip(columns_to_plot, colors):
fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], name=column.split(' ')[1], # Simplify name for legend
mode='lines+markers', line=dict(color=color)))
# Updating the layout
fig.update_layout(
title='Interactive U.S. Gasoline and Diesel Retail Prices Over Time (2010-2023)',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
legend_title='Fuel Type',
template='plotly_white',
width=1400 # Wider chart for better visualization
)
# Display the figure
fig.show()
In [ ]:
print(df_clean.index.name)
In [ ]:
In [13]:
import plotly.graph_objects as go
# Define the columns you want to plot and their corresponding colors
columns_to_plot = [
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
]
colors = ['#FFFF00', # Yellow
'#FF0000', # Red
'#0000FF', # Blue
'#008000'] # Green
# Creating the Plotly figure
fig = go.Figure()
# Adding a line plot for each fuel type
for column, color in zip(columns_to_plot, colors):
fig.add_trace(go.Scatter(x=df_filtered.index, y=df_filtered[column], name=column, # Keep full column name in legend
mode='lines+markers', line=dict(color=color)))
# Updating the layout
fig.update_layout(
title='Interactive U.S. Gasoline and Diesel Retail Prices Over Time (2010-2023)',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
legend_title='Fuel Type',
template='plotly_white',
width=1400, # Wider chart for better visualization
hovermode='x unified' # Unified hover mode for easier comparison
)
# Display the figure
fig.show()
In [14]:
import pandas as pd
import plotly.graph_objects as go
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date'
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
# Convert the 'Date' column to datetime format
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter the DataFrame to include data since January 2010
start_date = '2010-01-01'
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
# Define the columns to plot
columns_to_plot = [
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
]
# Create a Plotly figure
fig = go.Figure()
# Add a line plot for each fuel type
for column in columns_to_plot:
fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], name=column, mode='lines+markers'))
# Update the layout of the figure
fig.update_layout(
title='Interactive U.S. Gasoline and Diesel Retail Prices Since 2010',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
template='plotly_white',
width=1400
)
# Display the figure
fig.show()
except Exception as e:
print(f"An error occurred: {e}")
In [ ]:
print(df_clean.head()) # Print the first few rows of the DataFrame
print(df_clean.info()) # Print a summary of the DataFrame
In [ ]:
print(df_full.head()) # Print the first few rows of the DataFrame
print(df_full['Date'].min(), df_full['Date'].max()) # Print the minimum and maximum dates
In [15]:
import pandas as pd
import plotly.graph_objects as go
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date'
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
# Convert the 'Date' column to datetime format
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter the DataFrame to include data since January 2010
start_date = '2010-01-01'
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
# Define the columns to plot
columns_to_plot = df_filtered.columns
# Create a Plotly figure
fig = go.Figure()
# Add a line plot for each column
for column in columns_to_plot:
if column != 'Date': # Exclude 'Date' column from the legend
fig.add_trace(go.Scatter(x=df_filtered['Date'].dt.date, y=df_filtered[column], name=column, mode='lines+markers'))
# Update the layout of the figure
fig.update_layout(
title='Retail Gasoline Prices Over Time',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
legend_title='Fuel Type',
template='plotly_white',
width=1400
)
# Display the figure
fig.show()
except Exception as e:
print(f"An error occurred: {e}")
In [ ]:
print(df_clean.columns)
In [16]:
import pandas as pd
import plotly.graph_objects as go
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date'
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
# Convert the 'Date' column to datetime format
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter the DataFrame to include data since January 2010
start_date = '2010-01-01'
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
# Define the columns to plot
columns_to_plot = df_filtered.columns
# Create a Plotly figure
fig = go.Figure()
# Add a line plot for each column with fills
for column in columns_to_plot:
if column != 'Date': # Exclude 'Date' column from the legend
fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], mode='lines', name=column))
fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], mode='lines', name=column + '_fill',
fill='tonexty')) # Add fill to the line
# Update the layout of the figure
fig.update_layout(
title='Retail Gasoline Prices Over Time',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
legend_title='Fuel Type',
template='plotly_white',
width=1400,
hovermode='x unified' # Unified hover mode for easier comparison
)
# Display the figure
fig.show()
except Exception as e:
print(f"An error occurred: {e}")
In [17]:
import pandas as pd
import numpy as np
# Assuming you already have your DataFrame loaded as df_clean
# Select only the numeric columns
numeric_columns = df_clean.select_dtypes(include=[np.number])
# Calculate Z-score for each numeric column in the DataFrame
z_scores = (numeric_columns - numeric_columns.mean()) / numeric_columns.std()
# Define threshold for considering outliers
threshold = 3
# Identify outliers based on Z-score
outliers = (z_scores > threshold) | (z_scores < -threshold)
# Print outliers
print(outliers)
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. No 2 Diesel Retail Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
In [18]:
# Check the data types of the columns
print(df_clean.dtypes)
# Check for missing values in each column
print(df_clean.isnull().sum())
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) float64 U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) float64 U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) float64 U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) float64 U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) float64 U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) float64 U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) float64 U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) float64 U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) float64 U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) float64 U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) float64 U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) float64 U.S. No 2 Diesel Retail Prices (Dollars per Gallon) float64 U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) float64 U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) float64 dtype: object U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) 0 U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) 0 U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) 0 U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) 0 U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) 0 U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) 0 U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) 0 U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) 0 U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) 0 U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) 0 U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) 0 U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) 0 U.S. No 2 Diesel Retail Prices (Dollars per Gallon) 0 U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) 0 U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) 0 dtype: int64
In [19]:
# Importing necessary library
import numpy as np
# Extracting only the numeric columns for Z-score calculation
numeric_columns = df_clean.select_dtypes(include=np.number)
# Calculate Z-score for each numeric column in the DataFrame
z_scores = (numeric_columns - numeric_columns.mean()) / numeric_columns.std()
# Define threshold for considering outliers
threshold = 3
# Identify outliers based on Z-score
outliers = (z_scores > threshold) | (z_scores < -threshold)
# Print outliers
print(outliers)
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. No 2 Diesel Retail Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
In [20]:
print(outliers)
print(df_clean.dtypes)
print(df_clean.isnull().sum())
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. No 2 Diesel Retail Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) \
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
Date
2007-02-15 False
2007-03-15 False
2007-04-15 False
2007-05-15 False
2007-06-15 False
2007-07-15 False
2007-08-15 False
2007-09-15 False
2007-10-15 False
2007-11-15 False
2007-12-15 False
2008-01-15 False
2008-02-15 False
2008-03-15 False
2008-04-15 False
2008-05-15 False
2008-06-15 False
2008-07-15 False
2008-08-15 False
2008-09-15 False
2008-10-15 False
2008-11-15 False
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) float64
U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) float64
U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) float64
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) float64
U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) float64
U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) float64
U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) float64
U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) float64
U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) float64
U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) float64
U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) float64
U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) float64
U.S. No 2 Diesel Retail Prices (Dollars per Gallon) float64
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) float64
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) float64
dtype: object
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) 0
U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) 0
U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) 0
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) 0
U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) 0
U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) 0
U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) 0
U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) 0
U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) 0
U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) 0
U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) 0
U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) 0
U.S. No 2 Diesel Retail Prices (Dollars per Gallon) 0
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) 0
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) 0
dtype: int64
In [21]:
import pandas as pd
import numpy as np
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date'
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
# Convert the 'Date' column to datetime format
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter the DataFrame to include data since January 2010
start_date = '2010-01-01'
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
# Handle missing values by forward filling
df_clean = df_filtered.ffill()
# Define the columns to plot
columns_to_plot = [
'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
]
# Create a function to calculate outliers using Z-score
def detect_outliers_zscore(data, threshold=3):
z_scores = np.abs((data - data.mean()) / data.std())
return z_scores > threshold
# Identify outliers for each column of interest
outliers = df_clean[columns_to_plot].apply(detect_outliers_zscore)
# Print the outliers
print("Outliers:")
print(outliers)
except Exception as e:
print(f"An error occurred: {e}")
#based on the Z-score threshold, none of the data points are considered outliers
Outliers:
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) \
233 False
234 False
235 False
236 False
237 False
.. ...
399 False
400 False
401 False
402 False
403 False
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) \
233 False
234 False
235 False
236 False
237 False
.. ...
399 False
400 False
401 False
402 False
403 False
U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) \
233 False
234 False
235 False
236 False
237 False
.. ...
399 False
400 False
401 False
402 False
403 False
U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) \
233 False
234 False
235 False
236 False
237 False
.. ...
399 False
400 False
401 False
402 False
403 False
U.S. No 2 Diesel Retail Prices (Dollars per Gallon) \
233 False
234 False
235 False
236 False
237 False
.. ...
399 False
400 False
401 False
402 False
403 False
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) \
233 False
234 False
235 False
236 False
237 False
.. ...
399 False
400 False
401 False
402 False
403 False
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
233 False
234 False
235 False
236 False
237 False
.. ...
399 False
400 False
401 False
402 False
403 False
[171 rows x 7 columns]
In [22]:
import pandas as pd
import numpy as np
# Assuming df_clean is your DataFrame
# Calculate measures of central tendency
central_tendency = df_clean.describe().loc[['mean', '50%']] # Extracting mean and median
# Calculate measures of dispersion
standard_deviation = df_clean.std()
variance = df_clean.var()
range_values = df_clean.max() - df_clean.min()
# Create a DataFrame to store the results
statistics_df = pd.DataFrame({
'Mean': central_tendency.loc['mean'],
'Median': central_tendency.loc['50%'],
'Standard Deviation': standard_deviation,
'Variance': variance,
'Range': range_values
})
# Display the statistics DataFrame
print(statistics_df)
Mean Median \
Date NaN NaN
U.S. All Grades All Formulations Retail Gasolin... 3.073947 2.948
U.S. All Grades Conventional Retail Gasoline Pr... 2.978591 2.866
U.S. All Grades Reformulated Retail Gasoline Pr... 3.265222 3.145
U.S. Midgrade All Formulations Retail Gasoline ... 3.279082 3.212
U.S. Midgrade Conventional Retail Gasoline Pric... 3.154982 3.079
U.S. Midgrade Reformulated Retail Gasoline Pric... 3.484251 3.420
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... NaN NaN
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 3.403404 3.253
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 3.403532 3.253
U.S. Premium All Formulations Retail Gasoline P... 3.494749 3.450
U.S. Premium Conventional Retail Gasoline Price... 3.396392 3.348
U.S. Premium Reformulated Retail Gasoline Price... 3.633029 3.584
U.S. Regular All Formulations Retail Gasoline P... 2.986368 2.860
U.S. Regular Conventional Retail Gasoline Price... 2.901942 2.808
U.S. Regular Reformulated Retail Gasoline Price... 3.166480 3.061
Standard Deviation \
Date 1506 days 21:11:24.869391440
U.S. All Grades All Formulations Retail Gasolin... 0.615091
U.S. All Grades Conventional Retail Gasoline Pr... 0.607859
U.S. All Grades Reformulated Retail Gasoline Pr... 0.637697
U.S. Midgrade All Formulations Retail Gasoline ... 0.635346
U.S. Midgrade Conventional Retail Gasoline Pric... 0.611238
U.S. Midgrade Reformulated Retail Gasoline Pric... 0.675735
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... NaN
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 0.766083
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 0.766004
U.S. Premium All Formulations Retail Gasoline P... 0.649658
U.S. Premium Conventional Retail Gasoline Price... 0.616263
U.S. Premium Reformulated Retail Gasoline Price... 0.688965
U.S. Regular All Formulations Retail Gasoline P... 0.619935
U.S. Regular Conventional Retail Gasoline Price... 0.615876
U.S. Regular Reformulated Retail Gasoline Price... 0.635491
Variance \
Date NaN
U.S. All Grades All Formulations Retail Gasolin... 0.378337
U.S. All Grades Conventional Retail Gasoline Pr... 0.369493
U.S. All Grades Reformulated Retail Gasoline Pr... 0.406657
U.S. Midgrade All Formulations Retail Gasoline ... 0.403664
U.S. Midgrade Conventional Retail Gasoline Pric... 0.373612
U.S. Midgrade Reformulated Retail Gasoline Pric... 0.456618
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... NaN
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 0.586883
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 0.586762
U.S. Premium All Formulations Retail Gasoline P... 0.422055
U.S. Premium Conventional Retail Gasoline Price... 0.379780
U.S. Premium Reformulated Retail Gasoline Price... 0.474672
U.S. Regular All Formulations Retail Gasoline P... 0.384320
U.S. Regular Conventional Retail Gasoline Price... 0.379303
U.S. Regular Reformulated Retail Gasoline Price... 0.403849
Range
Date 5173 days 00:00:00
U.S. All Grades All Formulations Retail Gasolin... 3.16
U.S. All Grades Conventional Retail Gasoline Pr... 3.05
U.S. All Grades Reformulated Retail Gasoline Pr... 3.38
U.S. Midgrade All Formulations Retail Gasoline ... 3.369
U.S. Midgrade Conventional Retail Gasoline Pric... 3.199
U.S. Midgrade Reformulated Retail Gasoline Pric... 3.586
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... NaN
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 3.756
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 3.756
U.S. Premium All Formulations Retail Gasoline P... 3.457
U.S. Premium Conventional Retail Gasoline Price... 3.268
U.S. Premium Reformulated Retail Gasoline Price... 3.643
U.S. Regular All Formulations Retail Gasoline P... 3.165
U.S. Regular Conventional Retail Gasoline Price... 3.083
U.S. Regular Reformulated Retail Gasoline Price... 3.355
C:\Users\tommy\AppData\Local\Temp\ipykernel_25524\34344954.py:11: FutureWarning: The default value of numeric_only in DataFrame.var is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
In [65]:
#Determing correlation of independent variables
import pandas as pd
# Assuming df_clean is your DataFrame and you have selected relevant independent variables
relevant_columns = [
'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
]
# Selecting only the relevant columns
relevant_data = df_clean[relevant_columns]
# Calculating the correlation matrix
correlation_matrix = relevant_data.corr()
# Displaying the correlation matrix
print(correlation_matrix)
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) \
U.S. All Grades All Formulations Retail Gasolin... 1.000000
U.S. Regular All Formulations Retail Gasoline P... 0.999438
U.S. Midgrade All Formulations Retail Gasoline ... 0.980270
U.S. Premium All Formulations Retail Gasoline P... 0.957132
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 0.943263
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 0.943288
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... NaN
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) \
U.S. All Grades All Formulations Retail Gasolin... 0.999438
U.S. Regular All Formulations Retail Gasoline P... 1.000000
U.S. Midgrade All Formulations Retail Gasoline ... 0.975178
U.S. Premium All Formulations Retail Gasoline P... 0.949177
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 0.938388
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 0.938423
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... NaN
U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) \
U.S. All Grades All Formulations Retail Gasolin... 0.980270
U.S. Regular All Formulations Retail Gasoline P... 0.975178
U.S. Midgrade All Formulations Retail Gasoline ... 1.000000
U.S. Premium All Formulations Retail Gasoline P... 0.995015
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 0.957868
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 0.957860
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... NaN
U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) \
U.S. All Grades All Formulations Retail Gasolin... 0.957132
U.S. Regular All Formulations Retail Gasoline P... 0.949177
U.S. Midgrade All Formulations Retail Gasoline ... 0.995015
U.S. Premium All Formulations Retail Gasoline P... 1.000000
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 0.952743
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 0.952712
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... NaN
U.S. No 2 Diesel Retail Prices (Dollars per Gallon) \
U.S. All Grades All Formulations Retail Gasolin... 0.943263
U.S. Regular All Formulations Retail Gasoline P... 0.938388
U.S. Midgrade All Formulations Retail Gasoline ... 0.957868
U.S. Premium All Formulations Retail Gasoline P... 0.952743
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 1.000000
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 1.000000
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... NaN
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) \
U.S. All Grades All Formulations Retail Gasolin... 0.943288
U.S. Regular All Formulations Retail Gasoline P... 0.938423
U.S. Midgrade All Formulations Retail Gasoline ... 0.957860
U.S. Premium All Formulations Retail Gasoline P... 0.952712
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 1.000000
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 1.000000
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... NaN
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
U.S. All Grades All Formulations Retail Gasolin... NaN
U.S. Regular All Formulations Retail Gasoline P... NaN
U.S. Midgrade All Formulations Retail Gasoline ... NaN
U.S. Premium All Formulations Retail Gasoline P... NaN
U.S. No 2 Diesel Retail Prices (Dollars per Gal... NaN
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... NaN
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... NaN
In [66]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date'
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
# Convert the 'Date' column to datetime format
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Create a scatter plot of U.S. Regular All Formulations Retail Gasoline Prices over time
fig = px.scatter(df_full, x='Date', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Scatter Plot of U.S. Regular Gasoline Prices Over Time',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'},
trendline='ols' # Add a trendline using ordinary least squares method
)
# Display the scatter plot
fig.show()
except Exception as e:
print(f"An error occurred: {e}")
In [25]:
import pandas as pd
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Display the column names to identify the correct column for gasoline prices
print(df_full.columns)
except Exception as e:
print(f"An error occurred: {e}")
Index(['Date',
'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'],
dtype='object')
In [26]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date'
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
# Convert the 'Date' column to datetime format
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices
fig = px.histogram(df_full, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Histogram of U.S. Regular Gasoline Prices',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'}
)
# Display the histogram
fig.show()
except Exception as e:
print(f"An error occurred: {e}")
In [27]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date'
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
# Convert the 'Date' column to datetime format
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Create a scatter plot of U.S. Regular All Formulations Retail Gasoline Prices over time
fig = px.scatter(df_full, x='Date', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Scatter Plot of U.S. Regular Gasoline Prices Over Time',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'},
trendline='ols' # Add a trendline using ordinary least squares method (linear regression)
)
# Display the scatter plot
fig.show()
except Exception as e:
print(f"An error occurred: {e}")
In [28]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter for data from 2019 onward
df_filtered = df_full[df_full['Date'].dt.year >= 2019]
# Create the scatter plot
fig = px.scatter(df_filtered, x='Date', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Scatter Plot of U.S. Regular Gasoline Prices (2019 Onward)',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'},
trendline='ols')
# Display the scatter plot
fig.show()
except Exception as e:
print(f"An error occurred: {e}")
In [29]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Add a 'Year' column to the DataFrame
df_full['Year'] = df_full['Date'].dt.year
# Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year
fig = px.box(df_full, x='Year', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Colorful Boxplot of U.S. Regular Gasoline Prices by Year',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
'Year': 'Year'},
color='Year' # This makes the plot colorful
)
# Display the boxplot
fig.show()
except Exception as e:
print(f"An error occurred: {e}")
In [30]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Add a 'Year' column and a 'Quarter' column to the DataFrame
df_full['Year'] = df_full['Date'].dt.year
df_full['Quarter'] = df_full['Date'].dt.to_period('Q')
# Convert 'Quarter' to string for compatibility
df_full['QuarterStr'] = df_full['Quarter'].astype(str)
# Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year and Quarter
fig = px.box(df_full, x='QuarterStr', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Boxplot of U.S. Regular Gasoline Prices by Quarter',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
'QuarterStr': 'Quarter'},
color='QuarterStr' # This makes the plot colorful and distinguishes each quarter
)
# Display the boxplot
fig.show()
except Exception as e:
print(f"An error occurred: {e}")
In [31]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter for data from 2010 onward and avoid SettingWithCopyWarning by using copy()
df_filtered = df_full[df_full['Date'].dt.year >= 2010].copy()
# Properly use .loc to avoid SettingWithCopyWarning
df_filtered['Year'] = df_filtered['Date'].dt.year
df_filtered.loc[:, 'Quarter'] = df_filtered['Date'].dt.to_period('Q')
# Convert 'Quarter' to string for compatibility
df_filtered.loc[:, 'QuarterStr'] = df_filtered['Quarter'].astype(str)
# Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year and Quarter
fig = px.box(df_filtered, x='QuarterStr', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Boxplot of U.S. Regular Gasoline Prices by Quarter (From 2010 Onward)',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
'QuarterStr': 'Quarter'},
color='QuarterStr' # This makes the plot colorful and distinguishes each quarter
)
# Display the boxplot
fig.show()
except Exception as e:
print(f"An error occurred: {e}")
In [32]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter for data from 2019 onward and avoid SettingWithCopyWarning by using copy()
df_filtered = df_full[df_full['Date'].dt.year >= 2019].copy()
# Properly use .loc to avoid SettingWithCopyWarning
df_filtered['Year'] = df_filtered['Date'].dt.year
df_filtered.loc[:, 'Quarter'] = df_filtered['Date'].dt.to_period('Q')
# Convert 'Quarter' to string for compatibility
df_filtered.loc[:, 'QuarterStr'] = df_filtered['Quarter'].astype(str)
# Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year and Quarter
fig = px.box(df_filtered, x='QuarterStr', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Boxplot of U.S. Regular Gasoline Prices by Quarter (From 2019 Onward)',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
'QuarterStr': 'Quarter'},
color='QuarterStr' # This makes the plot colorful and distinguishes each quarter
)
# Display the boxplot
fig.show()
except Exception as e:
print(f"An error occurred: {e}")
In [33]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Extract the year from the 'Date' column
df_full['Year'] = df_full['Date'].dt.year
# Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices
# Color the histogram bars by the 'Year' to visualize how distributions change over time
fig = px.histogram(df_full, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', color='Year',
title='Histogram of U.S. Regular Gasoline Prices by Year',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
'Year': 'Year'},
barmode='overlay', # Overlay the histograms to see how distributions compare across years
histnorm='percent', # Normalize the histogram to show percentage rather than count
nbins=30 # Adjust the number of bins for finer granularity or clarity
)
# Display the histogram
fig.show()
except Exception as e:
print(f"An error occurred: {e}")
In [34]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter for data from 2019 onward
df_filtered = df_full[df_full['Date'].dt.year >= 2019].copy()
# Create a 'YearQuarter' column for coloring
df_filtered['YearQuarter'] = df_filtered['Date'].dt.to_period('Q').astype(str)
# Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices, colored by 'YearQuarter'
fig = px.histogram(df_filtered, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', color='YearQuarter',
title='Quarterly Histogram of U.S. Regular Gasoline Prices (2019 Onward)',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
'YearQuarter': 'Year-Quarter'},
barmode='overlay', # Overlay the histograms to compare across quarters
histnorm='percent', # Normalize to show percentage
nbins=20 # Number of bins for granularity
)
# Display the histogram
fig.show()
except Exception as e:
print(f"An error occurred: {e}")
In [35]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter for data from 2019 onward
df_filtered = df_full[df_full['Date'].dt.year >= 2019].copy()
# Create a 'YearQuarter' column for coloring
df_filtered['YearQuarter'] = df_filtered['Date'].dt.to_period('Q').astype(str)
# Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices, colored by 'YearQuarter'
fig = px.histogram(df_filtered, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', color='YearQuarter',
title='Quarterly Histogram of U.S. Regular Gasoline Prices (2019 Onward)',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
'YearQuarter': 'Year-Quarter'},
barmode='overlay', # Overlay the histograms to compare across quarters
histnorm='percent', # Normalize to show percentage
nbins=20, # Number of bins for granularity
)
# Update y-axis title to reflect what the percentages represent
fig.update_layout(yaxis_title='Percentage of Total Observations')
# Display the histogram
fig.show()
except Exception as e:
print(f"An error occurred: {e}")
In [36]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Drop the specified column
df_full.drop(columns=['U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'], inplace=True)
# Drop rows with any NaN values in the target column
df_full.dropna(subset=['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], inplace=True)
# Reset the index after the dropna operation
df_full.reset_index(drop=True, inplace=True)
# Feature Engineering: Use previous prices as features to predict the next price
df_full['PrevPrice'] = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].shift(1)
# Drop the first row which now has an NaN value after the shift operation
df_full.dropna(inplace=True)
# Define features and target variable
X = df_full[['PrevPrice']] # Features from previous prices
y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'] # Target
# Initialize the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
# Train the model on the entire dataset
model.fit(X, y)
# Predict on the training set itself since we don't have a separate test set
predictions = model.predict(X)
# Calculate and print the mean absolute error
mae = mean_absolute_error(y, predictions)
print("Mean Absolute Error on the entire dataset:", mae)
# Optionally, visualize the actual vs predicted prices
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.scatter(y, predictions, alpha=0.3)
plt.plot(y, y, color='red') # Perfect predictions line
plt.title('Actual vs Predicted Prices')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.show()
except Exception as e:
print(f"An error occurred: {e}")
Mean Absolute Error on the entire dataset: 0.06432608148404984
In [37]:
# Assuming the model and df_full are already defined and the model is trained
# Find the last known price
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
# Initialize a list to keep track of our predictions and input features
predictions = []
input_feature = last_known_price
# Predict the next three months
for _ in range(3):
# Predict the next price
next_month_price = model.predict([[input_feature]])[0]
# Store the prediction
predictions.append(next_month_price)
# Update the input_feature to the newly predicted price
input_feature = next_month_price
# Display the predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=3, freq='M')
predicted_prices = pd.Series(predictions, index=prediction_dates)
print(predicted_prices)
2024-04-30 3.58903 2024-05-31 3.65322 2024-06-30 3.64383 Freq: M, dtype: float64
C:\Users\tommy\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but RandomForestRegressor was fitted with feature names C:\Users\tommy\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but RandomForestRegressor was fitted with feature names C:\Users\tommy\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but RandomForestRegressor was fitted with feature names
In [38]:
import plotly.graph_objects as go
import pandas as pd
# Assuming df_full, model, and the predictions are already defined
# Create a DataFrame for the predicted data with appropriate dates
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=3, freq='M')
predicted_data = pd.DataFrame({'Date': prediction_dates, 'Predicted Prices': [3.58903, 3.65322, 3.64383]})
# Create the figure for plotting
fig = go.Figure()
# Add the historical data
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
mode='lines', name='Historical Prices', line=dict(color='blue')))
# Add the predicted data
fig.add_trace(go.Scatter(x=predicted_data['Date'], y=predicted_data['Predicted Prices'],
mode='lines+markers', name='Predicted Prices', line=dict(color='red', dash='dash')))
# Update layout for a more interactive experience
fig.update_layout(title='Historical and Predicted Gasoline Prices',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
template='plotly_white')
# Enable zoom and pan
fig.update_layout(xaxis=dict(rangeslider=dict(visible=True), type="date"))
# Show the figure
fig.show()
In [39]:
# Assuming df_full is your original dataframe, and it has been preprocessed correctly
# Ensure the feature used for prediction is correctly set
X = df_full[['PrevPrice']] # Assuming 'PrevPrice' was correctly defined previously
# Generate predictions for the entire dataset
predictions = model.predict(X)
# Now, ensure 'y' is the target variable for the entire dataset
y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']
# Ensure y and predictions are aligned; they should be if predictions were made for the entire dataset
residuals = y - predictions
# Now, you can visualize the residuals as previously described without encountering shape mismatch issues
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.scatter(y.index, residuals)
plt.axhline(y=0, color='red', linestyle='--')
plt.title('Residuals of Predictions')
plt.xlabel('Index')
plt.ylabel('Residuals')
plt.show()
In [40]:
# Assuming your RandomForestRegressor is named `model` and you have a DataFrame `X` with multiple features.
import pandas as pd
import matplotlib.pyplot as plt
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.nlargest(len(X.columns)).plot(kind='barh', colormap='winter')
plt.title('Feature Importances')
plt.show()
In [41]:
import plotly.graph_objects as go
# Assuming `df_full` contains your historical data and `predicted_data` contains future predictions
# Concatenate historical and predicted data for a continuous timeline
full_dates = pd.concat([df_full['Date'], predicted_data['Date']])
full_prices = pd.concat([df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], predicted_data['Predicted Prices']])
fig = go.Figure()
# Add historical data to the plot
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
mode='lines', name='Historical Prices'))
# Add predicted data to the plot
fig.add_trace(go.Scatter(x=predicted_data['Date'], y=predicted_data['Predicted Prices'],
mode='lines+markers', name='Predicted Prices', line=dict(dash='dot')))
# Update layout for a more detailed view
fig.update_layout(title='Historical and Predicted Gasoline Prices',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
template='plotly_white')
fig.show()
In [42]:
# Starting from the last known price in your dataset
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
# Initialize a list to store predictions
monthly_predictions = [last_known_price] # Starting with the last known price
# Predict the next 12 months
for _ in range(12):
# The model expects a DataFrame, so we create one with the current input feature
current_feature = pd.DataFrame([monthly_predictions[-1]], columns=['PrevPrice'])
# Predict the next price
next_month_price = model.predict(current_feature)[0]
# Append the prediction for next month
monthly_predictions.append(next_month_price)
# Remove the first element (last known price) to get only future predictions
future_predictions = monthly_predictions[1:]
# Create a date range for these predictions
last_date = df_full['Date'].iloc[-1]
prediction_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=12, freq='M')
# Plot the predictions
plt.figure(figsize=(10, 6))
plt.plot(prediction_dates, future_predictions, label='Predicted Prices', marker='o', linestyle='--', color='red')
plt.title('Predicted Gasoline Prices for the Next Year')
plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.legend()
plt.show()
In [43]:
import pandas as pd
import matplotlib.pyplot as plt
# Assuming 'df_full' contains your historical data up to the last known date
# and 'model' is your trained Random Forest model
# Initialize predictions list with the last known price to start the prediction chain
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = [last_known_price]
# Generate predictions for the next 12 months
for _ in range(12):
input_feature = pd.DataFrame([predictions[-1]], columns=['PrevPrice'])
next_price = model.predict(input_feature)[0]
predictions.append(next_price)
# Exclude the first element (last known price) to keep only future predictions
future_predictions = predictions[1:]
# Create a date range for the next 12 months of predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=12, freq='M')
# Combine historical dates and predicted dates
full_dates = pd.concat([df_full['Date'], pd.Series(prediction_dates)])
# Combine historical prices and predicted prices
# The last known price is included in both historical and predicted prices for continuity
full_prices = pd.concat([df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], pd.Series(future_predictions)])
# Plotting
plt.figure(figsize=(14, 7))
plt.plot(df_full['Date'], df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], label='Historical Prices', color='blue')
plt.plot(prediction_dates, future_predictions, label='Predicted Prices', linestyle='--', marker='o', color='red')
plt.title('Historical and Predicted Gasoline Prices')
plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.legend()
plt.show()
In [44]:
import plotly.graph_objects as go
import pandas as pd
# Assuming 'df_full' is your DataFrame with historical data and the model is ready
# Generate predictions for the next 12 months, starting from the last known price
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = [last_known_price]
for _ in range(12):
input_feature = pd.DataFrame([predictions[-1]], columns=['PrevPrice'])
next_price = model.predict(input_feature)[0]
predictions.append(next_price)
# Exclude the first element (last known price) to keep only future predictions
future_predictions = predictions[1:]
# Create a date range for the next 12 months of predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=12, freq='M')
# Plotting with Plotly for interactive exploration
fig = go.Figure()
# Add historical data to the plot
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
mode='lines', name='Historical Prices'))
# Add predicted data to the plot
fig.add_trace(go.Scatter(x=prediction_dates, y=future_predictions,
mode='lines+markers', name='Predicted Prices', line=dict(dash='dot')))
# Update layout for a more detailed view
fig.update_layout(title='Historical and Predicted Gasoline Prices',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
template='plotly_white')
# Show the figure
fig.show()
In [3]:
import plotly.graph_objects as go
import pandas as pd
# Assuming 'df_full' is your DataFrame with historical data and 'model' is your trained model
# Start predictions from the last known price
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = [last_known_price]
# Predict for the next 5 years (60 months)
for _ in range(60):
input_feature = pd.DataFrame([predictions[-1]], columns=['PrevPrice'])
next_price = model.predict(input_feature)[0]
predictions.append(next_price)
# Exclude the first element (last known price) to have only future predictions
future_predictions = predictions[1:]
# Create a date range for the next 5 years of predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=60, freq='M')
# Creating the Plotly figure
fig = go.Figure()
# Add historical data to the plot
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
mode='lines', name='Historical Prices', line=dict(color='blue')))
# Add predicted data to the plot
fig.add_trace(go.Scatter(x=prediction_dates, y=future_predictions,
mode='lines+markers', name='Predicted Prices', line=dict(color='red', dash='dash')))
# Update the layout for better visualization and interactivity
fig.update_layout(title='Historical and Predicted Gasoline Prices for the Next 5 Years',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
template='plotly_white')
# Enable zoom and pan features
fig.update_layout(xaxis_rangeslider_visible=True)
# Show the figure
fig.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[3], line 7 2 import pandas as pd 4 # Assuming 'df_full' is your DataFrame with historical data and 'model' is your trained model 5 6 # Start predictions from the last known price ----> 7 last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1] 8 predictions = [last_known_price] 10 # Predict for the next 5 years (60 months) NameError: name 'df_full' is not defined
In [45]:
# Find the earliest date in the historical data
earliest_date = df_full['Date'].min()
print(f"The model is trained on data starting from: {earliest_date}")
The model is trained on data starting from: 2007-02-15 00:00:00
In [47]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
# Assuming df_full is your complete dataset sorted by date
# Check the earliest date just for confirmation
print(f"Earliest date in the dataset: {df_full['Date'].min()}")
# Feature Engineering: Create 'PrevPrice' based on 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'
df_full['PrevPrice'] = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].shift(1)
df_full.dropna(inplace=True) # Drop the first row which now lacks 'PrevPrice'
# Splitting the data into features (X) and target (y)
X = df_full[['PrevPrice']]
y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
# Train the model
model.fit(X_train, y_train)
# Make predictions on the test set
predictions = model.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
print(f"Mean Absolute Error: {mae}")
# Now the model is trained from the earliest occurrence of price in the dataset
Earliest date in the dataset: 2007-03-15 00:00:00 Mean Absolute Error: 0.18015374796747932
In [48]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
# Assuming df_full is already loaded and sorted by date, starting from 1990
# Check to ensure the start date is indeed from 1990
print(f"Dataset start date: {df_full['Date'].min()}")
# Prepare the feature 'PrevPrice'
df_full['PrevPrice'] = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].shift(1)
df_full = df_full.dropna() # Removing the first row which now has a NaN value for 'PrevPrice'
# Define your features (X) and target variable (y)
X = df_full[['PrevPrice']]
y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']
# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
# Train the model on the entire dataset
model.fit(X, y)
# Assuming you want to predict the next 12 months to keep the example operational and straightforward
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = []
for _ in range(12):
# Using the last known price to predict the next one
input_feature = pd.DataFrame([last_known_price], columns=['PrevPrice'])
next_price = model.predict(input_feature)[0]
predictions.append(next_price)
last_known_price = next_price # Update the last known price for the next prediction
# Output the predictions
prediction_dates = pd.date_range(start=df_full['Date'].max() + pd.Timedelta(days=1), periods=12, freq='M')
predicted_prices = pd.Series(predictions, index=prediction_dates)
print(predicted_prices)
Dataset start date: 2007-04-15 00:00:00 2024-03-31 3.564080 2024-04-30 3.723030 2024-05-31 3.740050 2024-06-30 3.538280 2024-07-31 3.503880 2024-08-31 3.890570 2024-09-30 3.732380 2024-10-31 3.609140 2024-11-30 3.454184 2024-12-31 3.438570 2025-01-31 3.581940 2025-02-28 3.768000 Freq: M, dtype: float64
In [49]:
print(df_full['Date'].min())
print(X.head())
2007-05-15 00:00:00
PrevPrice
198 2.845
199 3.146
200 3.056
201 2.965
202 2.786
In [ ]:
print(df_full['Date'].min())
In [ ]:
print(X.head())
In [ ]:
# This line restricts your dataset to start from 2019
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
In [ ]:
print(df_full['Date'].min(), df_full['Date'].max())
In [ ]:
print(df_full.head())
print(df_full.tail())
In [50]:
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
# Define the path to the Excel file
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
# Load the data from the specified sheet, assuming data starts from row 2 to avoid headers
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Ensure the 'Date' column is datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
# Convert 'Date' to a numerical format for regression analysis (ordinal)
df['Date_ordinal'] = df['Date'].apply(lambda x: x.toordinal())
# Assume 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)' is the target column
# Fit the linear regression model
model = ols("Q('U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)') ~ Date_ordinal", data=df).fit()
# Print the summary of the model to see the results
print(model.summary())
# Extract the coefficients to formulate the regression equation
intercept, slope = model.params
print(f"Regression Equation: Price = {intercept:.4f} + {slope:.4f} * Date_ordinal")
OLS Regression Results
==========================================================================================================================================
Dep. Variable: Q('U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)') R-squared: 0.646
Model: OLS Adj. R-squared: 0.645
Method: Least Squares F-statistic: 727.3
Date: Tue, 16 Apr 2024 Prob (F-statistic): 6.05e-92
Time: 20:22:41 Log-Likelihood: -339.37
No. Observations: 401 AIC: 682.7
Df Residuals: 399 BIC: 690.7
Df Model: 1
Covariance Type: nonrobust
================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------
Intercept -156.1602 5.872 -26.593 0.000 -167.704 -144.616
Date_ordinal 0.0002 8.01e-06 26.969 0.000 0.000 0.000
==============================================================================
Omnibus: 24.065 Durbin-Watson: 0.071
Prob(Omnibus): 0.000 Jarque-Bera (JB): 26.805
Skew: 0.625 Prob(JB): 1.51e-06
Kurtosis: 3.204 Cond. No. 1.52e+08
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.52e+08. This might indicate that there are
strong multicollinearity or other numerical problems.
Regression Equation: Price = -156.1602 + 0.0002 * Date_ordinal
In [53]:
pip install pandas numpy statsmodels matplotlib plotly
Requirement already satisfied: pandas in c:\users\tommy\anaconda3\lib\site-packages (1.5.3)Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: numpy in c:\users\tommy\anaconda3\lib\site-packages (1.24.3) Requirement already satisfied: statsmodels in c:\users\tommy\anaconda3\lib\site-packages (0.14.0) Requirement already satisfied: matplotlib in c:\users\tommy\anaconda3\lib\site-packages (3.7.1) Requirement already satisfied: plotly in c:\users\tommy\anaconda3\lib\site-packages (5.9.0) Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\tommy\anaconda3\lib\site-packages (from pandas) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\tommy\anaconda3\lib\site-packages (from pandas) (2022.7) Requirement already satisfied: scipy!=1.9.2,>=1.4 in c:\users\tommy\anaconda3\lib\site-packages (from statsmodels) (1.10.1) Requirement already satisfied: patsy>=0.5.2 in c:\users\tommy\anaconda3\lib\site-packages (from statsmodels) (0.5.3) Requirement already satisfied: packaging>=21.3 in c:\users\tommy\anaconda3\lib\site-packages (from statsmodels) (23.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\tommy\anaconda3\lib\site-packages (from matplotlib) (1.0.5) Requirement already satisfied: cycler>=0.10 in c:\users\tommy\anaconda3\lib\site-packages (from matplotlib) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\tommy\anaconda3\lib\site-packages (from matplotlib) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\tommy\anaconda3\lib\site-packages (from matplotlib) (1.4.4) Requirement already satisfied: pillow>=6.2.0 in c:\users\tommy\anaconda3\lib\site-packages (from matplotlib) (9.4.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\tommy\anaconda3\lib\site-packages (from matplotlib) (3.0.9) Requirement already satisfied: tenacity>=6.2.0 in c:\users\tommy\anaconda3\lib\site-packages (from plotly) (8.2.2) Requirement already satisfied: six in c:\users\tommy\anaconda3\lib\site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
In [58]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
import plotly.graph_objects as go
# Load your dataset
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
# Assuming 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)' is your column of interest
data = df['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']
# Handle missing values by forward filling then backward filling to ensure no gaps
data_filled = data.fillna(method='ffill').fillna(method='bfill')
# Decompose the series to understand its components
decomposition = seasonal_decompose(data_filled, model='additive', period=12)
fig = decomposition.plot()
plt.show()
# Fit a SARIMA model
# Note: p, d, q values and seasonal P, D, Q, M should be selected based on ACF, PACF plots and domain understanding
model = SARIMAX(data_filled, order=(1,1,1), seasonal_order=(1,1,1,12))
results = model.fit(disp=False)
# Check diagnostics to ensure no assumptions are violated
results.plot_diagnostics(figsize=(15, 12))
plt.show()
# Forecast for the next 12 months
forecast = results.get_forecast(steps=12)
mean_forecast = forecast.predicted_mean
confidence_intervals = forecast.conf_int()
# Dates for the forecast
forecast_dates = pd.date_range(data_filled.index[-1], periods=12, freq='M')
# Create a plot with historical data and forecast
fig = go.Figure()
# Historical data
fig.add_trace(go.Scatter(x=data_filled.index, y=data_filled, mode='lines', name='Historical Prices'))
# Forecast data
fig.add_trace(go.Scatter(x=forecast_dates, y=mean_forecast, mode='lines', name='Forecast'))
# Fill area between confidence intervals
fig.add_trace(go.Scatter(x=np.concatenate([forecast_dates, forecast_dates[::-1]]),
y=np.concatenate([confidence_intervals.iloc[:, 0], confidence_intervals.iloc[:, 1][::-1]]),
fill='toself', fillcolor='rgba(0,100,80,0.2)', line=dict(color='rgba(255,255,255,0)'),
name='Confidence Interval'))
# Update layout
fig.update_layout(title='Forecast of U.S. Regular Gasoline Prices', xaxis_title='Date', yaxis_title='Price (Dollars per Gallon)', template='plotly_white')
fig.show()
C:\Users\tommy\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. C:\Users\tommy\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.
C:\Users\tommy\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. C:\Users\tommy\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.
In [64]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime as dt
# Load the dataset
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
# Convert dates into numerical format for regression
df['Date_ordinal'] = df.index.map(dt.datetime.toordinal)
data_filled = df['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].fillna(method='ffill').fillna(method='bfill')
# Prepare X and y for regression
X = df['Date_ordinal'].values.reshape(-1, 1)
y = data_filled.values
# Create and fit the linear regression model
model = LinearRegression()
model.fit(X, y)
# Print the coefficients of the model
print("Intercept (b0):", model.intercept_)
print("Slope (b1) for Date:", model.coef_[0])
# Plotting the results with formatted dates
plt.figure(figsize=(12, 6))
plt.scatter(df.index, y, color='blue', label='Actual Prices')
plt.plot(df.index, model.predict(X), color='red', linewidth=2, label='Fitted Line')
plt.title('Linear Regression on Gasoline Prices')
plt.xlabel('Date')
plt.ylabel('U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)')
# Format the x-axis to improve readability
plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator()) # Use AutoDateLocator without additional parameters
plt.gca().xaxis.set_major_formatter(mdates.ConciseDateFormatter(mdates.AutoDateLocator())) # Use concise formatter for cleaner labels
plt.gcf().autofmt_xdate() # Automatically rotate date labels for better readability
plt.legend()
plt.show()
Intercept (b0): -155.081308944846 Slope (b1) for Date: 0.00021462001181987098
In [69]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# File path from your original code
xls_path = r"C:\Users\tommy\OneDrive\Desktop\GGU\Classes Spring 2024 Trimester\Data 190 Capstone\Capstone Project\PET_PRI_GND_DCUS_NUS_M.xls"
# Load the dataset
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Assuming these are the relevant columns you are interested in
relevant_columns = [
'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
]
# Selecting only the relevant columns
df_clean = df[relevant_columns]
# Calculating the correlation matrix
correlation_matrix = df_clean.corr()
# Displaying the correlation matrix
print(correlation_matrix)
# Visualization of the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, linewidths=0.5)
plt.title('Correlation Matrix of Gasoline and Diesel Prices')
plt.show()
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) \
U.S. All Grades All Formulations Retail Gasolin... 1.000000
U.S. Regular All Formulations Retail Gasoline P... 0.999681
U.S. Midgrade All Formulations Retail Gasoline ... 0.994719
U.S. Premium All Formulations Retail Gasoline P... 0.987989
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 0.982191
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 0.941265
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... 0.915647
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) \
U.S. All Grades All Formulations Retail Gasolin... 0.999681
U.S. Regular All Formulations Retail Gasoline P... 1.000000
U.S. Midgrade All Formulations Retail Gasoline ... 0.992550
U.S. Premium All Formulations Retail Gasoline P... 0.984567
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 0.980242
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 0.935896
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... 0.913120
U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) \
U.S. All Grades All Formulations Retail Gasolin... 0.994719
U.S. Regular All Formulations Retail Gasoline P... 0.992550
U.S. Midgrade All Formulations Retail Gasoline ... 1.000000
U.S. Premium All Formulations Retail Gasoline P... 0.998461
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 0.985811
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 0.953893
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... 0.918941
U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) \
U.S. All Grades All Formulations Retail Gasolin... 0.987989
U.S. Regular All Formulations Retail Gasoline P... 0.984567
U.S. Midgrade All Formulations Retail Gasoline ... 0.998461
U.S. Premium All Formulations Retail Gasoline P... 1.000000
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 0.984245
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 0.946984
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... 0.923600
U.S. No 2 Diesel Retail Prices (Dollars per Gallon) \
U.S. All Grades All Formulations Retail Gasolin... 0.982191
U.S. Regular All Formulations Retail Gasoline P... 0.980242
U.S. Midgrade All Formulations Retail Gasoline ... 0.985811
U.S. Premium All Formulations Retail Gasoline P... 0.984245
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 1.000000
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 0.999984
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... 0.999597
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) \
U.S. All Grades All Formulations Retail Gasolin... 0.941265
U.S. Regular All Formulations Retail Gasoline P... 0.935896
U.S. Midgrade All Formulations Retail Gasoline ... 0.953893
U.S. Premium All Formulations Retail Gasoline P... 0.946984
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 0.999984
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 1.000000
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... 0.999523
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
U.S. All Grades All Formulations Retail Gasolin... 0.915647
U.S. Regular All Formulations Retail Gasoline P... 0.913120
U.S. Midgrade All Formulations Retail Gasoline ... 0.918941
U.S. Premium All Formulations Retail Gasoline P... 0.923600
U.S. No 2 Diesel Retail Prices (Dollars per Gal... 0.999597
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Re... 0.999523
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail... 1.000000
In [ ]: