In [1]:
import warnings
import pandas as pd
from pathlib import Path
from IPython.display import display
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 12)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 140)
def resolve_dataset_path(filename='PET_PRI_GND_DCUS_NUS_M.xls'):
candidates = [
Path.cwd() / filename,
Path.cwd() / 'Capstone Project' / filename,
Path.home() / 'OneDrive' / 'Desktop' / 'GGU' / 'Classes Spring 2024 Trimester' / 'Data 190 Capstone' / 'Capstone Project' / filename,
]
for candidate in candidates:
if candidate.exists():
return candidate
matches = list(Path.cwd().glob(f'**/{filename}'))
if matches:
return matches[0]
raise FileNotFoundError(f'Could not locate {filename} from {Path.cwd()}')
print('Notebook display cleanup loaded.')
Notebook display cleanup loaded.
In [ ]:
!pip install xlrd statsmodels seaborn scikit-learn plotly matplotlib pandas numpy
In [3]:
import pandas as pd
from IPython.display import display
# Define the path to the Excel file
xls_path = resolve_dataset_path()
print(f'Using dataset: {xls_path}')
# Read the .xls file
try:
df_full = pd.read_excel(xls_path, sheet_name=0, engine=None)
df_head = df_full.head(3)
df_tail = df_full.tail(3)
display(df_head)
display(df_tail)
except Exception as e:
print(f'Notebook cell skipped: {e}')
Using dataset: PET_PRI_GND_DCUS_NUS_M.xls
| Unnamed: 0 | Unnamed: 1 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | |
|---|---|---|---|---|---|---|---|
| 0 | NaN | Workbook Contents | NaN | NaN | NaN | NaN | NaN |
| 1 | NaN | U.S. Gasoline and Diesel Retail Prices | NaN | NaN | NaN | NaN | NaN |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Unnamed: 0 | Unnamed: 1 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | |
|---|---|---|---|---|---|---|---|
| 12 | NaN | Source: | Energy Information Administration | NaN | NaN | NaN | NaN |
| 13 | NaN | For Help, Contact: | infoctr@eia.gov | NaN | NaN | NaN | NaN |
| 14 | NaN | NaN | (202) 586-8800 | NaN | NaN | 4/1/2024 5:32:13 PM | NaN |
In [ ]:
from IPython.display import display
if 'df_full' in globals():
display(df_full.head(3))
display(df_full.tail(3))
In [5]:
#succesfully target the data
import pandas as pd
# Define the path to the Excel file
xls_path = resolve_dataset_path()
print(f'Using dataset: {xls_path}')
# Attempt to read the sheet named 'Data 1' which contains the actual data
try:
df_full = pd.read_excel(xls_path, sheet_name='Data 1')
df_head = df_full.head()
display(df_head)
except Exception as e:
print(f'Notebook cell skipped: {e}')
Using dataset: PET_PRI_GND_DCUS_NUS_M.xls
| Back to Contents | Data 1: U.S. Gasoline and Diesel Retail Prices | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | Unnamed: 10 | Unnamed: 11 | Unnamed: 12 | Unnamed: 13 | Unnamed: 14 | Unnamed: 15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Sourcekey | EMM_EPM0_PTE_NUS_DPG | EMM_EPM0U_PTE_NUS_DPG | EMM_EPM0R_PTE_NUS_DPG | EMM_EPMR_PTE_NUS_DPG | EMM_EPMRU_PTE_NUS_DPG | EMM_EPMRR_PTE_NUS_DPG | EMM_EPMM_PTE_NUS_DPG | EMM_EPMMU_PTE_NUS_DPG | EMM_EPMMR_PTE_NUS_DPG | EMM_EPMP_PTE_NUS_DPG | EMM_EPMPU_PTE_NUS_DPG | EMM_EPMPR_PTE_NUS_DPG | EMD_EPD2D_PTE_NUS_DPG | EMD_EPD2DXL0_PTE_NUS_DPG | EMD_EPD2DM10_PTE_NUS_DPG |
| 1 | Date | U.S. All Grades All Formulations Retail Gasoli... | U.S. All Grades Conventional Retail Gasoline P... | U.S. All Grades Reformulated Retail Gasoline P... | U.S. Regular All Formulations Retail Gasoline ... | U.S. Regular Conventional Retail Gasoline Pric... | U.S. Regular Reformulated Retail Gasoline Pric... | U.S. Midgrade All Formulations Retail Gasoline... | U.S. Midgrade Conventional Retail Gasoline Pri... | U.S. Midgrade Reformulated Retail Gasoline Pri... | U.S. Premium All Formulations Retail Gasoline ... | U.S. Premium Conventional Retail Gasoline Pric... | U.S. Premium Reformulated Retail Gasoline Pric... | U.S. No 2 Diesel Retail Prices (Dollars per Ga... | U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) R... | U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retai... |
| 2 | 1990-08-15 00:00:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 1990-09-15 00:00:00 | NaN | NaN | NaN | 1.258 | 1.258 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 1990-10-15 00:00:00 | NaN | NaN | NaN | 1.335 | 1.335 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
In [6]:
import pandas as pd
# Define the path
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1', skipping rows to get to the actual data
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce') # Coerce for non datetime values
# Filter the DataFrame to only include entries from January 2019 onwards
start_date = '2019-01-01'
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy() # Make a copy
# Drop rows where the date is NaT (not a time), which might have been created due to 'coerce'
df_filtered.dropna(subset=['Date'], inplace=True)
display(df_filtered.head(5))
except Exception as e:
print(f'Notebook cell skipped: {e}')
| Date | U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. No 2 Diesel Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 341 | 2019-01-15 | 2.338 | 2.214 | 2.575 | 2.248 | 2.145 | 2.464 | 2.658 | 2.498 | 2.884 | 2.904 | 2.768 | 3.060 | 2.980 | 2.980 | NaN |
| 342 | 2019-02-15 | 2.393 | 2.286 | 2.603 | 2.309 | 2.223 | 2.495 | 2.682 | 2.531 | 2.898 | 2.936 | 2.814 | 3.077 | 2.997 | 2.997 | NaN |
| 343 | 2019-03-15 | 2.594 | 2.503 | 2.774 | 2.516 | 2.443 | 2.673 | 2.854 | 2.720 | 3.045 | 3.107 | 3.006 | 3.223 | 3.076 | 3.076 | NaN |
| 344 | 2019-04-15 | 2.881 | 2.755 | 3.131 | 2.798 | 2.694 | 3.023 | 3.165 | 2.980 | 3.434 | 3.417 | 3.264 | 3.597 | 3.121 | 3.121 | NaN |
| 345 | 2019-05-15 | 2.946 | 2.796 | 3.243 | 2.859 | 2.731 | 3.136 | 3.250 | 3.051 | 3.542 | 3.505 | 3.328 | 3.713 | 3.161 | 3.161 | NaN |
In [7]:
from IPython.display import display
display(df_full.head(3))
display(df_full.tail(3))
| Date | U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. No 2 Diesel Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1990-08-15 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 1990-09-15 | NaN | NaN | NaN | 1.258 | 1.258 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 1990-10-15 | NaN | NaN | NaN | 1.335 | 1.335 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Date | U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. No 2 Diesel Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 401 | 2024-01-15 | 3.197 | 3.045 | 3.495 | 3.075 | 2.957 | 3.331 | 3.619 | 3.391 | 3.945 | 3.969 | 3.764 | 4.206 | 3.854 | 3.854 | NaN |
| 402 | 2024-02-15 | 3.328 | 3.188 | 3.605 | 3.212 | 3.102 | 3.446 | 3.729 | 3.511 | 4.042 | 4.077 | 3.888 | 4.296 | 4.044 | 4.044 | NaN |
| 403 | 2024-03-15 | 3.542 | 3.402 | 3.815 | 3.426 | 3.318 | 3.657 | 3.938 | 3.719 | 4.252 | 4.286 | 4.098 | 4.506 | 4.022 | 4.022 | NaN |
In [8]:
import pandas as pd
from IPython.display import display
# Define the path
xls_path = resolve_dataset_path()
print(f'Using dataset: {xls_path}')
try:
# Read the sheet named 'Data 1', skipping rows to get to the actual data
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce') # Coerce for non-datetime values
# Find the first non-null value and its corresponding date
first_valid_date = df_full.loc[df_full.notna().any(axis=1), 'Date'].min()
# Convert to string format
start_date = first_valid_date.strftime('%Y-%m-%d')
# Filter the DataFrame to only include entries from the first valid date onwards
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
# Check the column names in the DataFrame
print(f'Column count: {len(df_filtered.columns)}')
display(df_filtered.columns[:12].tolist())
# Define the columns of interest
columns_of_interest = [
'Date',
'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
]
if all(col in df_filtered.columns for col in columns_of_interest):
df_clean = df_filtered.dropna(subset=columns_of_interest)
display(df_clean.head(5))
else:
print('Some expected columns were not found in the dataset.')
except Exception as e:
print(f'Notebook cell skipped: {e}')
Using dataset: PET_PRI_GND_DCUS_NUS_M.xls Column count: 16
['Date', 'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)', 'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)']
| Date | U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. No 2 Diesel Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 198 | 2007-02-15 | 2.323 | 2.276 | 2.418 | 2.278 | 2.235 | 2.369 | 2.385 | 2.325 | 2.500 | 2.479 | 2.431 | 2.567 | 2.488 | 2.513 | 2.435 |
| 199 | 2007-03-15 | 2.609 | 2.546 | 2.738 | 2.563 | 2.503 | 2.689 | 2.670 | 2.593 | 2.820 | 2.766 | 2.706 | 2.877 | 2.667 | 2.680 | 2.630 |
| 200 | 2007-04-15 | 2.891 | 2.831 | 3.013 | 2.845 | 2.787 | 2.965 | 2.953 | 2.881 | 3.093 | 3.052 | 2.996 | 3.155 | 2.834 | 2.847 | 2.790 |
| 201 | 2007-05-15 | 3.187 | 3.157 | 3.248 | 3.146 | 3.119 | 3.202 | 3.241 | 3.199 | 3.321 | 3.331 | 3.300 | 3.389 | 2.796 | 2.818 | 2.726 |
| 202 | 2007-06-15 | 3.102 | 3.067 | 3.173 | 3.056 | 3.024 | 3.122 | 3.161 | 3.118 | 3.247 | 3.263 | 3.227 | 3.330 | 2.808 | 2.826 | 2.746 |
In [ ]:
In [9]:
# Display the first few column names
print(f'Column count: {len(df_clean.columns)}')
display(df_clean.columns[:12].tolist())
Column count: 16
['Date', 'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)', 'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)']
In [10]:
# Check the range of dates available
print(f"Date range: {df_clean['Date'].min()} to {df_clean['Date'].max()}")
display(df_clean[['Date', 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']].dropna().head(5))
Date range: 2007-02-15 00:00:00 to 2008-11-15 00:00:00
| Date | U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | |
|---|---|---|
| 198 | 2007-02-15 | 2.278 |
| 199 | 2007-03-15 | 2.563 |
| 200 | 2007-04-15 | 2.845 |
| 201 | 2007-05-15 | 3.146 |
| 202 | 2007-06-15 | 3.056 |
In [11]:
print('Missing values:', df_clean['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].isna().sum())
Missing values: 0
In [ ]:
# Display the available column names
print(f'Column count: {len(df_clean.columns)}')
display(df_clean.columns[:12].tolist())
# Define the target columns for plotting
columns_to_plot_corrected = [
'Regular_All_Formulations', # Exact DataFrame column name
# Additional columns can be added here
]
# Keep only columns that exist in df_clean
columns_to_plot_existing = [col for col in columns_to_plot_corrected if col in df_clean.columns]
# Set 'Date' as the index when needed
if 'Date' not in df_clean.index.names:
df_clean.set_index('Date', inplace=True)
# Plot the available target columns
if columns_to_plot_existing:
df_clean[columns_to_plot_existing].plot(figsize=(14, 7), marker='o', title='U.S. Gasoline Prices Over Time')
plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.grid(True)
plt.legend()
plt.show()
else:
print("None of the intended columns to plot were found in the DataFrame.")
In [ ]:
In [13]:
import plotly.graph_objects as go
# Define the columns the analysis plots and their corresponding colors
columns_to_plot = [
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
]
colors = ['#FFFF00', # Yellow
'#FF0000', # Red
'#0000FF', # Blue
'#008000'] # Green
# Creating the Plotly figure
fig = go.Figure()
# Adding a line plot for each fuel type
for column, color in zip(columns_to_plot, colors):
fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], name=column, # Keep full column name in legend
mode='lines+markers', line=dict(color=color)))
# Updating the layout
fig.update_layout(
title='Interactive U.S. Gasoline and Diesel Retail Prices Over Time (2019-2023)',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
legend_title='Fuel Type',
template='plotly_white',
width=1400, # Wider chart for better visualization
hovermode='x unified' # Unified hover mode for easier comparison
)
# Display the figure
fig.show()
In [14]:
import plotly.graph_objects as go
# Define the columns the analysis plots and their corresponding colors
columns_to_plot = [
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
]
colors = ['#FFFF00', # Yellow
'#FF0000', # Red
'#0000FF', # Blue
'#008000'] # Green
# Creating the Plotly figure
fig = go.Figure()
# Adding a line plot for each fuel type
for column, color in zip(columns_to_plot, colors):
fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], name=column.split(' ')[1], # Simplify name for legend
mode='lines+markers', line=dict(color=color)))
# Updating the layout
fig.update_layout(
title='Interactive U.S. Gasoline and Diesel Retail Prices Over Time (2010-2023)',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
legend_title='Fuel Type',
template='plotly_white',
width=1400 # Wider chart for better visualization
)
# Display the figure
fig.show()
In [15]:
print(f'Index name: {df_clean.index.name}')
Index name: Date
In [ ]:
In [16]:
import plotly.graph_objects as go
# Define the columns the analysis plots and their corresponding colors
columns_to_plot = [
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
]
colors = ['#FFFF00', # Yellow
'#FF0000', # Red
'#0000FF', # Blue
'#008000'] # Green
# Creating the Plotly figure
fig = go.Figure()
# Adding a line plot for each fuel type
for column, color in zip(columns_to_plot, colors):
fig.add_trace(go.Scatter(x=df_filtered.index, y=df_filtered[column], name=column, # Keep full column name in legend
mode='lines+markers', line=dict(color=color)))
# Updating the layout
fig.update_layout(
title='Interactive U.S. Gasoline and Diesel Retail Prices Over Time (2010-2023)',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
legend_title='Fuel Type',
template='plotly_white',
width=1400, # Wider chart for better visualization
hovermode='x unified' # Unified hover mode for easier comparison
)
# Display the figure
fig.show()
In [17]:
import pandas as pd
import plotly.graph_objects as go
# Define the path to the Excel file
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date'
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
# Convert the 'Date' column to datetime format
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter the DataFrame to include data since January 2010
start_date = '2010-01-01'
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
# Define the columns to plot
columns_to_plot = [
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
]
# Create a Plotly figure
fig = go.Figure()
# Add a line plot for each fuel type
for column in columns_to_plot:
fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], name=column, mode='lines+markers'))
# Update the layout of the figure
fig.update_layout(
title='Interactive U.S. Gasoline and Diesel Retail Prices Since 2010',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
template='plotly_white',
width=1400
)
# Display the figure
fig.show()
except Exception as e:
print(f'Notebook cell skipped: {e}')
In [18]:
from IPython.display import display
display(df_clean.head(5))
print(df_clean.info())
| U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. No 2 Diesel Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||
| 2007-02-15 | 2.323 | 2.276 | 2.418 | 2.278 | 2.235 | 2.369 | 2.385 | 2.325 | 2.500 | 2.479 | 2.431 | 2.567 | 2.488 | 2.513 | 2.435 |
| 2007-03-15 | 2.609 | 2.546 | 2.738 | 2.563 | 2.503 | 2.689 | 2.670 | 2.593 | 2.820 | 2.766 | 2.706 | 2.877 | 2.667 | 2.680 | 2.630 |
| 2007-04-15 | 2.891 | 2.831 | 3.013 | 2.845 | 2.787 | 2.965 | 2.953 | 2.881 | 3.093 | 3.052 | 2.996 | 3.155 | 2.834 | 2.847 | 2.790 |
| 2007-05-15 | 3.187 | 3.157 | 3.248 | 3.146 | 3.119 | 3.202 | 3.241 | 3.199 | 3.321 | 3.331 | 3.300 | 3.389 | 2.796 | 2.818 | 2.726 |
| 2007-06-15 | 3.102 | 3.067 | 3.173 | 3.056 | 3.024 | 3.122 | 3.161 | 3.118 | 3.247 | 3.263 | 3.227 | 3.330 | 2.808 | 2.826 | 2.746 |
<class 'pandas.DataFrame'> DatetimeIndex: 22 entries, 2007-02-15 to 2008-11-15 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) 22 non-null float64 1 U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) 22 non-null float64 2 U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) 22 non-null float64 3 U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) 22 non-null float64 4 U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) 22 non-null float64 5 U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) 22 non-null float64 6 U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) 22 non-null float64 7 U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) 22 non-null float64 8 U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) 22 non-null float64 9 U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) 22 non-null float64 10 U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) 22 non-null float64 11 U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) 22 non-null float64 12 U.S. No 2 Diesel Retail Prices (Dollars per Gallon) 22 non-null float64 13 U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) 22 non-null float64 14 U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) 22 non-null float64 dtypes: float64(15) memory usage: 2.8 KB None
In [19]:
print(df_full.head()) # Print the first few rows of the DataFrame
print(f"Date range: {df_full['Date'].min()} to {df_full['Date'].max()}")
Date U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) \ 0 1990-08-15 NaN 1 1990-09-15 NaN 2 1990-10-15 NaN 3 1990-11-15 NaN 4 1990-12-15 NaN U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) \ 0 NaN 1 NaN 2 NaN 3 NaN 4 NaN U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) \ 0 NaN 1 NaN 2 NaN 3 NaN 4 NaN U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) \ 0 NaN 1 1.258 2 1.335 3 1.324 4 NaN U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) \ 0 NaN 1 1.258 2 1.335 3 1.324 4 NaN U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) \ 0 NaN 1 NaN 2 NaN 3 NaN 4 NaN U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) \ 0 NaN 1 NaN 2 NaN 3 NaN 4 NaN U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) \ 0 NaN 1 NaN 2 NaN 3 NaN 4 NaN U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) \ 0 NaN 1 NaN 2 NaN 3 NaN 4 NaN U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) \ 0 NaN 1 NaN 2 NaN 3 NaN 4 NaN U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) \ 0 NaN 1 NaN 2 NaN 3 NaN 4 NaN U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. No 2 Diesel Retail Prices (Dollars per Gallon) \ 0 NaN NaN 1 NaN NaN 2 NaN NaN 3 NaN NaN 4 NaN NaN U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) \ 0 NaN 1 NaN 2 NaN 3 NaN 4 NaN U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) 0 NaN 1 NaN 2 NaN 3 NaN 4 NaN Date range: 1990-08-15 00:00:00 to 2024-03-15 00:00:00
In [20]:
import pandas as pd
import plotly.graph_objects as go
# Define the path to the Excel file
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date'
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
# Convert the 'Date' column to datetime format
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter the DataFrame to include data since January 2010
start_date = '2010-01-01'
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
# Define the columns to plot
columns_to_plot = df_filtered.columns
# Create a Plotly figure
fig = go.Figure()
# Add a line plot for each column
for column in columns_to_plot:
if column != 'Date': # Exclude 'Date' column from the legend
fig.add_trace(go.Scatter(x=df_filtered['Date'].dt.date, y=df_filtered[column], name=column, mode='lines+markers'))
# Update the layout of the figure
fig.update_layout(
title='Retail Gasoline Prices Over Time',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
legend_title='Fuel Type',
template='plotly_white',
width=1400
)
# Display the figure
fig.show()
except Exception as e:
print(f'Notebook cell skipped: {e}')
In [21]:
print(f'Column count: {len(df_clean.columns)}')
display(df_clean.columns[:12].tolist())
Column count: 15
['U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)', 'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)']
In [22]:
import pandas as pd
import plotly.graph_objects as go
# Define the path to the Excel file
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date'
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
# Convert the 'Date' column to datetime format
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter the DataFrame to include data since January 2010
start_date = '2010-01-01'
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
# Define the columns to plot
columns_to_plot = df_filtered.columns
# Create a Plotly figure
fig = go.Figure()
# Add a line plot for each column with fills
for column in columns_to_plot:
if column != 'Date': # Exclude 'Date' column from the legend
fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], mode='lines', name=column))
fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], mode='lines', name=column + '_fill',
fill='tonexty')) # Add fill to the line
# Update the layout of the figure
fig.update_layout(
title='Retail Gasoline Prices Over Time',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
legend_title='Fuel Type',
template='plotly_white',
width=1400,
hovermode='x unified' # Unified hover mode for easier comparison
)
# Display the figure
fig.show()
except Exception as e:
print(f'Notebook cell skipped: {e}')
In [23]:
import pandas as pd
import numpy as np
# Work with the cleaned dataframe
# Select only the numeric columns
numeric_columns = df_clean.select_dtypes(include=[np.number])
# Calculate Z-score for each numeric column in the DataFrame
z_scores = (numeric_columns - numeric_columns.mean()) / numeric_columns.std()
# Define threshold for considering outliers
threshold = 3
# Identify outliers based on Z-score
outliers = (z_scores > threshold) | (z_scores < -threshold)
# Print outliers
outlier_counts = outliers.sum().sort_values(ascending=False)
display(outlier_counts[outlier_counts > 0].head(12).to_frame('Outlier Count'))
| Outlier Count |
|---|
In [24]:
# Check the data types of the columns
display(df_clean.dtypes.to_frame('dtype'))
missing_counts = df_clean.isnull().sum()
display(missing_counts[missing_counts > 0].to_frame('Missing Count'))
| dtype | |
|---|---|
| U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) | float64 |
| U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) | float64 |
| U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) | float64 |
| U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | float64 |
| U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) | float64 |
| ... | ... |
| U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) | float64 |
| U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) | float64 |
| U.S. No 2 Diesel Retail Prices (Dollars per Gallon) | float64 |
| U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) | float64 |
| U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) | float64 |
15 rows × 1 columns
| Missing Count |
|---|
In [25]:
# Importing necessary library
import numpy as np
# Extracting only the numeric columns for Z-score calculation
numeric_columns = df_clean.select_dtypes(include=np.number)
# Calculate Z-score for each numeric column in the DataFrame
z_scores = (numeric_columns - numeric_columns.mean()) / numeric_columns.std()
# Define threshold for considering outliers
threshold = 3
# Identify outliers based on Z-score
outliers = (z_scores > threshold) | (z_scores < -threshold)
# Print outliers
outlier_counts = outliers.sum().sort_values(ascending=False)
display(outlier_counts[outlier_counts > 0].head(12).to_frame('Outlier Count'))
| Outlier Count |
|---|
In [26]:
outlier_counts = outliers.sum().sort_values(ascending=False)
display(outlier_counts[outlier_counts > 0].head(12).to_frame('Outlier Count'))
display(df_clean.dtypes.to_frame('dtype'))
missing_counts = df_clean.isnull().sum()
display(missing_counts[missing_counts > 0].to_frame('Missing Count'))
| Outlier Count |
|---|
| dtype | |
|---|---|
| U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) | float64 |
| U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) | float64 |
| U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) | float64 |
| U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | float64 |
| U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) | float64 |
| ... | ... |
| U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) | float64 |
| U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) | float64 |
| U.S. No 2 Diesel Retail Prices (Dollars per Gallon) | float64 |
| U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) | float64 |
| U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) | float64 |
15 rows × 1 columns
| Missing Count |
|---|
In [27]:
import pandas as pd
import numpy as np
from IPython.display import display
# Define the path to the Excel file
xls_path = resolve_dataset_path()
print(f'Using dataset: {xls_path}')
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date'
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
# Convert the 'Date' column to datetime format
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter the DataFrame to include data since January 2010
start_date = '2010-01-01'
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
# Handle missing values by forward filling
df_clean = df_filtered.ffill()
columns_to_plot = [
'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
]
def detect_outliers_zscore(data, threshold=3):
z_scores = np.abs((data - data.mean()) / data.std())
return z_scores > threshold
outliers = df_clean[columns_to_plot].apply(detect_outliers_zscore)
outlier_counts = outliers.sum().sort_values(ascending=False)
display(outlier_counts[outlier_counts > 0].head(12).to_frame('Outlier Count'))
except Exception as e:
print(f'Notebook cell skipped: {e}')
Using dataset: PET_PRI_GND_DCUS_NUS_M.xls
| Outlier Count | |
|---|---|
| U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) | 1 |
| U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | 1 |
| U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) | 1 |
| U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) | 1 |
| U.S. No 2 Diesel Retail Prices (Dollars per Gallon) | 1 |
| U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) | 1 |
In [28]:
import pandas as pd
import numpy as np
from IPython.display import display
# Restrict summary statistics to numeric columns so datetime fields do not break variance calculations
numeric_df = df_clean.select_dtypes(include=[np.number])
# Calculate measures of central tendency
central_tendency = numeric_df.describe().loc[['mean', '50%']]
# Calculate measures of dispersion
standard_deviation = numeric_df.std()
variance = numeric_df.var()
range_values = numeric_df.max() - numeric_df.min()
# Create a DataFrame to store the results
statistics_df = pd.DataFrame({
'Mean': central_tendency.loc['mean'],
'Median': central_tendency.loc['50%'],
'Standard Deviation': standard_deviation,
'Variance': variance,
'Range': range_values
})
# Display the statistics DataFrame
display(statistics_df.round(3))
| Mean | Median | Standard Deviation | Variance | Range | |
|---|---|---|---|---|---|
| U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) | 3.074 | 2.948 | 0.615 | 0.378 | 3.160 |
| U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) | 2.979 | 2.866 | 0.608 | 0.369 | 3.050 |
| U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) | 3.265 | 3.145 | 0.638 | 0.407 | 3.380 |
| U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | 2.986 | 2.860 | 0.620 | 0.384 | 3.165 |
| U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) | 2.902 | 2.808 | 0.616 | 0.379 | 3.083 |
| ... | ... | ... | ... | ... | ... |
| U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) | 3.396 | 3.348 | 0.616 | 0.380 | 3.268 |
| U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) | 3.633 | 3.584 | 0.689 | 0.475 | 3.643 |
| U.S. No 2 Diesel Retail Prices (Dollars per Gallon) | 3.403 | 3.253 | 0.766 | 0.587 | 3.756 |
| U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) | 3.404 | 3.253 | 0.766 | 0.587 | 3.756 |
| U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) | NaN | NaN | NaN | NaN | NaN |
15 rows × 5 columns
In [29]:
#Determing correlation of independent variables
import pandas as pd
# Run the analysis on the cleaned dataframe with the selected independent variables
relevant_columns = [
'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
]
# Selecting only the relevant columns
relevant_data = df_clean[relevant_columns]
# Calculating the correlation matrix
correlation_matrix = relevant_data.corr()
# Displaying the correlation matrix
display(correlation_matrix.round(3))
| U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. No 2 Diesel Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) | |
|---|---|---|---|---|---|---|---|
| U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) | 1.000 | 0.999 | 0.980 | 0.957 | 0.943 | 0.943 | NaN |
| U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | 0.999 | 1.000 | 0.975 | 0.949 | 0.938 | 0.938 | NaN |
| U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) | 0.980 | 0.975 | 1.000 | 0.995 | 0.958 | 0.958 | NaN |
| U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) | 0.957 | 0.949 | 0.995 | 1.000 | 0.953 | 0.953 | NaN |
| U.S. No 2 Diesel Retail Prices (Dollars per Gallon) | 0.943 | 0.938 | 0.958 | 0.953 | 1.000 | 1.000 | NaN |
| U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) | 0.943 | 0.938 | 0.958 | 0.953 | 1.000 | 1.000 | NaN |
| U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
In [30]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date'
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
# Convert the 'Date' column to datetime format
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Create a scatter plot of U.S. Regular All Formulations Retail Gasoline Prices over time
fig = px.scatter(df_full, x='Date', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Scatter Plot of U.S. Regular Gasoline Prices Over Time',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'},
trendline='ols' # Add a trendline using ordinary least squares method
)
# Display the scatter plot
fig.show()
except Exception as e:
print(f'Notebook cell skipped: {e}')
Notebook cell skipped: No module named 'statsmodels'
In [31]:
import pandas as pd
from IPython.display import display
# Define the path to the Excel file
xls_path = resolve_dataset_path()
print(f'Using dataset: {xls_path}')
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Display the column names to identify the correct column for gasoline prices
print(f'Column count: {len(df_full.columns)}')
display(df_full.columns[:12].tolist())
except Exception as e:
print(f'Notebook cell skipped: {e}')
Using dataset: PET_PRI_GND_DCUS_NUS_M.xls Column count: 16
['Date', 'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)', 'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)', 'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)']
In [32]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date'
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
# Convert the 'Date' column to datetime format
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices
fig = px.histogram(df_full, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Histogram of U.S. Regular Gasoline Prices',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'}
)
# Display the histogram
fig.show()
except Exception as e:
print(f'Notebook cell skipped: {e}')
In [33]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date'
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
# Convert the 'Date' column to datetime format
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Create a scatter plot of U.S. Regular All Formulations Retail Gasoline Prices over time
fig = px.scatter(df_full, x='Date', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Scatter Plot of U.S. Regular Gasoline Prices Over Time',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'},
trendline='ols' # Add a trendline using ordinary least squares method (linear regression)
)
# Display the scatter plot
fig.show()
except Exception as e:
print(f'Notebook cell skipped: {e}')
Notebook cell skipped: No module named 'statsmodels'
In [34]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter for data from 2019 onward
df_filtered = df_full[df_full['Date'].dt.year >= 2019]
# Create the scatter plot
fig = px.scatter(df_filtered, x='Date', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Scatter Plot of U.S. Regular Gasoline Prices (2019 Onward)',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'},
trendline='ols')
# Display the scatter plot
fig.show()
except Exception as e:
print(f'Notebook cell skipped: {e}')
Notebook cell skipped: No module named 'statsmodels'
In [35]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Add a 'Year' column to the DataFrame
df_full['Year'] = df_full['Date'].dt.year
# Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year
fig = px.box(df_full, x='Year', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Colorful Boxplot of U.S. Regular Gasoline Prices by Year',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
'Year': 'Year'},
color='Year' # This makes the plot colorful
)
# Display the boxplot
fig.show()
except Exception as e:
print(f'Notebook cell skipped: {e}')
In [36]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Add a 'Year' column and a 'Quarter' column to the DataFrame
df_full['Year'] = df_full['Date'].dt.year
df_full['Quarter'] = df_full['Date'].dt.to_period('Q')
# Convert 'Quarter' to string for compatibility
df_full['QuarterStr'] = df_full['Quarter'].astype(str)
# Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year and Quarter
fig = px.box(df_full, x='QuarterStr', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Boxplot of U.S. Regular Gasoline Prices by Quarter',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
'QuarterStr': 'Quarter'},
color='QuarterStr' # This makes the plot colorful and distinguishes each quarter
)
# Display the boxplot
fig.show()
except Exception as e:
print(f'Notebook cell skipped: {e}')
In [37]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter for data from 2010 onward and avoid SettingWithCopyWarning by using copy()
df_filtered = df_full[df_full['Date'].dt.year >= 2010].copy()
# Properly use .loc to avoid SettingWithCopyWarning
df_filtered['Year'] = df_filtered['Date'].dt.year
df_filtered.loc[:, 'Quarter'] = df_filtered['Date'].dt.to_period('Q')
# Convert 'Quarter' to string for compatibility
df_filtered.loc[:, 'QuarterStr'] = df_filtered['Quarter'].astype(str)
# Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year and Quarter
fig = px.box(df_filtered, x='QuarterStr', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Boxplot of U.S. Regular Gasoline Prices by Quarter (From 2010 Onward)',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
'QuarterStr': 'Quarter'},
color='QuarterStr' # This makes the plot colorful and distinguishes each quarter
)
# Display the boxplot
fig.show()
except Exception as e:
print(f'Notebook cell skipped: {e}')
In [38]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter for data from 2019 onward and avoid SettingWithCopyWarning by using copy()
df_filtered = df_full[df_full['Date'].dt.year >= 2019].copy()
# Properly use .loc to avoid SettingWithCopyWarning
df_filtered['Year'] = df_filtered['Date'].dt.year
df_filtered.loc[:, 'Quarter'] = df_filtered['Date'].dt.to_period('Q')
# Convert 'Quarter' to string for compatibility
df_filtered.loc[:, 'QuarterStr'] = df_filtered['Quarter'].astype(str)
# Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year and Quarter
fig = px.box(df_filtered, x='QuarterStr', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
title='Boxplot of U.S. Regular Gasoline Prices by Quarter (From 2019 Onward)',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
'QuarterStr': 'Quarter'},
color='QuarterStr' # This makes the plot colorful and distinguishes each quarter
)
# Display the boxplot
fig.show()
except Exception as e:
print(f'Notebook cell skipped: {e}')
In [39]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Extract the year from the 'Date' column
df_full['Year'] = df_full['Date'].dt.year
# Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices
# Color the histogram bars by the 'Year' to visualize how distributions change over time
fig = px.histogram(df_full, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', color='Year',
title='Histogram of U.S. Regular Gasoline Prices by Year',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
'Year': 'Year'},
barmode='overlay', # Overlay the histograms to see how distributions compare across years
histnorm='percent', # Normalize the histogram to show percentage rather than count
nbins=30 # Adjust the number of bins for finer granularity or clarity
)
# Display the histogram
fig.show()
except Exception as e:
print(f'Notebook cell skipped: {e}')
In [40]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter for data from 2019 onward
df_filtered = df_full[df_full['Date'].dt.year >= 2019].copy()
# Create a 'YearQuarter' column for coloring
df_filtered['YearQuarter'] = df_filtered['Date'].dt.to_period('Q').astype(str)
# Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices, colored by 'YearQuarter'
fig = px.histogram(df_filtered, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', color='YearQuarter',
title='Quarterly Histogram of U.S. Regular Gasoline Prices (2019 Onward)',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
'YearQuarter': 'Year-Quarter'},
barmode='overlay', # Overlay the histograms to compare across quarters
histnorm='percent', # Normalize to show percentage
nbins=20 # Number of bins for granularity
)
# Display the histogram
fig.show()
except Exception as e:
print(f'Notebook cell skipped: {e}')
In [41]:
import pandas as pd
import plotly.express as px
# Define the path to the Excel file
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Filter for data from 2019 onward
df_filtered = df_full[df_full['Date'].dt.year >= 2019].copy()
# Create a 'YearQuarter' column for coloring
df_filtered['YearQuarter'] = df_filtered['Date'].dt.to_period('Q').astype(str)
# Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices, colored by 'YearQuarter'
fig = px.histogram(df_filtered, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', color='YearQuarter',
title='Quarterly Histogram of U.S. Regular Gasoline Prices (2019 Onward)',
labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
'YearQuarter': 'Year-Quarter'},
barmode='overlay', # Overlay the histograms to compare across quarters
histnorm='percent', # Normalize to show percentage
nbins=20, # Number of bins for granularity
)
# Update y-axis title to reflect what the percentages represent
fig.update_layout(yaxis_title='Percentage of Total Observations')
# Display the histogram
fig.show()
except Exception as e:
print(f'Notebook cell skipped: {e}')
In [42]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# Define the path to the Excel file
xls_path = resolve_dataset_path()
try:
# Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Rename the first column to 'Date' and convert to datetime
df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
# Drop the specified column
df_full.drop(columns=['U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'], inplace=True)
# Drop rows with any NaN values in the target column
df_full.dropna(subset=['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], inplace=True)
# Reset the index after the dropna operation
df_full.reset_index(drop=True, inplace=True)
# Feature Engineering: Use previous prices as features to predict the next price
df_full['PrevPrice'] = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].shift(1)
# Drop the first row which now has an NaN value after the shift operation
df_full.dropna(inplace=True)
# Define features and target variable
X = df_full[['PrevPrice']] # Features from previous prices
y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'] # Target
# Initialize the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
# Train the model on the entire dataset
model.fit(X, y)
# Predict on the training set itself since we don't have a separate test set
predictions = model.predict(X)
# Calculate and print the mean absolute error
mae = mean_absolute_error(y, predictions)
print("Mean Absolute Error on the entire dataset:", mae)
# Optionally, visualize the actual vs predicted prices
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.scatter(y, predictions, alpha=0.3)
plt.plot(y, y, color='red') # Perfect predictions line
plt.title('Actual vs Predicted Prices')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.show()
except Exception as e:
print(f'Notebook cell skipped: {e}')
Mean Absolute Error on the entire dataset: 0.06432608148404984
In [43]:
# Continue forecasting from the trained model and working dataframe
# Find the last known price
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
# Initialize a list to keep track of our predictions and input features
predictions = []
input_feature = last_known_price
# Predict the next three months
for _ in range(3):
# Predict the next price
next_month_price = model.predict([[input_feature]])[0]
# Store the prediction
predictions.append(next_month_price)
# Update the input_feature to the newly predicted price
input_feature = next_month_price
# Display the predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=3, freq='ME')
predicted_prices = pd.Series(predictions, index=prediction_dates)
display(predicted_prices.to_frame('Predicted Price'))
| Predicted Price | |
|---|---|
| 2024-04-30 | 3.58903 |
| 2024-05-31 | 3.65322 |
| 2024-06-30 | 3.64383 |
In [44]:
import plotly.graph_objects as go
import pandas as pd
# Plot the forecast against the working dataframe and generated predictions
# Create a DataFrame for the predicted data with appropriate dates
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=3, freq='ME')
predicted_data = pd.DataFrame({'Date': prediction_dates, 'Predicted Prices': [3.58903, 3.65322, 3.64383]})
# Create the figure for plotting
fig = go.Figure()
# Add the historical data
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
mode='lines', name='Historical Prices', line=dict(color='blue')))
# Add the predicted data
fig.add_trace(go.Scatter(x=predicted_data['Date'], y=predicted_data['Predicted Prices'],
mode='lines+markers', name='Predicted Prices', line=dict(color='red', dash='dash')))
# Update layout for a more interactive experience
fig.update_layout(title='Historical and Predicted Gasoline Prices',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
template='plotly_white')
# Enable zoom and pan
fig.update_layout(xaxis=dict(rangeslider=dict(visible=True), type="date"))
# Show the figure
fig.show()
In [45]:
# Use the prepared working dataframe for feature construction
# Ensure the feature used for prediction is correctly set
X = df_full[['PrevPrice']] # Use the lagged price feature for model input
# Generate predictions for the entire dataset
predictions = model.predict(X)
# Now, ensure 'y' is the target variable for the entire dataset
y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']
# Ensure y and predictions are aligned; they should be if predictions were made for the entire dataset
residuals = y - predictions
# Now, the residuals can be visualized the residuals as previously described without encountering shape mismatch issues
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.scatter(y.index, residuals)
plt.axhline(y=0, color='red', linestyle='--')
plt.title('Residuals of Predictions')
plt.xlabel('Index')
plt.ylabel('Residuals')
plt.show()
In [46]:
# Evaluate the RandomForestRegressor model with the prepared feature set
import pandas as pd
import matplotlib.pyplot as plt
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.nlargest(len(X.columns)).plot(kind='barh', colormap='winter')
plt.title('Feature Importances')
plt.show()
In [47]:
import plotly.graph_objects as go
# Combine historical data with future predictions for plotting
# Concatenate historical and predicted data for a continuous timeline
full_dates = pd.concat([df_full['Date'], predicted_data['Date']])
full_prices = pd.concat([df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], predicted_data['Predicted Prices']])
fig = go.Figure()
# Add historical data to the plot
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
mode='lines', name='Historical Prices'))
# Add predicted data to the plot
fig.add_trace(go.Scatter(x=predicted_data['Date'], y=predicted_data['Predicted Prices'],
mode='lines+markers', name='Predicted Prices', line=dict(dash='dot')))
# Update layout for a more detailed view
fig.update_layout(title='Historical and Predicted Gasoline Prices',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
template='plotly_white')
fig.show()
In [48]:
# Starting from the last known price in the dataset
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
# Initialize a list to store predictions
monthly_predictions = [last_known_price] # Starting with the last known price
# Predict the next 12 months
for _ in range(12):
# The model expects a DataFrame, so we create one with the current input feature
current_feature = pd.DataFrame([monthly_predictions[-1]], columns=['PrevPrice'])
# Predict the next price
next_month_price = model.predict(current_feature)[0]
# Append the prediction for next month
monthly_predictions.append(next_month_price)
# Remove the first element (last known price) to get only future predictions
future_predictions = monthly_predictions[1:]
# Create a date range for these predictions
last_date = df_full['Date'].iloc[-1]
prediction_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=12, freq='ME')
# Plot the predictions
plt.figure(figsize=(10, 6))
plt.plot(prediction_dates, future_predictions, label='Predicted Prices', marker='o', linestyle='--', color='red')
plt.title('Predicted Gasoline Prices for the Next Year')
plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.legend()
plt.show()
In [49]:
import pandas as pd
import matplotlib.pyplot as plt
# Forecast forward from the last known date in the historical dataframe
# and 'model' is the trained Random Forest model
# Initialize predictions list with the last known price to start the prediction chain
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = [last_known_price]
# Generate predictions for the next 12 months
for _ in range(12):
input_feature = pd.DataFrame([predictions[-1]], columns=['PrevPrice'])
next_price = model.predict(input_feature)[0]
predictions.append(next_price)
# Exclude the first element (last known price) to keep only future predictions
future_predictions = predictions[1:]
# Create a date range for the next 12 months of predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=12, freq='ME')
# Combine historical dates and predicted dates
full_dates = pd.concat([df_full['Date'], pd.Series(prediction_dates)])
# Combine historical prices and predicted prices
# The last known price is included in both historical and predicted prices for continuity
full_prices = pd.concat([df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], pd.Series(future_predictions)])
# Plotting
plt.figure(figsize=(14, 7))
plt.plot(df_full['Date'], df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], label='Historical Prices', color='blue')
plt.plot(prediction_dates, future_predictions, label='Predicted Prices', linestyle='--', marker='o', color='red')
plt.title('Historical and Predicted Gasoline Prices')
plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.legend()
plt.show()
In [50]:
import plotly.graph_objects as go
import pandas as pd
# Generate forward predictions from the historical dataframe and trained model
# Generate predictions for the next 12 months, starting from the last known price
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = [last_known_price]
for _ in range(12):
input_feature = pd.DataFrame([predictions[-1]], columns=['PrevPrice'])
next_price = model.predict(input_feature)[0]
predictions.append(next_price)
# Exclude the first element (last known price) to keep only future predictions
future_predictions = predictions[1:]
# Create a date range for the next 12 months of predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=12, freq='ME')
# Plotting with Plotly for interactive exploration
fig = go.Figure()
# Add historical data to the plot
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
mode='lines', name='Historical Prices'))
# Add predicted data to the plot
fig.add_trace(go.Scatter(x=prediction_dates, y=future_predictions,
mode='lines+markers', name='Predicted Prices', line=dict(dash='dot')))
# Update layout for a more detailed view
fig.update_layout(title='Historical and Predicted Gasoline Prices',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
template='plotly_white')
# Show the figure
fig.show()
In [51]:
import plotly.graph_objects as go
import pandas as pd
# Extend the trained model beyond the historical dataframe
# Start predictions from the last known price
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = [last_known_price]
# Predict for the next 5 years (60 months)
for _ in range(60):
input_feature = pd.DataFrame([predictions[-1]], columns=['PrevPrice'])
next_price = model.predict(input_feature)[0]
predictions.append(next_price)
# Exclude the first element (last known price) to have only future predictions
future_predictions = predictions[1:]
# Create a date range for the next 5 years of predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=60, freq='ME')
# Creating the Plotly figure
fig = go.Figure()
# Add historical data to the plot
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
mode='lines', name='Historical Prices', line=dict(color='blue')))
# Add predicted data to the plot
fig.add_trace(go.Scatter(x=prediction_dates, y=future_predictions,
mode='lines+markers', name='Predicted Prices', line=dict(color='red', dash='dash')))
# Update the layout for better visualization and interactivity
fig.update_layout(title='Historical and Predicted Gasoline Prices for the Next 5 Years',
xaxis_title='Date',
yaxis_title='Price (Dollars per Gallon)',
template='plotly_white')
# Enable zoom and pan features
fig.update_layout(xaxis_rangeslider_visible=True)
# Show the figure
fig.show()
In [52]:
# Find the earliest date in the historical data
earliest_date = df_full['Date'].min()
print(f"The model is trained on data starting from: {earliest_date}")
The model is trained on data starting from: 2007-02-15 00:00:00
In [53]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
# Use the complete dataset sorted by date
# Check the earliest date just for confirmation
print(f"Earliest date in the dataset: {df_full['Date'].min()}")
# Feature Engineering: Create 'PrevPrice' based on 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'
df_full['PrevPrice'] = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].shift(1)
df_full.dropna(inplace=True) # Drop the first row which now lacks 'PrevPrice'
# Splitting the data into features (X) and target (y)
X = df_full[['PrevPrice']]
y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
# Train the model
model.fit(X_train, y_train)
# Make predictions on the test set
predictions = model.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
print(f"Mean Absolute Error: {mae}")
# Now the model is trained from the earliest occurrence of price in the dataset
Earliest date in the dataset: 2007-02-15 00:00:00 Mean Absolute Error: 0.16113744715447187
In [54]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
# Work from the loaded dataset sorted by date, starting from 1990
# Check to ensure the start date is indeed from 1990
print(f"Dataset start date: {df_full['Date'].min()}")
# Prepare the feature 'PrevPrice'
df_full['PrevPrice'] = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].shift(1)
df_full = df_full.dropna() # Removing the first row which now has a NaN value for 'PrevPrice'
# Define the features (X) and target variable (y)
X = df_full[['PrevPrice']]
y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']
# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
# Train the model on the entire dataset
model.fit(X, y)
# Forecast the next 12 months
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = []
for _ in range(12):
# Using the last known price to predict the next one
input_feature = pd.DataFrame([last_known_price], columns=['PrevPrice'])
next_price = model.predict(input_feature)[0]
predictions.append(next_price)
last_known_price = next_price # Update the last known price for the next prediction
# Output the predictions
prediction_dates = pd.date_range(start=df_full['Date'].max() + pd.Timedelta(days=1), periods=12, freq='ME')
predicted_prices = pd.Series(predictions, index=prediction_dates)
display(predicted_prices.to_frame('Predicted Price'))
Dataset start date: 2007-03-15 00:00:00
| Predicted Price | |
|---|---|
| 2024-03-31 | 3.59018 |
| 2024-04-30 | 3.67058 |
| 2024-05-31 | 3.69817 |
| 2024-06-30 | 3.58821 |
| 2024-07-31 | 3.67058 |
| 2024-08-31 | 3.69817 |
| 2024-09-30 | 3.58821 |
| 2024-10-31 | 3.67058 |
| 2024-11-30 | 3.69817 |
| 2024-12-31 | 3.58821 |
| 2025-01-31 | 3.67058 |
| 2025-02-28 | 3.69817 |
In [55]:
print(f"Earliest date: {df_full['Date'].min()}")
display(X.head(5))
Earliest date: 2007-04-15 00:00:00
| PrevPrice | |
|---|---|
| 197 | 2.563 |
| 198 | 2.845 |
| 199 | 3.146 |
| 200 | 3.056 |
| 201 | 2.965 |
In [56]:
print(f"Earliest date: {df_full['Date'].min()}")
Earliest date: 2007-04-15 00:00:00
In [57]:
display(X.head(5))
| PrevPrice | |
|---|---|
| 197 | 2.563 |
| 198 | 2.845 |
| 199 | 3.146 |
| 200 | 3.056 |
| 201 | 2.965 |
In [62]:
# This line restricts the dataset to start from 2019
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
In [59]:
print(f"Date range: {df_full['Date'].min()} to {df_full['Date'].max()}")
Date range: 2007-04-15 00:00:00 to 2024-03-15 00:00:00
In [63]:
from IPython.display import display
if 'df_full' in globals():
display(df_full.head(3))
display(df_full.tail(3))
| Date | U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. No 2 Diesel Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) | PrevPrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 197 | 2007-04-15 | 2.891 | 2.831 | 3.013 | 2.845 | 2.787 | 2.965 | 2.953 | 2.881 | 3.093 | 3.052 | 2.996 | 3.155 | 2.834 | 2.847 | 2.563 |
| 198 | 2007-05-15 | 3.187 | 3.157 | 3.248 | 3.146 | 3.119 | 3.202 | 3.241 | 3.199 | 3.321 | 3.331 | 3.300 | 3.389 | 2.796 | 2.818 | 2.845 |
| 199 | 2007-06-15 | 3.102 | 3.067 | 3.173 | 3.056 | 3.024 | 3.122 | 3.161 | 3.118 | 3.247 | 3.263 | 3.227 | 3.330 | 2.808 | 2.826 | 3.146 |
| Date | U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) | U.S. No 2 Diesel Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) | PrevPrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 398 | 2024-01-15 | 3.197 | 3.045 | 3.495 | 3.075 | 2.957 | 3.331 | 3.619 | 3.391 | 3.945 | 3.969 | 3.764 | 4.206 | 3.854 | 3.854 | 3.134 |
| 399 | 2024-02-15 | 3.328 | 3.188 | 3.605 | 3.212 | 3.102 | 3.446 | 3.729 | 3.511 | 4.042 | 4.077 | 3.888 | 4.296 | 4.044 | 4.044 | 3.075 |
| 400 | 2024-03-15 | 3.542 | 3.402 | 3.815 | 3.426 | 3.318 | 3.657 | 3.938 | 3.719 | 4.252 | 4.286 | 4.098 | 4.506 | 4.022 | 4.022 | 3.212 |
In [ ]:
import importlib
import subprocess
import sys
import pandas as pd
import numpy as np
try:
from statsmodels.formula.api import ols
except ModuleNotFoundError:
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'statsmodels'])
from statsmodels.formula.api import ols
# Define the path to the Excel file
xls_path = resolve_dataset_path()
# Load the data from the specified sheet with the data starting on row 2
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Ensure the 'Date' column is datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
# Convert 'Date' to a numerical format for regression analysis (ordinal)
df['Date_ordinal'] = df['Date'].apply(lambda x: x.toordinal() if pd.notna(x) else np.nan)
df = df.dropna(subset=['Date_ordinal', 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'])
# Fit the linear regression model
model = ols("Q('U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)') ~ Date_ordinal", data=df).fit()
# Print the summary of the model results
print(model.summary().as_text().split('Notes:')[0])
# Extract the coefficients to formulate the regression equation
intercept, slope = model.params
print(f"Regression Equation: Price = {intercept:.4f} + {slope:.4f} * Date_ordinal")
In [74]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
import plotly.graph_objects as go
# Load the dataset
xls_path = resolve_dataset_path()
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
# Select the U.S. Regular All Formulations Retail Gasoline Prices column
data = df['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']
# Handle missing values by forward filling then backward filling to ensure no gaps
data_filled = data.ffill().bfill()
# Decompose the series to understand its components
decomposition = seasonal_decompose(data_filled, model='additive', period=12)
fig = decomposition.plot()
plt.show()
# Fit a SARIMA model
# Note: p, d, q values and seasonal P, D, Q, M should be selected based on ACF, PACF plots and domain understanding
model = SARIMAX(data_filled, order=(1,1,1), seasonal_order=(1,1,1,12))
results = model.fit(disp=False)
# Check diagnostics to ensure no assumptions are violated
results.plot_diagnostics(figsize=(15, 12))
plt.show()
# Forecast for the next 12 months
forecast = results.get_forecast(steps=12)
mean_forecast = forecast.predicted_mean
confidence_intervals = forecast.conf_int()
# Dates for the forecast
forecast_dates = pd.date_range(data_filled.index[-1], periods=12, freq='ME')
# Create a plot with historical data and forecast
fig = go.Figure()
# Historical data
fig.add_trace(go.Scatter(x=data_filled.index, y=data_filled, mode='lines', name='Historical Prices'))
# Forecast data
fig.add_trace(go.Scatter(x=forecast_dates, y=mean_forecast, mode='lines', name='Forecast'))
# Fill area between confidence intervals
fig.add_trace(go.Scatter(x=np.concatenate([forecast_dates, forecast_dates[::-1]]),
y=np.concatenate([confidence_intervals.iloc[:, 0], confidence_intervals.iloc[:, 1][::-1]]),
fill='toself', fillcolor='rgba(0,100,80,0.2)', line=dict(color='rgba(255,255,255,0)'),
name='Confidence Interval'))
# Update layout
fig.update_layout(title='Forecast of U.S. Regular Gasoline Prices', xaxis_title='Date', yaxis_title='Price (Dollars per Gallon)', template='plotly_white')
fig.show()
C:\Users\pc\OneDrive\Desktop\Python\CodexPrime\venv\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq) C:\Users\pc\OneDrive\Desktop\Python\CodexPrime\venv\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting. self._init_dates(dates, freq)
C:\Users\pc\OneDrive\Desktop\Python\CodexPrime\venv\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`. return get_prediction_index(
In [71]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime as dt
# Load the dataset
xls_path = resolve_dataset_path()
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
# Convert dates into numerical format for regression
df['Date_ordinal'] = df.index.map(dt.datetime.toordinal)
data_filled = df['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].ffill().bfill()
# Prepare X and y for regression
X = df['Date_ordinal'].values.reshape(-1, 1)
y = data_filled.values
# Create and fit the linear regression model
model = LinearRegression()
model.fit(X, y)
# Print the coefficients of the model
print("Intercept (b0):", model.intercept_)
print("Slope (b1) for Date:", model.coef_[0])
# Plotting the results with formatted dates
plt.figure(figsize=(12, 6))
plt.scatter(df.index, y, color='blue', label='Actual Prices')
plt.plot(df.index, model.predict(X), color='red', linewidth=2, label='Fitted Line')
plt.title('Linear Regression on Gasoline Prices')
plt.xlabel('Date')
plt.ylabel('U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)')
# Format the x-axis to improve readability
plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator()) # Use AutoDateLocator without additional parameters
plt.gca().xaxis.set_major_formatter(mdates.ConciseDateFormatter(mdates.AutoDateLocator())) # Use concise formatter for cleaner labels
plt.gcf().autofmt_xdate() # Automatically rotate date labels for better readability
plt.legend()
plt.show()
Intercept (b0): -155.08130894484586 Slope (b1) for Date: 0.0002146200118198708
In [73]:
import subprocess
import sys
import pandas as pd
import matplotlib.pyplot as plt
try:
import seaborn as sns
except ModuleNotFoundError:
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn'])
import seaborn as sns
# File path from the original code
xls_path = resolve_dataset_path()
# Load the dataset
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
# Select the relevant columns used in the analysis
relevant_columns = [
'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
]
# Selecting only the relevant columns
df_clean = df[relevant_columns]
# Calculating the correlation matrix
correlation_matrix = df_clean.corr()
# Displaying the correlation matrix
display(correlation_matrix.round(3))
# Visualization of the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Fuel Price Variables')
plt.show()
| U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) | U.S. No 2 Diesel Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) | U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) | |
|---|---|---|---|---|---|---|---|
| U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) | 1.000 | 1.000 | 0.995 | 0.988 | 0.982 | 0.941 | 0.916 |
| U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) | 1.000 | 1.000 | 0.993 | 0.985 | 0.980 | 0.936 | 0.913 |
| U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) | 0.995 | 0.993 | 1.000 | 0.998 | 0.986 | 0.954 | 0.919 |
| U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) | 0.988 | 0.985 | 0.998 | 1.000 | 0.984 | 0.947 | 0.924 |
| U.S. No 2 Diesel Retail Prices (Dollars per Gallon) | 0.982 | 0.980 | 0.986 | 0.984 | 1.000 | 1.000 | 1.000 |
| U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) | 0.941 | 0.936 | 0.954 | 0.947 | 1.000 | 1.000 | 1.000 |
| U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) | 0.916 | 0.913 | 0.919 | 0.924 | 1.000 | 1.000 | 1.000 |
In [ ]: