In [1]:
import warnings
import pandas as pd
from pathlib import Path
from IPython.display import display

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 12)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 140)

def resolve_dataset_path(filename='PET_PRI_GND_DCUS_NUS_M.xls'):
    candidates = [
        Path.cwd() / filename,
        Path.cwd() / 'Capstone Project' / filename,
        Path.home() / 'OneDrive' / 'Desktop' / 'GGU' / 'Classes Spring 2024 Trimester' / 'Data 190 Capstone' / 'Capstone Project' / filename,
    ]
    for candidate in candidates:
        if candidate.exists():
            return candidate

    matches = list(Path.cwd().glob(f'**/{filename}'))
    if matches:
        return matches[0]

    raise FileNotFoundError(f'Could not locate {filename} from {Path.cwd()}')

print('Notebook display cleanup loaded.')
Notebook display cleanup loaded.
In [ ]:
!pip install xlrd statsmodels seaborn scikit-learn plotly matplotlib pandas numpy
In [3]:
import pandas as pd
from IPython.display import display

# Define the path to the Excel file
xls_path = resolve_dataset_path()
print(f'Using dataset: {xls_path}')

# Read the .xls file
try:
    df_full = pd.read_excel(xls_path, sheet_name=0, engine=None)
    df_head = df_full.head(3)
    df_tail = df_full.tail(3)
    display(df_head)
    display(df_tail)
except Exception as e:
    print(f'Notebook cell skipped: {e}')
Using dataset: PET_PRI_GND_DCUS_NUS_M.xls
Unnamed: 0 Unnamed: 1 Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6
0 NaN Workbook Contents NaN NaN NaN NaN NaN
1 NaN U.S. Gasoline and Diesel Retail Prices NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN
Unnamed: 0 Unnamed: 1 Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6
12 NaN Source: Energy Information Administration NaN NaN NaN NaN
13 NaN For Help, Contact: infoctr@eia.gov NaN NaN NaN NaN
14 NaN NaN (202) 586-8800 NaN NaN 4/1/2024 5:32:13 PM NaN
In [ ]:
from IPython.display import display

if 'df_full' in globals():
    display(df_full.head(3))
    display(df_full.tail(3))
In [5]:
#succesfully target the data
import pandas as pd

# Define the path to the Excel file
xls_path = resolve_dataset_path()
print(f'Using dataset: {xls_path}')

# Attempt to read the sheet named 'Data 1' which contains the actual data
try:
    df_full = pd.read_excel(xls_path, sheet_name='Data 1')
    df_head = df_full.head()
    display(df_head)
except Exception as e:
    print(f'Notebook cell skipped: {e}')
Using dataset: PET_PRI_GND_DCUS_NUS_M.xls
Back to Contents Data 1: U.S. Gasoline and Diesel Retail Prices Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 Unnamed: 10 Unnamed: 11 Unnamed: 12 Unnamed: 13 Unnamed: 14 Unnamed: 15
0 Sourcekey EMM_EPM0_PTE_NUS_DPG EMM_EPM0U_PTE_NUS_DPG EMM_EPM0R_PTE_NUS_DPG EMM_EPMR_PTE_NUS_DPG EMM_EPMRU_PTE_NUS_DPG EMM_EPMRR_PTE_NUS_DPG EMM_EPMM_PTE_NUS_DPG EMM_EPMMU_PTE_NUS_DPG EMM_EPMMR_PTE_NUS_DPG EMM_EPMP_PTE_NUS_DPG EMM_EPMPU_PTE_NUS_DPG EMM_EPMPR_PTE_NUS_DPG EMD_EPD2D_PTE_NUS_DPG EMD_EPD2DXL0_PTE_NUS_DPG EMD_EPD2DM10_PTE_NUS_DPG
1 Date U.S. All Grades All Formulations Retail Gasoli... U.S. All Grades Conventional Retail Gasoline P... U.S. All Grades Reformulated Retail Gasoline P... U.S. Regular All Formulations Retail Gasoline ... U.S. Regular Conventional Retail Gasoline Pric... U.S. Regular Reformulated Retail Gasoline Pric... U.S. Midgrade All Formulations Retail Gasoline... U.S. Midgrade Conventional Retail Gasoline Pri... U.S. Midgrade Reformulated Retail Gasoline Pri... U.S. Premium All Formulations Retail Gasoline ... U.S. Premium Conventional Retail Gasoline Pric... U.S. Premium Reformulated Retail Gasoline Pric... U.S. No 2 Diesel Retail Prices (Dollars per Ga... U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) R... U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retai...
2 1990-08-15 00:00:00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 1990-09-15 00:00:00 NaN NaN NaN 1.258 1.258 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 1990-10-15 00:00:00 NaN NaN NaN 1.335 1.335 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [6]:
import pandas as pd

# Define the path
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1', skipping rows to get to the actual data
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')  # Coerce for non datetime values 

    # Filter the DataFrame to only include entries from January 2019 onwards
    start_date = '2019-01-01'
    df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()  # Make a copy 
    
    # Drop rows where the date is NaT (not a time), which might have been created due to 'coerce'
    df_filtered.dropna(subset=['Date'], inplace=True)

    display(df_filtered.head(5))
except Exception as e:
    print(f'Notebook cell skipped: {e}')
Date U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. No 2 Diesel Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
341 2019-01-15 2.338 2.214 2.575 2.248 2.145 2.464 2.658 2.498 2.884 2.904 2.768 3.060 2.980 2.980 NaN
342 2019-02-15 2.393 2.286 2.603 2.309 2.223 2.495 2.682 2.531 2.898 2.936 2.814 3.077 2.997 2.997 NaN
343 2019-03-15 2.594 2.503 2.774 2.516 2.443 2.673 2.854 2.720 3.045 3.107 3.006 3.223 3.076 3.076 NaN
344 2019-04-15 2.881 2.755 3.131 2.798 2.694 3.023 3.165 2.980 3.434 3.417 3.264 3.597 3.121 3.121 NaN
345 2019-05-15 2.946 2.796 3.243 2.859 2.731 3.136 3.250 3.051 3.542 3.505 3.328 3.713 3.161 3.161 NaN
In [7]:
from IPython.display import display
display(df_full.head(3))
display(df_full.tail(3))
Date U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. No 2 Diesel Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
0 1990-08-15 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 1990-09-15 NaN NaN NaN 1.258 1.258 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 1990-10-15 NaN NaN NaN 1.335 1.335 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Date U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. No 2 Diesel Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
401 2024-01-15 3.197 3.045 3.495 3.075 2.957 3.331 3.619 3.391 3.945 3.969 3.764 4.206 3.854 3.854 NaN
402 2024-02-15 3.328 3.188 3.605 3.212 3.102 3.446 3.729 3.511 4.042 4.077 3.888 4.296 4.044 4.044 NaN
403 2024-03-15 3.542 3.402 3.815 3.426 3.318 3.657 3.938 3.719 4.252 4.286 4.098 4.506 4.022 4.022 NaN
In [8]:
import pandas as pd
from IPython.display import display

# Define the path
xls_path = resolve_dataset_path()
print(f'Using dataset: {xls_path}')

try:
    # Read the sheet named 'Data 1', skipping rows to get to the actual data
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')  # Coerce for non-datetime values

    # Find the first non-null value and its corresponding date
    first_valid_date = df_full.loc[df_full.notna().any(axis=1), 'Date'].min()

    # Convert to string format
    start_date = first_valid_date.strftime('%Y-%m-%d')

    # Filter the DataFrame to only include entries from the first valid date onwards
    df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()

    # Check the column names in the DataFrame
    print(f'Column count: {len(df_filtered.columns)}')
    display(df_filtered.columns[:12].tolist())

    # Define the columns of interest
    columns_of_interest = [
        'Date',
        'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
        'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
        'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
    ]

    if all(col in df_filtered.columns for col in columns_of_interest):
        df_clean = df_filtered.dropna(subset=columns_of_interest)
        display(df_clean.head(5))
    else:
        print('Some expected columns were not found in the dataset.')

except Exception as e:
    print(f'Notebook cell skipped: {e}')
Using dataset: PET_PRI_GND_DCUS_NUS_M.xls
Column count: 16
['Date',
 'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)']
Date U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. No 2 Diesel Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
198 2007-02-15 2.323 2.276 2.418 2.278 2.235 2.369 2.385 2.325 2.500 2.479 2.431 2.567 2.488 2.513 2.435
199 2007-03-15 2.609 2.546 2.738 2.563 2.503 2.689 2.670 2.593 2.820 2.766 2.706 2.877 2.667 2.680 2.630
200 2007-04-15 2.891 2.831 3.013 2.845 2.787 2.965 2.953 2.881 3.093 3.052 2.996 3.155 2.834 2.847 2.790
201 2007-05-15 3.187 3.157 3.248 3.146 3.119 3.202 3.241 3.199 3.321 3.331 3.300 3.389 2.796 2.818 2.726
202 2007-06-15 3.102 3.067 3.173 3.056 3.024 3.122 3.161 3.118 3.247 3.263 3.227 3.330 2.808 2.826 2.746
In [ ]:
 
In [9]:
# Display the first few column names 
print(f'Column count: {len(df_clean.columns)}')
display(df_clean.columns[:12].tolist())
Column count: 16
['Date',
 'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)']
In [10]:
# Check the range of dates available
print(f"Date range: {df_clean['Date'].min()} to {df_clean['Date'].max()}")
display(df_clean[['Date', 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']].dropna().head(5))
Date range: 2007-02-15 00:00:00 to 2008-11-15 00:00:00
Date U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)
198 2007-02-15 2.278
199 2007-03-15 2.563
200 2007-04-15 2.845
201 2007-05-15 3.146
202 2007-06-15 3.056
In [11]:
print('Missing values:', df_clean['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].isna().sum())
Missing values: 0
In [ ]:
# Display the available column names
print(f'Column count: {len(df_clean.columns)}')
display(df_clean.columns[:12].tolist())

# Define the target columns for plotting
columns_to_plot_corrected = [
    'Regular_All_Formulations',  # Exact DataFrame column name
    # Additional columns can be added here
]

# Keep only columns that exist in df_clean
columns_to_plot_existing = [col for col in columns_to_plot_corrected if col in df_clean.columns]

# Set 'Date' as the index when needed
if 'Date' not in df_clean.index.names:
    df_clean.set_index('Date', inplace=True)

# Plot the available target columns
if columns_to_plot_existing:
    df_clean[columns_to_plot_existing].plot(figsize=(14, 7), marker='o', title='U.S. Gasoline Prices Over Time')
    plt.xlabel('Date')
    plt.ylabel('Price (Dollars per Gallon)')
    plt.grid(True)
    plt.legend()
    plt.show()
else:
    print("None of the intended columns to plot were found in the DataFrame.")
In [ ]:
 
In [13]:
import plotly.graph_objects as go

# Define the columns the analysis plots and their corresponding colors
columns_to_plot = [
    'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
]
colors = ['#FFFF00',  # Yellow
          '#FF0000',  # Red
          '#0000FF',  # Blue
          '#008000']  # Green

# Creating the Plotly figure
fig = go.Figure()

# Adding a line plot for each fuel type
for column, color in zip(columns_to_plot, colors):
    fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], name=column,  # Keep full column name in legend
                             mode='lines+markers', line=dict(color=color)))

# Updating the layout
fig.update_layout(
    title='Interactive U.S. Gasoline and Diesel Retail Prices Over Time (2019-2023)',
    xaxis_title='Date',
    yaxis_title='Price (Dollars per Gallon)',
    legend_title='Fuel Type',
    template='plotly_white',
    width=1400,  # Wider chart for better visualization
    hovermode='x unified'  # Unified hover mode for easier comparison
)

# Display the figure
fig.show()
In [14]:
import plotly.graph_objects as go

# Define the columns the analysis plots and their corresponding colors
columns_to_plot = [
    'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
]
colors = ['#FFFF00',  # Yellow
          '#FF0000',  # Red
          '#0000FF',  # Blue
          '#008000']  # Green

# Creating the Plotly figure
fig = go.Figure()

# Adding a line plot for each fuel type
for column, color in zip(columns_to_plot, colors):
    fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], name=column.split(' ')[1],  # Simplify name for legend
                             mode='lines+markers', line=dict(color=color)))

# Updating the layout
fig.update_layout(
    title='Interactive U.S. Gasoline and Diesel Retail Prices Over Time (2010-2023)',
    xaxis_title='Date',
    yaxis_title='Price (Dollars per Gallon)',
    legend_title='Fuel Type',
    template='plotly_white',
    width=1400  # Wider chart for better visualization
)

# Display the figure
fig.show()
In [15]:
print(f'Index name: {df_clean.index.name}')
Index name: Date
In [ ]:
 
In [16]:
import plotly.graph_objects as go

# Define the columns the analysis plots and their corresponding colors
columns_to_plot = [
    'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
]
colors = ['#FFFF00',  # Yellow
          '#FF0000',  # Red
          '#0000FF',  # Blue
          '#008000']  # Green

# Creating the Plotly figure
fig = go.Figure()

# Adding a line plot for each fuel type
for column, color in zip(columns_to_plot, colors):
    fig.add_trace(go.Scatter(x=df_filtered.index, y=df_filtered[column], name=column,  # Keep full column name in legend
                             mode='lines+markers', line=dict(color=color)))

# Updating the layout
fig.update_layout(
    title='Interactive U.S. Gasoline and Diesel Retail Prices Over Time (2010-2023)',
    xaxis_title='Date',
    yaxis_title='Price (Dollars per Gallon)',
    legend_title='Fuel Type',
    template='plotly_white',
    width=1400,  # Wider chart for better visualization
    hovermode='x unified'  # Unified hover mode for easier comparison
)

# Display the figure
fig.show()
In [17]:
import pandas as pd
import plotly.graph_objects as go

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date'
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    
    # Convert the 'Date' column to datetime format
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter the DataFrame to include data since January 2010
    start_date = '2010-01-01'
    df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
    
    # Define the columns to plot
    columns_to_plot = [
        'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
    ]
    
    # Create a Plotly figure
    fig = go.Figure()
    
    # Add a line plot for each fuel type
    for column in columns_to_plot:
        fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], name=column, mode='lines+markers'))
    
    # Update the layout of the figure
    fig.update_layout(
        title='Interactive U.S. Gasoline and Diesel Retail Prices Since 2010',
        xaxis_title='Date',
        yaxis_title='Price (Dollars per Gallon)',
        template='plotly_white',
        width=1400
    )
    
    # Display the figure
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')
In [18]:
from IPython.display import display
display(df_clean.head(5))
print(df_clean.info())
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. No 2 Diesel Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
Date
2007-02-15 2.323 2.276 2.418 2.278 2.235 2.369 2.385 2.325 2.500 2.479 2.431 2.567 2.488 2.513 2.435
2007-03-15 2.609 2.546 2.738 2.563 2.503 2.689 2.670 2.593 2.820 2.766 2.706 2.877 2.667 2.680 2.630
2007-04-15 2.891 2.831 3.013 2.845 2.787 2.965 2.953 2.881 3.093 3.052 2.996 3.155 2.834 2.847 2.790
2007-05-15 3.187 3.157 3.248 3.146 3.119 3.202 3.241 3.199 3.321 3.331 3.300 3.389 2.796 2.818 2.726
2007-06-15 3.102 3.067 3.173 3.056 3.024 3.122 3.161 3.118 3.247 3.263 3.227 3.330 2.808 2.826 2.746
<class 'pandas.DataFrame'>
DatetimeIndex: 22 entries, 2007-02-15 to 2008-11-15
Data columns (total 15 columns):
 #   Column                                                                           Non-Null Count  Dtype  
---  ------                                                                           --------------  -----  
 0   U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)     22 non-null     float64
 1   U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)         22 non-null     float64
 2   U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)         22 non-null     float64
 3   U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)        22 non-null     float64
 4   U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)            22 non-null     float64
 5   U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)            22 non-null     float64
 6   U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)       22 non-null     float64
 7   U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)           22 non-null     float64
 8   U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)           22 non-null     float64
 9   U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)        22 non-null     float64
 10  U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)            22 non-null     float64
 11  U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)            22 non-null     float64
 12  U.S. No 2 Diesel Retail Prices (Dollars per Gallon)                              22 non-null     float64
 13  U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)  22 non-null     float64
 14  U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)      22 non-null     float64
dtypes: float64(15)
memory usage: 2.8 KB
None
In [19]:
print(df_full.head())  # Print the first few rows of the DataFrame
print(f"Date range: {df_full['Date'].min()} to {df_full['Date'].max()}")
        Date  U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)  \
0 1990-08-15                                                NaN                              
1 1990-09-15                                                NaN                              
2 1990-10-15                                                NaN                              
3 1990-11-15                                                NaN                              
4 1990-12-15                                                NaN                              

   U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                          
1                                                NaN                          
2                                                NaN                          
3                                                NaN                          
4                                                NaN                          

   U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                          
1                                                NaN                          
2                                                NaN                          
3                                                NaN                          
4                                                NaN                          

   U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                           
1                                              1.258                           
2                                              1.335                           
3                                              1.324                           
4                                                NaN                           

   U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                       
1                                              1.258                       
2                                              1.335                       
3                                              1.324                       
4                                                NaN                       

   U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                       
1                                                NaN                       
2                                                NaN                       
3                                                NaN                       
4                                                NaN                       

   U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                            
1                                                NaN                            
2                                                NaN                            
3                                                NaN                            
4                                                NaN                            

   U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                        
1                                                NaN                        
2                                                NaN                        
3                                                NaN                        
4                                                NaN                        

   U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                        
1                                                NaN                        
2                                                NaN                        
3                                                NaN                        
4                                                NaN                        

   U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                           
1                                                NaN                           
2                                                NaN                           
3                                                NaN                           
4                                                NaN                           

   U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                       
1                                                NaN                       
2                                                NaN                       
3                                                NaN                       
4                                                NaN                       

   U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)  U.S. No 2 Diesel Retail Prices (Dollars per Gallon)  \
0                                                NaN                                                                    NaN     
1                                                NaN                                                                    NaN     
2                                                NaN                                                                    NaN     
3                                                NaN                                                                    NaN     
4                                                NaN                                                                    NaN     

   U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)  \
0                                                NaN                                 
1                                                NaN                                 
2                                                NaN                                 
3                                                NaN                                 
4                                                NaN                                 

   U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)  
0                                                NaN                            
1                                                NaN                            
2                                                NaN                            
3                                                NaN                            
4                                                NaN                            
Date range: 1990-08-15 00:00:00 to 2024-03-15 00:00:00
In [20]:
import pandas as pd
import plotly.graph_objects as go

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date'
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    
    # Convert the 'Date' column to datetime format
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter the DataFrame to include data since January 2010
    start_date = '2010-01-01'
    df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
    
    # Define the columns to plot
    columns_to_plot = df_filtered.columns
    
    # Create a Plotly figure
    fig = go.Figure()
    
    # Add a line plot for each column
    for column in columns_to_plot:
        if column != 'Date':  # Exclude 'Date' column from the legend
            fig.add_trace(go.Scatter(x=df_filtered['Date'].dt.date, y=df_filtered[column], name=column, mode='lines+markers'))
    
    # Update the layout of the figure
    fig.update_layout(
        title='Retail Gasoline Prices Over Time',
        xaxis_title='Date',
        yaxis_title='Price (Dollars per Gallon)',
        legend_title='Fuel Type',
        template='plotly_white',
        width=1400
    )
    
    # Display the figure
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')
No description has been provided for this image
In [21]:
print(f'Column count: {len(df_clean.columns)}')
display(df_clean.columns[:12].tolist())
Column count: 15
['U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)']
In [22]:
import pandas as pd
import plotly.graph_objects as go

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date'
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    
    # Convert the 'Date' column to datetime format
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter the DataFrame to include data since January 2010
    start_date = '2010-01-01'
    df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
    
    # Define the columns to plot
    columns_to_plot = df_filtered.columns
    
    # Create a Plotly figure
    fig = go.Figure()
    
    # Add a line plot for each column with fills
    for column in columns_to_plot:
        if column != 'Date':  # Exclude 'Date' column from the legend
            fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], mode='lines', name=column))
            fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], mode='lines', name=column + '_fill',
                                     fill='tonexty'))  # Add fill to the line
    
    # Update the layout of the figure
    fig.update_layout(
        title='Retail Gasoline Prices Over Time',
        xaxis_title='Date',
        yaxis_title='Price (Dollars per Gallon)',
        legend_title='Fuel Type',
        template='plotly_white',
        width=1400,
        hovermode='x unified'  # Unified hover mode for easier comparison
    )
    
    # Display the figure
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')
In [23]:
import pandas as pd
import numpy as np

# Work with the cleaned dataframe
# Select only the numeric columns
numeric_columns = df_clean.select_dtypes(include=[np.number])

# Calculate Z-score for each numeric column in the DataFrame
z_scores = (numeric_columns - numeric_columns.mean()) / numeric_columns.std()

# Define threshold for considering outliers
threshold = 3

# Identify outliers based on Z-score
outliers = (z_scores > threshold) | (z_scores < -threshold)

# Print outliers
outlier_counts = outliers.sum().sort_values(ascending=False)
display(outlier_counts[outlier_counts > 0].head(12).to_frame('Outlier Count'))
Outlier Count
In [24]:
# Check the data types of the columns
display(df_clean.dtypes.to_frame('dtype'))
missing_counts = df_clean.isnull().sum()
display(missing_counts[missing_counts > 0].to_frame('Missing Count'))
dtype
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) float64
U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) float64
U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) float64
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) float64
U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) float64
... ...
U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) float64
U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) float64
U.S. No 2 Diesel Retail Prices (Dollars per Gallon) float64
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) float64
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) float64

15 rows × 1 columns

Missing Count
In [25]:
# Importing necessary library
import numpy as np

# Extracting only the numeric columns for Z-score calculation
numeric_columns = df_clean.select_dtypes(include=np.number)

# Calculate Z-score for each numeric column in the DataFrame
z_scores = (numeric_columns - numeric_columns.mean()) / numeric_columns.std()

# Define threshold for considering outliers
threshold = 3

# Identify outliers based on Z-score
outliers = (z_scores > threshold) | (z_scores < -threshold)

# Print outliers
outlier_counts = outliers.sum().sort_values(ascending=False)
display(outlier_counts[outlier_counts > 0].head(12).to_frame('Outlier Count'))
Outlier Count
In [26]:
outlier_counts = outliers.sum().sort_values(ascending=False)
display(outlier_counts[outlier_counts > 0].head(12).to_frame('Outlier Count'))
display(df_clean.dtypes.to_frame('dtype'))
missing_counts = df_clean.isnull().sum()
display(missing_counts[missing_counts > 0].to_frame('Missing Count'))
Outlier Count
dtype
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) float64
U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) float64
U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) float64
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) float64
U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) float64
... ...
U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) float64
U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) float64
U.S. No 2 Diesel Retail Prices (Dollars per Gallon) float64
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) float64
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) float64

15 rows × 1 columns

Missing Count
In [27]:
import pandas as pd
import numpy as np
from IPython.display import display

# Define the path to the Excel file
xls_path = resolve_dataset_path()
print(f'Using dataset: {xls_path}')

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)

    # Rename the first column to 'Date'
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)

    # Convert the 'Date' column to datetime format
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')

    # Filter the DataFrame to include data since January 2010
    start_date = '2010-01-01'
    df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()

    # Handle missing values by forward filling
    df_clean = df_filtered.ffill()

    columns_to_plot = [
        'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
        'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
        'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
    ]

    def detect_outliers_zscore(data, threshold=3):
        z_scores = np.abs((data - data.mean()) / data.std())
        return z_scores > threshold

    outliers = df_clean[columns_to_plot].apply(detect_outliers_zscore)
    outlier_counts = outliers.sum().sort_values(ascending=False)
    display(outlier_counts[outlier_counts > 0].head(12).to_frame('Outlier Count'))

except Exception as e:
    print(f'Notebook cell skipped: {e}')
Using dataset: PET_PRI_GND_DCUS_NUS_M.xls
Outlier Count
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) 1
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) 1
U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) 1
U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) 1
U.S. No 2 Diesel Retail Prices (Dollars per Gallon) 1
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) 1
In [28]:
import pandas as pd
import numpy as np
from IPython.display import display

# Restrict summary statistics to numeric columns so datetime fields do not break variance calculations
numeric_df = df_clean.select_dtypes(include=[np.number])

# Calculate measures of central tendency
central_tendency = numeric_df.describe().loc[['mean', '50%']]

# Calculate measures of dispersion
standard_deviation = numeric_df.std()
variance = numeric_df.var()
range_values = numeric_df.max() - numeric_df.min()

# Create a DataFrame to store the results
statistics_df = pd.DataFrame({
    'Mean': central_tendency.loc['mean'],
    'Median': central_tendency.loc['50%'],
    'Standard Deviation': standard_deviation,
    'Variance': variance,
    'Range': range_values
})

# Display the statistics DataFrame
display(statistics_df.round(3))
Mean Median Standard Deviation Variance Range
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) 3.074 2.948 0.615 0.378 3.160
U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) 2.979 2.866 0.608 0.369 3.050
U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) 3.265 3.145 0.638 0.407 3.380
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) 2.986 2.860 0.620 0.384 3.165
U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) 2.902 2.808 0.616 0.379 3.083
... ... ... ... ... ...
U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) 3.396 3.348 0.616 0.380 3.268
U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) 3.633 3.584 0.689 0.475 3.643
U.S. No 2 Diesel Retail Prices (Dollars per Gallon) 3.403 3.253 0.766 0.587 3.756
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) 3.404 3.253 0.766 0.587 3.756
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) NaN NaN NaN NaN NaN

15 rows × 5 columns

In [29]:
#Determing correlation of independent variables


import pandas as pd

# Run the analysis on the cleaned dataframe with the selected independent variables
relevant_columns = [
    'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
]

# Selecting only the relevant columns
relevant_data = df_clean[relevant_columns]

# Calculating the correlation matrix
correlation_matrix = relevant_data.corr()

# Displaying the correlation matrix
display(correlation_matrix.round(3))
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. No 2 Diesel Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) 1.000 0.999 0.980 0.957 0.943 0.943 NaN
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) 0.999 1.000 0.975 0.949 0.938 0.938 NaN
U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) 0.980 0.975 1.000 0.995 0.958 0.958 NaN
U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) 0.957 0.949 0.995 1.000 0.953 0.953 NaN
U.S. No 2 Diesel Retail Prices (Dollars per Gallon) 0.943 0.938 0.958 0.953 1.000 1.000 NaN
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) 0.943 0.938 0.958 0.953 1.000 1.000 NaN
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) NaN NaN NaN NaN NaN NaN NaN
In [30]:
import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date'
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    
    # Convert the 'Date' column to datetime format
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Create a scatter plot of U.S. Regular All Formulations Retail Gasoline Prices over time
    fig = px.scatter(df_full, x='Date', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                     title='Scatter Plot of U.S. Regular Gasoline Prices Over Time',
                     labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'},
                     trendline='ols'  # Add a trendline using ordinary least squares method
                    )
    
    # Display the scatter plot
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')
Notebook cell skipped: No module named 'statsmodels'
In [31]:
import pandas as pd
from IPython.display import display

# Define the path to the Excel file
xls_path = resolve_dataset_path()
print(f'Using dataset: {xls_path}')

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)

    # Display the column names to identify the correct column for gasoline prices
    print(f'Column count: {len(df_full.columns)}')
    display(df_full.columns[:12].tolist())

except Exception as e:
    print(f'Notebook cell skipped: {e}')
Using dataset: PET_PRI_GND_DCUS_NUS_M.xls
Column count: 16
['Date',
 'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)']
In [32]:
import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date'
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    
    # Convert the 'Date' column to datetime format
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices
    fig = px.histogram(df_full, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                       title='Histogram of U.S. Regular Gasoline Prices',
                       labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'}
                      )
    
    # Display the histogram
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')
In [33]:
import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date'
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    
    # Convert the 'Date' column to datetime format
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Create a scatter plot of U.S. Regular All Formulations Retail Gasoline Prices over time
    fig = px.scatter(df_full, x='Date', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                     title='Scatter Plot of U.S. Regular Gasoline Prices Over Time',
                     labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'},
                     trendline='ols'  # Add a trendline using ordinary least squares method (linear regression)
                    )
    
    # Display the scatter plot
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')
Notebook cell skipped: No module named 'statsmodels'
In [34]:
import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter for data from 2019 onward
    df_filtered = df_full[df_full['Date'].dt.year >= 2019]
    
    # Create the scatter plot
    fig = px.scatter(df_filtered, x='Date', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                     title='Scatter Plot of U.S. Regular Gasoline Prices (2019 Onward)',
                     labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'},
                     trendline='ols')
    
    # Display the scatter plot
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')
Notebook cell skipped: No module named 'statsmodels'
In [35]:
import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Add a 'Year' column to the DataFrame
    df_full['Year'] = df_full['Date'].dt.year

    # Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year
    fig = px.box(df_full, x='Year', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                 title='Colorful Boxplot of U.S. Regular Gasoline Prices by Year',
                 labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
                         'Year': 'Year'},
                 color='Year'  # This makes the plot colorful
                )
    
    # Display the boxplot
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')
In [36]:
import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Add a 'Year' column and a 'Quarter' column to the DataFrame
    df_full['Year'] = df_full['Date'].dt.year
    df_full['Quarter'] = df_full['Date'].dt.to_period('Q')
    
    # Convert 'Quarter' to string for compatibility
    df_full['QuarterStr'] = df_full['Quarter'].astype(str)

    # Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year and Quarter
    fig = px.box(df_full, x='QuarterStr', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                 title='Boxplot of U.S. Regular Gasoline Prices by Quarter',
                 labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
                         'QuarterStr': 'Quarter'},
                 color='QuarterStr'  # This makes the plot colorful and distinguishes each quarter
                )
    
    # Display the boxplot
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')
In [37]:
import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter for data from 2010 onward and avoid SettingWithCopyWarning by using copy()
    df_filtered = df_full[df_full['Date'].dt.year >= 2010].copy()
    
    # Properly use .loc to avoid SettingWithCopyWarning
    df_filtered['Year'] = df_filtered['Date'].dt.year
    df_filtered.loc[:, 'Quarter'] = df_filtered['Date'].dt.to_period('Q')
    
    # Convert 'Quarter' to string for compatibility
    df_filtered.loc[:, 'QuarterStr'] = df_filtered['Quarter'].astype(str)

    # Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year and Quarter
    fig = px.box(df_filtered, x='QuarterStr', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                 title='Boxplot of U.S. Regular Gasoline Prices by Quarter (From 2010 Onward)',
                 labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
                         'QuarterStr': 'Quarter'},
                 color='QuarterStr'  # This makes the plot colorful and distinguishes each quarter
                )
    
    # Display the boxplot
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')
In [38]:
import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter for data from 2019 onward and avoid SettingWithCopyWarning by using copy()
    df_filtered = df_full[df_full['Date'].dt.year >= 2019].copy()
    
    # Properly use .loc to avoid SettingWithCopyWarning
    df_filtered['Year'] = df_filtered['Date'].dt.year
    df_filtered.loc[:, 'Quarter'] = df_filtered['Date'].dt.to_period('Q')
    
    # Convert 'Quarter' to string for compatibility
    df_filtered.loc[:, 'QuarterStr'] = df_filtered['Quarter'].astype(str)

    # Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year and Quarter
    fig = px.box(df_filtered, x='QuarterStr', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                 title='Boxplot of U.S. Regular Gasoline Prices by Quarter (From 2019 Onward)',
                 labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
                         'QuarterStr': 'Quarter'},
                 color='QuarterStr'  # This makes the plot colorful and distinguishes each quarter
                )
    
    # Display the boxplot
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')
In [39]:
import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Extract the year from the 'Date' column
    df_full['Year'] = df_full['Date'].dt.year
    
    # Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices
    # Color the histogram bars by the 'Year' to visualize how distributions change over time
    fig = px.histogram(df_full, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', color='Year',
                       title='Histogram of U.S. Regular Gasoline Prices by Year',
                       labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
                               'Year': 'Year'},
                       barmode='overlay',  # Overlay the histograms to see how distributions compare across years
                       histnorm='percent',  # Normalize the histogram to show percentage rather than count
                       nbins=30  # Adjust the number of bins for finer granularity or clarity
                      )
    
    # Display the histogram
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')
In [40]:
import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter for data from 2019 onward
    df_filtered = df_full[df_full['Date'].dt.year >= 2019].copy()
    
    # Create a 'YearQuarter' column for coloring
    df_filtered['YearQuarter'] = df_filtered['Date'].dt.to_period('Q').astype(str)

    # Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices, colored by 'YearQuarter'
    fig = px.histogram(df_filtered, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', color='YearQuarter',
                       title='Quarterly Histogram of U.S. Regular Gasoline Prices (2019 Onward)',
                       labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
                               'YearQuarter': 'Year-Quarter'},
                       barmode='overlay',  # Overlay the histograms to compare across quarters
                       histnorm='percent',  # Normalize to show percentage
                       nbins=20  # Number of bins for granularity
                      )
    
    # Display the histogram
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')
In [41]:
import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter for data from 2019 onward
    df_filtered = df_full[df_full['Date'].dt.year >= 2019].copy()
    
    # Create a 'YearQuarter' column for coloring
    df_filtered['YearQuarter'] = df_filtered['Date'].dt.to_period('Q').astype(str)

    # Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices, colored by 'YearQuarter'
    fig = px.histogram(df_filtered, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', color='YearQuarter',
                       title='Quarterly Histogram of U.S. Regular Gasoline Prices (2019 Onward)',
                       labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
                               'YearQuarter': 'Year-Quarter'},
                       barmode='overlay',  # Overlay the histograms to compare across quarters
                       histnorm='percent',  # Normalize to show percentage
                       nbins=20,  # Number of bins for granularity
                       )
    
    # Update y-axis title to reflect what the percentages represent
    fig.update_layout(yaxis_title='Percentage of Total Observations')

    # Display the histogram
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')
In [42]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Drop the specified column
    df_full.drop(columns=['U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'], inplace=True)
    
    # Drop rows with any NaN values in the target column
    df_full.dropna(subset=['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], inplace=True)
    
    # Reset the index after the dropna operation
    df_full.reset_index(drop=True, inplace=True)
    
    # Feature Engineering: Use previous prices as features to predict the next price
    df_full['PrevPrice'] = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].shift(1)
    
    # Drop the first row which now has an NaN value after the shift operation
    df_full.dropna(inplace=True)
    
    # Define features and target variable
    X = df_full[['PrevPrice']]  # Features from previous prices
    y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']  # Target
    
    # Initialize the Random Forest model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    
    # Train the model on the entire dataset
    model.fit(X, y)
    
    # Predict on the training set itself since we don't have a separate test set
    predictions = model.predict(X)
    
    # Calculate and print the mean absolute error
    mae = mean_absolute_error(y, predictions)
    print("Mean Absolute Error on the entire dataset:", mae)
    
    # Optionally, visualize the actual vs predicted prices
    import matplotlib.pyplot as plt

    plt.figure(figsize=(10, 6))
    plt.scatter(y, predictions, alpha=0.3)
    plt.plot(y, y, color='red')  # Perfect predictions line
    plt.title('Actual vs Predicted Prices')
    plt.xlabel('Actual Prices')
    plt.ylabel('Predicted Prices')
    plt.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')
Mean Absolute Error on the entire dataset: 0.06432608148404984
No description has been provided for this image
In [43]:
# Continue forecasting from the trained model and working dataframe
# Find the last known price
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]

# Initialize a list to keep track of our predictions and input features
predictions = []
input_feature = last_known_price

# Predict the next three months
for _ in range(3):
    # Predict the next price
    next_month_price = model.predict([[input_feature]])[0]
    
    # Store the prediction
    predictions.append(next_month_price)
    
    # Update the input_feature to the newly predicted price
    input_feature = next_month_price

# Display the predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=3, freq='ME')
predicted_prices = pd.Series(predictions, index=prediction_dates)

display(predicted_prices.to_frame('Predicted Price'))
Predicted Price
2024-04-30 3.58903
2024-05-31 3.65322
2024-06-30 3.64383
In [44]:
import plotly.graph_objects as go
import pandas as pd

# Plot the forecast against the working dataframe and generated predictions

# Create a DataFrame for the predicted data with appropriate dates
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=3, freq='ME')
predicted_data = pd.DataFrame({'Date': prediction_dates, 'Predicted Prices': [3.58903, 3.65322, 3.64383]})

# Create the figure for plotting
fig = go.Figure()

# Add the historical data
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
                         mode='lines', name='Historical Prices', line=dict(color='blue')))

# Add the predicted data
fig.add_trace(go.Scatter(x=predicted_data['Date'], y=predicted_data['Predicted Prices'],
                         mode='lines+markers', name='Predicted Prices', line=dict(color='red', dash='dash')))

# Update layout for a more interactive experience
fig.update_layout(title='Historical and Predicted Gasoline Prices',
                  xaxis_title='Date',
                  yaxis_title='Price (Dollars per Gallon)',
                  template='plotly_white')

# Enable zoom and pan
fig.update_layout(xaxis=dict(rangeslider=dict(visible=True), type="date"))

# Show the figure
fig.show()
In [45]:
# Use the prepared working dataframe for feature construction

# Ensure the feature used for prediction is correctly set
X = df_full[['PrevPrice']]  # Use the lagged price feature for model input

# Generate predictions for the entire dataset
predictions = model.predict(X)

# Now, ensure 'y' is the target variable for the entire dataset
y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']

# Ensure y and predictions are aligned; they should be if predictions were made for the entire dataset
residuals = y - predictions

# Now, the residuals can be visualized the residuals as previously described without encountering shape mismatch issues
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(y.index, residuals)
plt.axhline(y=0, color='red', linestyle='--')
plt.title('Residuals of Predictions')
plt.xlabel('Index')
plt.ylabel('Residuals')
plt.show()
No description has been provided for this image
In [46]:
# Evaluate the RandomForestRegressor model with the prepared feature set
import pandas as pd
import matplotlib.pyplot as plt

feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.nlargest(len(X.columns)).plot(kind='barh', colormap='winter')
plt.title('Feature Importances')
plt.show()
No description has been provided for this image
In [47]:
import plotly.graph_objects as go

# Combine historical data with future predictions for plotting
# Concatenate historical and predicted data for a continuous timeline
full_dates = pd.concat([df_full['Date'], predicted_data['Date']])
full_prices = pd.concat([df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], predicted_data['Predicted Prices']])

fig = go.Figure()

# Add historical data to the plot
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
                         mode='lines', name='Historical Prices'))

# Add predicted data to the plot
fig.add_trace(go.Scatter(x=predicted_data['Date'], y=predicted_data['Predicted Prices'],
                         mode='lines+markers', name='Predicted Prices', line=dict(dash='dot')))

# Update layout for a more detailed view
fig.update_layout(title='Historical and Predicted Gasoline Prices',
                  xaxis_title='Date',
                  yaxis_title='Price (Dollars per Gallon)',
                  template='plotly_white')

fig.show()
In [48]:
# Starting from the last known price in the dataset
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]

# Initialize a list to store predictions
monthly_predictions = [last_known_price]  # Starting with the last known price

# Predict the next 12 months
for _ in range(12):
    # The model expects a DataFrame, so we create one with the current input feature
    current_feature = pd.DataFrame([monthly_predictions[-1]], columns=['PrevPrice'])
    
    # Predict the next price
    next_month_price = model.predict(current_feature)[0]
    
    # Append the prediction for next month
    monthly_predictions.append(next_month_price)

# Remove the first element (last known price) to get only future predictions
future_predictions = monthly_predictions[1:]

# Create a date range for these predictions
last_date = df_full['Date'].iloc[-1]
prediction_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=12, freq='ME')

# Plot the predictions
plt.figure(figsize=(10, 6))
plt.plot(prediction_dates, future_predictions, label='Predicted Prices', marker='o', linestyle='--', color='red')
plt.title('Predicted Gasoline Prices for the Next Year')
plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.legend()
plt.show()
No description has been provided for this image
In [49]:
import pandas as pd
import matplotlib.pyplot as plt

# Forecast forward from the last known date in the historical dataframe
# and 'model' is the trained Random Forest model

# Initialize predictions list with the last known price to start the prediction chain
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = [last_known_price]

# Generate predictions for the next 12 months
for _ in range(12):
    input_feature = pd.DataFrame([predictions[-1]], columns=['PrevPrice'])
    next_price = model.predict(input_feature)[0]
    predictions.append(next_price)

# Exclude the first element (last known price) to keep only future predictions
future_predictions = predictions[1:]

# Create a date range for the next 12 months of predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=12, freq='ME')

# Combine historical dates and predicted dates
full_dates = pd.concat([df_full['Date'], pd.Series(prediction_dates)])

# Combine historical prices and predicted prices
# The last known price is included in both historical and predicted prices for continuity
full_prices = pd.concat([df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], pd.Series(future_predictions)])

# Plotting
plt.figure(figsize=(14, 7))
plt.plot(df_full['Date'], df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], label='Historical Prices', color='blue')
plt.plot(prediction_dates, future_predictions, label='Predicted Prices', linestyle='--', marker='o', color='red')
plt.title('Historical and Predicted Gasoline Prices')
plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.legend()
plt.show()
No description has been provided for this image
In [50]:
import plotly.graph_objects as go
import pandas as pd

# Generate forward predictions from the historical dataframe and trained model

# Generate predictions for the next 12 months, starting from the last known price
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = [last_known_price]

for _ in range(12):
    input_feature = pd.DataFrame([predictions[-1]], columns=['PrevPrice'])
    next_price = model.predict(input_feature)[0]
    predictions.append(next_price)

# Exclude the first element (last known price) to keep only future predictions
future_predictions = predictions[1:]

# Create a date range for the next 12 months of predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=12, freq='ME')

# Plotting with Plotly for interactive exploration
fig = go.Figure()

# Add historical data to the plot
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
                         mode='lines', name='Historical Prices'))

# Add predicted data to the plot
fig.add_trace(go.Scatter(x=prediction_dates, y=future_predictions,
                         mode='lines+markers', name='Predicted Prices', line=dict(dash='dot')))

# Update layout for a more detailed view
fig.update_layout(title='Historical and Predicted Gasoline Prices',
                  xaxis_title='Date',
                  yaxis_title='Price (Dollars per Gallon)',
                  template='plotly_white')

# Show the figure
fig.show()
In [51]:
import plotly.graph_objects as go
import pandas as pd

# Extend the trained model beyond the historical dataframe

# Start predictions from the last known price
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = [last_known_price]

# Predict for the next 5 years (60 months)
for _ in range(60):
    input_feature = pd.DataFrame([predictions[-1]], columns=['PrevPrice'])
    next_price = model.predict(input_feature)[0]
    predictions.append(next_price)

# Exclude the first element (last known price) to have only future predictions
future_predictions = predictions[1:]

# Create a date range for the next 5 years of predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=60, freq='ME')

# Creating the Plotly figure
fig = go.Figure()

# Add historical data to the plot
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
                         mode='lines', name='Historical Prices', line=dict(color='blue')))

# Add predicted data to the plot
fig.add_trace(go.Scatter(x=prediction_dates, y=future_predictions,
                         mode='lines+markers', name='Predicted Prices', line=dict(color='red', dash='dash')))

# Update the layout for better visualization and interactivity
fig.update_layout(title='Historical and Predicted Gasoline Prices for the Next 5 Years',
                  xaxis_title='Date',
                  yaxis_title='Price (Dollars per Gallon)',
                  template='plotly_white')

# Enable zoom and pan features
fig.update_layout(xaxis_rangeslider_visible=True)

# Show the figure
fig.show()
In [52]:
# Find the earliest date in the historical data
earliest_date = df_full['Date'].min()

print(f"The model is trained on data starting from: {earliest_date}")
The model is trained on data starting from: 2007-02-15 00:00:00
In [53]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Use the complete dataset sorted by date
# Check the earliest date just for confirmation
print(f"Earliest date in the dataset: {df_full['Date'].min()}")

# Feature Engineering: Create 'PrevPrice' based on 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'
df_full['PrevPrice'] = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].shift(1)
df_full.dropna(inplace=True)  # Drop the first row which now lacks 'PrevPrice'

# Splitting the data into features (X) and target (y)
X = df_full[['PrevPrice']]
y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
print(f"Mean Absolute Error: {mae}")

# Now the model is trained from the earliest occurrence of price in the dataset
Earliest date in the dataset: 2007-02-15 00:00:00
Mean Absolute Error: 0.16113744715447187
In [54]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# Work from the loaded dataset sorted by date, starting from 1990

# Check to ensure the start date is indeed from 1990
print(f"Dataset start date: {df_full['Date'].min()}")

# Prepare the feature 'PrevPrice'
df_full['PrevPrice'] = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].shift(1)
df_full = df_full.dropna()  # Removing the first row which now has a NaN value for 'PrevPrice'

# Define the features (X) and target variable (y)
X = df_full[['PrevPrice']]
y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the entire dataset
model.fit(X, y)

# Forecast the next 12 months
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = []

for _ in range(12):
    # Using the last known price to predict the next one
    input_feature = pd.DataFrame([last_known_price], columns=['PrevPrice'])
    next_price = model.predict(input_feature)[0]
    predictions.append(next_price)
    last_known_price = next_price  # Update the last known price for the next prediction

# Output the predictions
prediction_dates = pd.date_range(start=df_full['Date'].max() + pd.Timedelta(days=1), periods=12, freq='ME')
predicted_prices = pd.Series(predictions, index=prediction_dates)
display(predicted_prices.to_frame('Predicted Price'))
Dataset start date: 2007-03-15 00:00:00
Predicted Price
2024-03-31 3.59018
2024-04-30 3.67058
2024-05-31 3.69817
2024-06-30 3.58821
2024-07-31 3.67058
2024-08-31 3.69817
2024-09-30 3.58821
2024-10-31 3.67058
2024-11-30 3.69817
2024-12-31 3.58821
2025-01-31 3.67058
2025-02-28 3.69817
In [55]:
print(f"Earliest date: {df_full['Date'].min()}")
display(X.head(5))
Earliest date: 2007-04-15 00:00:00
PrevPrice
197 2.563
198 2.845
199 3.146
200 3.056
201 2.965
In [56]:
print(f"Earliest date: {df_full['Date'].min()}")
Earliest date: 2007-04-15 00:00:00
In [57]:
display(X.head(5))
PrevPrice
197 2.563
198 2.845
199 3.146
200 3.056
201 2.965
In [62]:
# This line restricts the dataset to start from 2019
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
In [59]:
print(f"Date range: {df_full['Date'].min()} to {df_full['Date'].max()}")
Date range: 2007-04-15 00:00:00 to 2024-03-15 00:00:00
In [63]:
from IPython.display import display

if 'df_full' in globals():
    display(df_full.head(3))
    display(df_full.tail(3))
Date U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. No 2 Diesel Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) PrevPrice
197 2007-04-15 2.891 2.831 3.013 2.845 2.787 2.965 2.953 2.881 3.093 3.052 2.996 3.155 2.834 2.847 2.563
198 2007-05-15 3.187 3.157 3.248 3.146 3.119 3.202 3.241 3.199 3.321 3.331 3.300 3.389 2.796 2.818 2.845
199 2007-06-15 3.102 3.067 3.173 3.056 3.024 3.122 3.161 3.118 3.247 3.263 3.227 3.330 2.808 2.826 3.146
Date U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon) U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon) U.S. No 2 Diesel Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) PrevPrice
398 2024-01-15 3.197 3.045 3.495 3.075 2.957 3.331 3.619 3.391 3.945 3.969 3.764 4.206 3.854 3.854 3.134
399 2024-02-15 3.328 3.188 3.605 3.212 3.102 3.446 3.729 3.511 4.042 4.077 3.888 4.296 4.044 4.044 3.075
400 2024-03-15 3.542 3.402 3.815 3.426 3.318 3.657 3.938 3.719 4.252 4.286 4.098 4.506 4.022 4.022 3.212
In [ ]:
import importlib
import subprocess
import sys

import pandas as pd
import numpy as np

try:
    from statsmodels.formula.api import ols
except ModuleNotFoundError:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'statsmodels'])
    from statsmodels.formula.api import ols

# Define the path to the Excel file
xls_path = resolve_dataset_path()

# Load the data from the specified sheet with the data starting on row 2
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)

# Ensure the 'Date' column is datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Convert 'Date' to a numerical format for regression analysis (ordinal)
df['Date_ordinal'] = df['Date'].apply(lambda x: x.toordinal() if pd.notna(x) else np.nan)
df = df.dropna(subset=['Date_ordinal', 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'])

# Fit the linear regression model
model = ols("Q('U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)') ~ Date_ordinal", data=df).fit()

# Print the summary of the model results
print(model.summary().as_text().split('Notes:')[0])

# Extract the coefficients to formulate the regression equation
intercept, slope = model.params
print(f"Regression Equation: Price = {intercept:.4f} + {slope:.4f} * Date_ordinal")
In [74]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# Load the dataset
xls_path = resolve_dataset_path()
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

# Select the U.S. Regular All Formulations Retail Gasoline Prices column
data = df['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']

# Handle missing values by forward filling then backward filling to ensure no gaps
data_filled = data.ffill().bfill()

# Decompose the series to understand its components
decomposition = seasonal_decompose(data_filled, model='additive', period=12)
fig = decomposition.plot()
plt.show()

# Fit a SARIMA model
# Note: p, d, q values and seasonal P, D, Q, M should be selected based on ACF, PACF plots and domain understanding
model = SARIMAX(data_filled, order=(1,1,1), seasonal_order=(1,1,1,12))
results = model.fit(disp=False)

# Check diagnostics to ensure no assumptions are violated
results.plot_diagnostics(figsize=(15, 12))
plt.show()

# Forecast for the next 12 months
forecast = results.get_forecast(steps=12)
mean_forecast = forecast.predicted_mean
confidence_intervals = forecast.conf_int()

# Dates for the forecast
forecast_dates = pd.date_range(data_filled.index[-1], periods=12, freq='ME')

# Create a plot with historical data and forecast
fig = go.Figure()

# Historical data
fig.add_trace(go.Scatter(x=data_filled.index, y=data_filled, mode='lines', name='Historical Prices'))

# Forecast data
fig.add_trace(go.Scatter(x=forecast_dates, y=mean_forecast, mode='lines', name='Forecast'))

# Fill area between confidence intervals
fig.add_trace(go.Scatter(x=np.concatenate([forecast_dates, forecast_dates[::-1]]),
                         y=np.concatenate([confidence_intervals.iloc[:, 0], confidence_intervals.iloc[:, 1][::-1]]),
                         fill='toself', fillcolor='rgba(0,100,80,0.2)', line=dict(color='rgba(255,255,255,0)'),
                         name='Confidence Interval'))

# Update layout
fig.update_layout(title='Forecast of U.S. Regular Gasoline Prices', xaxis_title='Date', yaxis_title='Price (Dollars per Gallon)', template='plotly_white')
fig.show()
No description has been provided for this image
C:\Users\pc\OneDrive\Desktop\Python\CodexPrime\venv\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
C:\Users\pc\OneDrive\Desktop\Python\CodexPrime\venv\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
No description has been provided for this image
C:\Users\pc\OneDrive\Desktop\Python\CodexPrime\venv\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`.
  return get_prediction_index(
In [71]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime as dt

# Load the dataset
xls_path = resolve_dataset_path()
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

# Convert dates into numerical format for regression
df['Date_ordinal'] = df.index.map(dt.datetime.toordinal)
data_filled = df['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].ffill().bfill()

# Prepare X and y for regression
X = df['Date_ordinal'].values.reshape(-1, 1)
y = data_filled.values

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Print the coefficients of the model
print("Intercept (b0):", model.intercept_)
print("Slope (b1) for Date:", model.coef_[0])

# Plotting the results with formatted dates
plt.figure(figsize=(12, 6))
plt.scatter(df.index, y, color='blue', label='Actual Prices')
plt.plot(df.index, model.predict(X), color='red', linewidth=2, label='Fitted Line')
plt.title('Linear Regression on Gasoline Prices')
plt.xlabel('Date')
plt.ylabel('U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)')

# Format the x-axis to improve readability
plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())  # Use AutoDateLocator without additional parameters
plt.gca().xaxis.set_major_formatter(mdates.ConciseDateFormatter(mdates.AutoDateLocator()))  # Use concise formatter for cleaner labels

plt.gcf().autofmt_xdate()  # Automatically rotate date labels for better readability
plt.legend()
plt.show()
Intercept (b0): -155.08130894484586
Slope (b1) for Date: 0.0002146200118198708
No description has been provided for this image
In [73]:
import subprocess
import sys

import pandas as pd
import matplotlib.pyplot as plt

try:
    import seaborn as sns
except ModuleNotFoundError:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn'])
    import seaborn as sns

# File path from the original code
xls_path = resolve_dataset_path()

# Load the dataset
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)

# Select the relevant columns used in the analysis
relevant_columns = [
    'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
]

# Selecting only the relevant columns
df_clean = df[relevant_columns]

# Calculating the correlation matrix
correlation_matrix = df_clean.corr()

# Displaying the correlation matrix
display(correlation_matrix.round(3))

# Visualization of the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Fuel Price Variables')
plt.show()
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) U.S. No 2 Diesel Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon) 1.000 1.000 0.995 0.988 0.982 0.941 0.916
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon) 1.000 1.000 0.993 0.985 0.980 0.936 0.913
U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon) 0.995 0.993 1.000 0.998 0.986 0.954 0.919
U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon) 0.988 0.985 0.998 1.000 0.984 0.947 0.924
U.S. No 2 Diesel Retail Prices (Dollars per Gallon) 0.982 0.980 0.986 0.984 1.000 1.000 1.000
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon) 0.941 0.936 0.954 0.947 1.000 1.000 1.000
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon) 0.916 0.913 0.919 0.924 1.000 1.000 1.000
No description has been provided for this image
In [ ]: