import warnings
import pandas as pd
from pathlib import Path
from IPython.display import display

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 12)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 140)

def resolve_dataset_path(filename='PET_PRI_GND_DCUS_NUS_M.xls'):
    candidates = [
        Path.cwd() / filename,
        Path.cwd() / 'Capstone Project' / filename,
        Path.home() / 'OneDrive' / 'Desktop' / 'GGU' / 'Classes Spring 2024 Trimester' / 'Data 190 Capstone' / 'Capstone Project' / filename,
    ]
    for candidate in candidates:
        if candidate.exists():
            return candidate

    matches = list(Path.cwd().glob(f'**/{filename}'))
    if matches:
        return matches[0]

    raise FileNotFoundError(f'Could not locate {filename} from {Path.cwd()}')

print('Notebook display cleanup loaded.')

Notebook display cleanup loaded.

!pip install xlrd statsmodels seaborn scikit-learn plotly matplotlib pandas numpy

import pandas as pd
from IPython.display import display

# Define the path to the Excel file
xls_path = resolve_dataset_path()
print(f'Using dataset: {xls_path}')

# Read the .xls file
try:
    df_full = pd.read_excel(xls_path, sheet_name=0, engine=None)
    df_head = df_full.head(3)
    df_tail = df_full.tail(3)
    display(df_head)
    display(df_tail)
except Exception as e:
    print(f'Notebook cell skipped: {e}')

Using dataset: PET_PRI_GND_DCUS_NUS_M.xls

from IPython.display import display

if 'df_full' in globals():
    display(df_full.head(3))
    display(df_full.tail(3))

#succesfully target the data
import pandas as pd

# Define the path to the Excel file
xls_path = resolve_dataset_path()
print(f'Using dataset: {xls_path}')

# Attempt to read the sheet named 'Data 1' which contains the actual data
try:
    df_full = pd.read_excel(xls_path, sheet_name='Data 1')
    df_head = df_full.head()
    display(df_head)
except Exception as e:
    print(f'Notebook cell skipped: {e}')

Using dataset: PET_PRI_GND_DCUS_NUS_M.xls

import pandas as pd

# Define the path
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1', skipping rows to get to the actual data
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')  # Coerce for non datetime values 

    # Filter the DataFrame to only include entries from January 2019 onwards
    start_date = '2019-01-01'
    df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()  # Make a copy 
    
    # Drop rows where the date is NaT (not a time), which might have been created due to 'coerce'
    df_filtered.dropna(subset=['Date'], inplace=True)

    display(df_filtered.head(5))
except Exception as e:
    print(f'Notebook cell skipped: {e}')

from IPython.display import display
display(df_full.head(3))
display(df_full.tail(3))

import pandas as pd
from IPython.display import display

# Define the path
xls_path = resolve_dataset_path()
print(f'Using dataset: {xls_path}')

try:
    # Read the sheet named 'Data 1', skipping rows to get to the actual data
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')  # Coerce for non-datetime values

    # Find the first non-null value and its corresponding date
    first_valid_date = df_full.loc[df_full.notna().any(axis=1), 'Date'].min()

    # Convert to string format
    start_date = first_valid_date.strftime('%Y-%m-%d')

    # Filter the DataFrame to only include entries from the first valid date onwards
    df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()

    # Check the column names in the DataFrame
    print(f'Column count: {len(df_filtered.columns)}')
    display(df_filtered.columns[:12].tolist())

    # Define the columns of interest
    columns_of_interest = [
        'Date',
        'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
        'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
        'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
    ]

    if all(col in df_filtered.columns for col in columns_of_interest):
        df_clean = df_filtered.dropna(subset=columns_of_interest)
        display(df_clean.head(5))
    else:
        print('Some expected columns were not found in the dataset.')

except Exception as e:
    print(f'Notebook cell skipped: {e}')

Using dataset: PET_PRI_GND_DCUS_NUS_M.xls
Column count: 16

['Date',
 'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)']

# Display the first few column names 
print(f'Column count: {len(df_clean.columns)}')
display(df_clean.columns[:12].tolist())

Column count: 16

['Date',
 'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)']

# Check the range of dates available
print(f"Date range: {df_clean['Date'].min()} to {df_clean['Date'].max()}")
display(df_clean[['Date', 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']].dropna().head(5))

Date range: 2007-02-15 00:00:00 to 2008-11-15 00:00:00

print('Missing values:', df_clean['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].isna().sum())

Missing values: 0

# Display the available column names
print(f'Column count: {len(df_clean.columns)}')
display(df_clean.columns[:12].tolist())

# Define the target columns for plotting
columns_to_plot_corrected = [
    'Regular_All_Formulations',  # Exact DataFrame column name
    # Additional columns can be added here
]

# Keep only columns that exist in df_clean
columns_to_plot_existing = [col for col in columns_to_plot_corrected if col in df_clean.columns]

# Set 'Date' as the index when needed
if 'Date' not in df_clean.index.names:
    df_clean.set_index('Date', inplace=True)

# Plot the available target columns
if columns_to_plot_existing:
    df_clean[columns_to_plot_existing].plot(figsize=(14, 7), marker='o', title='U.S. Gasoline Prices Over Time')
    plt.xlabel('Date')
    plt.ylabel('Price (Dollars per Gallon)')
    plt.grid(True)
    plt.legend()
    plt.show()
else:
    print("None of the intended columns to plot were found in the DataFrame.")

import plotly.graph_objects as go

# Define the columns the analysis plots and their corresponding colors
columns_to_plot = [
    'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
]
colors = ['#FFFF00',  # Yellow
          '#FF0000',  # Red
          '#0000FF',  # Blue
          '#008000']  # Green

# Creating the Plotly figure
fig = go.Figure()

# Adding a line plot for each fuel type
for column, color in zip(columns_to_plot, colors):
    fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], name=column,  # Keep full column name in legend
                             mode='lines+markers', line=dict(color=color)))

# Updating the layout
fig.update_layout(
    title='Interactive U.S. Gasoline and Diesel Retail Prices Over Time (2019-2023)',
    xaxis_title='Date',
    yaxis_title='Price (Dollars per Gallon)',
    legend_title='Fuel Type',
    template='plotly_white',
    width=1400,  # Wider chart for better visualization
    hovermode='x unified'  # Unified hover mode for easier comparison
)

# Display the figure
fig.show()

import plotly.graph_objects as go

# Define the columns the analysis plots and their corresponding colors
columns_to_plot = [
    'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
]
colors = ['#FFFF00',  # Yellow
          '#FF0000',  # Red
          '#0000FF',  # Blue
          '#008000']  # Green

# Creating the Plotly figure
fig = go.Figure()

# Adding a line plot for each fuel type
for column, color in zip(columns_to_plot, colors):
    fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], name=column.split(' ')[1],  # Simplify name for legend
                             mode='lines+markers', line=dict(color=color)))

# Updating the layout
fig.update_layout(
    title='Interactive U.S. Gasoline and Diesel Retail Prices Over Time (2010-2023)',
    xaxis_title='Date',
    yaxis_title='Price (Dollars per Gallon)',
    legend_title='Fuel Type',
    template='plotly_white',
    width=1400  # Wider chart for better visualization
)

# Display the figure
fig.show()

print(f'Index name: {df_clean.index.name}')

Index name: Date

import plotly.graph_objects as go

# Define the columns the analysis plots and their corresponding colors
columns_to_plot = [
    'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
]
colors = ['#FFFF00',  # Yellow
          '#FF0000',  # Red
          '#0000FF',  # Blue
          '#008000']  # Green

# Creating the Plotly figure
fig = go.Figure()

# Adding a line plot for each fuel type
for column, color in zip(columns_to_plot, colors):
    fig.add_trace(go.Scatter(x=df_filtered.index, y=df_filtered[column], name=column,  # Keep full column name in legend
                             mode='lines+markers', line=dict(color=color)))

# Updating the layout
fig.update_layout(
    title='Interactive U.S. Gasoline and Diesel Retail Prices Over Time (2010-2023)',
    xaxis_title='Date',
    yaxis_title='Price (Dollars per Gallon)',
    legend_title='Fuel Type',
    template='plotly_white',
    width=1400,  # Wider chart for better visualization
    hovermode='x unified'  # Unified hover mode for easier comparison
)

# Display the figure
fig.show()

import pandas as pd
import plotly.graph_objects as go

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date'
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    
    # Convert the 'Date' column to datetime format
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter the DataFrame to include data since January 2010
    start_date = '2010-01-01'
    df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
    
    # Define the columns to plot
    columns_to_plot = [
        'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)'
    ]
    
    # Create a Plotly figure
    fig = go.Figure()
    
    # Add a line plot for each fuel type
    for column in columns_to_plot:
        fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], name=column, mode='lines+markers'))
    
    # Update the layout of the figure
    fig.update_layout(
        title='Interactive U.S. Gasoline and Diesel Retail Prices Since 2010',
        xaxis_title='Date',
        yaxis_title='Price (Dollars per Gallon)',
        template='plotly_white',
        width=1400
    )
    
    # Display the figure
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')

from IPython.display import display
display(df_clean.head(5))
print(df_clean.info())

<class 'pandas.DataFrame'>
DatetimeIndex: 22 entries, 2007-02-15 to 2008-11-15
Data columns (total 15 columns):
 #   Column                                                                           Non-Null Count  Dtype  
---  ------                                                                           --------------  -----  
 0   U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)     22 non-null     float64
 1   U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)         22 non-null     float64
 2   U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)         22 non-null     float64
 3   U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)        22 non-null     float64
 4   U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)            22 non-null     float64
 5   U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)            22 non-null     float64
 6   U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)       22 non-null     float64
 7   U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)           22 non-null     float64
 8   U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)           22 non-null     float64
 9   U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)        22 non-null     float64
 10  U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)            22 non-null     float64
 11  U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)            22 non-null     float64
 12  U.S. No 2 Diesel Retail Prices (Dollars per Gallon)                              22 non-null     float64
 13  U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)  22 non-null     float64
 14  U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)      22 non-null     float64
dtypes: float64(15)
memory usage: 2.8 KB
None

print(df_full.head())  # Print the first few rows of the DataFrame
print(f"Date range: {df_full['Date'].min()} to {df_full['Date'].max()}")

        Date  U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)  \
0 1990-08-15                                                NaN                              
1 1990-09-15                                                NaN                              
2 1990-10-15                                                NaN                              
3 1990-11-15                                                NaN                              
4 1990-12-15                                                NaN                              

   U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                          
1                                                NaN                          
2                                                NaN                          
3                                                NaN                          
4                                                NaN                          

   U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                          
1                                                NaN                          
2                                                NaN                          
3                                                NaN                          
4                                                NaN                          

   U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                           
1                                              1.258                           
2                                              1.335                           
3                                              1.324                           
4                                                NaN                           

   U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                       
1                                              1.258                       
2                                              1.335                       
3                                              1.324                       
4                                                NaN                       

   U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                       
1                                                NaN                       
2                                                NaN                       
3                                                NaN                       
4                                                NaN                       

   U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                            
1                                                NaN                            
2                                                NaN                            
3                                                NaN                            
4                                                NaN                            

   U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                        
1                                                NaN                        
2                                                NaN                        
3                                                NaN                        
4                                                NaN                        

   U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                        
1                                                NaN                        
2                                                NaN                        
3                                                NaN                        
4                                                NaN                        

   U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                           
1                                                NaN                           
2                                                NaN                           
3                                                NaN                           
4                                                NaN                           

   U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)  \
0                                                NaN                       
1                                                NaN                       
2                                                NaN                       
3                                                NaN                       
4                                                NaN                       

   U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)  U.S. No 2 Diesel Retail Prices (Dollars per Gallon)  \
0                                                NaN                                                                    NaN     
1                                                NaN                                                                    NaN     
2                                                NaN                                                                    NaN     
3                                                NaN                                                                    NaN     
4                                                NaN                                                                    NaN     

   U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)  \
0                                                NaN                                 
1                                                NaN                                 
2                                                NaN                                 
3                                                NaN                                 
4                                                NaN                                 

   U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)  
0                                                NaN                            
1                                                NaN                            
2                                                NaN                            
3                                                NaN                            
4                                                NaN                            
Date range: 1990-08-15 00:00:00 to 2024-03-15 00:00:00

import pandas as pd
import plotly.graph_objects as go

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date'
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    
    # Convert the 'Date' column to datetime format
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter the DataFrame to include data since January 2010
    start_date = '2010-01-01'
    df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
    
    # Define the columns to plot
    columns_to_plot = df_filtered.columns
    
    # Create a Plotly figure
    fig = go.Figure()
    
    # Add a line plot for each column
    for column in columns_to_plot:
        if column != 'Date':  # Exclude 'Date' column from the legend
            fig.add_trace(go.Scatter(x=df_filtered['Date'].dt.date, y=df_filtered[column], name=column, mode='lines+markers'))
    
    # Update the layout of the figure
    fig.update_layout(
        title='Retail Gasoline Prices Over Time',
        xaxis_title='Date',
        yaxis_title='Price (Dollars per Gallon)',
        legend_title='Fuel Type',
        template='plotly_white',
        width=1400
    )
    
    # Display the figure
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')

print(f'Column count: {len(df_clean.columns)}')
display(df_clean.columns[:12].tolist())

Column count: 15

['U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)']

import pandas as pd
import plotly.graph_objects as go

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date'
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    
    # Convert the 'Date' column to datetime format
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter the DataFrame to include data since January 2010
    start_date = '2010-01-01'
    df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()
    
    # Define the columns to plot
    columns_to_plot = df_filtered.columns
    
    # Create a Plotly figure
    fig = go.Figure()
    
    # Add a line plot for each column with fills
    for column in columns_to_plot:
        if column != 'Date':  # Exclude 'Date' column from the legend
            fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], mode='lines', name=column))
            fig.add_trace(go.Scatter(x=df_filtered['Date'], y=df_filtered[column], mode='lines', name=column + '_fill',
                                     fill='tonexty'))  # Add fill to the line
    
    # Update the layout of the figure
    fig.update_layout(
        title='Retail Gasoline Prices Over Time',
        xaxis_title='Date',
        yaxis_title='Price (Dollars per Gallon)',
        legend_title='Fuel Type',
        template='plotly_white',
        width=1400,
        hovermode='x unified'  # Unified hover mode for easier comparison
    )
    
    # Display the figure
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')

import pandas as pd
import numpy as np

# Work with the cleaned dataframe
# Select only the numeric columns
numeric_columns = df_clean.select_dtypes(include=[np.number])

# Calculate Z-score for each numeric column in the DataFrame
z_scores = (numeric_columns - numeric_columns.mean()) / numeric_columns.std()

# Define threshold for considering outliers
threshold = 3

# Identify outliers based on Z-score
outliers = (z_scores > threshold) | (z_scores < -threshold)

# Print outliers
outlier_counts = outliers.sum().sort_values(ascending=False)
display(outlier_counts[outlier_counts > 0].head(12).to_frame('Outlier Count'))

# Check the data types of the columns
display(df_clean.dtypes.to_frame('dtype'))
missing_counts = df_clean.isnull().sum()
display(missing_counts[missing_counts > 0].to_frame('Missing Count'))

# Importing necessary library
import numpy as np

# Extracting only the numeric columns for Z-score calculation
numeric_columns = df_clean.select_dtypes(include=np.number)

# Calculate Z-score for each numeric column in the DataFrame
z_scores = (numeric_columns - numeric_columns.mean()) / numeric_columns.std()

# Define threshold for considering outliers
threshold = 3

# Identify outliers based on Z-score
outliers = (z_scores > threshold) | (z_scores < -threshold)

# Print outliers
outlier_counts = outliers.sum().sort_values(ascending=False)
display(outlier_counts[outlier_counts > 0].head(12).to_frame('Outlier Count'))

outlier_counts = outliers.sum().sort_values(ascending=False)
display(outlier_counts[outlier_counts > 0].head(12).to_frame('Outlier Count'))
display(df_clean.dtypes.to_frame('dtype'))
missing_counts = df_clean.isnull().sum()
display(missing_counts[missing_counts > 0].to_frame('Missing Count'))

import pandas as pd
import numpy as np
from IPython.display import display

# Define the path to the Excel file
xls_path = resolve_dataset_path()
print(f'Using dataset: {xls_path}')

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)

    # Rename the first column to 'Date'
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)

    # Convert the 'Date' column to datetime format
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')

    # Filter the DataFrame to include data since January 2010
    start_date = '2010-01-01'
    df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()

    # Handle missing values by forward filling
    df_clean = df_filtered.ffill()

    columns_to_plot = [
        'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
        'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
        'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
        'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
    ]

    def detect_outliers_zscore(data, threshold=3):
        z_scores = np.abs((data - data.mean()) / data.std())
        return z_scores > threshold

    outliers = df_clean[columns_to_plot].apply(detect_outliers_zscore)
    outlier_counts = outliers.sum().sort_values(ascending=False)
    display(outlier_counts[outlier_counts > 0].head(12).to_frame('Outlier Count'))

except Exception as e:
    print(f'Notebook cell skipped: {e}')

Using dataset: PET_PRI_GND_DCUS_NUS_M.xls

import pandas as pd
import numpy as np
from IPython.display import display

# Restrict summary statistics to numeric columns so datetime fields do not break variance calculations
numeric_df = df_clean.select_dtypes(include=[np.number])

# Calculate measures of central tendency
central_tendency = numeric_df.describe().loc[['mean', '50%']]

# Calculate measures of dispersion
standard_deviation = numeric_df.std()
variance = numeric_df.var()
range_values = numeric_df.max() - numeric_df.min()

# Create a DataFrame to store the results
statistics_df = pd.DataFrame({
    'Mean': central_tendency.loc['mean'],
    'Median': central_tendency.loc['50%'],
    'Standard Deviation': standard_deviation,
    'Variance': variance,
    'Range': range_values
})

# Display the statistics DataFrame
display(statistics_df.round(3))

#Determing correlation of independent variables


import pandas as pd

# Run the analysis on the cleaned dataframe with the selected independent variables
relevant_columns = [
    'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
]

# Selecting only the relevant columns
relevant_data = df_clean[relevant_columns]

# Calculating the correlation matrix
correlation_matrix = relevant_data.corr()

# Displaying the correlation matrix
display(correlation_matrix.round(3))

import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date'
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    
    # Convert the 'Date' column to datetime format
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Create a scatter plot of U.S. Regular All Formulations Retail Gasoline Prices over time
    fig = px.scatter(df_full, x='Date', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                     title='Scatter Plot of U.S. Regular Gasoline Prices Over Time',
                     labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'},
                     trendline='ols'  # Add a trendline using ordinary least squares method
                    )
    
    # Display the scatter plot
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')

Notebook cell skipped: No module named 'statsmodels'

import pandas as pd
from IPython.display import display

# Define the path to the Excel file
xls_path = resolve_dataset_path()
print(f'Using dataset: {xls_path}')

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)

    # Display the column names to identify the correct column for gasoline prices
    print(f'Column count: {len(df_full.columns)}')
    display(df_full.columns[:12].tolist())

except Exception as e:
    print(f'Notebook cell skipped: {e}')

Using dataset: PET_PRI_GND_DCUS_NUS_M.xls
Column count: 16

['Date',
 'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
 'U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)']

import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date'
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    
    # Convert the 'Date' column to datetime format
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices
    fig = px.histogram(df_full, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                       title='Histogram of U.S. Regular Gasoline Prices',
                       labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'}
                      )
    
    # Display the histogram
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')

import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date'
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    
    # Convert the 'Date' column to datetime format
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Create a scatter plot of U.S. Regular All Formulations Retail Gasoline Prices over time
    fig = px.scatter(df_full, x='Date', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                     title='Scatter Plot of U.S. Regular Gasoline Prices Over Time',
                     labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'},
                     trendline='ols'  # Add a trendline using ordinary least squares method (linear regression)
                    )
    
    # Display the scatter plot
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')

Notebook cell skipped: No module named 'statsmodels'

import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter for data from 2019 onward
    df_filtered = df_full[df_full['Date'].dt.year >= 2019]
    
    # Create the scatter plot
    fig = px.scatter(df_filtered, x='Date', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                     title='Scatter Plot of U.S. Regular Gasoline Prices (2019 Onward)',
                     labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)'},
                     trendline='ols')
    
    # Display the scatter plot
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')

Notebook cell skipped: No module named 'statsmodels'

import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Add a 'Year' column to the DataFrame
    df_full['Year'] = df_full['Date'].dt.year

    # Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year
    fig = px.box(df_full, x='Year', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                 title='Colorful Boxplot of U.S. Regular Gasoline Prices by Year',
                 labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
                         'Year': 'Year'},
                 color='Year'  # This makes the plot colorful
                )
    
    # Display the boxplot
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')

import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Add a 'Year' column and a 'Quarter' column to the DataFrame
    df_full['Year'] = df_full['Date'].dt.year
    df_full['Quarter'] = df_full['Date'].dt.to_period('Q')
    
    # Convert 'Quarter' to string for compatibility
    df_full['QuarterStr'] = df_full['Quarter'].astype(str)

    # Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year and Quarter
    fig = px.box(df_full, x='QuarterStr', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                 title='Boxplot of U.S. Regular Gasoline Prices by Quarter',
                 labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
                         'QuarterStr': 'Quarter'},
                 color='QuarterStr'  # This makes the plot colorful and distinguishes each quarter
                )
    
    # Display the boxplot
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')

import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter for data from 2010 onward and avoid SettingWithCopyWarning by using copy()
    df_filtered = df_full[df_full['Date'].dt.year >= 2010].copy()
    
    # Properly use .loc to avoid SettingWithCopyWarning
    df_filtered['Year'] = df_filtered['Date'].dt.year
    df_filtered.loc[:, 'Quarter'] = df_filtered['Date'].dt.to_period('Q')
    
    # Convert 'Quarter' to string for compatibility
    df_filtered.loc[:, 'QuarterStr'] = df_filtered['Quarter'].astype(str)

    # Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year and Quarter
    fig = px.box(df_filtered, x='QuarterStr', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                 title='Boxplot of U.S. Regular Gasoline Prices by Quarter (From 2010 Onward)',
                 labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
                         'QuarterStr': 'Quarter'},
                 color='QuarterStr'  # This makes the plot colorful and distinguishes each quarter
                )
    
    # Display the boxplot
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')

import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter for data from 2019 onward and avoid SettingWithCopyWarning by using copy()
    df_filtered = df_full[df_full['Date'].dt.year >= 2019].copy()
    
    # Properly use .loc to avoid SettingWithCopyWarning
    df_filtered['Year'] = df_filtered['Date'].dt.year
    df_filtered.loc[:, 'Quarter'] = df_filtered['Date'].dt.to_period('Q')
    
    # Convert 'Quarter' to string for compatibility
    df_filtered.loc[:, 'QuarterStr'] = df_filtered['Quarter'].astype(str)

    # Create a colorful boxplot of U.S. Regular All Formulations Retail Gasoline Prices, grouped by Year and Quarter
    fig = px.box(df_filtered, x='QuarterStr', y='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
                 title='Boxplot of U.S. Regular Gasoline Prices by Quarter (From 2019 Onward)',
                 labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
                         'QuarterStr': 'Quarter'},
                 color='QuarterStr'  # This makes the plot colorful and distinguishes each quarter
                )
    
    # Display the boxplot
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')

import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Extract the year from the 'Date' column
    df_full['Year'] = df_full['Date'].dt.year
    
    # Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices
    # Color the histogram bars by the 'Year' to visualize how distributions change over time
    fig = px.histogram(df_full, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', color='Year',
                       title='Histogram of U.S. Regular Gasoline Prices by Year',
                       labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
                               'Year': 'Year'},
                       barmode='overlay',  # Overlay the histograms to see how distributions compare across years
                       histnorm='percent',  # Normalize the histogram to show percentage rather than count
                       nbins=30  # Adjust the number of bins for finer granularity or clarity
                      )
    
    # Display the histogram
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')

import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter for data from 2019 onward
    df_filtered = df_full[df_full['Date'].dt.year >= 2019].copy()
    
    # Create a 'YearQuarter' column for coloring
    df_filtered['YearQuarter'] = df_filtered['Date'].dt.to_period('Q').astype(str)

    # Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices, colored by 'YearQuarter'
    fig = px.histogram(df_filtered, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', color='YearQuarter',
                       title='Quarterly Histogram of U.S. Regular Gasoline Prices (2019 Onward)',
                       labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
                               'YearQuarter': 'Year-Quarter'},
                       barmode='overlay',  # Overlay the histograms to compare across quarters
                       histnorm='percent',  # Normalize to show percentage
                       nbins=20  # Number of bins for granularity
                      )
    
    # Display the histogram
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')

import pandas as pd
import plotly.express as px

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Filter for data from 2019 onward
    df_filtered = df_full[df_full['Date'].dt.year >= 2019].copy()
    
    # Create a 'YearQuarter' column for coloring
    df_filtered['YearQuarter'] = df_filtered['Date'].dt.to_period('Q').astype(str)

    # Create a histogram of U.S. Regular All Formulations Retail Gasoline Prices, colored by 'YearQuarter'
    fig = px.histogram(df_filtered, x='U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)', color='YearQuarter',
                       title='Quarterly Histogram of U.S. Regular Gasoline Prices (2019 Onward)',
                       labels={'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)': 'Price (Dollars per Gallon)',
                               'YearQuarter': 'Year-Quarter'},
                       barmode='overlay',  # Overlay the histograms to compare across quarters
                       histnorm='percent',  # Normalize to show percentage
                       nbins=20,  # Number of bins for granularity
                       )
    
    # Update y-axis title to reflect what the percentages represent
    fig.update_layout(yaxis_title='Percentage of Total Observations')

    # Display the histogram
    fig.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Define the path to the Excel file
xls_path = resolve_dataset_path()

try:
    # Read the sheet named 'Data 1' from the Excel file, skipping the first two rows
    df_full = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
    
    # Rename the first column to 'Date' and convert to datetime
    df_full.rename(columns={df_full.columns[0]: 'Date'}, inplace=True)
    df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
    
    # Drop the specified column
    df_full.drop(columns=['U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'], inplace=True)
    
    # Drop rows with any NaN values in the target column
    df_full.dropna(subset=['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], inplace=True)
    
    # Reset the index after the dropna operation
    df_full.reset_index(drop=True, inplace=True)
    
    # Feature Engineering: Use previous prices as features to predict the next price
    df_full['PrevPrice'] = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].shift(1)
    
    # Drop the first row which now has an NaN value after the shift operation
    df_full.dropna(inplace=True)
    
    # Define features and target variable
    X = df_full[['PrevPrice']]  # Features from previous prices
    y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']  # Target
    
    # Initialize the Random Forest model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    
    # Train the model on the entire dataset
    model.fit(X, y)
    
    # Predict on the training set itself since we don't have a separate test set
    predictions = model.predict(X)
    
    # Calculate and print the mean absolute error
    mae = mean_absolute_error(y, predictions)
    print("Mean Absolute Error on the entire dataset:", mae)
    
    # Optionally, visualize the actual vs predicted prices
    import matplotlib.pyplot as plt

    plt.figure(figsize=(10, 6))
    plt.scatter(y, predictions, alpha=0.3)
    plt.plot(y, y, color='red')  # Perfect predictions line
    plt.title('Actual vs Predicted Prices')
    plt.xlabel('Actual Prices')
    plt.ylabel('Predicted Prices')
    plt.show()

except Exception as e:
    print(f'Notebook cell skipped: {e}')

Mean Absolute Error on the entire dataset: 0.06432608148404984

# Continue forecasting from the trained model and working dataframe
# Find the last known price
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]

# Initialize a list to keep track of our predictions and input features
predictions = []
input_feature = last_known_price

# Predict the next three months
for _ in range(3):
    # Predict the next price
    next_month_price = model.predict([[input_feature]])[0]
    
    # Store the prediction
    predictions.append(next_month_price)
    
    # Update the input_feature to the newly predicted price
    input_feature = next_month_price

# Display the predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=3, freq='ME')
predicted_prices = pd.Series(predictions, index=prediction_dates)

display(predicted_prices.to_frame('Predicted Price'))

import plotly.graph_objects as go
import pandas as pd

# Plot the forecast against the working dataframe and generated predictions

# Create a DataFrame for the predicted data with appropriate dates
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=3, freq='ME')
predicted_data = pd.DataFrame({'Date': prediction_dates, 'Predicted Prices': [3.58903, 3.65322, 3.64383]})

# Create the figure for plotting
fig = go.Figure()

# Add the historical data
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
                         mode='lines', name='Historical Prices', line=dict(color='blue')))

# Add the predicted data
fig.add_trace(go.Scatter(x=predicted_data['Date'], y=predicted_data['Predicted Prices'],
                         mode='lines+markers', name='Predicted Prices', line=dict(color='red', dash='dash')))

# Update layout for a more interactive experience
fig.update_layout(title='Historical and Predicted Gasoline Prices',
                  xaxis_title='Date',
                  yaxis_title='Price (Dollars per Gallon)',
                  template='plotly_white')

# Enable zoom and pan
fig.update_layout(xaxis=dict(rangeslider=dict(visible=True), type="date"))

# Show the figure
fig.show()

# Use the prepared working dataframe for feature construction

# Ensure the feature used for prediction is correctly set
X = df_full[['PrevPrice']]  # Use the lagged price feature for model input

# Generate predictions for the entire dataset
predictions = model.predict(X)

# Now, ensure 'y' is the target variable for the entire dataset
y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']

# Ensure y and predictions are aligned; they should be if predictions were made for the entire dataset
residuals = y - predictions

# Now, the residuals can be visualized the residuals as previously described without encountering shape mismatch issues
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(y.index, residuals)
plt.axhline(y=0, color='red', linestyle='--')
plt.title('Residuals of Predictions')
plt.xlabel('Index')
plt.ylabel('Residuals')
plt.show()

# Evaluate the RandomForestRegressor model with the prepared feature set
import pandas as pd
import matplotlib.pyplot as plt

feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.nlargest(len(X.columns)).plot(kind='barh', colormap='winter')
plt.title('Feature Importances')
plt.show()

import plotly.graph_objects as go

# Combine historical data with future predictions for plotting
# Concatenate historical and predicted data for a continuous timeline
full_dates = pd.concat([df_full['Date'], predicted_data['Date']])
full_prices = pd.concat([df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], predicted_data['Predicted Prices']])

fig = go.Figure()

# Add historical data to the plot
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
                         mode='lines', name='Historical Prices'))

# Add predicted data to the plot
fig.add_trace(go.Scatter(x=predicted_data['Date'], y=predicted_data['Predicted Prices'],
                         mode='lines+markers', name='Predicted Prices', line=dict(dash='dot')))

# Update layout for a more detailed view
fig.update_layout(title='Historical and Predicted Gasoline Prices',
                  xaxis_title='Date',
                  yaxis_title='Price (Dollars per Gallon)',
                  template='plotly_white')

fig.show()

# Starting from the last known price in the dataset
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]

# Initialize a list to store predictions
monthly_predictions = [last_known_price]  # Starting with the last known price

# Predict the next 12 months
for _ in range(12):
    # The model expects a DataFrame, so we create one with the current input feature
    current_feature = pd.DataFrame([monthly_predictions[-1]], columns=['PrevPrice'])
    
    # Predict the next price
    next_month_price = model.predict(current_feature)[0]
    
    # Append the prediction for next month
    monthly_predictions.append(next_month_price)

# Remove the first element (last known price) to get only future predictions
future_predictions = monthly_predictions[1:]

# Create a date range for these predictions
last_date = df_full['Date'].iloc[-1]
prediction_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=12, freq='ME')

# Plot the predictions
plt.figure(figsize=(10, 6))
plt.plot(prediction_dates, future_predictions, label='Predicted Prices', marker='o', linestyle='--', color='red')
plt.title('Predicted Gasoline Prices for the Next Year')
plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.legend()
plt.show()

import pandas as pd
import matplotlib.pyplot as plt

# Forecast forward from the last known date in the historical dataframe
# and 'model' is the trained Random Forest model

# Initialize predictions list with the last known price to start the prediction chain
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = [last_known_price]

# Generate predictions for the next 12 months
for _ in range(12):
    input_feature = pd.DataFrame([predictions[-1]], columns=['PrevPrice'])
    next_price = model.predict(input_feature)[0]
    predictions.append(next_price)

# Exclude the first element (last known price) to keep only future predictions
future_predictions = predictions[1:]

# Create a date range for the next 12 months of predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=12, freq='ME')

# Combine historical dates and predicted dates
full_dates = pd.concat([df_full['Date'], pd.Series(prediction_dates)])

# Combine historical prices and predicted prices
# The last known price is included in both historical and predicted prices for continuity
full_prices = pd.concat([df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], pd.Series(future_predictions)])

# Plotting
plt.figure(figsize=(14, 7))
plt.plot(df_full['Date'], df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'], label='Historical Prices', color='blue')
plt.plot(prediction_dates, future_predictions, label='Predicted Prices', linestyle='--', marker='o', color='red')
plt.title('Historical and Predicted Gasoline Prices')
plt.xlabel('Date')
plt.ylabel('Price (Dollars per Gallon)')
plt.legend()
plt.show()

import plotly.graph_objects as go
import pandas as pd

# Generate forward predictions from the historical dataframe and trained model

# Generate predictions for the next 12 months, starting from the last known price
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = [last_known_price]

for _ in range(12):
    input_feature = pd.DataFrame([predictions[-1]], columns=['PrevPrice'])
    next_price = model.predict(input_feature)[0]
    predictions.append(next_price)

# Exclude the first element (last known price) to keep only future predictions
future_predictions = predictions[1:]

# Create a date range for the next 12 months of predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=12, freq='ME')

# Plotting with Plotly for interactive exploration
fig = go.Figure()

# Add historical data to the plot
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
                         mode='lines', name='Historical Prices'))

# Add predicted data to the plot
fig.add_trace(go.Scatter(x=prediction_dates, y=future_predictions,
                         mode='lines+markers', name='Predicted Prices', line=dict(dash='dot')))

# Update layout for a more detailed view
fig.update_layout(title='Historical and Predicted Gasoline Prices',
                  xaxis_title='Date',
                  yaxis_title='Price (Dollars per Gallon)',
                  template='plotly_white')

# Show the figure
fig.show()

import plotly.graph_objects as go
import pandas as pd

# Extend the trained model beyond the historical dataframe

# Start predictions from the last known price
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = [last_known_price]

# Predict for the next 5 years (60 months)
for _ in range(60):
    input_feature = pd.DataFrame([predictions[-1]], columns=['PrevPrice'])
    next_price = model.predict(input_feature)[0]
    predictions.append(next_price)

# Exclude the first element (last known price) to have only future predictions
future_predictions = predictions[1:]

# Create a date range for the next 5 years of predictions
prediction_dates = pd.date_range(start=df_full['Date'].iloc[-1] + pd.DateOffset(months=1), periods=60, freq='ME')

# Creating the Plotly figure
fig = go.Figure()

# Add historical data to the plot
fig.add_trace(go.Scatter(x=df_full['Date'], y=df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'],
                         mode='lines', name='Historical Prices', line=dict(color='blue')))

# Add predicted data to the plot
fig.add_trace(go.Scatter(x=prediction_dates, y=future_predictions,
                         mode='lines+markers', name='Predicted Prices', line=dict(color='red', dash='dash')))

# Update the layout for better visualization and interactivity
fig.update_layout(title='Historical and Predicted Gasoline Prices for the Next 5 Years',
                  xaxis_title='Date',
                  yaxis_title='Price (Dollars per Gallon)',
                  template='plotly_white')

# Enable zoom and pan features
fig.update_layout(xaxis_rangeslider_visible=True)

# Show the figure
fig.show()

# Find the earliest date in the historical data
earliest_date = df_full['Date'].min()

print(f"The model is trained on data starting from: {earliest_date}")

The model is trained on data starting from: 2007-02-15 00:00:00

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Use the complete dataset sorted by date
# Check the earliest date just for confirmation
print(f"Earliest date in the dataset: {df_full['Date'].min()}")

# Feature Engineering: Create 'PrevPrice' based on 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'
df_full['PrevPrice'] = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].shift(1)
df_full.dropna(inplace=True)  # Drop the first row which now lacks 'PrevPrice'

# Splitting the data into features (X) and target (y)
X = df_full[['PrevPrice']]
y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
print(f"Mean Absolute Error: {mae}")

# Now the model is trained from the earliest occurrence of price in the dataset

Earliest date in the dataset: 2007-02-15 00:00:00
Mean Absolute Error: 0.16113744715447187

from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# Work from the loaded dataset sorted by date, starting from 1990

# Check to ensure the start date is indeed from 1990
print(f"Dataset start date: {df_full['Date'].min()}")

# Prepare the feature 'PrevPrice'
df_full['PrevPrice'] = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].shift(1)
df_full = df_full.dropna()  # Removing the first row which now has a NaN value for 'PrevPrice'

# Define the features (X) and target variable (y)
X = df_full[['PrevPrice']]
y = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the entire dataset
model.fit(X, y)

# Forecast the next 12 months
last_known_price = df_full['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].iloc[-1]
predictions = []

for _ in range(12):
    # Using the last known price to predict the next one
    input_feature = pd.DataFrame([last_known_price], columns=['PrevPrice'])
    next_price = model.predict(input_feature)[0]
    predictions.append(next_price)
    last_known_price = next_price  # Update the last known price for the next prediction

# Output the predictions
prediction_dates = pd.date_range(start=df_full['Date'].max() + pd.Timedelta(days=1), periods=12, freq='ME')
predicted_prices = pd.Series(predictions, index=prediction_dates)
display(predicted_prices.to_frame('Predicted Price'))

Dataset start date: 2007-03-15 00:00:00

print(f"Earliest date: {df_full['Date'].min()}")
display(X.head(5))

Earliest date: 2007-04-15 00:00:00

print(f"Earliest date: {df_full['Date'].min()}")

Earliest date: 2007-04-15 00:00:00

display(X.head(5))

# This line restricts the dataset to start from 2019
df_filtered = df_full[df_full['Date'] >= pd.to_datetime(start_date)].copy()

print(f"Date range: {df_full['Date'].min()} to {df_full['Date'].max()}")

Date range: 2007-04-15 00:00:00 to 2024-03-15 00:00:00

from IPython.display import display

if 'df_full' in globals():
    display(df_full.head(3))
    display(df_full.tail(3))

import importlib
import subprocess
import sys

import pandas as pd
import numpy as np

try:
    from statsmodels.formula.api import ols
except ModuleNotFoundError:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'statsmodels'])
    from statsmodels.formula.api import ols

# Define the path to the Excel file
xls_path = resolve_dataset_path()

# Load the data from the specified sheet with the data starting on row 2
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)

# Ensure the 'Date' column is datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Convert 'Date' to a numerical format for regression analysis (ordinal)
df['Date_ordinal'] = df['Date'].apply(lambda x: x.toordinal() if pd.notna(x) else np.nan)
df = df.dropna(subset=['Date_ordinal', 'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'])

# Fit the linear regression model
model = ols("Q('U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)') ~ Date_ordinal", data=df).fit()

# Print the summary of the model results
print(model.summary().as_text().split('Notes:')[0])

# Extract the coefficients to formulate the regression equation
intercept, slope = model.params
print(f"Regression Equation: Price = {intercept:.4f} + {slope:.4f} * Date_ordinal")

import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# Load the dataset
xls_path = resolve_dataset_path()
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

# Select the U.S. Regular All Formulations Retail Gasoline Prices column
data = df['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)']

# Handle missing values by forward filling then backward filling to ensure no gaps
data_filled = data.ffill().bfill()

# Decompose the series to understand its components
decomposition = seasonal_decompose(data_filled, model='additive', period=12)
fig = decomposition.plot()
plt.show()

# Fit a SARIMA model
# Note: p, d, q values and seasonal P, D, Q, M should be selected based on ACF, PACF plots and domain understanding
model = SARIMAX(data_filled, order=(1,1,1), seasonal_order=(1,1,1,12))
results = model.fit(disp=False)

# Check diagnostics to ensure no assumptions are violated
results.plot_diagnostics(figsize=(15, 12))
plt.show()

# Forecast for the next 12 months
forecast = results.get_forecast(steps=12)
mean_forecast = forecast.predicted_mean
confidence_intervals = forecast.conf_int()

# Dates for the forecast
forecast_dates = pd.date_range(data_filled.index[-1], periods=12, freq='ME')

# Create a plot with historical data and forecast
fig = go.Figure()

# Historical data
fig.add_trace(go.Scatter(x=data_filled.index, y=data_filled, mode='lines', name='Historical Prices'))

# Forecast data
fig.add_trace(go.Scatter(x=forecast_dates, y=mean_forecast, mode='lines', name='Forecast'))

# Fill area between confidence intervals
fig.add_trace(go.Scatter(x=np.concatenate([forecast_dates, forecast_dates[::-1]]),
                         y=np.concatenate([confidence_intervals.iloc[:, 0], confidence_intervals.iloc[:, 1][::-1]]),
                         fill='toself', fillcolor='rgba(0,100,80,0.2)', line=dict(color='rgba(255,255,255,0)'),
                         name='Confidence Interval'))

# Update layout
fig.update_layout(title='Forecast of U.S. Regular Gasoline Prices', xaxis_title='Date', yaxis_title='Price (Dollars per Gallon)', template='plotly_white')
fig.show()

C:\Users\pc\OneDrive\Desktop\Python\CodexPrime\venv\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
C:\Users\pc\OneDrive\Desktop\Python\CodexPrime\venv\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)

C:\Users\pc\OneDrive\Desktop\Python\CodexPrime\venv\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`.
  return get_prediction_index(

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime as dt

# Load the dataset
xls_path = resolve_dataset_path()
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

# Convert dates into numerical format for regression
df['Date_ordinal'] = df.index.map(dt.datetime.toordinal)
data_filled = df['U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)'].ffill().bfill()

# Prepare X and y for regression
X = df['Date_ordinal'].values.reshape(-1, 1)
y = data_filled.values

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Print the coefficients of the model
print("Intercept (b0):", model.intercept_)
print("Slope (b1) for Date:", model.coef_[0])

# Plotting the results with formatted dates
plt.figure(figsize=(12, 6))
plt.scatter(df.index, y, color='blue', label='Actual Prices')
plt.plot(df.index, model.predict(X), color='red', linewidth=2, label='Fitted Line')
plt.title('Linear Regression on Gasoline Prices')
plt.xlabel('Date')
plt.ylabel('U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)')

# Format the x-axis to improve readability
plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())  # Use AutoDateLocator without additional parameters
plt.gca().xaxis.set_major_formatter(mdates.ConciseDateFormatter(mdates.AutoDateLocator()))  # Use concise formatter for cleaner labels

plt.gcf().autofmt_xdate()  # Automatically rotate date labels for better readability
plt.legend()
plt.show()

Intercept (b0): -155.08130894484586
Slope (b1) for Date: 0.0002146200118198708

import subprocess
import sys

import pandas as pd
import matplotlib.pyplot as plt

try:
    import seaborn as sns
except ModuleNotFoundError:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn'])
    import seaborn as sns

# File path from the original code
xls_path = resolve_dataset_path()

# Load the dataset
df = pd.read_excel(xls_path, sheet_name='Data 1', skiprows=2)

# Select the relevant columns used in the analysis
relevant_columns = [
    'U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Retail Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)',
    'U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)'
]

# Selecting only the relevant columns
df_clean = df[relevant_columns]

# Calculating the correlation matrix
correlation_matrix = df_clean.corr()

# Displaying the correlation matrix
display(correlation_matrix.round(3))

# Visualization of the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Fuel Price Variables')
plt.show()

	Back to Contents	Data 1: U.S. Gasoline and Diesel Retail Prices	Unnamed: 2	Unnamed: 3	Unnamed: 4	Unnamed: 5	Unnamed: 6	Unnamed: 7	Unnamed: 8	Unnamed: 9	Unnamed: 10	Unnamed: 11	Unnamed: 12	Unnamed: 13	Unnamed: 14	Unnamed: 15
0	Sourcekey	EMM_EPM0_PTE_NUS_DPG	EMM_EPM0U_PTE_NUS_DPG	EMM_EPM0R_PTE_NUS_DPG	EMM_EPMR_PTE_NUS_DPG	EMM_EPMRU_PTE_NUS_DPG	EMM_EPMRR_PTE_NUS_DPG	EMM_EPMM_PTE_NUS_DPG	EMM_EPMMU_PTE_NUS_DPG	EMM_EPMMR_PTE_NUS_DPG	EMM_EPMP_PTE_NUS_DPG	EMM_EPMPU_PTE_NUS_DPG	EMM_EPMPR_PTE_NUS_DPG	EMD_EPD2D_PTE_NUS_DPG	EMD_EPD2DXL0_PTE_NUS_DPG	EMD_EPD2DM10_PTE_NUS_DPG
1	Date	U.S. All Grades All Formulations Retail Gasoli...	U.S. All Grades Conventional Retail Gasoline P...	U.S. All Grades Reformulated Retail Gasoline P...	U.S. Regular All Formulations Retail Gasoline ...	U.S. Regular Conventional Retail Gasoline Pric...	U.S. Regular Reformulated Retail Gasoline Pric...	U.S. Midgrade All Formulations Retail Gasoline...	U.S. Midgrade Conventional Retail Gasoline Pri...	U.S. Midgrade Reformulated Retail Gasoline Pri...	U.S. Premium All Formulations Retail Gasoline ...	U.S. Premium Conventional Retail Gasoline Pric...	U.S. Premium Reformulated Retail Gasoline Pric...	U.S. No 2 Diesel Retail Prices (Dollars per Ga...	U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) R...	U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retai...
2	1990-08-15 00:00:00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	1990-09-15 00:00:00	NaN	NaN	NaN	1.258	1.258	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	1990-10-15 00:00:00	NaN	NaN	NaN	1.335	1.335	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Date	U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. No 2 Diesel Retail Prices (Dollars per Gallon)	U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)	U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
341	2019-01-15	2.338	2.214	2.575	2.248	2.145	2.464	2.658	2.498	2.884	2.904	2.768	3.060	2.980	2.980	NaN
342	2019-02-15	2.393	2.286	2.603	2.309	2.223	2.495	2.682	2.531	2.898	2.936	2.814	3.077	2.997	2.997	NaN
343	2019-03-15	2.594	2.503	2.774	2.516	2.443	2.673	2.854	2.720	3.045	3.107	3.006	3.223	3.076	3.076	NaN
344	2019-04-15	2.881	2.755	3.131	2.798	2.694	3.023	3.165	2.980	3.434	3.417	3.264	3.597	3.121	3.121	NaN
345	2019-05-15	2.946	2.796	3.243	2.859	2.731	3.136	3.250	3.051	3.542	3.505	3.328	3.713	3.161	3.161	NaN

	Date	U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. No 2 Diesel Retail Prices (Dollars per Gallon)	U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)	U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
401	2024-01-15	3.197	3.045	3.495	3.075	2.957	3.331	3.619	3.391	3.945	3.969	3.764	4.206	3.854	3.854	NaN
402	2024-02-15	3.328	3.188	3.605	3.212	3.102	3.446	3.729	3.511	4.042	4.077	3.888	4.296	4.044	4.044	NaN
403	2024-03-15	3.542	3.402	3.815	3.426	3.318	3.657	3.938	3.719	4.252	4.286	4.098	4.506	4.022	4.022	NaN

	Date	U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. No 2 Diesel Retail Prices (Dollars per Gallon)	U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)	U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
198	2007-02-15	2.323	2.276	2.418	2.278	2.235	2.369	2.385	2.325	2.500	2.479	2.431	2.567	2.488	2.513	2.435
199	2007-03-15	2.609	2.546	2.738	2.563	2.503	2.689	2.670	2.593	2.820	2.766	2.706	2.877	2.667	2.680	2.630
200	2007-04-15	2.891	2.831	3.013	2.845	2.787	2.965	2.953	2.881	3.093	3.052	2.996	3.155	2.834	2.847	2.790
201	2007-05-15	3.187	3.157	3.248	3.146	3.119	3.202	3.241	3.199	3.321	3.331	3.300	3.389	2.796	2.818	2.726
202	2007-06-15	3.102	3.067	3.173	3.056	3.024	3.122	3.161	3.118	3.247	3.263	3.227	3.330	2.808	2.826	2.746

	U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. No 2 Diesel Retail Prices (Dollars per Gallon)	U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)	U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
Date
2007-02-15	2.323	2.276	2.418	2.278	2.235	2.369	2.385	2.325	2.500	2.479	2.431	2.567	2.488	2.513	2.435
2007-03-15	2.609	2.546	2.738	2.563	2.503	2.689	2.670	2.593	2.820	2.766	2.706	2.877	2.667	2.680	2.630
2007-04-15	2.891	2.831	3.013	2.845	2.787	2.965	2.953	2.881	3.093	3.052	2.996	3.155	2.834	2.847	2.790
2007-05-15	3.187	3.157	3.248	3.146	3.119	3.202	3.241	3.199	3.321	3.331	3.300	3.389	2.796	2.818	2.726
2007-06-15	3.102	3.067	3.173	3.056	3.024	3.122	3.161	3.118	3.247	3.263	3.227	3.330	2.808	2.826	2.746

	Unnamed: 0	Unnamed: 1	Unnamed: 2	Unnamed: 3	Unnamed: 4	Unnamed: 5	Unnamed: 6
0	NaN	Workbook Contents	NaN	NaN	NaN	NaN	NaN
1	NaN	U.S. Gasoline and Diesel Retail Prices	NaN	NaN	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Unnamed: 0	Unnamed: 1	Unnamed: 2	Unnamed: 3	Unnamed: 4	Unnamed: 5	Unnamed: 6
12	NaN	Source:	Energy Information Administration	NaN	NaN	NaN	NaN
13	NaN	For Help, Contact:	infoctr@eia.gov	NaN	NaN	NaN	NaN
14	NaN	NaN	(202) 586-8800	NaN	NaN	4/1/2024 5:32:13 PM	NaN

	Date	U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)	U.S. No 2 Diesel Retail Prices (Dollars per Gallon)	U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)	U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
0	1990-08-15	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	1990-09-15	NaN	NaN	NaN	1.258	1.258	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	1990-10-15	NaN	NaN	NaN	1.335	1.335	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	dtype
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)	float64
U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)	float64
U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)	float64
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)	float64
U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)	float64
...	...
U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)	float64
U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)	float64
U.S. No 2 Diesel Retail Prices (Dollars per Gallon)	float64
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)	float64
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)	float64

	Mean	Median	Standard Deviation	Variance	Range
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)	3.074	2.948	0.615	0.378	3.160
U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)	2.979	2.866	0.608	0.369	3.050
U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)	3.265	3.145	0.638	0.407	3.380
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)	2.986	2.860	0.620	0.384	3.165
U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)	2.902	2.808	0.616	0.379	3.083
...	...	...	...	...	...
U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)	3.396	3.348	0.616	0.380	3.268
U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)	3.633	3.584	0.689	0.475	3.643
U.S. No 2 Diesel Retail Prices (Dollars per Gallon)	3.403	3.253	0.766	0.587	3.756
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)	3.404	3.253	0.766	0.587	3.756
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)	NaN	NaN	NaN	NaN	NaN

	U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)	U.S. No 2 Diesel Retail Prices (Dollars per Gallon)	U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)	U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)
U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)	1.000	0.999	0.980	0.957	0.943	0.943	NaN
U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)	0.999	1.000	0.975	0.949	0.938	0.938	NaN
U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)	0.980	0.975	1.000	0.995	0.958	0.958	NaN
U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)	0.957	0.949	0.995	1.000	0.953	0.953	NaN
U.S. No 2 Diesel Retail Prices (Dollars per Gallon)	0.943	0.938	0.958	0.953	1.000	1.000	NaN
U.S. No 2 Diesel Ultra Low Sulfur (0-15 ppm) Retail Prices (Dollars per Gallon)	0.943	0.938	0.958	0.953	1.000	1.000	NaN
U.S. No 2 Diesel Low Sulfur (15-500 ppm) Retail Prices (Dollars per Gallon)	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Predicted Price
2024-03-31	3.59018
2024-04-30	3.67058
2024-05-31	3.69817
2024-06-30	3.58821
2024-07-31	3.67058
2024-08-31	3.69817
2024-09-30	3.58821
2024-10-31	3.67058
2024-11-30	3.69817
2024-12-31	3.58821
2025-01-31	3.67058
2025-02-28	3.69817