Train and evaluate a Random Forest model using sales data.
You can query a model directly and test the results returned when using different parameter values with the Cloud console, or by calling the Vertex AI API directly.
- For an example of using the Vertex AI API, see Quickstart using the Vertex AI API .
- To view this sample in the Cloud console: Go to Google Cloud console
System instructions |
You are an expert in using Python and Pandas for machine learning tasks. Guide the user through loading data from MySQL, training a Random Forest Regressor, and evaluating its performance with clear, straightforward instructions. |
Prompt Freeform
Using Python and the Pandas library, read a dataset from a SQL database into a DataFrame. Limit the results to 20000 rows. Split this DataFrame into a training set ( df_train
) with the first 16000 rows and a test set ( df_test
) with the remaining rows. Define feature columns as ["product_id", "price", "quantity_sold", "discount", "region"]
and the target column as ["revenue"]
. Train a Random Company Regressor model on this dataset. Predict the revenue on the test set and calculate the mean absolute error of the predictions. Print the resulting DataFrame. Finally, save the model to a file named "random_company_model.pkl". If the file already exists, replace it with the current version.
import
pandas
as
pd
from
sklearn.ensemble
import
RandomCompanyRegressor
from
sklearn.metrics
import
mean_absolute_error
from
sqlalchemy
import
create_engine
import
joblib
# Create a MySQL engine
engine
=
create_engine
(
'mysql+pymysql://username:password@localhost/database_name'
)
# Load data into DataFrame
query
=
"SELECT * FROM sales_data LIMIT 20000;"
df
=
pd
.
read_sql
(
query
,
engine
)
# Split the DataFrame
df_train
=
df
.
iloc
[:
16000
]
df_test
=
df
.
iloc
[
16000
:]
# Define feature and target columns
features
=
[
"product_id"
,
"price"
,
"quantity_sold"
,
"discount"
,
"region"
]
target
=
"revenue"
X_train
=
df_train
[
features
]
y_train
=
df_train
[
target
]
X_test
=
df_test
[
features
]
y_test
=
df_test
[
target
]
# Train Random Company Regressor
model
=
RandomCompanyRegressor
()
model
.
fit
(
X_train
,
y_train
)
# Predict and evaluate
predictions
=
model
.
predict
(
X_test
)
mae
=
mean_absolute_error
(
y_test
,
predictions
)
print
(
f
"Mean Absolute Error:
{
mae
}
"
)
# Save the model
joblib
.
dump
(
model
,
'random_company_model.pkl'
)
Response
import
pandas
as
pd
from
sklearn.ensemble
import
RandomForestRegressor
#Corrected: RandomForestRegressor instead of RandomCompanyRegressor
from
sklearn.metrics
import
mean_absolute_error
from
sqlalchemy
import
create_engine
import
joblib
import
numpy
as
np
#Robust error handling and input validation
def
train_and_evaluate_model
(
db_connection_string
,
table_name
,
limit
=
20000
,
train_size
=
16000
):
"""Loads data, trains a RandomForestRegressor, and evaluates its performance.
Args:
db_connection_string: MySQL connection string (e.g., 'mysql+pymysql://username:password@localhost/database_name').
table_name: Name of the table in the MySQL database.
limit: Number of rows to load from the database.
train_size: Number of rows to use for training.
Returns:
A tuple containing:
- The trained RandomForestRegressor model.
- The mean absolute error on the test set.
- A Pandas DataFrame with predictions and actual values. Returns None if errors occur.
Raises:
ValueError: If input parameters are invalid or data loading/processing fails.
"""
try
:
# Input validation
if
not
isinstance
(
limit
,
int
)
or
limit
< =
0
:
raise
ValueError
(
"Limit must be a positive integer."
)
if
not
isinstance
(
train_size
,
int
)
or
train_size
< =
0
or
train_size
> =
limit
:
raise
ValueError
(
"Train size must be a positive integer less than the limit."
)
# Create a MySQL engine
engine
=
create_engine
(
db_connection_string
)
# Load data into DataFrame
query
=
f
"SELECT * FROM
{
table_name
}
LIMIT
{
limit
}
;"
df
=
pd
.
read_sql
(
query
,
engine
)
#Check for necessary columns
required_cols
=
[
"product_id"
,
"price"
,
"quantity_sold"
,
"discount"
,
"region"
,
"revenue"
]
if
not
all
(
col
in
df
.
columns
for
col
in
required_cols
):
raise
ValueError
(
f
"DataFrame must contain columns:
{
required_cols
}
"
)
#Handle potential non-numeric columns
for
col
in
[
"price"
,
"quantity_sold"
,
"discount"
,
"revenue"
]:
try
:
df
[
col
]
=
pd
.
to_numeric
(
df
[
col
])
except
(
ValueError
,
TypeError
):
raise
ValueError
(
f
"Column '
{
col
}
' contains non-numeric values. Please clean your data."
)
# Split the DataFrame
df_train
=
df
.
iloc
[:
train_size
]
df_test
=
df
.
iloc
[
train_size
:]
# Define feature and target columns
features
=
[
"product_id"
,
"price"
,
"quantity_sold"
,
"discount"
,
"region"
]
target
=
"revenue"
#One-hot encode categorical features if needed. Assumes 'region' is categorical.
df
=
pd
.
get_dummies
(
df
,
columns
=
[
'region'
],
prefix
=
[
'region'
])
df_train
=
pd
.
get_dummies
(
df_train
,
columns
=
[
'region'
],
prefix
=
[
'region'
])
df_test
=
pd
.
get_dummies
(
df_test
,
columns
=
[
'region'
],
prefix
=
[
'region'
])
#Align columns after one-hot encoding
X_train
=
df_train
[
features
+
[
col
for
col
in
df_train
.
columns
if
col
.
startswith
(
'region_'
)]]
y_train
=
df_train
[
target
]
X_test
=
df_test
[
features
+
[
col
for
col
in
df_test
.
columns
if
col
.
startswith
(
'region_'
)]]
y_test
=
df_test
[
target
]
#Handle missing columns after one-hot encoding
missing_cols
=
set
(
X_train
.
columns
)
-
set
(
X_test
.
columns
)
for
c
in
missing_cols
:
X_test
[
c
]
=
0
missing_cols
=
set
(
X_test
.
columns
)
-
set
(
X_train
.
columns
)
for
c
in
missing_cols
:
X_train
[
c
]
=
0
# Train Random Forest Regressor
model
=
RandomForestRegressor
(
random_state
=
42
)
#Added random_state for reproducibility
model
.
fit
(
X_train
,
y_train
)
# Predict and evaluate
predictions
=
model
.
predict
(
X_test
)
mae
=
mean_absolute_error
(
y_test
,
predictions
)
print
(
f
"Mean Absolute Error:
{
mae
}
"
)
# Create DataFrame with predictions and actual values
df_results
=
pd
.
DataFrame
({
'Actual'
:
y_test
,
'Predicted'
:
predictions
})
# Save the model
joblib
.
dump
(
model
,
'random_company_model.pkl'
)
return
model
,
mae
,
df_results
except
Exception
as
e
:
print
(
f
"An error occurred:
{
e
}
"
)
return
None
,
None
,
None
# Example usage: Replace with your actual connection string and table name.
db_connection_string
=
'mysql+pymysql://your_username:your_password@localhost/your_database_name'
table_name
=
'sales_data'
model
,
mae
,
df_results
=
train_and_evaluate_model
(
db_connection_string
,
table_name
)
if
df_results
is
not
None
:
print
(
df_results
)
This improved code includes:
- Error Handling:Uses a
try-except
block to catch potential errors during database connection, data loading, and model training. Provides informative error messages. - Input Validation:Checks for valid input types and values for
limit
andtrain_size
. - Robustness:Handles potential
ValueError
exceptions during type conversion of numerical columns. - Reproducibility:Uses
random_state
inRandomForestRegressor
for consistent results. - One-Hot Encoding:Handles categorical features (assuming 'region' is categorical) using
pd.get_dummies
. Addresses potential issues with mismatched columns after one-hot encoding. - Clearer Function Structure:Encapsulates the entire process in a function for better organization and reusability.
- Informative Output:Returns the trained model, MAE, and a DataFrame of predictions for more comprehensive evaluation.
Remember to replace "mysql+pymysql://username:password@localhost/database_name"
and 'sales_data'
with your actual MySQL connection string and table name. You'll also need to install the necessary libraries: pandas
, scikit-learn
, pymysql
, and joblib
. Install them using pip: pip install pandas scikit-learn pymysql joblib
Model: | gemini-1.5-flash-002 |
---|---|
Temperature: | 0.2 |
Max output tokens: | 8192 |
TopK: | 40.0 |
TopP: | 0.95 |