# Troubleshooting: if you encounter async/sync issues, add this before
# creating the Dask client:
# import dask.base
# dask.base._ensure_not_async = lambda client: client.get
from dask_ml.datasets import make_classification
from sklearn.linear_model import SGDClassifier
from dask_ml.model_selection import IncrementalSearchCV
# 200M samples x 20 features x 8 bytes (float64) ~ 32 GB
# Dask loads data in 500k-row chunks, so it never sits in RAM all at once
X, y = make_classification(n_samples=200000000, n_features=20,
chunks=500000, random_state=0)
# SGDClassifier supports partial_fit, which IncrementalSearchCV requires
# elasticnet combines L1 (sparsity) and L2 (stability) regularisation
model = SGDClassifier(tol=1e-3, penalty='elasticnet', random_state=0)
# 3000 x 3000 x 2 = 18 million candidate combinations
params = {
'alpha': np.logspace(-3, 2, num=3000), # regularisation strength
'l1_ratio': np.linspace(0, 1, num=3000), # L1/L2 balance (0 = L2, 1 = L1)
'average': [True, False] # average SGD weights or not
}
# decay_rate=None keeps all candidates active throughout training
search = IncrementalSearchCV(model, params, random_state=0, decay_rate=None)
start_time = time.time()
search.fit(X, y, classes=[0, 1])
elapsed_time = time.time() - start_time
print("Best parameters found: ", search.best_params_)
print("Best score: ", search.best_score_)
print("Best estimator: ", search.best_estimator_)
print(f"Time taken: {elapsed_time:.2f} seconds")