Example realdata

%load_ext autoreload
%autoreload 2

import torch
import geospaNN
import numpy as np
import time
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as img 
import geopandas as gpd
from shapely.geometry import Point
from scipy import spatial, interpolate
import warnings
warnings.filterwarnings("ignore")

url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_nation_20m.zip"
us = gpd.read_file(url).explode()
us = us.loc[us.geometry.apply(lambda x: x.exterior.bounds[2])<-60]

df_covariates = pd.read_csv('./data/covariate0605.csv')
df_pm25 = pd.read_csv('./data/pm25_0605.csv')
df_pm25 = df_pm25.loc[df_pm25.Latitude < 50]

x_min,y_min,x_max,y_max = np.array([np.min(df_covariates['long']), np.min(df_covariates['lat']),
    np.max(df_covariates['long']), np.max(df_covariates['lat'])])
arr1 = np.mgrid[x_min:x_max:101j, y_min:y_max:101j]

# extract the x and y coordinates as flat arrays
arr1x = np.ravel(arr1[0])
arr1y = np.ravel(arr1[1])
# using the X and Y columns, build a dataframe, then the geodataframe
df = pd.DataFrame({'X':arr1x, 'Y':arr1y})
df['coords'] = list(zip(df['X'], df['Y']))
df['coords'] = df['coords'].apply(Point)

gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(x=df.X, y=df.Y),crs = us.crs)
inUS = gdf['geometry'].apply(lambda s: s.within(us.geometry.unary_union))

lonlat_pm25=df_pm25.values[:,[1,2]]
near = df_covariates.values[:,[1,2]]
tree = spatial.KDTree(list(zip(near[:,0].ravel(), near[:,1].ravel())))
idx = tree.query(lonlat_pm25)[1]
df_pm25_mean = df_pm25.assign(neighbor = idx).groupby('neighbor')['PM25'].mean()
idx_new = df_pm25_mean.index.values
pm25 = df_pm25_mean.values
z = pm25[:,None]

lon = df_covariates.values[:,1]
lat = df_covariates.values[:,2]

f = interpolate.Rbf(lon[idx_new], lat[idx_new], z, function = 'inverse')
x_test = gdf.loc[inUS,:].X
y_test = gdf.loc[inUS,:].Y
z_test = f(x_test, y_test)

plt.clf()
fig, ax = plt.subplots(figsize=(9, 5))
c = ax.scatter(x = x_test, y = y_test, s = 10, c = z_test, marker = 's', alpha = 0.7)
ax.plot(np.array(df_pm25['Longitude']), np.array(df_pm25['Latitude']), 'o', c = 'orange', markersize = 4)
ax.set_title('')
fig.colorbar(c, ax=ax)
plt.show()

<Figure size 640x480 with 0 Axes>

png

lon = df_covariates.values[:,1]
lat = df_covariates.values[:,2]
covariates = df_covariates.values[:,3:]
normalized_lon = (lon-min(lon))/(max(lon)-min(lon))
normalized_lat = (lat-min(lat))/(max(lat)-min(lat))
normalized_x_test = (x_test-min(lon))/(max(lon)-min(lon))
normalized_y_test = (y_test-min(lat))/(max(lat)-min(lat))

s_obs = np.vstack((normalized_lon[idx_new],normalized_lat[idx_new])).T
X = covariates[idx_new,:]
normalized_X = X
for i in range(X.shape[1]):
    normalized_X[:,i] = (X[:,i]-min(X[:,i]))/(max(X[:,i])-min(X[:,i]))

X = normalized_X
Y = z.reshape(-1)
coord = s_obs
#columns = ['precipitation', 'temperature', 'air pressure', 'relative humidity', 'U-wind', 'V-wind',
#           'PM 2.5', 'longitude', 'latitude']
#df = pd.DataFrame(data=data, index=range(data.shape[0]), columns=columns)
#df.to_csv('./data/Normalized_PM2.5_20190605.csv')

data_PM25 = pd.read_csv("./data/Normalized_PM2.5_20190605.csv")
data_PM25

	Unnamed: 0	precipitation	temperature	air pressure	relative humidity	U-wind	V-wind	PM 2.5	longitude	latitude
0	0	0.008044	0.362296	0.887664	0.774197	0.868530	0.781498	5.020834	0.980311	0.906268
1	1	0.005516	0.355305	0.882153	0.751742	0.864206	0.770715	3.837500	0.983093	0.889762
2	2	0.000000	0.335323	0.928359	0.714189	0.697080	0.813224	2.041666	0.974238	0.814722
3	3	0.000000	0.338579	0.954218	0.690767	0.625266	0.868161	3.669444	0.976951	0.798275
4	4	0.002528	0.293827	0.893599	0.685830	0.688808	0.842395	1.020833	0.945193	0.800337
...	...	...	...	...	...	...	...	...	...	...
600	600	0.393932	0.901079	0.972022	0.910279	0.642436	0.469410	5.168750	0.469337	0.101640
601	601	0.000689	0.810329	0.897414	0.539392	0.553265	0.464926	6.041666	0.436702	0.098054
602	602	0.294415	0.882501	0.972022	0.822590	0.642248	0.469353	8.704166	0.468665	0.090378
603	603	0.011492	0.811830	0.965240	0.719028	0.676463	0.512429	8.725000	0.460179	0.035680
604	604	0.256952	0.775792	0.975837	0.802168	0.696516	0.509137	8.213636	0.470595	0.032926

605 rows × 10 columns

X = torch.from_numpy(data_PM25[['precipitation', 'temperature', 'air pressure', 'relative humidity', 'U-wind', 'V-wind']].to_numpy()).float()
Y = torch.from_numpy(data_PM25[['PM 2.5']].to_numpy().reshape(-1)).float()
coord = torch.from_numpy(data_PM25[['longitude', 'latitude']].to_numpy()).float()

p = X.shape[1]

n = X.shape[0]
nn = 20
batch_size = 50

X, Y, coord, _ = geospaNN.spatial_order(X, Y, coord, method = 'max-min')
data = geospaNN.make_graph(X, Y, coord, nn)

torch.manual_seed(2024)
np.random.seed(0)
data_train, data_val, data_test = geospaNN.split_data(X, Y, coord, neighbor_size = 20, 
                                                   test_proportion = 0.5)

start_time = time.time()
mlp_nn = torch.nn.Sequential(
    torch.nn.Linear(p, 50),
    torch.nn.ReLU(),
    torch.nn.Linear(50, 20),
    torch.nn.ReLU(),
    torch.nn.Linear(20, 1),
)
nn_model = geospaNN.nn_train(mlp_nn, lr =  0.01, min_delta = 0.001)
training_log = nn_model.train(data_train, data_val, data_test)

Epoch 00031: reducing learning rate of group 0 to 5.0000e-03.

theta0 = geospaNN.theta_update(torch.tensor([1, 1.5, 0.01]), mlp_nn(data_train.x).squeeze() - data_train.y, data_train.pos, neighbor_size = 20)
mlp_nngls = torch.nn.Sequential(
    torch.nn.Linear(p, 100),
    torch.nn.ReLU(),
    torch.nn.Linear(100, 50),
    torch.nn.ReLU(),
    torch.nn.Linear(50, 20),
    torch.nn.ReLU(),
    torch.nn.Linear(20, 10),
    torch.nn.ReLU(),
    torch.nn.Linear(10, 1),
)
model = geospaNN.nngls(p=p, neighbor_size=nn, coord_dimensions=2, mlp=mlp_nngls, theta=torch.tensor(theta0))
nngls_model = geospaNN.nngls_train(model, lr =  0.01, min_delta = 0.001)
training_log = nngls_model.train(data_train, data_val, data_test,
                                 Update_init = 20, Update_step = 10)
end_time = time.time()

Theta updated from
[1.   1.5  0.01]
Epoch 00020: reducing learning rate of group 0 to 5.0000e-03.
Theta updated from
[10.29591688 10.3716713   0.20216037]
to
[11.32370202  8.14642528  0.15268137]
Epoch 00029: reducing learning rate of group 0 to 2.5000e-03.
Theta updated from
[11.32370202  8.14642528  0.15268137]
to
[10.57182313  8.82062549  0.15966506]
INFO: Early stopping
End at epoch32

print(f"\rRunning time: {end_time - start_time} seconds")

Running time: 7.1364500522613525 seconds

[test_predict, test_U, test_L] = model.predict(data_train, data_test, CI = True)
plt.clf()
plt.scatter(test_predict.detach().numpy(), data_test.y.detach().numpy(), s = 1, label = 'Truth vs prediction')
plt.scatter(data_test.y.detach().numpy(), data_test.y.detach().numpy(), s = 1, label = 'reference')
plt.xlabel("Prediction")
plt.ylabel("Truth")
plt.legend()
plt.show()

png

f_pred = interpolate.CloughTocher2DInterpolator(list(zip(data_test.pos.detach().numpy()[:,0], 
                                                data_test.pos.detach().numpy()[:,1])),
                                                test_predict.detach().numpy())
f_U = interpolate.CloughTocher2DInterpolator(list(zip(data_test.pos.detach().numpy()[:,0], 
                                                data_test.pos.detach().numpy()[:,1])),
                                                test_U.detach().numpy())
f_L = interpolate.CloughTocher2DInterpolator(list(zip(data_test.pos.detach().numpy()[:,0], 
                                                data_test.pos.detach().numpy()[:,1])),
                                                test_L.detach().numpy())
f_true = interpolate.CloughTocher2DInterpolator(list(zip(data_test.pos.detach().numpy()[:,0], 
                                                data_test.pos.detach().numpy()[:,1])),
                                                data_test.y.detach().numpy())


titles = np.array([['Prediction', 'Truth'], ['Lower confidence bound', 'Upper confidence bound']])
f_vec = np.array([[f_pred, f_true], [f_L, f_U]])

fig, ax = plt.subplots(2, 2, figsize=(20, 10))
for i in range(2):
    for j in range(2):
        im = ax[i,j].scatter(x = normalized_x_test, y = normalized_y_test, s = 9, 
                             c = f_vec[i,j](normalized_x_test, normalized_y_test), marker = 's', alpha = 0.7, 
                             vmin=0, vmax=20)
        ax[i,j].title.set_text(titles[i,j])
        fig.colorbar(im)
    #cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
ax[0,1].plot(data_test.pos.detach().numpy()[:,0], 
           data_test.pos.detach().numpy()[:,1], 'o', c = 'orange', markersize = 4)
plt.show()

png

variable_names = ['Precipitation accumulation', 'Air temperature', 'Pressure', 'Relative humidity', 'U-wind', 'V-wind']
geospaNN.plot_PDP(model, X, variable_names)

Precipitation accumulation Air Temperature Pressure

Relative Humidity U-wind V-wind