Skip to content

Commit 206e37a

Browse files
committed
general update pre paper
1 parent 88d1f6e commit 206e37a

15 files changed

+241
-664
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
batchfile.slurm
33
crosstest_batchfile.slurm
44
tunning_batchfile.slurm
5-
__pycache__/
5+
__pycache__/
6+
plotresults.py

.gitignore~

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
.DS_Store
2+
batchfile.slurm
3+
crosstest_batchfile.slurm
4+
tunning_batchfile.slurm
5+
__pycache__/

README.md

+7-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ See the paper [arXiv:2204.xxxxx](https://arxiv.org/abs/2204.xxxxx) for more deta
1111
<img src="visualize_graph_10.png" width="500">
1212

1313

14-
## Scripts
14+
## Codes
1515

1616
Here is a brief description of the codes included:
1717

@@ -51,13 +51,18 @@ The libraries required for training the models and compute some statistics are:
5151
* `scipy`
5252
* `sklearn`
5353
* `optuna` (only for optimization in `hyperparams_optimization.py`)
54+
* [`Pylians`](https://pylians3.readthedocs.io/en/master/) (only for computing power spectra in `ps_test.py`)
5455

5556

5657
## Usage
5758

59+
The codes implemented here are designed to train GNNs for two tasks. The desired task is chosen in `hyperparameters.py` with the `outmode` flag:
60+
1. Infer cosmological parameters from galaxy catalogues. Set `outmode = "cosmo"`.
61+
2. Predict the power spectrum from galaxy catalogues. Set `outmode = "ps"`.
62+
5863
These are some advices to employ the scripts described above:
5964
1. To perform a search of the optimal hyperparameters, run `hyperparams_optimization.py`.
60-
2. To train a model with a given set of parameters defined in `hyperparameters.py`, run `main.py`. The hyperparameters currently present in `hyperparameters.py` correspond to the best optimal values for each suite when all galactic features are employed (see the paper).
65+
2. To train a model with a given set of parameters defined in `hyperparameters.py`, run `main.py`. The hyperparameters currently present in `hyperparameters.py` correspond to the best optimal values for each suite when all galactic features are employed (see the paper). Modify it accordingly to the task.
6166
3. Once a model is trained, run `crosstest.py` to test in the training simulation suite and cross test it in the other one included in CAMELS (IllustrisTNG and SIMBA).
6267

6368

Source/constants.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#----------------------------------------------------------------------
22
# List of constants and some common functions
33
# Author: Pablo Villanueva Domingo
4-
# Last update: 10/11/21
4+
# Last update: 4/22
55
#----------------------------------------------------------------------
66

77
import numpy as np
@@ -39,15 +39,12 @@
3939
# Batch size
4040
batch_size = 25
4141

42-
43-
4442
# Number of k bins in the power spectrum
4543
ps_size = 79
4644

4745
#--- FUNCTIONS ---#
4846

49-
5047
# Choose color depending on the CAMELS simulation suite
5148
def colorsuite(suite):
5249
if suite=="IllustrisTNG": return "purple"
53-
elif suite=="SIMBA": return "deepskyblue"
50+
elif suite=="SIMBA": return "dodgerblue"

Source/load_data.py

+50-136
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
#----------------------------------------------------
2+
# Routine for loading the CAMELS galaxy catalogues
3+
# Author: Pablo Villanueva Domingo
4+
# Last update: 4/22
5+
#----------------------------------------------------
6+
17
import h5py
28
from torch_geometric.data import Data, DataLoader
39
from Source.constants import *
@@ -6,20 +12,25 @@
612

713
Nstar_th = 20 # Minimum number of stellar particles required to consider a galaxy
814

15+
# Normalize CAMELS parameters
916
def normalize_params(params):
1017

1118
minimum = np.array([0.1, 0.6, 0.25, 0.25, 0.5, 0.5])
1219
maximum = np.array([0.5, 1.0, 4.00, 4.00, 2.0, 2.0])
1320
params = (params - minimum)/(maximum - minimum)
1421
return params
1522

23+
# Normalize power spectrum
1624
def normalize_ps(ps):
1725
mean, std = ps.mean(axis=0), ps.std(axis=0)
1826
normps = (ps - mean)/std
1927
return normps
2028

29+
# Compute KDTree and get edges and edge features
2130
def get_edges(pos, r_link, use_loops):
2231

32+
# 1. Get edges
33+
2334
# Create the KDTree and look for pairs within a distance r_link
2435
# Boxsize normalize to 1
2536
kd_tree = SS.KDTree(pos, leafsize=16, boxsize=1.0001)
@@ -37,35 +48,36 @@ def get_edges(pos, r_link, use_loops):
3748
edge_index = edge_index.reshape((2,-1))
3849
num_pairs = edge_index.shape[1]
3950

40-
# Edge attributes
51+
# 2. Get edge attributes
52+
4153
row, col = edge_index
4254
diff = pos[row]-pos[col]
4355

44-
# Correct boundaries in distances
56+
# Take into account periodic boundary conditions, correcting the distances
4557
for i, pos_i in enumerate(diff):
46-
#outbound=False
4758
for j, coord in enumerate(pos_i):
4859
if coord > r_link:
49-
#outbound=True
5060
diff[i,j] -= 1. # Boxsize normalize to 1
5161
elif -coord > r_link:
52-
#outbound=True
5362
diff[i,j] += 1. # Boxsize normalize to 1
54-
#if outbound: numbounds+=1
5563

64+
# Get translational and rotational invariant features
65+
# Distance
5666
dist = np.linalg.norm(diff, axis=1)
67+
# Centroid of galaxy catalogue
5768
centroid = np.mean(pos,axis=0)
69+
# Unit vectors of node, neighbor and difference vector
5870
unitrow = (pos[row]-centroid)/np.linalg.norm((pos[row]-centroid), axis=1).reshape(-1,1)
5971
unitcol = (pos[col]-centroid)/np.linalg.norm((pos[col]-centroid), axis=1).reshape(-1,1)
6072
unitdiff = diff/dist.reshape(-1,1)
73+
# Dot products between unit vectors
6174
cos1 = np.array([np.dot(unitrow[i,:].T,unitcol[i,:]) for i in range(num_pairs)])
6275
cos2 = np.array([np.dot(unitrow[i,:].T,unitdiff[i,:]) for i in range(num_pairs)])
63-
64-
#print(edge_index.shape, cos1.shape, cos2.shape, dist.shape)
76+
# Normalize distance by linking radius
6577
dist /= r_link
66-
edge_attr = np.concatenate([dist.reshape(-1,1), cos1.reshape(-1,1), cos2.reshape(-1,1)], axis=1)
6778

68-
#print(pos.shape, edge_index.shape, edge_attr.shape)
79+
# Concatenate to get all edge attributes
80+
edge_attr = np.concatenate([dist.reshape(-1,1), cos1.reshape(-1,1), cos2.reshape(-1,1)], axis=1)
6981

7082
# Add loops
7183
if use_loops:
@@ -78,90 +90,54 @@ def get_edges(pos, r_link, use_loops):
7890
edge_attr = np.append(edge_attr, atrloops, 0)
7991
edge_index = edge_index.astype(int)
8092

81-
#print(pos.shape, edge_index.shape, edge_attr.shape)
82-
83-
84-
85-
#print(edge_index.shape, edge_attr.shape)
86-
87-
88-
"""
89-
diff = (pos[row]-pos[col])/r_link
90-
91-
#print(diff.shape, edge_index.shape, pos.shape)
92-
#numbounds = 0
93-
94-
# Correct boundaries in distances
95-
for i, pos_i in enumerate(diff):
96-
#outbound=False
97-
for j, coord in enumerate(pos_i):
98-
if coord > 1.:
99-
#outbound=True
100-
diff[i,j] -= 1./r_link # Boxsize normalize to 1
101-
elif -coord > 1.:
102-
#outbound=True
103-
diff[i,j] += 1./r_link # Boxsize normalize to 1
104-
#if outbound: numbounds+=1
105-
106-
edge_attr = np.concatenate([diff, np.linalg.norm(diff, axis=1, keepdims=True)], axis=1)
107-
#print(edge_attr[:,3].min(), edge_attr[:,3].max())
108-
#print(diff.shape[0], numbounds)
109-
"""
110-
11193
return edge_index, edge_attr
11294

113-
######################################################################################
114-
# This routine reads the galaxies from a simulation and
115-
# root ------> folder containing all simulations with their galaxy catalogues
116-
# sim -------> 'IllustrisTNG' or 'SIMBA'
117-
# suite -----> 'LH' or 'CV'
118-
# number ----> number of the simulation
119-
# snapnum ---> snapshot number (choose depending of the desired redshift)
120-
# BoxSize ---> size of the simulation box in Mpc/h
121-
# Nstar_th -----> galaxies need to contain at least Nstar_th stars
122-
# k ---------> number of neighbors
123-
# param_file -> file with the value of the cosmological + astrophysical parameters
124-
def sim_graph(simnumber,param_file,hparams):
12595

96+
# Routine to create a cosmic graph from a galaxy catalogue
97+
# simnumber: number of simulation
98+
# param_file: file with the value of the cosmological + astrophysical parameters
99+
# hparams: hyperparameters class
100+
def sim_graph(simnumber, param_file, hparams):
101+
102+
# Get some hyperparameters
126103
simsuite,simset,r_link,only_positions,outmode,pred_params = hparams.simsuite,hparams.simset,hparams.r_link,hparams.only_positions,hparams.outmode,hparams.pred_params
127104

128-
# get the name of the galaxy catalogue
105+
# Name of the galaxy catalogue
129106
simpath = simpathroot + simsuite + "/"+simset+"_"
130107
catalogue = simpath + str(simnumber)+"/fof_subhalo_tab_0"+hparams.snap+".hdf5"
131108

132-
# read the catalogue
109+
# Read the catalogue
133110
f = h5py.File(catalogue, 'r')
134111
pos = f['/Subhalo/SubhaloPos'][:]/boxsize
135112
Mstar = f['/Subhalo/SubhaloMassType'][:,4] #Msun/h
136-
SubhaloVel = f["Subhalo/SubhaloVel"][:]
137113
Rstar = f["Subhalo/SubhaloHalfmassRadType"][:,4]
138114
Metal = f["Subhalo/SubhaloStarMetallicity"][:]
139115
Vmax = f["Subhalo/SubhaloVmax"][:]
140116
Nstar = f['/Subhalo/SubhaloLenType'][:,4] #number of stars
141117
f.close()
142118

143-
# some simulations are slightly outside the box
119+
# Some simulations are slightly outside the box, correct it
144120
pos[np.where(pos<0.0)]+=1.0
145121
pos[np.where(pos>1.0)]-=1.0
146122

147-
# select only galaxies with more than 10 star particles
123+
# Select only galaxies with more than Nstar_th star particles
148124
indexes = np.where(Nstar>Nstar_th)[0]
149125
pos = pos[indexes]
150126
Mstar = Mstar[indexes]
151-
SubhaloVel = SubhaloVel[indexes]
152127
Rstar = Rstar[indexes]
153128
Metal = Metal[indexes]
154129
Vmax = Vmax[indexes]
155130

156131
# Get the output to be predicted by the GNN, either the cosmo parameters or the power spectrum
157132
if outmode=="cosmo":
158-
# read the value of the cosmological & astrophysical parameters
133+
# Read the value of the cosmological & astrophysical parameters
159134
paramsfile = np.loadtxt(param_file, dtype=str)
160135
params = np.array(paramsfile[simnumber,1:-1],dtype=np.float32)
161136
params = normalize_params(params)
162-
params = params[:pred_params]
137+
params = params[:pred_params] # Consider only the first parameters, up to pred_params
163138
y = np.reshape(params, (1,params.shape[0]))
164139

140+
# Read the power spectra
165141
elif outmode=="ps":
166142

167143
ps = np.load(param_file)
@@ -170,97 +146,39 @@ def sim_graph(simnumber,param_file,hparams):
170146
#ps = normalize_ps(ps)
171147
y = np.reshape(ps, (1,ps_size))
172148

173-
174-
"""
175-
# compute the number of pairs
176-
nodes = pos.shape[0]
177-
u = np.zeros((1,2), dtype=np.float32)
178-
u[0,0] = np.log10(np.sum(Mstar))
179-
u[0,1] = np.log10(nodes)
180-
"""
149+
# Number of galaxies as global feature
181150
u = np.log10(pos.shape[0]).reshape(1,1)
182151

183152
Mstar = np.log10(1.+ Mstar)
184-
#SubhaloVel = np.log10(1.+SubhaloVel)
185-
SubhaloVel/=100.
186153
Rstar = np.log10(1.+ Rstar)
187154
Metal = np.log10(1.+ Metal)
188155
Vmax = np.log10(1. + Vmax)
156+
157+
# Node features
189158
tab = np.column_stack((Mstar, Rstar, Metal, Vmax))
190-
#tab = Vmax.reshape(-1,1)
159+
#tab = Vmax.reshape(-1,1) # For using only Vmax
160+
x = torch.tensor(tab, dtype=torch.float32)
191161

162+
# Use loops if node features are considered only
192163
if only_positions:
193-
#u = np.zeros((1,2), dtype=np.float32) # not used
194-
tab = np.zeros_like(pos[:,:1]) # not really used
164+
tab = np.zeros_like(pos[:,:1]) # Node features not really used
195165
use_loops = False
196166
else:
197-
use_loops = True#"""
167+
use_loops = True
198168

199-
#use_loops = False
200-
201-
x = torch.tensor(tab, dtype=torch.float32)
202-
203-
#use_loops = False
169+
# Get edges and edge features
204170
edge_index, edge_attr = get_edges(pos, r_link, use_loops)
205-
#edge_index = get_edges(pos, r_link)
206-
#edge_index = None
207171

208-
# get the graph
172+
# Construct the graph
209173
graph = Data(x=x,
210174
y=torch.tensor(y, dtype=torch.float32),
211175
u=torch.tensor(u, dtype=torch.float32),
212176
edge_index=torch.tensor(edge_index, dtype=torch.long),
213177
edge_attr=torch.tensor(edge_attr, dtype=torch.float32))
214178

215179
return graph
216-
######################################################################################
217-
"""
218-
######################################################################################
219-
# This routine creates the dataset for the considered mode
220-
# mode -------------> 'train', 'valid', 'test' or 'all'
221-
# seed -------------> random seed to split simulations among train/valid/test
222-
# sims -------------> total number of simulations
223-
# root -------------> folder containing all simulations with their galaxy catalogues
224-
# sim --------------> 'IllustrisTNG' or 'SIMBA'
225-
# suite ------------> 'LH' or 'CV'
226-
# number -----------> number of the simulation
227-
# snapnum ----------> snapshot number (choose depending of the desired redshift)
228-
# BoxSize ----------> size of the simulation box in Mpc/h
229-
# Nstar_th --> galaxies need to contain at least Nstar_th stars
230-
# k ----------------> number of neighbors
231-
# param_file -------> file with the value of the cosmo & astro parameters
232-
# batch_size -------> batch size
233-
# num_workers ------> number of workers to load the data
234-
# shuffle ----------> whether randomly shuffle the data in the data loader
235-
def create_dataset(mode, seed, sims, root, sim, suite, snapnum, BoxSize,
236-
Nstar_th, k, param_file, batch_size, num_workers=1,
237-
shuffle=True):
238-
239-
240-
241-
# get the offset and size of the considered mode
242-
if mode=='train': offset, size = int(0.0*sims), int(0.8*sims)
243-
elif mode=='valid': offset, size = int(0.8*sims), int(0.1*sims)
244-
elif mode=='test': offset, size = int(0.9*sims), int(0.1*sims)
245-
elif mode=='all': offset, size = int(0.0*sims), int(1.0*sims)
246-
else: raise Exception('wrong mode!')
247-
248-
# randomly shuffle the simulations. Instead of 0 1 2 3...999 have a
249-
# random permutation. E.g. 5 9 0 29...342
250-
np.random.seed(seed)
251-
numbers = np.arange(sims) #shuffle sims not maps
252-
np.random.shuffle(numbers)
253-
numbers = numbers[offset:offset+size] #select indexes of mode
254-
255-
# get the dataset
256-
dataset = []
257-
for i in numbers:
258-
dataset.append(sim_graph(root,sim,suite,i,snapnum,BoxSize,
259-
Nstar_th,k,param_file))
260180

261-
return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle,
262-
num_workers=num_workers)
263-
"""
181+
264182
# Split training and validation sets
265183
def split_datasets(dataset):
266184

@@ -283,13 +201,9 @@ def split_datasets(dataset):
283201
######################################################################################
284202

285203
# Main routine to load data and create the dataset
286-
# simsuite: simulation suite, either "IllustrisTNG" or "SIMBA"
287-
# simset: set of simulations:
288-
# CV: Use simulations with fiducial cosmological and astrophysical parameters, but different random seeds (27 simulations total)
289-
# LH: Use simulations over latin-hypercube, varying over cosmological and astrophysical parameters, and different random seeds (1000 simulations total)
290-
# n_sims: number of simulations, maximum 27 for CV and 1000 for LH
291204
def create_dataset(hparams):
292205

206+
# Target file depending on the task: inferring cosmo parameters or predicting power spectrum
293207
if hparams.outmode == "cosmo":
294208
param_file = "/projects/QUIJOTE/CAMELS/Sims/CosmoAstroSeed_params_"+hparams.simsuite+".txt"
295209
elif hparams.outmode == "ps":
@@ -311,8 +225,8 @@ def create_dataset(hparams):
311225
# Add other snapshots from other redshifts
312226
# Snapshot redshift
313227
# 004: z=3, 010: z=2, 014: z=1.5, 018: z=1, 024: z=0.5, 033: z=0
314-
for snap in [24,18,14,10]:
315-
#for snap in [18,10]:
228+
#for snap in [24,18,14,10]:
229+
for snap in [18,10]:
316230

317231
hparams.snap = str(snap)
318232

0 commit comments

Comments
 (0)