1
+ #----------------------------------------------------
2
+ # Routine for loading the CAMELS galaxy catalogues
3
+ # Author: Pablo Villanueva Domingo
4
+ # Last update: 4/22
5
+ #----------------------------------------------------
6
+
1
7
import h5py
2
8
from torch_geometric .data import Data , DataLoader
3
9
from Source .constants import *
6
12
7
13
Nstar_th = 20 # Minimum number of stellar particles required to consider a galaxy
8
14
15
+ # Normalize CAMELS parameters
9
16
def normalize_params (params ):
10
17
11
18
minimum = np .array ([0.1 , 0.6 , 0.25 , 0.25 , 0.5 , 0.5 ])
12
19
maximum = np .array ([0.5 , 1.0 , 4.00 , 4.00 , 2.0 , 2.0 ])
13
20
params = (params - minimum )/ (maximum - minimum )
14
21
return params
15
22
23
+ # Normalize power spectrum
16
24
def normalize_ps (ps ):
17
25
mean , std = ps .mean (axis = 0 ), ps .std (axis = 0 )
18
26
normps = (ps - mean )/ std
19
27
return normps
20
28
29
+ # Compute KDTree and get edges and edge features
21
30
def get_edges (pos , r_link , use_loops ):
22
31
32
+ # 1. Get edges
33
+
23
34
# Create the KDTree and look for pairs within a distance r_link
24
35
# Boxsize normalize to 1
25
36
kd_tree = SS .KDTree (pos , leafsize = 16 , boxsize = 1.0001 )
@@ -37,35 +48,36 @@ def get_edges(pos, r_link, use_loops):
37
48
edge_index = edge_index .reshape ((2 ,- 1 ))
38
49
num_pairs = edge_index .shape [1 ]
39
50
40
- # Edge attributes
51
+ # 2. Get edge attributes
52
+
41
53
row , col = edge_index
42
54
diff = pos [row ]- pos [col ]
43
55
44
- # Correct boundaries in distances
56
+ # Take into account periodic boundary conditions, correcting the distances
45
57
for i , pos_i in enumerate (diff ):
46
- #outbound=False
47
58
for j , coord in enumerate (pos_i ):
48
59
if coord > r_link :
49
- #outbound=True
50
60
diff [i ,j ] -= 1. # Boxsize normalize to 1
51
61
elif - coord > r_link :
52
- #outbound=True
53
62
diff [i ,j ] += 1. # Boxsize normalize to 1
54
- #if outbound: numbounds+=1
55
63
64
+ # Get translational and rotational invariant features
65
+ # Distance
56
66
dist = np .linalg .norm (diff , axis = 1 )
67
+ # Centroid of galaxy catalogue
57
68
centroid = np .mean (pos ,axis = 0 )
69
+ # Unit vectors of node, neighbor and difference vector
58
70
unitrow = (pos [row ]- centroid )/ np .linalg .norm ((pos [row ]- centroid ), axis = 1 ).reshape (- 1 ,1 )
59
71
unitcol = (pos [col ]- centroid )/ np .linalg .norm ((pos [col ]- centroid ), axis = 1 ).reshape (- 1 ,1 )
60
72
unitdiff = diff / dist .reshape (- 1 ,1 )
73
+ # Dot products between unit vectors
61
74
cos1 = np .array ([np .dot (unitrow [i ,:].T ,unitcol [i ,:]) for i in range (num_pairs )])
62
75
cos2 = np .array ([np .dot (unitrow [i ,:].T ,unitdiff [i ,:]) for i in range (num_pairs )])
63
-
64
- #print(edge_index.shape, cos1.shape, cos2.shape, dist.shape)
76
+ # Normalize distance by linking radius
65
77
dist /= r_link
66
- edge_attr = np .concatenate ([dist .reshape (- 1 ,1 ), cos1 .reshape (- 1 ,1 ), cos2 .reshape (- 1 ,1 )], axis = 1 )
67
78
68
- #print(pos.shape, edge_index.shape, edge_attr.shape)
79
+ # Concatenate to get all edge attributes
80
+ edge_attr = np .concatenate ([dist .reshape (- 1 ,1 ), cos1 .reshape (- 1 ,1 ), cos2 .reshape (- 1 ,1 )], axis = 1 )
69
81
70
82
# Add loops
71
83
if use_loops :
@@ -78,90 +90,54 @@ def get_edges(pos, r_link, use_loops):
78
90
edge_attr = np .append (edge_attr , atrloops , 0 )
79
91
edge_index = edge_index .astype (int )
80
92
81
- #print(pos.shape, edge_index.shape, edge_attr.shape)
82
-
83
-
84
-
85
- #print(edge_index.shape, edge_attr.shape)
86
-
87
-
88
- """
89
- diff = (pos[row]-pos[col])/r_link
90
-
91
- #print(diff.shape, edge_index.shape, pos.shape)
92
- #numbounds = 0
93
-
94
- # Correct boundaries in distances
95
- for i, pos_i in enumerate(diff):
96
- #outbound=False
97
- for j, coord in enumerate(pos_i):
98
- if coord > 1.:
99
- #outbound=True
100
- diff[i,j] -= 1./r_link # Boxsize normalize to 1
101
- elif -coord > 1.:
102
- #outbound=True
103
- diff[i,j] += 1./r_link # Boxsize normalize to 1
104
- #if outbound: numbounds+=1
105
-
106
- edge_attr = np.concatenate([diff, np.linalg.norm(diff, axis=1, keepdims=True)], axis=1)
107
- #print(edge_attr[:,3].min(), edge_attr[:,3].max())
108
- #print(diff.shape[0], numbounds)
109
- """
110
-
111
93
return edge_index , edge_attr
112
94
113
- ######################################################################################
114
- # This routine reads the galaxies from a simulation and
115
- # root ------> folder containing all simulations with their galaxy catalogues
116
- # sim -------> 'IllustrisTNG' or 'SIMBA'
117
- # suite -----> 'LH' or 'CV'
118
- # number ----> number of the simulation
119
- # snapnum ---> snapshot number (choose depending of the desired redshift)
120
- # BoxSize ---> size of the simulation box in Mpc/h
121
- # Nstar_th -----> galaxies need to contain at least Nstar_th stars
122
- # k ---------> number of neighbors
123
- # param_file -> file with the value of the cosmological + astrophysical parameters
124
- def sim_graph (simnumber ,param_file ,hparams ):
125
95
96
+ # Routine to create a cosmic graph from a galaxy catalogue
97
+ # simnumber: number of simulation
98
+ # param_file: file with the value of the cosmological + astrophysical parameters
99
+ # hparams: hyperparameters class
100
+ def sim_graph (simnumber , param_file , hparams ):
101
+
102
+ # Get some hyperparameters
126
103
simsuite ,simset ,r_link ,only_positions ,outmode ,pred_params = hparams .simsuite ,hparams .simset ,hparams .r_link ,hparams .only_positions ,hparams .outmode ,hparams .pred_params
127
104
128
- # get the name of the galaxy catalogue
105
+ # Name of the galaxy catalogue
129
106
simpath = simpathroot + simsuite + "/" + simset + "_"
130
107
catalogue = simpath + str (simnumber )+ "/fof_subhalo_tab_0" + hparams .snap + ".hdf5"
131
108
132
- # read the catalogue
109
+ # Read the catalogue
133
110
f = h5py .File (catalogue , 'r' )
134
111
pos = f ['/Subhalo/SubhaloPos' ][:]/ boxsize
135
112
Mstar = f ['/Subhalo/SubhaloMassType' ][:,4 ] #Msun/h
136
- SubhaloVel = f ["Subhalo/SubhaloVel" ][:]
137
113
Rstar = f ["Subhalo/SubhaloHalfmassRadType" ][:,4 ]
138
114
Metal = f ["Subhalo/SubhaloStarMetallicity" ][:]
139
115
Vmax = f ["Subhalo/SubhaloVmax" ][:]
140
116
Nstar = f ['/Subhalo/SubhaloLenType' ][:,4 ] #number of stars
141
117
f .close ()
142
118
143
- # some simulations are slightly outside the box
119
+ # Some simulations are slightly outside the box, correct it
144
120
pos [np .where (pos < 0.0 )]+= 1.0
145
121
pos [np .where (pos > 1.0 )]-= 1.0
146
122
147
- # select only galaxies with more than 10 star particles
123
+ # Select only galaxies with more than Nstar_th star particles
148
124
indexes = np .where (Nstar > Nstar_th )[0 ]
149
125
pos = pos [indexes ]
150
126
Mstar = Mstar [indexes ]
151
- SubhaloVel = SubhaloVel [indexes ]
152
127
Rstar = Rstar [indexes ]
153
128
Metal = Metal [indexes ]
154
129
Vmax = Vmax [indexes ]
155
130
156
131
# Get the output to be predicted by the GNN, either the cosmo parameters or the power spectrum
157
132
if outmode == "cosmo" :
158
- # read the value of the cosmological & astrophysical parameters
133
+ # Read the value of the cosmological & astrophysical parameters
159
134
paramsfile = np .loadtxt (param_file , dtype = str )
160
135
params = np .array (paramsfile [simnumber ,1 :- 1 ],dtype = np .float32 )
161
136
params = normalize_params (params )
162
- params = params [:pred_params ]
137
+ params = params [:pred_params ] # Consider only the first parameters, up to pred_params
163
138
y = np .reshape (params , (1 ,params .shape [0 ]))
164
139
140
+ # Read the power spectra
165
141
elif outmode == "ps" :
166
142
167
143
ps = np .load (param_file )
@@ -170,97 +146,39 @@ def sim_graph(simnumber,param_file,hparams):
170
146
#ps = normalize_ps(ps)
171
147
y = np .reshape (ps , (1 ,ps_size ))
172
148
173
-
174
- """
175
- # compute the number of pairs
176
- nodes = pos.shape[0]
177
- u = np.zeros((1,2), dtype=np.float32)
178
- u[0,0] = np.log10(np.sum(Mstar))
179
- u[0,1] = np.log10(nodes)
180
- """
149
+ # Number of galaxies as global feature
181
150
u = np .log10 (pos .shape [0 ]).reshape (1 ,1 )
182
151
183
152
Mstar = np .log10 (1. + Mstar )
184
- #SubhaloVel = np.log10(1.+SubhaloVel)
185
- SubhaloVel /= 100.
186
153
Rstar = np .log10 (1. + Rstar )
187
154
Metal = np .log10 (1. + Metal )
188
155
Vmax = np .log10 (1. + Vmax )
156
+
157
+ # Node features
189
158
tab = np .column_stack ((Mstar , Rstar , Metal , Vmax ))
190
- #tab = Vmax.reshape(-1,1)
159
+ #tab = Vmax.reshape(-1,1) # For using only Vmax
160
+ x = torch .tensor (tab , dtype = torch .float32 )
191
161
162
+ # Use loops if node features are considered only
192
163
if only_positions :
193
- #u = np.zeros((1,2), dtype=np.float32) # not used
194
- tab = np .zeros_like (pos [:,:1 ]) # not really used
164
+ tab = np .zeros_like (pos [:,:1 ]) # Node features not really used
195
165
use_loops = False
196
166
else :
197
- use_loops = True #"""
167
+ use_loops = True
198
168
199
- #use_loops = False
200
-
201
- x = torch .tensor (tab , dtype = torch .float32 )
202
-
203
- #use_loops = False
169
+ # Get edges and edge features
204
170
edge_index , edge_attr = get_edges (pos , r_link , use_loops )
205
- #edge_index = get_edges(pos, r_link)
206
- #edge_index = None
207
171
208
- # get the graph
172
+ # Construct the graph
209
173
graph = Data (x = x ,
210
174
y = torch .tensor (y , dtype = torch .float32 ),
211
175
u = torch .tensor (u , dtype = torch .float32 ),
212
176
edge_index = torch .tensor (edge_index , dtype = torch .long ),
213
177
edge_attr = torch .tensor (edge_attr , dtype = torch .float32 ))
214
178
215
179
return graph
216
- ######################################################################################
217
- """
218
- ######################################################################################
219
- # This routine creates the dataset for the considered mode
220
- # mode -------------> 'train', 'valid', 'test' or 'all'
221
- # seed -------------> random seed to split simulations among train/valid/test
222
- # sims -------------> total number of simulations
223
- # root -------------> folder containing all simulations with their galaxy catalogues
224
- # sim --------------> 'IllustrisTNG' or 'SIMBA'
225
- # suite ------------> 'LH' or 'CV'
226
- # number -----------> number of the simulation
227
- # snapnum ----------> snapshot number (choose depending of the desired redshift)
228
- # BoxSize ----------> size of the simulation box in Mpc/h
229
- # Nstar_th --> galaxies need to contain at least Nstar_th stars
230
- # k ----------------> number of neighbors
231
- # param_file -------> file with the value of the cosmo & astro parameters
232
- # batch_size -------> batch size
233
- # num_workers ------> number of workers to load the data
234
- # shuffle ----------> whether randomly shuffle the data in the data loader
235
- def create_dataset(mode, seed, sims, root, sim, suite, snapnum, BoxSize,
236
- Nstar_th, k, param_file, batch_size, num_workers=1,
237
- shuffle=True):
238
-
239
-
240
-
241
- # get the offset and size of the considered mode
242
- if mode=='train': offset, size = int(0.0*sims), int(0.8*sims)
243
- elif mode=='valid': offset, size = int(0.8*sims), int(0.1*sims)
244
- elif mode=='test': offset, size = int(0.9*sims), int(0.1*sims)
245
- elif mode=='all': offset, size = int(0.0*sims), int(1.0*sims)
246
- else: raise Exception('wrong mode!')
247
-
248
- # randomly shuffle the simulations. Instead of 0 1 2 3...999 have a
249
- # random permutation. E.g. 5 9 0 29...342
250
- np.random.seed(seed)
251
- numbers = np.arange(sims) #shuffle sims not maps
252
- np.random.shuffle(numbers)
253
- numbers = numbers[offset:offset+size] #select indexes of mode
254
-
255
- # get the dataset
256
- dataset = []
257
- for i in numbers:
258
- dataset.append(sim_graph(root,sim,suite,i,snapnum,BoxSize,
259
- Nstar_th,k,param_file))
260
180
261
- return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle,
262
- num_workers=num_workers)
263
- """
181
+
264
182
# Split training and validation sets
265
183
def split_datasets (dataset ):
266
184
@@ -283,13 +201,9 @@ def split_datasets(dataset):
283
201
######################################################################################
284
202
285
203
# Main routine to load data and create the dataset
286
- # simsuite: simulation suite, either "IllustrisTNG" or "SIMBA"
287
- # simset: set of simulations:
288
- # CV: Use simulations with fiducial cosmological and astrophysical parameters, but different random seeds (27 simulations total)
289
- # LH: Use simulations over latin-hypercube, varying over cosmological and astrophysical parameters, and different random seeds (1000 simulations total)
290
- # n_sims: number of simulations, maximum 27 for CV and 1000 for LH
291
204
def create_dataset (hparams ):
292
205
206
+ # Target file depending on the task: inferring cosmo parameters or predicting power spectrum
293
207
if hparams .outmode == "cosmo" :
294
208
param_file = "/projects/QUIJOTE/CAMELS/Sims/CosmoAstroSeed_params_" + hparams .simsuite + ".txt"
295
209
elif hparams .outmode == "ps" :
@@ -311,8 +225,8 @@ def create_dataset(hparams):
311
225
# Add other snapshots from other redshifts
312
226
# Snapshot redshift
313
227
# 004: z=3, 010: z=2, 014: z=1.5, 018: z=1, 024: z=0.5, 033: z=0
314
- for snap in [24 ,18 ,14 ,10 ]:
315
- # for snap in [18,10]:
228
+ # for snap in [24,18,14,10]:
229
+ for snap in [18 ,10 ]:
316
230
317
231
hparams .snap = str (snap )
318
232
0 commit comments