Skip to content

Commit

Permalink
WIP: optimise RAM usage #2
Browse files Browse the repository at this point in the history
  • Loading branch information
Jan Jurgen Griesfeller committed Aug 24, 2023
1 parent 7e3d518 commit 71668d7
Showing 1 changed file with 34 additions and 11 deletions.
45 changes: 34 additions & 11 deletions pyaerocom/io/read_airnow.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,15 +353,34 @@ def _read_files(self, files, vars_to_retrieve):
arrs = []
# 1 for pandas, 0 for Python
read_flag = 1
unique_stat_ids = None
for i in tqdm(range(len(files))):
fp = files[i]
# print(fp)
if read_flag == 1:
filedata = self._read_file(fp)
for i, filevar in enumerate(file_vars_to_retrieve):
try:
arrs.append(filedata[filedata["variable"] == filevar].values)
except:
pass
# try:
arrs.append(filedata[filedata["variable"] == filevar].values)
if unique_stat_ids is None:
unique_stat_ids = np.unique(
(arrs[-1][:, self.FILE_COL_NAMES.index("station_id")]).astype(str)
)
else:
try:
unique_stat_ids = np.union1d(
unique_stat_ids,
np.unique(
(arrs[-1][:, self.FILE_COL_NAMES.index("station_id")]).astype(
str
)
),
)
except (ValueError, TypeError):
print(arrs[-1][:, self.FILE_COL_NAMES.index("station_id")])
raise DataRetrievalError(
f"file {fp}: error in creating unique stationlist"
)
else:
filedata = self.read_file(fp, vars_to_retrieve=vars_to_retrieve)
for i, var in enumerate(vars_to_retrieve):
Expand All @@ -383,9 +402,9 @@ def _read_files(self, files, vars_to_retrieve):
# arrs.append(vardata)
if len(arrs) == 0:
raise DataRetrievalError("None of the input variables could be found in input list")
return self._filedata_to_statlist(arrs, vars_to_retrieve)
return self._filedata_to_statlist(arrs, vars_to_retrieve, unique_stat_ids=unique_stat_ids)

def _filedata_to_statlist(self, arrs, vars_to_retrieve):
def _filedata_to_statlist(self, arrs, vars_to_retrieve, unique_stat_ids=None):
"""
Convert loaded filedata into list of StationData objects
Expand Down Expand Up @@ -426,11 +445,15 @@ def _filedata_to_statlist(self, arrs, vars_to_retrieve):
subset = data[mask]
dtime_subset = dtime[mask]
# not all stations seems to provide the station id as string...
try:
statlist = np.unique(subset[:, statcol])
except TypeError:
tmp_str = [subset[:, statcol][x] for x in range(len(subset[:, statcol]))]
statlist = np.unique(tmp_str)
if unique_stat_ids is None:
try:
statlist = np.unique((subset[:, statcol]).astype(str))
except TypeError:
raise DataRetrievalError("error in creating an unique station list")
# tmp_str = [subset[:, statcol][x] for x in range(len(subset[:, statcol]))]
# statlist = np.unique(tmp_str)
else:
statlist = unique_stat_ids
for stat_id in tqdm(statlist, desc=var):
if not stat_id in stat_ids:
continue
Expand Down

0 comments on commit 71668d7

Please sign in to comment.