Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix statemonitor #279

Merged
merged 8 commits into from
Mar 31, 2022
Merged
11 changes: 8 additions & 3 deletions brian2cuda/templates/common_group.cu
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ __launch_bounds__(1024, {{sm_multiplier}})
_run_kernel_{{codeobj_name}}(
int _N,
int THREADS_PER_BLOCK,
{% block extra_kernel_parameters %}
{% endblock %}
///// KERNEL_PARAMETERS /////
%KERNEL_PARAMETERS%
)
Expand All @@ -117,8 +119,11 @@ _run_kernel_{{codeobj_name}}(

int tid = threadIdx.x;
int bid = blockIdx.x;

{% block indices %}
int _idx = bid * THREADS_PER_BLOCK + tid;
int _vectorisation_idx = _idx;
{% endblock %}

///// KERNEL_CONSTANTS /////
%KERNEL_CONSTANTS%
Expand All @@ -131,12 +136,10 @@ _run_kernel_{{codeobj_name}}(
{% block additional_variables %}
{% endblock %}

{% block num_thread_check %}
if(_idx >= _N)
if(_vectorisation_idx >= _N)
{
return;
}
{% endblock %}

{% block kernel_maincode %}

Expand Down Expand Up @@ -300,6 +303,8 @@ void _run_{{codeobj_name}}()
_run_kernel_{{codeobj_name}}<<<num_blocks, num_threads>>>(
_N,
num_threads,
{% block extra_host_parameters %}
{% endblock %}
///// HOST_PARAMETERS /////
%HOST_PARAMETERS%
);
Expand Down
14 changes: 12 additions & 2 deletions brian2cuda/templates/objects.cu
Original file line number Diff line number Diff line change
Expand Up @@ -353,10 +353,16 @@ void _write_arrays()
using namespace brian;

{% for var, varname in array_specs | dictsort(by='value') %}
{% if not (var in dynamic_array_specs or var in dynamic_array_2d_specs or var in static_array_specs) %}
{% if not (var in dynamic_array_specs
or var in dynamic_array_2d_specs
or var in static_array_specs
) %}
{# Don't copy StateMonitor's N variables, which are modified on host only #}
{% if not (var.owner.__class__.__name__ == 'StateMonitor' and var.name == 'N') %}
CUDA_SAFE_CALL(
cudaMemcpy({{varname}}, dev{{varname}}, sizeof({{c_data_type(var.dtype)}})*_num_{{varname}}, cudaMemcpyDeviceToHost)
);
{% endif %}
ofstream outfile_{{varname}};
outfile_{{varname}}.open("{{get_array_filename(var) | replace('\\', '\\\\')}}", ios::binary | ios::out);
if(outfile_{{varname}}.is_open())
Expand All @@ -371,7 +377,11 @@ void _write_arrays()
{% endfor %}

{% for var, varname in dynamic_array_specs | dictsort(by='value') %}
{% if not var in multisynaptic_idx_vars and not var.name in ['delay', '_synaptic_pre', '_synaptic_post'] %}
{# TODO: pass isinstance to Jinja template to make it available here #}
{% if not (var in multisynaptic_idx_vars
or var.name in ['delay', '_synaptic_pre', '_synaptic_post']
or (var.owner.__class__.__name__ == 'StateMonitor' and var.name == 't')
) %}
{{varname}} = dev{{varname}};
{% endif %}
ofstream outfile_{{varname}};
Expand Down
142 changes: 53 additions & 89 deletions brian2cuda/templates/statemonitor.cu
Original file line number Diff line number Diff line change
@@ -1,60 +1,50 @@
{# USES_VARIABLES { t, N } #}
{# USES_VARIABLES { t, _indices, N } #}
{# WRITES_TO_READ_ONLY_VARIABLES { t, N } #}
{% extends 'common_group.cu' %}

{% block define_N %}
{% endblock %}

{# remove this once we have properly defined num_threads, num_blocks here... #}
{% block occupancy %}
{% endblock %}
{% block update_occupancy %}
{% endblock %}
{% block kernel_info %}
{% endblock %}

{% block prepare_kernel_inner %}
{# We are using block modify_kernel_dimensions for additional kernel preparation #}
{% block modify_kernel_dimensions %}
{% for varname, var in _recorded_variables | dictsort %}
{% set _recorded = get_array_name(var, access_data=False) %}
{% set _recorded = get_array_name(var, access_data=False) %}
addresses_monitor_{{_recorded}}.clear();
{% endfor %}
for(int i = 0; i < _num__array_{{owner.name}}__indices; i++)
{
{% for varname, var in _recorded_variables | dictsort %}
{% set _recorded = get_array_name(var, access_data=False) %}
{{_recorded}}[i].resize(_numt + num_iterations - current_iteration);
{% set _recorded = get_array_name(var, access_data=False) %}
{{_recorded}}[i].resize(_numt_host + num_iterations - current_iteration);
addresses_monitor_{{_recorded}}.push_back(thrust::raw_pointer_cast(&{{_recorded}}[i][0]));
{% endfor %}
}
// Print a warning when the monitor is not going to work (#50)
if (_num__array_{{owner.name}}__indices > 1024)
{
printf("ERROR in {{owner.name}}: Too many neurons recorded. Due to a bug (brian-team/brian2cuda#50), "
"currently only as many neurons can be recorded as threads can be called from a single block!\n");
}
{% endblock prepare_kernel_inner %}
{% endblock modify_kernel_dimensions %}

{% block host_maincode %}
// TODO: this pushes a new value to the device each time step? Looks
// inefficient, can we keep the t values on the host instead? Do we need them
// on the device?
dev_dynamic_array_{{owner.name}}_t.push_back({{owner.clock.name}}.t[0]);
// NOTE: We are using _N as the number of recorded indices here (the relevant size for
// parallelization). This is different from `StateMonitor.N` in Python, which refers to
// the number of recorded time steps (while `StateMonitor.n_indices` gives the number of
// recorded indices).
const int _N = _num_indices;

// We are using an extra variable because HOST_CONSTANTS uses the device vector, which
// is not used (TODO: Fix this in HOST_CONSTANTS instead of this hack here...)
const int _numt_host = _dynamic_array_{{owner.name}}_t.size();

// We push t only on host and don't make a device->host copy in write_arrays()
_dynamic_array_{{owner.name}}_t.push_back({{owner.clock.name}}.t[0]);

// Update size variables for Python side indexing to work
// (Note: Need to update device variable which will be copied to host in write_arrays())
// TODO: This is one cudaMemcpy per time step, this should be done only once in the last
// time step, fix when fixing the statemonitor (currently only works for <=1024 threads)
_array_{{owner.name}}_N[0] += 1;
CUDA_SAFE_CALL(
cudaMemcpy(dev_array_{{owner.name}}_N, _array_{{owner.name}}_N, sizeof(int32_t),
cudaMemcpyHostToDevice)
);

int num_iterations = {{owner.clock.name}}.i_end;
int current_iteration = {{owner.clock.name}}.timestep[0];
static int start_offset = current_iteration - _numt;
static int start_offset = current_iteration - _numt_host;
{% endblock host_maincode %}

{% block kernel_call %}

{% block extra_kernel_call %}
// If the StateMonitor is run outside the MagicNetwork, we need to resize it.
// Happens e.g. when StateMonitor.record_single_timestep() is called.
if(current_iteration >= num_iterations)
Expand All @@ -63,75 +53,49 @@ if(current_iteration >= num_iterations)
{
{% for varname, var in _recorded_variables | dictsort %}
{% set _recorded = get_array_name(var, access_data=False) %}
{{_recorded}}[i].resize(_numt + 1);
{{_recorded}}[i].resize(_numt_host + 1);
addresses_monitor_{{_recorded}}[i] = thrust::raw_pointer_cast(&{{_recorded}}[i][0]);
{% endfor %}
}
}

if (_num__array_{{owner.name}}__indices > 0)
// TODO we get invalid launch configuration if this is 0, which happens e.g. for StateMonitor(..., variables=[])
if (_num__array_{{owner.name}}__indices > 0)
{
_run_kernel_{{codeobj_name}}<<<1, _num__array_{{owner.name}}__indices>>>(
_num__array_{{owner.name}}__indices,
dev_array_{{owner.name}}__indices,
current_iteration - start_offset,
{% for varname, var in _recorded_variables | dictsort %}
{% set _recorded = get_array_name(var, access_data=False) %}
thrust::raw_pointer_cast(&addresses_monitor_{{_recorded}}[0]),
{% endfor %}
///// HOST_PARAMETERS /////
%HOST_PARAMETERS%
);
{% endblock extra_kernel_call %}


CUDA_CHECK_ERROR("_run_kernel_{{codeobj_name}}");
{% block extra_kernel_call_post %}
{# Close conditional from block extra_kernel_call #}
}
{% endblock kernel_call %}

{% block kernel %}
__global__ void
{% if launch_bounds %}
__launch_bounds__(1024, {{sm_multiplier}})
{% endif %}
_run_kernel_{{codeobj_name}}(
int _num_indices,
int32_t* indices,
int current_iteration,
{% for varname, var in _recorded_variables | dictsort %}
{{c_data_type(var.dtype)}}** monitor_{{varname}},
{% endfor %}
///// KERNEL_PARAMETERS /////
%KERNEL_PARAMETERS%
)
{
using namespace brian;
{% endblock %}

int tid = threadIdx.x;
if(tid > _num_indices)
{
return;
}
int32_t _idx = indices[tid];

///// KERNEL_CONSTANTS /////
%KERNEL_CONSTANTS%
{% block indices %}
int _vectorisation_idx = bid * THREADS_PER_BLOCK + tid;
int _idx = {{_indices}}[_vectorisation_idx];
{% endblock %}

///// kernel_lines /////
{{kernel_lines|autoindent}}

///// scalar_code /////
{{scalar_code|autoindent}}
{% block extra_vector_code %}
{% for varname, var in _recorded_variables | dictsort %}
monitor_{{varname}}[_vectorisation_idx][current_iteration] = _to_record_{{varname}};
{% endfor %}
{% endblock extra_vector_code %}

// need different scope here since scalar_code and vector_code can
// declare the same variables
{
///// vector_code /////
{{vector_code|autoindent}}

{% for varname, var in _recorded_variables | dictsort %}
{% set _recorded = get_array_name(var, access_data=False) %}
monitor_{{varname}}[tid][current_iteration] = _to_record_{{varname}};
{% endfor %}
}
}
{% block extra_kernel_parameters %}
int current_iteration,
{% for varname, var in _recorded_variables | dictsort %}
{{c_data_type(var.dtype)}}** monitor_{{varname}},
{% endfor %}
{% endblock %}


{% block extra_host_parameters %}
current_iteration - start_offset,
{% for varname, var in _recorded_variables | dictsort %}
{% set _recorded = get_array_name(var, access_data=False) %}
thrust::raw_pointer_cast(&addresses_monitor_{{_recorded}}[0]),
{% endfor %}
{% endblock %}
40 changes: 39 additions & 1 deletion brian2cuda/tests/features/speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@
'STDPCUDARandomConnectivityHeterogeneousDelaysNarrowDistr',
'STDPCUDANoPostEffects',
'STDPEventDriven',
'MushroomBody'
'MushroomBody',
'StateMonitorBenchmarkCoalescedReads',
'StateMonitorBenchmarkUncoalescedReads',
])


Expand Down Expand Up @@ -831,9 +833,45 @@ def run(self):

self.timed_run(self.duration)

class StateMonitorBenchmarkBase(TimedSpeedTest):

category = "Monitor only"
tags = ["Monitors", "Neurons"]
n_label = "Num recorded neurons"
name = "StateMonitor benchmark"
n_power = [3, 4, 5, 6, 7, 8, 9, 10]
n_range = [int(10**p) for p in n_power]

# configuration options
duration = 1*second
coalesced_state_reading = None

def run(self):
warp_size = 32
num_neurons = self.n * warp_size
G = NeuronGroup(num_neurons, 'v:1')
G.v = 'i'
assert self.coalesced_state_reading is not None, "Don't use base benchmark class"
if self.coalesced_state_reading:
# record first n neurons in neurongroup (coalesced reads on state variables)
record = arange(self.n)
else:
# record n neurons in steps of 32 (warp size -> non-coalesced reads)
record = arange(0, self.n, warp_size)

mon = StateMonitor(G, 'v', record=record)

self.timed_run(self.duration)


class StateMonitorBenchmarkCoalescedReads(TimedSpeedTest):
name = "StateMonitor recording from consecutive neuron indices (coalesced read)"
coalesced_state_reading = True


class StateMonitorBenchmarkUncoalescedReads(TimedSpeedTest):
name = "StateMonitor recording from non-consecutive neuron indices (uncoalesced read)"
coalesced_state_reading = False


if __name__=='__main__':
Expand Down
12 changes: 7 additions & 5 deletions brian2cuda/tests/test_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,16 @@
@pytest.mark.standalone_only
def test_state_monitor_more_threads_than_single_block():
set_device("cuda_standalone", directory=None)
# Currently, statemonitor only works for <=1024 recorded variables (#201).
# This is a test is to remind us of the issue.
G = NeuronGroup(1025, 'v:1')
n = 2000
G = NeuronGroup(n, 'v:1')
mon = StateMonitor(G, 'v', record=True)
v_init = arange(n)
G.v = v_init

run(defaultclock.dt)
run(3 * defaultclock.dt)

assert_equal(mon.v, 0)
for t in range(3):
assert_equal(mon.v[:, t], v_init)


if __name__ == '__main__':
Expand Down