diff --git a/brian2cuda/templates/common_group.cu b/brian2cuda/templates/common_group.cu index 0557cc34..712ef5b9 100644 --- a/brian2cuda/templates/common_group.cu +++ b/brian2cuda/templates/common_group.cu @@ -109,6 +109,8 @@ __launch_bounds__(1024, {{sm_multiplier}}) _run_kernel_{{codeobj_name}}( int _N, int THREADS_PER_BLOCK, + {% block extra_kernel_parameters %} + {% endblock %} ///// KERNEL_PARAMETERS ///// %KERNEL_PARAMETERS% ) @@ -117,8 +119,11 @@ _run_kernel_{{codeobj_name}}( int tid = threadIdx.x; int bid = blockIdx.x; + + {% block indices %} int _idx = bid * THREADS_PER_BLOCK + tid; int _vectorisation_idx = _idx; + {% endblock %} ///// KERNEL_CONSTANTS ///// %KERNEL_CONSTANTS% @@ -131,12 +136,10 @@ _run_kernel_{{codeobj_name}}( {% block additional_variables %} {% endblock %} - {% block num_thread_check %} - if(_idx >= _N) + if(_vectorisation_idx >= _N) { return; } - {% endblock %} {% block kernel_maincode %} @@ -300,6 +303,8 @@ void _run_{{codeobj_name}}() _run_kernel_{{codeobj_name}}<<>>( _N, num_threads, + {% block extra_host_parameters %} + {% endblock %} ///// HOST_PARAMETERS ///// %HOST_PARAMETERS% ); diff --git a/brian2cuda/templates/objects.cu b/brian2cuda/templates/objects.cu index 54b1f11a..1e422a00 100644 --- a/brian2cuda/templates/objects.cu +++ b/brian2cuda/templates/objects.cu @@ -353,10 +353,16 @@ void _write_arrays() using namespace brian; {% for var, varname in array_specs | dictsort(by='value') %} - {% if not (var in dynamic_array_specs or var in dynamic_array_2d_specs or var in static_array_specs) %} + {% if not (var in dynamic_array_specs + or var in dynamic_array_2d_specs + or var in static_array_specs + ) %} + {# Don't copy StateMonitor's N variables, which are modified on host only #} + {% if not (var.owner.__class__.__name__ == 'StateMonitor' and var.name == 'N') %} CUDA_SAFE_CALL( cudaMemcpy({{varname}}, dev{{varname}}, sizeof({{c_data_type(var.dtype)}})*_num_{{varname}}, cudaMemcpyDeviceToHost) ); + {% endif %} ofstream outfile_{{varname}}; outfile_{{varname}}.open("{{get_array_filename(var) | replace('\\', '\\\\')}}", ios::binary | ios::out); if(outfile_{{varname}}.is_open()) @@ -371,7 +377,11 @@ void _write_arrays() {% endfor %} {% for var, varname in dynamic_array_specs | dictsort(by='value') %} - {% if not var in multisynaptic_idx_vars and not var.name in ['delay', '_synaptic_pre', '_synaptic_post'] %} + {# TODO: pass isinstance to Jinja template to make it available here #} + {% if not (var in multisynaptic_idx_vars + or var.name in ['delay', '_synaptic_pre', '_synaptic_post'] + or (var.owner.__class__.__name__ == 'StateMonitor' and var.name == 't') + ) %} {{varname}} = dev{{varname}}; {% endif %} ofstream outfile_{{varname}}; diff --git a/brian2cuda/templates/statemonitor.cu b/brian2cuda/templates/statemonitor.cu index 4583ba59..bcc43561 100644 --- a/brian2cuda/templates/statemonitor.cu +++ b/brian2cuda/templates/statemonitor.cu @@ -1,60 +1,50 @@ -{# USES_VARIABLES { t, N } #} +{# USES_VARIABLES { t, _indices, N } #} {# WRITES_TO_READ_ONLY_VARIABLES { t, N } #} {% extends 'common_group.cu' %} {% block define_N %} {% endblock %} -{# remove this once we have properly defined num_threads, num_blocks here... #} -{% block occupancy %} -{% endblock %} -{% block update_occupancy %} -{% endblock %} -{% block kernel_info %} -{% endblock %} - -{% block prepare_kernel_inner %} +{# We are using block modify_kernel_dimensions for additional kernel preparation #} +{% block modify_kernel_dimensions %} {% for varname, var in _recorded_variables | dictsort %} -{% set _recorded = get_array_name(var, access_data=False) %} +{% set _recorded = get_array_name(var, access_data=False) %} addresses_monitor_{{_recorded}}.clear(); {% endfor %} for(int i = 0; i < _num__array_{{owner.name}}__indices; i++) { {% for varname, var in _recorded_variables | dictsort %} - {% set _recorded = get_array_name(var, access_data=False) %} - {{_recorded}}[i].resize(_numt + num_iterations - current_iteration); + {% set _recorded = get_array_name(var, access_data=False) %} + {{_recorded}}[i].resize(_numt_host + num_iterations - current_iteration); addresses_monitor_{{_recorded}}.push_back(thrust::raw_pointer_cast(&{{_recorded}}[i][0])); {% endfor %} } -// Print a warning when the monitor is not going to work (#50) -if (_num__array_{{owner.name}}__indices > 1024) -{ - printf("ERROR in {{owner.name}}: Too many neurons recorded. Due to a bug (brian-team/brian2cuda#50), " - "currently only as many neurons can be recorded as threads can be called from a single block!\n"); -} -{% endblock prepare_kernel_inner %} +{% endblock modify_kernel_dimensions %} {% block host_maincode %} -// TODO: this pushes a new value to the device each time step? Looks -// inefficient, can we keep the t values on the host instead? Do we need them -// on the device? -dev_dynamic_array_{{owner.name}}_t.push_back({{owner.clock.name}}.t[0]); +// NOTE: We are using _N as the number of recorded indices here (the relevant size for +// parallelization). This is different from `StateMonitor.N` in Python, which refers to +// the number of recorded time steps (while `StateMonitor.n_indices` gives the number of +// recorded indices). +const int _N = _num_indices; + +// We are using an extra variable because HOST_CONSTANTS uses the device vector, which +// is not used (TODO: Fix this in HOST_CONSTANTS instead of this hack here...) +const int _numt_host = _dynamic_array_{{owner.name}}_t.size(); + +// We push t only on host and don't make a device->host copy in write_arrays() +_dynamic_array_{{owner.name}}_t.push_back({{owner.clock.name}}.t[0]); + // Update size variables for Python side indexing to work -// (Note: Need to update device variable which will be copied to host in write_arrays()) -// TODO: This is one cudaMemcpy per time step, this should be done only once in the last -// time step, fix when fixing the statemonitor (currently only works for <=1024 threads) _array_{{owner.name}}_N[0] += 1; -CUDA_SAFE_CALL( - cudaMemcpy(dev_array_{{owner.name}}_N, _array_{{owner.name}}_N, sizeof(int32_t), - cudaMemcpyHostToDevice) - ); int num_iterations = {{owner.clock.name}}.i_end; int current_iteration = {{owner.clock.name}}.timestep[0]; -static int start_offset = current_iteration - _numt; +static int start_offset = current_iteration - _numt_host; {% endblock host_maincode %} -{% block kernel_call %} + +{% block extra_kernel_call %} // If the StateMonitor is run outside the MagicNetwork, we need to resize it. // Happens e.g. when StateMonitor.record_single_timestep() is called. if(current_iteration >= num_iterations) @@ -63,75 +53,49 @@ if(current_iteration >= num_iterations) { {% for varname, var in _recorded_variables | dictsort %} {% set _recorded = get_array_name(var, access_data=False) %} - {{_recorded}}[i].resize(_numt + 1); + {{_recorded}}[i].resize(_numt_host + 1); addresses_monitor_{{_recorded}}[i] = thrust::raw_pointer_cast(&{{_recorded}}[i][0]); {% endfor %} } } -if (_num__array_{{owner.name}}__indices > 0) // TODO we get invalid launch configuration if this is 0, which happens e.g. for StateMonitor(..., variables=[]) +if (_num__array_{{owner.name}}__indices > 0) { - _run_kernel_{{codeobj_name}}<<<1, _num__array_{{owner.name}}__indices>>>( - _num__array_{{owner.name}}__indices, - dev_array_{{owner.name}}__indices, - current_iteration - start_offset, - {% for varname, var in _recorded_variables | dictsort %} - {% set _recorded = get_array_name(var, access_data=False) %} - thrust::raw_pointer_cast(&addresses_monitor_{{_recorded}}[0]), - {% endfor %} - ///// HOST_PARAMETERS ///// - %HOST_PARAMETERS% - ); +{% endblock extra_kernel_call %} + - CUDA_CHECK_ERROR("_run_kernel_{{codeobj_name}}"); +{% block extra_kernel_call_post %} +{# Close conditional from block extra_kernel_call #} } -{% endblock kernel_call %} - -{% block kernel %} -__global__ void -{% if launch_bounds %} -__launch_bounds__(1024, {{sm_multiplier}}) -{% endif %} -_run_kernel_{{codeobj_name}}( - int _num_indices, - int32_t* indices, - int current_iteration, - {% for varname, var in _recorded_variables | dictsort %} - {{c_data_type(var.dtype)}}** monitor_{{varname}}, - {% endfor %} - ///// KERNEL_PARAMETERS ///// - %KERNEL_PARAMETERS% - ) -{ - using namespace brian; +{% endblock %} - int tid = threadIdx.x; - if(tid > _num_indices) - { - return; - } - int32_t _idx = indices[tid]; - ///// KERNEL_CONSTANTS ///// - %KERNEL_CONSTANTS% +{% block indices %} + int _vectorisation_idx = bid * THREADS_PER_BLOCK + tid; + int _idx = {{_indices}}[_vectorisation_idx]; +{% endblock %} - ///// kernel_lines ///// - {{kernel_lines|autoindent}} - ///// scalar_code ///// - {{scalar_code|autoindent}} +{% block extra_vector_code %} + {% for varname, var in _recorded_variables | dictsort %} + monitor_{{varname}}[_vectorisation_idx][current_iteration] = _to_record_{{varname}}; + {% endfor %} +{% endblock extra_vector_code %} - // need different scope here since scalar_code and vector_code can - // declare the same variables - { - ///// vector_code ///// - {{vector_code|autoindent}} - {% for varname, var in _recorded_variables | dictsort %} - {% set _recorded = get_array_name(var, access_data=False) %} - monitor_{{varname}}[tid][current_iteration] = _to_record_{{varname}}; - {% endfor %} - } -} +{% block extra_kernel_parameters %} + int current_iteration, + {% for varname, var in _recorded_variables | dictsort %} + {{c_data_type(var.dtype)}}** monitor_{{varname}}, + {% endfor %} +{% endblock %} + + +{% block extra_host_parameters %} + current_iteration - start_offset, + {% for varname, var in _recorded_variables | dictsort %} + {% set _recorded = get_array_name(var, access_data=False) %} + thrust::raw_pointer_cast(&addresses_monitor_{{_recorded}}[0]), + {% endfor %} {% endblock %} diff --git a/brian2cuda/tests/features/speed.py b/brian2cuda/tests/features/speed.py index b99b8f35..09f09d13 100644 --- a/brian2cuda/tests/features/speed.py +++ b/brian2cuda/tests/features/speed.py @@ -31,7 +31,9 @@ 'STDPCUDARandomConnectivityHeterogeneousDelaysNarrowDistr', 'STDPCUDANoPostEffects', 'STDPEventDriven', - 'MushroomBody' + 'MushroomBody', + 'StateMonitorBenchmarkCoalescedReads', + 'StateMonitorBenchmarkUncoalescedReads', ]) @@ -831,9 +833,45 @@ def run(self): self.timed_run(self.duration) +class StateMonitorBenchmarkBase(TimedSpeedTest): + category = "Monitor only" + tags = ["Monitors", "Neurons"] + n_label = "Num recorded neurons" + name = "StateMonitor benchmark" + n_power = [3, 4, 5, 6, 7, 8, 9, 10] + n_range = [int(10**p) for p in n_power] + + # configuration options + duration = 1*second + coalesced_state_reading = None + + def run(self): + warp_size = 32 + num_neurons = self.n * warp_size + G = NeuronGroup(num_neurons, 'v:1') + G.v = 'i' + assert self.coalesced_state_reading is not None, "Don't use base benchmark class" + if self.coalesced_state_reading: + # record first n neurons in neurongroup (coalesced reads on state variables) + record = arange(self.n) + else: + # record n neurons in steps of 32 (warp size -> non-coalesced reads) + record = arange(0, self.n, warp_size) + + mon = StateMonitor(G, 'v', record=record) + + self.timed_run(self.duration) + + +class StateMonitorBenchmarkCoalescedReads(TimedSpeedTest): + name = "StateMonitor recording from consecutive neuron indices (coalesced read)" + coalesced_state_reading = True +class StateMonitorBenchmarkUncoalescedReads(TimedSpeedTest): + name = "StateMonitor recording from non-consecutive neuron indices (uncoalesced read)" + coalesced_state_reading = False if __name__=='__main__': diff --git a/brian2cuda/tests/test_monitor.py b/brian2cuda/tests/test_monitor.py index c02da37e..dbd04d26 100644 --- a/brian2cuda/tests/test_monitor.py +++ b/brian2cuda/tests/test_monitor.py @@ -8,14 +8,16 @@ @pytest.mark.standalone_only def test_state_monitor_more_threads_than_single_block(): set_device("cuda_standalone", directory=None) - # Currently, statemonitor only works for <=1024 recorded variables (#201). - # This is a test is to remind us of the issue. - G = NeuronGroup(1025, 'v:1') + n = 2000 + G = NeuronGroup(n, 'v:1') mon = StateMonitor(G, 'v', record=True) + v_init = arange(n) + G.v = v_init - run(defaultclock.dt) + run(3 * defaultclock.dt) - assert_equal(mon.v, 0) + for t in range(3): + assert_equal(mon.v[:, t], v_init) if __name__ == '__main__':