diff --git a/brian2cuda/templates/common_group.cu b/brian2cuda/templates/common_group.cu
index 0557cc34..712ef5b9 100644
--- a/brian2cuda/templates/common_group.cu
+++ b/brian2cuda/templates/common_group.cu
@@ -109,6 +109,8 @@ __launch_bounds__(1024, {{sm_multiplier}})
 _run_kernel_{{codeobj_name}}(
     int _N,
     int THREADS_PER_BLOCK,
+    {% block extra_kernel_parameters %}
+    {% endblock %}
     ///// KERNEL_PARAMETERS /////
     %KERNEL_PARAMETERS%
     )
@@ -117,8 +119,11 @@ _run_kernel_{{codeobj_name}}(
 
     int tid = threadIdx.x;
     int bid = blockIdx.x;
+
+    {% block indices %}
     int _idx = bid * THREADS_PER_BLOCK + tid;
     int _vectorisation_idx = _idx;
+    {% endblock %}
 
     ///// KERNEL_CONSTANTS /////
     %KERNEL_CONSTANTS%
@@ -131,12 +136,10 @@ _run_kernel_{{codeobj_name}}(
     {% block additional_variables %}
     {% endblock %}
 
-    {% block num_thread_check %}
-    if(_idx >= _N)
+    if(_vectorisation_idx >= _N)
     {
         return;
     }
-    {% endblock %}
 
     {% block kernel_maincode %}
 
@@ -300,6 +303,8 @@ void _run_{{codeobj_name}}()
     _run_kernel_{{codeobj_name}}<<<num_blocks, num_threads>>>(
             _N,
             num_threads,
+            {% block extra_host_parameters %}
+            {% endblock %}
             ///// HOST_PARAMETERS /////
             %HOST_PARAMETERS%
         );
diff --git a/brian2cuda/templates/objects.cu b/brian2cuda/templates/objects.cu
index 54b1f11a..1e422a00 100644
--- a/brian2cuda/templates/objects.cu
+++ b/brian2cuda/templates/objects.cu
@@ -353,10 +353,16 @@ void _write_arrays()
     using namespace brian;
 
     {% for var, varname in array_specs | dictsort(by='value') %}
-    {% if not (var in dynamic_array_specs or var in dynamic_array_2d_specs or var in static_array_specs) %}
+    {% if not (var in dynamic_array_specs
+                or var in dynamic_array_2d_specs
+                or var in static_array_specs
+              ) %}
+    {# Don't copy StateMonitor's N variables, which are modified on host only #}
+    {% if not (var.owner.__class__.__name__ == 'StateMonitor' and var.name == 'N') %}
     CUDA_SAFE_CALL(
             cudaMemcpy({{varname}}, dev{{varname}}, sizeof({{c_data_type(var.dtype)}})*_num_{{varname}}, cudaMemcpyDeviceToHost)
             );
+    {% endif %}
     ofstream outfile_{{varname}};
     outfile_{{varname}}.open("{{get_array_filename(var) | replace('\\', '\\\\')}}", ios::binary | ios::out);
     if(outfile_{{varname}}.is_open())
@@ -371,7 +377,11 @@ void _write_arrays()
     {% endfor %}
 
     {% for var, varname in dynamic_array_specs | dictsort(by='value') %}
-    {% if not var in multisynaptic_idx_vars and not var.name in ['delay', '_synaptic_pre', '_synaptic_post'] %}
+    {# TODO: pass isinstance to Jinja template to make it available here #}
+    {% if not (var in multisynaptic_idx_vars
+                or var.name in ['delay', '_synaptic_pre', '_synaptic_post']
+                or (var.owner.__class__.__name__ == 'StateMonitor' and var.name == 't')
+              ) %}
     {{varname}} = dev{{varname}};
     {% endif %}
     ofstream outfile_{{varname}};
diff --git a/brian2cuda/templates/statemonitor.cu b/brian2cuda/templates/statemonitor.cu
index 4583ba59..bcc43561 100644
--- a/brian2cuda/templates/statemonitor.cu
+++ b/brian2cuda/templates/statemonitor.cu
@@ -1,60 +1,50 @@
-{# USES_VARIABLES { t, N } #}
+{# USES_VARIABLES { t, _indices, N } #}
 {# WRITES_TO_READ_ONLY_VARIABLES { t, N } #}
 {% extends 'common_group.cu' %}
 
 {% block define_N %}
 {% endblock %}
 
-{# remove this once we have properly defined num_threads, num_blocks here... #}
-{% block occupancy %}
-{% endblock %}
-{% block update_occupancy %}
-{% endblock %}
-{% block kernel_info %}
-{% endblock %}
-
-{% block prepare_kernel_inner %}
+{# We are using block modify_kernel_dimensions for additional kernel preparation #}
+{% block modify_kernel_dimensions %}
 {% for varname, var in _recorded_variables | dictsort %}
-{% set _recorded =  get_array_name(var, access_data=False) %}
+{% set _recorded = get_array_name(var, access_data=False) %}
 addresses_monitor_{{_recorded}}.clear();
 {% endfor %}
 for(int i = 0; i < _num__array_{{owner.name}}__indices; i++)
 {
     {% for varname, var in _recorded_variables | dictsort %}
-    {% set _recorded =  get_array_name(var, access_data=False) %}
-    {{_recorded}}[i].resize(_numt + num_iterations - current_iteration);
+    {% set _recorded = get_array_name(var, access_data=False) %}
+    {{_recorded}}[i].resize(_numt_host + num_iterations - current_iteration);
     addresses_monitor_{{_recorded}}.push_back(thrust::raw_pointer_cast(&{{_recorded}}[i][0]));
     {% endfor %}
 }
-// Print a warning when the monitor is not going to work (#50)
-if (_num__array_{{owner.name}}__indices > 1024)
-{
-    printf("ERROR in {{owner.name}}: Too many neurons recorded. Due to a bug (brian-team/brian2cuda#50), "
-            "currently only as many neurons can be recorded as threads can be called from a single block!\n");
-}
-{% endblock prepare_kernel_inner %}
+{% endblock modify_kernel_dimensions %}
 
 {% block host_maincode %}
-// TODO: this pushes a new value to the device each time step? Looks
-// inefficient, can we keep the t values on the host instead? Do we need them
-// on the device?
-dev_dynamic_array_{{owner.name}}_t.push_back({{owner.clock.name}}.t[0]);
+// NOTE: We are using _N as the number of recorded indices here (the relevant size for
+// parallelization). This is different from `StateMonitor.N` in Python, which refers to
+// the number of recorded time steps (while `StateMonitor.n_indices` gives the number of
+// recorded indices).
+const int _N = _num_indices;
+
+// We are using an extra variable because HOST_CONSTANTS uses the device vector, which
+// is not used (TODO: Fix this in HOST_CONSTANTS instead of this hack here...)
+const int _numt_host = _dynamic_array_{{owner.name}}_t.size();
+
+// We push t only on host and don't make a device->host copy in write_arrays()
+_dynamic_array_{{owner.name}}_t.push_back({{owner.clock.name}}.t[0]);
+
 // Update size variables for Python side indexing to work
-// (Note: Need to update device variable which will be copied to host in write_arrays())
-// TODO: This is one cudaMemcpy per time step, this should be done only once in the last
-// time step, fix when fixing the statemonitor (currently only works for <=1024 threads)
 _array_{{owner.name}}_N[0] += 1;
-CUDA_SAFE_CALL(
-        cudaMemcpy(dev_array_{{owner.name}}_N, _array_{{owner.name}}_N, sizeof(int32_t),
-                   cudaMemcpyHostToDevice)
-        );
 
 int num_iterations = {{owner.clock.name}}.i_end;
 int current_iteration = {{owner.clock.name}}.timestep[0];
-static int start_offset = current_iteration - _numt;
+static int start_offset = current_iteration - _numt_host;
 {% endblock host_maincode %}
 
-{% block kernel_call %}
+
+{% block extra_kernel_call %}
 // If the StateMonitor is run outside the MagicNetwork, we need to resize it.
 // Happens e.g. when StateMonitor.record_single_timestep() is called.
 if(current_iteration >= num_iterations)
@@ -63,75 +53,49 @@ if(current_iteration >= num_iterations)
     {
         {% for varname, var in _recorded_variables | dictsort %}
         {% set _recorded =  get_array_name(var, access_data=False) %}
-        {{_recorded}}[i].resize(_numt + 1);
+        {{_recorded}}[i].resize(_numt_host + 1);
         addresses_monitor_{{_recorded}}[i] = thrust::raw_pointer_cast(&{{_recorded}}[i][0]);
         {% endfor %}
     }
 }
 
-if (_num__array_{{owner.name}}__indices > 0)
 // TODO we get invalid launch configuration if this is 0, which happens e.g. for StateMonitor(..., variables=[])
+if (_num__array_{{owner.name}}__indices > 0)
 {
-    _run_kernel_{{codeobj_name}}<<<1, _num__array_{{owner.name}}__indices>>>(
-        _num__array_{{owner.name}}__indices,
-        dev_array_{{owner.name}}__indices,
-        current_iteration - start_offset,
-        {% for varname, var in _recorded_variables | dictsort %}
-        {% set _recorded =  get_array_name(var, access_data=False) %}
-        thrust::raw_pointer_cast(&addresses_monitor_{{_recorded}}[0]),
-        {% endfor %}
-        ///// HOST_PARAMETERS /////
-        %HOST_PARAMETERS%
-        );
+{% endblock extra_kernel_call %}
+
 
-    CUDA_CHECK_ERROR("_run_kernel_{{codeobj_name}}");
+{% block extra_kernel_call_post %}
+{# Close conditional from block extra_kernel_call #}
 }
-{% endblock kernel_call %}
-
-{% block kernel %}
-__global__ void
-{% if launch_bounds %}
-__launch_bounds__(1024, {{sm_multiplier}})
-{% endif %}
-_run_kernel_{{codeobj_name}}(
-    int _num_indices,
-    int32_t* indices,
-    int current_iteration,
-    {% for varname, var in _recorded_variables | dictsort %}
-    {{c_data_type(var.dtype)}}** monitor_{{varname}},
-    {% endfor %}
-    ///// KERNEL_PARAMETERS /////
-    %KERNEL_PARAMETERS%
-    )
-{
-    using namespace brian;
+{% endblock %}
 
-    int tid = threadIdx.x;
-    if(tid > _num_indices)
-    {
-        return;
-    }
-    int32_t _idx = indices[tid];
 
-    ///// KERNEL_CONSTANTS /////
-    %KERNEL_CONSTANTS%
+{% block indices %}
+    int _vectorisation_idx = bid * THREADS_PER_BLOCK + tid;
+    int _idx = {{_indices}}[_vectorisation_idx];
+{% endblock %}
 
-    ///// kernel_lines /////
-    {{kernel_lines|autoindent}}
 
-    ///// scalar_code /////
-    {{scalar_code|autoindent}}
+{% block extra_vector_code %}
+    {% for varname, var in _recorded_variables | dictsort %}
+    monitor_{{varname}}[_vectorisation_idx][current_iteration] = _to_record_{{varname}};
+    {% endfor %}
+{% endblock extra_vector_code %}
 
-    // need different scope here since scalar_code and vector_code can
-    // declare the same variables
-    {
-        ///// vector_code /////
-        {{vector_code|autoindent}}
 
-        {% for varname, var in _recorded_variables | dictsort %}
-        {% set _recorded =  get_array_name(var, access_data=False) %}
-        monitor_{{varname}}[tid][current_iteration] = _to_record_{{varname}};
-        {% endfor %}
-    }
-}
+{% block extra_kernel_parameters %}
+    int current_iteration,
+    {% for varname, var in _recorded_variables | dictsort %}
+    {{c_data_type(var.dtype)}}** monitor_{{varname}},
+    {% endfor %}
+{% endblock %}
+
+
+{% block extra_host_parameters %}
+    current_iteration - start_offset,
+    {% for varname, var in _recorded_variables | dictsort %}
+    {% set _recorded =  get_array_name(var, access_data=False) %}
+    thrust::raw_pointer_cast(&addresses_monitor_{{_recorded}}[0]),
+    {% endfor %}
 {% endblock %}
diff --git a/brian2cuda/tests/features/speed.py b/brian2cuda/tests/features/speed.py
index b99b8f35..09f09d13 100644
--- a/brian2cuda/tests/features/speed.py
+++ b/brian2cuda/tests/features/speed.py
@@ -31,7 +31,9 @@
                 'STDPCUDARandomConnectivityHeterogeneousDelaysNarrowDistr',
                 'STDPCUDANoPostEffects',
                 'STDPEventDriven',
-                'MushroomBody'
+                'MushroomBody',
+                'StateMonitorBenchmarkCoalescedReads',
+                'StateMonitorBenchmarkUncoalescedReads',
                ])
 
 
@@ -831,9 +833,45 @@ def run(self):
 
         self.timed_run(self.duration)
 
+class StateMonitorBenchmarkBase(TimedSpeedTest):
 
+    category = "Monitor only"
+    tags = ["Monitors", "Neurons"]
+    n_label = "Num recorded neurons"
+    name = "StateMonitor benchmark"
+    n_power = [3, 4, 5, 6, 7, 8, 9, 10]
+    n_range = [int(10**p) for p in n_power]
+
+    # configuration options
+    duration = 1*second
+    coalesced_state_reading = None
+
+    def run(self):
+        warp_size = 32
+        num_neurons = self.n * warp_size
+        G = NeuronGroup(num_neurons, 'v:1')
+        G.v = 'i'
+        assert self.coalesced_state_reading is not None, "Don't use base benchmark class"
+        if self.coalesced_state_reading:
+            # record first n neurons in neurongroup (coalesced reads on state variables)
+            record = arange(self.n)
+        else:
+            # record n neurons in steps of 32 (warp size -> non-coalesced reads)
+            record = arange(0, self.n, warp_size)
+
+        mon = StateMonitor(G, 'v', record=record)
+
+        self.timed_run(self.duration)
+
+
+class StateMonitorBenchmarkCoalescedReads(TimedSpeedTest):
+    name = "StateMonitor recording from consecutive neuron indices (coalesced read)"
+    coalesced_state_reading = True
 
 
+class StateMonitorBenchmarkUncoalescedReads(TimedSpeedTest):
+    name = "StateMonitor recording from non-consecutive neuron indices (uncoalesced read)"
+    coalesced_state_reading = False
 
 
 if __name__=='__main__':
diff --git a/brian2cuda/tests/test_monitor.py b/brian2cuda/tests/test_monitor.py
index c02da37e..dbd04d26 100644
--- a/brian2cuda/tests/test_monitor.py
+++ b/brian2cuda/tests/test_monitor.py
@@ -8,14 +8,16 @@
 @pytest.mark.standalone_only
 def test_state_monitor_more_threads_than_single_block():
     set_device("cuda_standalone", directory=None)
-    # Currently, statemonitor only works for <=1024 recorded variables (#201).
-    # This is a test is to remind us of the issue.
-    G = NeuronGroup(1025, 'v:1')
+    n = 2000
+    G = NeuronGroup(n, 'v:1')
     mon = StateMonitor(G, 'v', record=True)
+    v_init = arange(n)
+    G.v = v_init
 
-    run(defaultclock.dt)
+    run(3 * defaultclock.dt)
 
-    assert_equal(mon.v, 0)
+    for t in range(3):
+        assert_equal(mon.v[:, t], v_init)
 
 
 if __name__ == '__main__':