Consider the MWE below.

```
using Flux
model = Chain(
RNN(1, 128),
Dense(128, 1)
);
# one feature, sequence length = 3, batch size = 6
x = [rand(Float32, 1, 6) for _ = 1:3]
y = rand(Float32, 1, 6)
function loss(x, y)
Flux.reset!(model)
# xi is the input at one step
# many-to-one: we only care about the last output
ŷ = [model(xi) for xi in x][end]
return Flux.mse(ŷ, y)
end
# compute the gradient corresponding to (x, y)
ps = Flux.params(model)
gs = gradient(ps) do
loss(x, y)
end
```

The above `gradient`

line causes the following error:

```
MethodError: no method matching ndims(::ChainRulesCore.NoTangent)
Closest candidates are:
ndims(::CUDA.CUSPARSE.CuSparseMatrix{Tv, Ti} where {Tv, Ti}) at /home/shuhua/.julia/packages/CUDA/YpW0k/lib/cusparse/array.jl:204
ndims(::CUDA.CuSparseDeviceVector) at /home/shuhua/.julia/packages/CUDA/YpW0k/src/device/sparse.jl:23
ndims(::AbstractChar) at char.jl:191
...
```