void mv_aTb(const raft::handle_t &handle, Matrix::Data< double > &out, const std::vector< Matrix::Data< double > * > &A, const Matrix::PartDescriptor &ADesc, const std::vector< Matrix::Data< double > * > &b, cudaStream_t *streams, int n_streams)
performs MNMG A^T x b calculation.