tssc-hpcg/src/ComputeDotProduct_ref.cpp


//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra     (dongarra@eecs.utk.edu)
// Piotr Luszczek    (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER

/*!
 @file ComputeDotProduct_ref.cpp

 HPCG routine
 */

#ifndef HPCG_NO_MPI
#include "mytimer.hpp"
#include <mpi.h>
#endif
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#include "ComputeDotProduct_ref.hpp"
#include <cassert>

/*!
  Routine to compute the dot product of two vectors where:

  This is the reference dot-product implementation.  It _CANNOT_ be modified for the
  purposes of this benchmark.

  @param[in] n the number of vector elements (on this processor)
  @param[in] x, y the input vectors
  @param[in] result a pointer to scalar value, on exit will contain result.
  @param[out] time_allreduce the time it took to perform the communication between processes

  @return returns 0 upon success and non-zero otherwise

  @see ComputeDotProduct
*/
int ComputeDotProduct_ref(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce)
{
    assert(x.localLength >= n); // Test vector lengths
    assert(y.localLength >= n);

    double local_result = 0.0;
    double* xv = x.values;
    double* yv = y.values;
    if (yv == xv)
    {
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for reduction(+ : local_result)
#endif
        for (local_int_t i = 0; i < n; i++)
            local_result += xv[i] * xv[i];
    }
    else
    {
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for reduction(+ : local_result)
#endif
        for (local_int_t i = 0; i < n; i++)
            local_result += xv[i] * yv[i];
    }

#ifndef HPCG_NO_MPI
    // Use MPI's reduce function to collect all partial sums
    double t0 = mytimer();
    double global_result = 0.0;
    MPI_Allreduce(&local_result, &global_result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    result = global_result;
    time_allreduce += mytimer() - t0;
#else
    time_allreduce += 0.0;
    result = local_result;
#endif

    return 0;
}