Vortex 2.0 changes:
+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
This commit is contained in:
10
tests/opencl/vectorhypot/Makefile
Normal file
10
tests/opencl/vectorhypot/Makefile
Normal file
@@ -0,0 +1,10 @@
|
||||
PROJECT = vectorhypot
|
||||
|
||||
SRCS = main.cc oclUtils.cpp shrUtils.cpp cmd_arg_reader.cpp
|
||||
|
||||
CXXFLAGS += -I.
|
||||
|
||||
OPTS ?=
|
||||
|
||||
include ../common.mk
|
||||
|
||||
152
tests/opencl/vectorhypot/cmd_arg_reader.cpp
Normal file
152
tests/opencl/vectorhypot/cmd_arg_reader.cpp
Normal file
@@ -0,0 +1,152 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* CUda UTility Library */
|
||||
|
||||
// includes, file
|
||||
#include "cmd_arg_reader.h"
|
||||
|
||||
// includes, system
|
||||
#include <vector>
|
||||
|
||||
// internal unnamed namespace
|
||||
|
||||
namespace
|
||||
{
|
||||
// types, internal (class, enum, struct, union, typedef)
|
||||
|
||||
// variables, internal
|
||||
|
||||
} // namespace {
|
||||
|
||||
// variables, exported
|
||||
|
||||
/*static*/ CmdArgReader* CmdArgReader::self;
|
||||
/*static*/ char** CmdArgReader::rargv;
|
||||
/*static*/ int CmdArgReader::rargc;
|
||||
|
||||
// functions, exported
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Public construction interface
|
||||
//! @return a handle to the class instance
|
||||
//! @param argc number of command line arguments (as given to main())
|
||||
//! @param argv command line argument string (as given to main())
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ void
|
||||
CmdArgReader::init( const int argc, const char** argv)
|
||||
{
|
||||
if ( NULL != self)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// command line arguments
|
||||
if (( 0 == argc) || ( 0 == argv))
|
||||
{
|
||||
LOGIC_EXCEPTION( "No command line arguments given.");
|
||||
}
|
||||
|
||||
self = new CmdArgReader();
|
||||
|
||||
self->createArgsMaps( argc, argv);
|
||||
|
||||
rargc = argc;
|
||||
rargv = const_cast<char**>( argv);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Constructor, default
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
CmdArgReader::CmdArgReader() :
|
||||
args(),
|
||||
unprocessed(),
|
||||
iter(),
|
||||
iter_unprocessed()
|
||||
{ }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Destructor
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
CmdArgReader::~CmdArgReader()
|
||||
{
|
||||
for( iter = args.begin(); iter != args.end(); ++iter)
|
||||
{
|
||||
if( *(iter->second.first) == typeid( int))
|
||||
{
|
||||
delete static_cast<int*>( iter->second.second);
|
||||
break;
|
||||
}
|
||||
else if( *(iter->second.first) == typeid( bool))
|
||||
{
|
||||
delete static_cast<bool*>( iter->second.second);
|
||||
break;
|
||||
}
|
||||
else if( *(iter->second.first) == typeid( std::string))
|
||||
{
|
||||
delete static_cast<std::string*>( iter->second.second);
|
||||
break;
|
||||
}
|
||||
else if( *(iter->second.first) == typeid( std::vector< std::string>) )
|
||||
{
|
||||
delete static_cast< std::vector< std::string>* >( iter->second.second);
|
||||
break;
|
||||
}
|
||||
else if( *(iter->second.first) == typeid( std::vector<int>) )
|
||||
{
|
||||
delete static_cast< std::vector<int>* >( iter->second.second);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Read args as token value pair into map for better processing (Even the
|
||||
//! values remain strings until the parameter values is requested by the
|
||||
//! program.)
|
||||
//! @param argc the argument count (as given to 'main')
|
||||
//! @param argv the char* array containing the command line arguments
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void
|
||||
CmdArgReader::createArgsMaps( const int argc, const char** argv) {
|
||||
|
||||
std::string token;
|
||||
std::string val_str;
|
||||
|
||||
std::map< std::string, std::string> args;
|
||||
|
||||
std::string::size_type pos;
|
||||
std::string arg;
|
||||
for( int i=1; i<argc; ++i)
|
||||
{
|
||||
arg = argv[i];
|
||||
|
||||
// check if valid command line argument: all arguments begin with - or --
|
||||
if (arg[0] != '-')
|
||||
{
|
||||
RUNTIME_EXCEPTION("Invalid command line argument.");
|
||||
}
|
||||
|
||||
int numDashes = (arg[1] == '-' ? 2 : 1);
|
||||
|
||||
// check if only flag or if a value is given
|
||||
if ( (pos = arg.find( '=')) == std::string::npos)
|
||||
{
|
||||
unprocessed[ std::string( arg, numDashes, arg.length()-numDashes)] = "FLAG";
|
||||
}
|
||||
else
|
||||
{
|
||||
unprocessed[ std::string( arg, numDashes, pos-numDashes)] =
|
||||
std::string( arg, pos+1, arg.length()-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
488
tests/opencl/vectorhypot/cmd_arg_reader.h
Normal file
488
tests/opencl/vectorhypot/cmd_arg_reader.h
Normal file
@@ -0,0 +1,488 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* CUda UTility Library */
|
||||
|
||||
#ifndef _CMDARGREADER_H_
|
||||
#define _CMDARGREADER_H_
|
||||
|
||||
// includes, system
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include <typeinfo>
|
||||
|
||||
// includes, project
|
||||
#include "exception.h"
|
||||
|
||||
//! Preprocessed command line arguments
|
||||
//! @note Lazy evaluation: The arguments are converted from strings to
|
||||
//! the correct data type upon request. Converted values are stored
|
||||
//! in an additonal map so that no additional conversion is
|
||||
//! necessary. Arrays of command line arguments are stored in
|
||||
//! std::vectors
|
||||
//! @note Usage:
|
||||
//! const std::string* file =
|
||||
//! CmdArgReader::getArg< std::string>( "model")
|
||||
//! const std::vector< std::string>* files =
|
||||
//! CmdArgReader::getArg< std::vector< std::string> >( "model")
|
||||
//! @note All command line arguments begin with '--' followed by the token;
|
||||
//! token and value are seperated by '='; example --samples=50
|
||||
//! @note Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
//! (without whitespaces)
|
||||
|
||||
//! Command line argument parser
|
||||
class CmdArgReader
|
||||
{
|
||||
template<class> friend class TestCmdArgReader;
|
||||
|
||||
protected:
|
||||
|
||||
//! @param self handle to the only instance of this class
|
||||
static CmdArgReader* self;
|
||||
|
||||
public:
|
||||
|
||||
//! Public construction interface
|
||||
//! @return a handle to the class instance
|
||||
//! @param argc number of command line arguments (as given to main())
|
||||
//! @param argv command line argument string (as given to main())
|
||||
static void init( const int argc, const char** argv);
|
||||
|
||||
public:
|
||||
|
||||
//! Get the value of the command line argument with given name
|
||||
//! @return A const handle to the requested argument.
|
||||
//! If the argument does not exist or if it
|
||||
//! is not from type T NULL is returned
|
||||
//! @param name the name of the requested argument
|
||||
//! @note T the type of the argument requested
|
||||
template<class T>
|
||||
static inline const T* getArg( const std::string& name);
|
||||
|
||||
//! Check if a command line argument with the given name exists
|
||||
//! @return true if a command line argument with name \a name exists,
|
||||
//! otherwise false
|
||||
//! @param name name of the command line argument in question
|
||||
static inline bool existArg( const std::string& name);
|
||||
|
||||
//! Get the original / raw argc program argument
|
||||
static inline int& getRArgc();
|
||||
|
||||
//! Get the original / raw argv program argument
|
||||
static inline char**& getRArgv();
|
||||
|
||||
public:
|
||||
|
||||
//! Destructor
|
||||
~CmdArgReader();
|
||||
|
||||
protected:
|
||||
|
||||
//! Constructor, default
|
||||
CmdArgReader();
|
||||
|
||||
private:
|
||||
|
||||
// private helper functions
|
||||
|
||||
//! Get the value of the command line argument with given name
|
||||
//! @note Private helper function for 'getArg' to work on the members
|
||||
//! @return A const handle to the requested argument. If the argument
|
||||
//! does not exist or if it is not from type T a NULL pointer
|
||||
//! is returned.
|
||||
//! @param name the name of the requested argument
|
||||
//! @note T the type of the argument requested
|
||||
template<class T>
|
||||
inline const T* getArgHelper( const std::string& name);
|
||||
|
||||
//! Check if a command line argument with name \a name exists
|
||||
//! @return true if a command line argument of name \a name exists,
|
||||
//! otherwise false
|
||||
//! @param name the name of the requested argument
|
||||
inline bool existArgHelper( const std::string& name) const;
|
||||
|
||||
//! Read args as token value pair into map for better processing
|
||||
//! (Even the values remain strings until the parameter values is
|
||||
//! requested by the program.)
|
||||
//! @param argc the argument count (as given to 'main')
|
||||
//! @param argv the char* array containing the command line arguments
|
||||
void createArgsMaps( const int argc, const char** argv);
|
||||
|
||||
//! Helper for "casting" the strings from the map with the unprocessed
|
||||
//! values to the correct
|
||||
//! data type.
|
||||
//! @return true if conversion succeeded, otherwise false
|
||||
//! @param element the value as string
|
||||
//! @param val the value as type T
|
||||
template<class T>
|
||||
static inline bool convertToT( const std::string& element, T& val);
|
||||
|
||||
public:
|
||||
|
||||
// typedefs internal
|
||||
|
||||
//! container for a processed command line argument
|
||||
//! typeid is used to easily be able to decide if a re-requested token-value
|
||||
//! pair match the type of the first conversion
|
||||
typedef std::pair< const std::type_info*, void*> ValType;
|
||||
//! map of already converted values
|
||||
typedef std::map< std::string, ValType > ArgsMap;
|
||||
//! iterator for the map of already converted values
|
||||
typedef ArgsMap::iterator ArgsMapIter;
|
||||
typedef ArgsMap::const_iterator ConstArgsMapIter;
|
||||
|
||||
//! map of unprocessed (means unconverted) token-value pairs
|
||||
typedef std::map< std::string, std::string> UnpMap;
|
||||
//! iterator for the map of unprocessed (means unconverted) token-value pairs
|
||||
typedef std::map< std::string, std::string>::iterator UnpMapIter;
|
||||
|
||||
private:
|
||||
|
||||
#ifdef _WIN32
|
||||
# pragma warning( disable: 4251)
|
||||
#endif
|
||||
|
||||
//! rargc original value of argc
|
||||
static int rargc;
|
||||
|
||||
//! rargv contains command line arguments in raw format
|
||||
static char** rargv;
|
||||
|
||||
//! args Map containing the already converted token-value pairs
|
||||
ArgsMap args;
|
||||
|
||||
//! args Map containing the unprocessed / unconverted token-value pairs
|
||||
UnpMap unprocessed;
|
||||
|
||||
//! iter Iterator for the map with the already converted token-value
|
||||
//! pairs (to avoid frequent reallocation)
|
||||
ArgsMapIter iter;
|
||||
|
||||
//! iter Iterator for the map with the unconverted token-value
|
||||
//! pairs (to avoid frequent reallocation)
|
||||
UnpMapIter iter_unprocessed;
|
||||
|
||||
#ifdef _WIN32
|
||||
# pragma warning( default: 4251)
|
||||
#endif
|
||||
|
||||
private:
|
||||
|
||||
//! Constructor, copy (not implemented)
|
||||
CmdArgReader( const CmdArgReader&);
|
||||
|
||||
//! Assignment operator (not implemented)
|
||||
CmdArgReader& operator=( const CmdArgReader&);
|
||||
};
|
||||
|
||||
// variables, exported (extern)
|
||||
|
||||
// functions, inlined (inline)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line argument arrays
|
||||
//! @note This function is used each type for which no template specialization
|
||||
//! exist (which will cause errors if the type does not fulfill the std::vector
|
||||
//! interface).
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class T>
|
||||
/*static*/ inline bool
|
||||
CmdArgReader::convertToT( const std::string& element, T& val)
|
||||
{
|
||||
// preallocate storage
|
||||
val.resize( std::count( element.begin(), element.end(), ',') + 1);
|
||||
|
||||
unsigned int i = 0;
|
||||
std::string::size_type pos_start = 1; // leave array prefix '['
|
||||
std::string::size_type pos_end = 0;
|
||||
|
||||
// do for all elements of the comma seperated list
|
||||
while( std::string::npos != ( pos_end = element.find(',', pos_end+1)) )
|
||||
{
|
||||
// convert each element by the appropriate function
|
||||
if ( ! convertToT< typename T::value_type >(
|
||||
std::string( element, pos_start, pos_end - pos_start), val[i]))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
pos_start = pos_end + 1;
|
||||
++i;
|
||||
}
|
||||
|
||||
std::string tmp1( element, pos_start, element.length() - pos_start - 1);
|
||||
|
||||
// process last element (leave array postfix ']')
|
||||
if ( ! convertToT< typename T::value_type >( std::string( element,
|
||||
pos_start,
|
||||
element.length() - pos_start - 1),
|
||||
val[i]))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// possible to process all elements?
|
||||
return true;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line arguments of type int
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
inline bool
|
||||
CmdArgReader::convertToT<int>( const std::string& element, int& val)
|
||||
{
|
||||
std::istringstream ios( element);
|
||||
ios >> val;
|
||||
|
||||
bool ret_val = false;
|
||||
if ( ios.eof())
|
||||
{
|
||||
ret_val = true;
|
||||
}
|
||||
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line arguments of type float
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
inline bool
|
||||
CmdArgReader::convertToT<float>( const std::string& element, float& val)
|
||||
{
|
||||
std::istringstream ios( element);
|
||||
ios >> val;
|
||||
|
||||
bool ret_val = false;
|
||||
if ( ios.eof())
|
||||
{
|
||||
ret_val = true;
|
||||
}
|
||||
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line arguments of type double
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
inline bool
|
||||
CmdArgReader::convertToT<double>( const std::string& element, double& val)
|
||||
{
|
||||
std::istringstream ios( element);
|
||||
ios >> val;
|
||||
|
||||
bool ret_val = false;
|
||||
if ( ios.eof())
|
||||
{
|
||||
ret_val = true;
|
||||
}
|
||||
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line arguments of type string
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
inline bool
|
||||
CmdArgReader::convertToT<std::string>( const std::string& element,
|
||||
std::string& val)
|
||||
{
|
||||
val = element;
|
||||
return true;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line arguments of type bool
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
inline bool
|
||||
CmdArgReader::convertToT<bool>( const std::string& element, bool& val)
|
||||
{
|
||||
// check if value is given as string-type { true | false }
|
||||
if ( "true" == element)
|
||||
{
|
||||
val = true;
|
||||
return true;
|
||||
}
|
||||
else if ( "false" == element)
|
||||
{
|
||||
val = false;
|
||||
return true;
|
||||
}
|
||||
// check if argument is given as integer { 0 | 1 }
|
||||
else
|
||||
{
|
||||
int tmp;
|
||||
if ( convertToT<int>( element, tmp))
|
||||
{
|
||||
if ( 1 == tmp)
|
||||
{
|
||||
val = true;
|
||||
return true;
|
||||
}
|
||||
else if ( 0 == tmp)
|
||||
{
|
||||
val = false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of the command line argument with given name
|
||||
//! @return A const handle to the requested argument. If the argument does
|
||||
//! not exist or if it is not from type T NULL is returned
|
||||
//! @param T the type of the argument requested
|
||||
//! @param name the name of the requested argument
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class T>
|
||||
/*static*/ const T*
|
||||
CmdArgReader::getArg( const std::string& name)
|
||||
{
|
||||
if( ! self)
|
||||
{
|
||||
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return self->getArgHelper<T>( name);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if a command line argument with the given name exists
|
||||
//! @return true if a command line argument with name \a name exists,
|
||||
//! otherwise false
|
||||
//! @param name name of the command line argument in question
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ inline bool
|
||||
CmdArgReader::existArg( const std::string& name)
|
||||
{
|
||||
if( ! self)
|
||||
{
|
||||
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
|
||||
return false;
|
||||
}
|
||||
|
||||
return self->existArgHelper( name);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! @brief Get the value of the command line argument with given name
|
||||
//! @return A const handle to the requested argument. If the argument does
|
||||
//! not exist or if it is not from type T NULL is returned
|
||||
//! @param T the type of the argument requested
|
||||
//! @param name the name of the requested argument
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class T>
|
||||
const T*
|
||||
CmdArgReader::getArgHelper( const std::string& name)
|
||||
{
|
||||
// check if argument already processed and stored in correct type
|
||||
if ( args.end() != (iter = args.find( name)))
|
||||
{
|
||||
if ( (*(iter->second.first)) == typeid( T) )
|
||||
{
|
||||
return (T*) iter->second.second;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
T* tmp = new T;
|
||||
|
||||
// check the array with unprocessed values
|
||||
if ( unprocessed.end() != (iter_unprocessed = unprocessed.find( name)))
|
||||
{
|
||||
// try to "cast" the string to the type requested
|
||||
if ( convertToT< T >( iter_unprocessed->second, *tmp))
|
||||
{
|
||||
// add the token element pair to map of already converted values
|
||||
args[name] = std::make_pair( &(typeid( T)), (void*) tmp);
|
||||
|
||||
return tmp;
|
||||
}
|
||||
}
|
||||
|
||||
// not used while not inserted into the map -> cleanup
|
||||
delete tmp;
|
||||
}
|
||||
|
||||
// failed, argument not available
|
||||
return NULL;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if a command line argument with name \a name exists
|
||||
//! @return true if a command line argument of name \a name exists,
|
||||
//! otherwise false
|
||||
//! @param name the name of the requested argument
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
inline bool
|
||||
CmdArgReader::existArgHelper( const std::string& name) const
|
||||
{
|
||||
bool ret_val = false;
|
||||
|
||||
// check if argument already processed and stored in correct type
|
||||
if( args.end() != args.find( name))
|
||||
{
|
||||
ret_val = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
// check the array with unprocessed values
|
||||
if ( unprocessed.end() != unprocessed.find( name))
|
||||
{
|
||||
ret_val = true;
|
||||
}
|
||||
}
|
||||
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the original / raw argc program argument
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ inline int&
|
||||
CmdArgReader::getRArgc()
|
||||
{
|
||||
if( ! self)
|
||||
{
|
||||
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
|
||||
}
|
||||
|
||||
return rargc;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the original / raw argv program argument
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ inline char**&
|
||||
CmdArgReader::getRArgv()
|
||||
{
|
||||
if( ! self)
|
||||
{
|
||||
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
|
||||
}
|
||||
|
||||
return rargv;
|
||||
}
|
||||
|
||||
// functions, exported (extern)
|
||||
|
||||
#endif // #ifndef _CMDARGREADER_H_
|
||||
151
tests/opencl/vectorhypot/exception.h
Normal file
151
tests/opencl/vectorhypot/exception.h
Normal file
@@ -0,0 +1,151 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* CUda UTility Library */
|
||||
#ifndef _EXCEPTION_H_
|
||||
#define _EXCEPTION_H_
|
||||
|
||||
// includes, system
|
||||
#include <exception>
|
||||
#include <stdexcept>
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
|
||||
//! Exception wrapper.
|
||||
//! @param Std_Exception Exception out of namespace std for easy typing.
|
||||
template<class Std_Exception>
|
||||
class Exception : public Std_Exception
|
||||
{
|
||||
public:
|
||||
|
||||
//! @brief Static construction interface
|
||||
//! @return Alwayss throws ( Located_Exception<Exception>)
|
||||
//! @param file file in which the Exception occurs
|
||||
//! @param line line in which the Exception occurs
|
||||
//! @param detailed details on the code fragment causing the Exception
|
||||
static void throw_it( const char* file,
|
||||
const int line,
|
||||
const char* detailed = "-" );
|
||||
|
||||
//! Static construction interface
|
||||
//! @return Alwayss throws ( Located_Exception<Exception>)
|
||||
//! @param file file in which the Exception occurs
|
||||
//! @param line line in which the Exception occurs
|
||||
//! @param detailed details on the code fragment causing the Exception
|
||||
static void throw_it( const char* file,
|
||||
const int line,
|
||||
const std::string& detailed);
|
||||
|
||||
//! Destructor
|
||||
virtual ~Exception() throw();
|
||||
|
||||
private:
|
||||
|
||||
//! Constructor, default (private)
|
||||
Exception();
|
||||
|
||||
//! Constructor, standard
|
||||
//! @param str string returned by what()
|
||||
Exception( const std::string& str);
|
||||
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Exception handler function for arbitrary exceptions
|
||||
//! @param ex exception to handle
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class Exception_Typ>
|
||||
inline void
|
||||
handleException( const Exception_Typ& ex)
|
||||
{
|
||||
std::cerr << ex.what() << std::endl;
|
||||
|
||||
exit( EXIT_FAILURE);
|
||||
}
|
||||
|
||||
//! Convenience macros
|
||||
|
||||
//! Exception caused by dynamic program behavior, e.g. file does not exist
|
||||
#define RUNTIME_EXCEPTION( msg) \
|
||||
Exception<std::runtime_error>::throw_it( __FILE__, __LINE__, msg)
|
||||
|
||||
//! Logic exception in program, e.g. an assert failed
|
||||
#define LOGIC_EXCEPTION( msg) \
|
||||
Exception<std::logic_error>::throw_it( __FILE__, __LINE__, msg)
|
||||
|
||||
//! Out of range exception
|
||||
#define RANGE_EXCEPTION( msg) \
|
||||
Exception<std::range_error>::throw_it( __FILE__, __LINE__, msg)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Implementation
|
||||
|
||||
// includes, system
|
||||
#include <sstream>
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Static construction interface.
|
||||
//! @param Exception causing code fragment (file and line) and detailed infos.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ template<class Std_Exception>
|
||||
void
|
||||
Exception<Std_Exception>::
|
||||
throw_it( const char* file, const int line, const char* detailed)
|
||||
{
|
||||
std::stringstream s;
|
||||
|
||||
// Quiet heavy-weight but exceptions are not for
|
||||
// performance / release versions
|
||||
s << "Exception in file '" << file << "' in line " << line << "\n"
|
||||
<< "Detailed description: " << detailed << "\n";
|
||||
|
||||
throw Exception( s.str());
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Static construction interface.
|
||||
//! @param Exception causing code fragment (file and line) and detailed infos.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ template<class Std_Exception>
|
||||
void
|
||||
Exception<Std_Exception>::
|
||||
throw_it( const char* file, const int line, const std::string& msg)
|
||||
{
|
||||
throw_it( file, line, msg.c_str());
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Constructor, default (private).
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class Std_Exception>
|
||||
Exception<Std_Exception>::Exception() :
|
||||
Exception("Unknown Exception.\n")
|
||||
{ }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Constructor, standard (private).
|
||||
//! String returned by what().
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class Std_Exception>
|
||||
Exception<Std_Exception>::Exception( const std::string& s) :
|
||||
Std_Exception( s)
|
||||
{ }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Destructor
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class Std_Exception>
|
||||
Exception<Std_Exception>::~Exception() throw() { }
|
||||
|
||||
// functions, exported
|
||||
|
||||
#endif // #ifndef _EXCEPTION_H_
|
||||
|
||||
41
tests/opencl/vectorhypot/kernel.cl
Normal file
41
tests/opencl/vectorhypot/kernel.cl
Normal file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// OpenCL Kernel Function Naive Implementation for hyptenuse
|
||||
__kernel void VectorHypot(__global float4* fg4A, __global float4* fg4B, __global float4* fg4Hypot, unsigned int uiOffset, int iInnerLoopCount, unsigned int uiNumElements)
|
||||
{
|
||||
// get index into global data array
|
||||
size_t szGlobalOffset = get_global_id(0) + uiOffset;
|
||||
|
||||
// bound check
|
||||
if (szGlobalOffset >= uiNumElements)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Processing 4 elements per work item, so read fgA and fgB source values from GMEM
|
||||
float4 f4A = fg4A[szGlobalOffset];
|
||||
float4 f4B = fg4B[szGlobalOffset];
|
||||
float4 f4H = (float4)0.0f;
|
||||
|
||||
// Get the hypotenuses the vectors of 'legs', but exaggerate the time needed with loop
|
||||
for (int i = 0; i < iInnerLoopCount; i++)
|
||||
{
|
||||
// compute the 4 hypotenuses using built-in function
|
||||
f4H.x = hypot (f4A.x, f4B.x);
|
||||
f4H.y = hypot (f4A.y, f4B.y);
|
||||
f4H.z = hypot (f4A.z, f4B.z);
|
||||
f4H.w = hypot (f4A.w, f4B.w);
|
||||
}
|
||||
|
||||
// Write 4 result values back out to GMEM
|
||||
fg4Hypot[szGlobalOffset] = f4H;
|
||||
}
|
||||
702
tests/opencl/vectorhypot/main.cc
Normal file
702
tests/opencl/vectorhypot/main.cc
Normal file
@@ -0,0 +1,702 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// *********************************************************************
|
||||
// oclCopyComputeOverlap Notes:
|
||||
//
|
||||
// OpenCL API demo application for NVIDIA CUDA GPU's that implements a
|
||||
// element by element vector hyptenuse computation using 2 input float arrays
|
||||
// and 1 output float array.
|
||||
//
|
||||
// Demonstrates host->GPU and GPU->host copies that are asynchronous/overlapped
|
||||
// with respect to GPU computation (and with respect to host thread).
|
||||
//
|
||||
// Because the overlap acheivable for this computation and data set on a given system depends upon the GPU being used and the
|
||||
// GPU/Host bandwidth, the sample adjust the computation duration to test the most ideal case and test against a consistent standard.
|
||||
// This sample should be able to achieve up to 30% overlap on GPU's arch 1.2 and 1.3, and up to 50% on arch 2.0+ (Fermi) GPU's.
|
||||
//
|
||||
// After setup, warmup and calibration to the system, the sample runs 4 scenarios:
|
||||
// A) Computations with 2 command queues on GPU
|
||||
// A multiple-cycle sequence is executed, timed and compared against the host
|
||||
// B) Computations with 1 command queue on GPU
|
||||
// A multiple-cycle sequence is executed, timed and compared against the host
|
||||
//
|
||||
// The 2-command queue approach ought to be substantially faster
|
||||
//
|
||||
// For developmental purposes, the "iInnerLoopCount" variable passes into kernel and independently
|
||||
// increases compute time without increasing data size (via a loop inside the kernel)
|
||||
//
|
||||
// At some value of iInnerLoopCount, # of elements, workgroup size, etc the Overlap percentage should reach 30%:
|
||||
// (This ~naively assumes time H2D bandwidth is the same as D2H bandwidth, but this is close on most systems)
|
||||
//
|
||||
// If we name the time to copy single input vector H2D (or outpute vector D2H) as "T", then the optimum comparison case is:
|
||||
//
|
||||
// Single Queue with all the data and all the work
|
||||
// Ttot (serial) = 4T + 4T + 2T = 10T
|
||||
//
|
||||
// Dual Queue, where each queue has 1/2 the data and 1/2 the work
|
||||
// Tq0 (overlap) = 2T + 2T + T ....
|
||||
// Tq1 (overlap) = .... 2T + 2T + T
|
||||
//
|
||||
// Ttot (elapsed, wall) = 2T + 2T + 2T + T = 7T
|
||||
//
|
||||
// Best Overlap % = 100.0 * (10T - 7T)/10T = 30.0 % (Tesla arch 1.2 or 1.3, single copy engine)
|
||||
//
|
||||
// For multiple independent cycles using arch >= 2.0 with 2 copy engines, input and output copies can also be overlapped.
|
||||
// This doesn't help for the first cycle, but theoretically can lead to 50% overlap over many independent cycles.
|
||||
// *********************************************************************
|
||||
|
||||
// common SDK header for standard utilities and system libs
|
||||
#include <oclUtils.h>
|
||||
#include <shrQATest.h>
|
||||
#include <iostream>
|
||||
|
||||
// Best possible and Min ratio of compute/copy overlap timing benefit to pass the test
|
||||
// values greater than 0.0f represent a speed-up relative to non-overlapped
|
||||
#define EXPECTED_OVERLAP 30.0f
|
||||
#define EXPECTED_OVERLAP_FERMI 45.0f
|
||||
#define PASS_FACTOR 0.60f
|
||||
#define RETRIES_ON_FAILURE 1
|
||||
|
||||
// Base sizes for parameters manipulated dynamically or on the command line
|
||||
#define BASE_WORK_ITEMS 64
|
||||
#define BASE_ARRAY_LENGTH 40000
|
||||
#define BASE_LOOP_COUNT 32
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (nullptr == filename || nullptr == data || 0 == size)
|
||||
return CL_INVALID_VALUE;
|
||||
|
||||
FILE* fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Failed to load kernel.");
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
fseek(fp , 0 , SEEK_END);
|
||||
long fsize = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
*data = (uint8_t*)malloc(fsize);
|
||||
*size = fread(*data, 1, fsize, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
// Vars
|
||||
// *********************************************************************
|
||||
cl_platform_id cpPlatform; // OpenCL platform
|
||||
cl_context cxGPUContext; // OpenCL context
|
||||
cl_command_queue cqCommandQueue[2]; // OpenCL command queues
|
||||
cl_device_id* cdDevices; // OpenCL device list
|
||||
cl_program cpProgram; // OpenCL program
|
||||
cl_kernel ckKernel[2]; // OpenCL kernel, 1 per queue
|
||||
cl_mem cmPinnedSrcA; // OpenCL pinned host source buffer A
|
||||
cl_mem cmPinnedSrcB; // OpenCL pinned host source buffer B
|
||||
cl_mem cmPinnedResult; // OpenCL pinned host result buffer
|
||||
float* fSourceA = NULL; // Mapped pointer for pinned Host source A buffer
|
||||
float* fSourceB = NULL; // Mapped pointer for pinned Host source B buffer
|
||||
float* fResult = NULL; // Mapped pointer for pinned Host result buffer
|
||||
cl_mem cmDevSrcA; // OpenCL device source buffer A
|
||||
cl_mem cmDevSrcB; // OpenCL device source buffer B
|
||||
cl_mem cmDevResult; // OpenCL device result buffer
|
||||
size_t szBuffBytes; // Size of main buffers
|
||||
size_t szGlobalWorkSize; // 1D var for Total # of work items in the launched ND range
|
||||
size_t szLocalWorkSize = BASE_WORK_ITEMS; // initial # of work items in the work group
|
||||
cl_int ciErrNum; // Error code var
|
||||
char* cPathAndName = NULL; // Var for full paths to data, src, etc.
|
||||
char* cSourceCL = NULL; // Buffer to hold source for compilation
|
||||
const char* cExecutableName = NULL;
|
||||
|
||||
// demo config vars
|
||||
const char* cSourceFile = "kernel.cl"; // OpenCL computation kernel source code
|
||||
float* Golden = NULL; // temp buffer to hold golden results for cross check
|
||||
bool bNoPrompt = false; // Command line switch to skip exit prompt
|
||||
bool bQATest = false; // Command line switch to test
|
||||
|
||||
// Forward Declarations
|
||||
// *********************************************************************
|
||||
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
|
||||
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
|
||||
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitialLoopCount, int iCycles);
|
||||
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount);
|
||||
void Cleanup (int iExitCode);
|
||||
void (*pCleanup)(int) = &Cleanup;
|
||||
|
||||
int *gp_argc = 0;
|
||||
const char *** gp_argv = NULL;
|
||||
|
||||
// Main function
|
||||
// *********************************************************************
|
||||
int main(int argc, const char **argv)
|
||||
{
|
||||
//Locals
|
||||
size_t szKernelLength; // Byte size of kernel code
|
||||
double dBuildTime; // Compile time
|
||||
cl_uint uiTargetDevice = 0; // Default Device to compute on
|
||||
cl_uint uiNumDevsUsed = 1; // Number of devices used in this sample
|
||||
cl_uint uiNumDevices; // Number of devices available
|
||||
int iDevCap = -1; // Capability of device
|
||||
int iInnerLoopCount = BASE_LOOP_COUNT; // Varies "compute intensity" per data within the kernel
|
||||
const int iTestCycles = 10; // How many times to run the external test loop
|
||||
const int iWarmupCycles = 8; // How many times to run the warmup sequence
|
||||
cl_uint uiWorkGroupMultiple = 4; // Command line var (using "workgroupmult=<n>") to optionally increase workgroup size
|
||||
cl_uint uiNumElements = BASE_ARRAY_LENGTH; // initial # of elements per array to process (note: procesing 4 per work item)
|
||||
cl_uint uiSizeMultiple = 4; // Command line var (using "sizemult=<n>") to optionally increase vector sizes
|
||||
bool bPassFlag = false; // Var to accumulate test pass/fail
|
||||
shrBOOL bMatch = shrFALSE; // Cross check result
|
||||
shrBOOL bTestOverlap = shrFALSE;
|
||||
double dAvgGPUTime[2] = {0.0, 0.0}; // Average time of iTestCycles calls for 2-Queue and 1-Queue test
|
||||
double dHostTime[2] = {0.0, 0.0}; // Host computation time (2nd test is redundant but a good stability indicator)
|
||||
float fMinPassCriteria[2] = {0.0f, 0.0f}; // Test pass cireria, adjusted dependant on GPU arch
|
||||
|
||||
gp_argc = &argc;
|
||||
gp_argv = &argv;
|
||||
|
||||
shrQAStart(argc, (char **)argv);
|
||||
|
||||
// start logs
|
||||
cExecutableName = argv[0];
|
||||
shrSetLogFileName ("oclCopyComputeOverlap.txt");
|
||||
shrLog("%s Starting...\n\n", argv[0]);
|
||||
|
||||
// get basic command line args
|
||||
bNoPrompt = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "noprompt"));
|
||||
bQATest = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "qatest"));
|
||||
shrGetCmdLineArgumentu(argc, argv, "device", &uiTargetDevice);
|
||||
|
||||
// Optional Command-line multiplier for vector size
|
||||
// Default val of 4 gives 10.24 million float elements per vector
|
||||
// Range of 3 - 16 (7.68 to 40.96 million floats) is reasonable range (if system and GPU have enough memory)
|
||||
shrGetCmdLineArgumentu(argc, argv, "sizemult", &uiSizeMultiple);
|
||||
uiSizeMultiple = CLAMP(uiSizeMultiple, 1, 50);
|
||||
uiNumElements = uiSizeMultiple * BASE_ARRAY_LENGTH * BASE_WORK_ITEMS;
|
||||
shrLog("Array sizes = %u float elements\n", uiNumElements);
|
||||
|
||||
// Optional Command-line multiplier for workgroup size (x 64 work items)
|
||||
// Default val of 4 gives szLocalWorkSize of 256.
|
||||
// Range of 1 - 8 (resulting in workgroup sizes of 64 to 512) is reasonable range
|
||||
shrGetCmdLineArgumentu(argc, argv, "workgroupmult", &uiWorkGroupMultiple);
|
||||
uiWorkGroupMultiple = CLAMP(uiWorkGroupMultiple, 1, 10);
|
||||
szLocalWorkSize = uiWorkGroupMultiple * BASE_WORK_ITEMS;
|
||||
shrLog("Workgroup Size = %u\n\n", szLocalWorkSize);
|
||||
|
||||
// Get the NVIDIA platform if available, otherwise use default
|
||||
shrLog("Get the Platform ID...\n\n");
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Get OpenCL platform name and version
|
||||
char cBuffer[256];
|
||||
ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("Platform Name = %s\n\n", cBuffer);
|
||||
|
||||
// Get all the devices
|
||||
shrLog("Get the Device info and select Device...\n");
|
||||
uiNumDevices = 1;
|
||||
cdDevices = (cl_device_id*)malloc(uiNumDevices * sizeof(cl_device_id));
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, cdDevices, NULL);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Set target device and check capabilities
|
||||
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
||||
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
|
||||
shrLog(" Using Device %u, ", uiTargetDevice);
|
||||
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
|
||||
/*iDevCap = oclGetDevCap(cdDevices[uiTargetDevice]);
|
||||
if (iDevCap > 0) {
|
||||
shrLog(", Capability = %d.%d\n\n", iDevCap/10, iDevCap%10);
|
||||
} else {
|
||||
shrLog("\n\n", iDevCap);
|
||||
}
|
||||
if (strstr(cBuffer, "NVIDIA") != NULL)
|
||||
{
|
||||
if (iDevCap < 12)
|
||||
{
|
||||
shrLog("Device doesn't have overlap capability. Skipping test...\n");
|
||||
Cleanup (EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
// Device and Platform eligible for overlap testing
|
||||
bTestOverlap = shrTRUE;
|
||||
|
||||
// If device has overlap capability, proceed
|
||||
fMinPassCriteria[0] = PASS_FACTOR * EXPECTED_OVERLAP; // 1st cycle overlap is same for 1 or 2 copy engines
|
||||
if (iDevCap != 20)
|
||||
{
|
||||
// Single copy engine
|
||||
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // avg of many cycles
|
||||
}
|
||||
else
|
||||
{
|
||||
char cDevName[1024];
|
||||
clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_NAME, sizeof(cDevName), &cDevName, NULL);
|
||||
if(strstr(cDevName, "Quadro")!=0 || strstr(cDevName, "Tesla")!=0)
|
||||
{
|
||||
// Tesla or Quadro (arch = 2.0) ... Dual copy engine
|
||||
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP_FERMI; // average of many cycles
|
||||
}
|
||||
else
|
||||
{
|
||||
// Geforce ... Single copy engine
|
||||
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // average of many cycles
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
||||
// Create the context
|
||||
shrLog("clCreateContext...\n");
|
||||
cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Create 2 command-queues
|
||||
cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateCommandQueue [0]...\n");
|
||||
cqCommandQueue[1] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateCommandQueue [1]...\n");
|
||||
|
||||
// Allocate the OpenCL source and result buffer memory objects on GPU device GMEM
|
||||
szBuffBytes = sizeof(cl_float) * uiNumElements;
|
||||
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmDevResult = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szBuffBytes, NULL, &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateBuffer (Src A, Src B and Result GPU Device GMEM, 3 x %u floats) ...\n", uiNumElements);
|
||||
|
||||
// Allocate pinned source and result host buffers:
|
||||
// Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations ***
|
||||
cmPinnedSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmPinnedSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmPinnedResult = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateBuffer (Src A, Src B and Result Pinned Host buffers, 3 x %u floats)...\n\n", uiNumElements);
|
||||
|
||||
// Get mapped pointers to pinned input host buffers
|
||||
// Note: This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers
|
||||
fSourceA = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcA, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
fSourceB = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcB, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
fResult = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedResult, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
|
||||
oclCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clEnqueueMapBuffer (Pointers to 3 pinned host buffers)...\n");
|
||||
|
||||
// Alloc temp golden buffer for cross checks
|
||||
Golden = (float*)malloc(szBuffBytes);
|
||||
oclCheckErrorEX(Golden != NULL, shrTRUE, pCleanup);
|
||||
|
||||
#ifdef HOSTGPU
|
||||
// Read the OpenCL kernel in from source file
|
||||
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
|
||||
oclCheckError(cPathAndName != NULL, shrTRUE);
|
||||
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
|
||||
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
|
||||
// Create the program object
|
||||
shrLog("clCreateProgramWithSource...\n");
|
||||
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
#else
|
||||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cpProgram = clCreateProgramWithBinary(
|
||||
cxGPUContext, 1, &cdDevices[uiTargetDevice], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
#endif
|
||||
// Build the program for the target device
|
||||
clFinish(cqCommandQueue[0]);
|
||||
shrDeltaT(0);
|
||||
ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[uiTargetDevice], "-cl-fast-relaxed-math", NULL, NULL);
|
||||
shrLog("clBuildProgram...");
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
// write out standard error, Build Log and PTX, then cleanup and exit
|
||||
shrLogEx(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
|
||||
oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
|
||||
oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "VectorHypot.ptx");
|
||||
Cleanup(EXIT_FAILURE);
|
||||
}
|
||||
dBuildTime = shrDeltaT(0);
|
||||
|
||||
// Create the kernel
|
||||
ckKernel[0] = clCreateKernel(cpProgram, "VectorHypot", &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
ckKernel[1] = clCreateKernel(cpProgram, "VectorHypot", &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateKernel (ckKernel[2])...\n");
|
||||
|
||||
// Offsets for 2 queues
|
||||
cl_uint uiOffset[2] = {0, uiNumElements / (2 * 4)};
|
||||
|
||||
// Set the Argument values for the 1st kernel instance (queue 0)
|
||||
ciErrNum = clSetKernelArg(ckKernel[0], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 2, sizeof(cl_mem), (void*)&cmDevResult);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 3, sizeof(cl_uint), (void*)&uiOffset[0]);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 5, sizeof(cl_uint), (void*)&uiNumElements);
|
||||
shrLog("clSetKernelArg ckKernel[0] args 0 - 5...\n");
|
||||
|
||||
// Set the Argument values for the 2d kernel instance (queue 1)
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 2, sizeof(cl_mem), (void*)&cmDevResult);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 3, sizeof(cl_uint), (void*)&uiOffset[1]);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 5, sizeof(cl_uint), (void*)&uiNumElements);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clSetKernelArg ckKernel[1] args 0 - 5...\n\n");
|
||||
|
||||
//*******************************************
|
||||
// Warmup the driver with dual queue sequence
|
||||
//*******************************************
|
||||
|
||||
// Warmup with dual queue sequence for iTestCycles
|
||||
shrLog("Warmup with 2-Queue sequence, %d cycles...\n", iWarmupCycles);
|
||||
DualQueueSequence(iWarmupCycles, uiNumElements, false);
|
||||
|
||||
// Use single queue config to adjust compute intensity
|
||||
shrLog("Adjust compute for GPU / system...\n");
|
||||
iInnerLoopCount = AdjustCompute(cdDevices[uiTargetDevice], uiNumElements, iInnerLoopCount, iTestCycles);
|
||||
shrLog(" Kernel inner loop count = %d\n", iInnerLoopCount);
|
||||
|
||||
//*******************************************
|
||||
// Run and time with 2 command-queues
|
||||
//*******************************************
|
||||
for( int iRun =0; iRun <= RETRIES_ON_FAILURE; ++iRun ) {
|
||||
|
||||
// Run the sequence iTestCycles times
|
||||
dAvgGPUTime[0] = DualQueueSequence(iTestCycles, uiNumElements, false);
|
||||
|
||||
// Warmup then Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrLog(" Device vs Host Result Comparison\t: ");
|
||||
VectorHypotHost(fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
|
||||
shrDeltaT(0);
|
||||
for (int i = 0; i < iTestCycles; i++)
|
||||
{
|
||||
VectorHypotHost (fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
|
||||
}
|
||||
dHostTime[0] = shrDeltaT(0)/iTestCycles;
|
||||
|
||||
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
|
||||
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
|
||||
bPassFlag = (bMatch == shrTRUE);
|
||||
|
||||
//*******************************************
|
||||
// Run and time with 1 command queue
|
||||
//*******************************************
|
||||
// Run the sequence iTestCycles times
|
||||
dAvgGPUTime[1] = OneQueueSequence(iTestCycles, uiNumElements, false);
|
||||
|
||||
// Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrLog(" Device vs Host Result Comparison\t: ");
|
||||
shrDeltaT(0);
|
||||
for (int i = 0; i < iTestCycles; i++)
|
||||
{
|
||||
VectorHypotHost(fSourceA, fSourceB, Golden, (int)uiNumElements, iInnerLoopCount);
|
||||
}
|
||||
dHostTime[1] = shrDeltaT(0)/iTestCycles;
|
||||
|
||||
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
|
||||
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
|
||||
bPassFlag &= (bMatch == shrTRUE);
|
||||
|
||||
//*******************************************
|
||||
|
||||
// Compare Single and Dual queue timing
|
||||
shrLog("\nResult Summary:\n");
|
||||
|
||||
// Log GPU and CPU Time for 2-queue scenario
|
||||
shrLog(" Avg GPU Elapsed Time for 2-Queues\t= %.5f s\n", dAvgGPUTime[0]);
|
||||
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[0]);
|
||||
|
||||
// Log GPU and CPU Time for 1-queue scenario
|
||||
shrLog(" Avg GPU Elapsed Time for 1-Queue\t= %.5f s\n", dAvgGPUTime[1]);
|
||||
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[1]);
|
||||
|
||||
// Log overlap % for GPU (comparison of 2-queue and 1 queue scenarios) and status
|
||||
double dAvgOverlap = 100.0 * (1.0 - dAvgGPUTime[0]/dAvgGPUTime[1]);
|
||||
|
||||
if( bTestOverlap ) {
|
||||
bool bAvgOverlapOK = (dAvgOverlap >= fMinPassCriteria[1]);
|
||||
if( iRun == RETRIES_ON_FAILURE || bAvgOverlapOK ) {
|
||||
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Measured Overlap is %s\n\n", dAvgOverlap, fMinPassCriteria[1], bAvgOverlapOK ? "Acceptable" : "NOT Acceptable");
|
||||
|
||||
// Log info to master log in standard format
|
||||
shrLogEx(LOGBOTH | MASTER, 0, "oclCopyComputeOverlap-Avg, Throughput = %.4f OverlapPercent, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||
dAvgOverlap, dAvgGPUTime[0], uiNumElements, uiNumDevsUsed, szLocalWorkSize);
|
||||
|
||||
bPassFlag &= bAvgOverlapOK;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Retry %d more time(s)...\n\n", dAvgOverlap, fMinPassCriteria[1], RETRIES_ON_FAILURE - iRun);
|
||||
}
|
||||
|
||||
|
||||
//*******************************************
|
||||
// Report pass/fail, cleanup and exit
|
||||
Cleanup (bPassFlag ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Run 1 queue sequence for n cycles
|
||||
// *********************************************************************
|
||||
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
|
||||
{
|
||||
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrFillArray(fSourceA, (int)uiNumElements);
|
||||
shrFillArray(fSourceB, (int)uiNumElements);
|
||||
|
||||
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
|
||||
|
||||
// *** Make sure queues are empty and then start timer
|
||||
double dAvgTime = 0.0;
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
shrDeltaT(0);
|
||||
|
||||
// Run the sequence iCycles times
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Nonblocking Write of all of input data from host to device in command-queue 0
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Launch kernel computation, command-queue 0
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Non Blocking Read of output data from device to host, command-queue 0
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szBuffBytes, (void*)&fResult[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Flush sequence to device (may not be necessary on Linux or WinXP or when using the NVIDIA Tesla Computing Cluster driver)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
}
|
||||
|
||||
// *** Assure sync to host and return average sequence time
|
||||
clFinish(cqCommandQueue[0]);
|
||||
dAvgTime = shrDeltaT(0)/(double)iCycles;
|
||||
|
||||
// Log config if asked for
|
||||
if (bShowConfig)
|
||||
{
|
||||
shrLog("\n1-Queue sequence Configuration:\n");
|
||||
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 1\n",
|
||||
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
|
||||
}
|
||||
return dAvgTime;
|
||||
}
|
||||
|
||||
// Run 2 queue sequence for n cycles
|
||||
// *********************************************************************
|
||||
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
|
||||
{
|
||||
// Locals
|
||||
size_t szHalfBuffer = szBuffBytes / 2;
|
||||
size_t szHalfOffset = szHalfBuffer / sizeof(float);
|
||||
double dAvgTime = 0.0;
|
||||
|
||||
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrFillArray(fSourceA, (int)uiNumElements);
|
||||
shrFillArray(fSourceB, (int)uiNumElements);
|
||||
|
||||
// Set Global work size for 2 command-queues, and log work sizes & dimensions
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/(2 * 4)));
|
||||
|
||||
// Make sure queues are empty and then start timer
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
shrDeltaT(0);
|
||||
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Mid Phase 0
|
||||
// Nonblocking Write of 1st half of input data from host to device in command-queue 0
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceA[0], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceB[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Push out the write for queue 0 (and prior read from queue 1 at end of loop) to the driver
|
||||
// (not necessary on Linux, Mac OSX or WinXP)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
clFlush(cqCommandQueue[1]);
|
||||
|
||||
// Start Phase 1 ***********************************
|
||||
|
||||
// Launch kernel computation, command-queue 0
|
||||
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Nonblocking Write of 2nd half of input data from host to device in command-queue 1
|
||||
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcA, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceA[szHalfOffset], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcB, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceB[szHalfOffset], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Push out the compute for queue 0 and write for queue 1 to the driver
|
||||
// (not necessary on Linux, Mac OSX or WinXP)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
clFlush(cqCommandQueue[1]);
|
||||
|
||||
// Start Phase 2 ***********************************
|
||||
|
||||
// Launch kernel computation, command-queue 1
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[1], ckKernel[1], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Non Blocking Read of 1st half of output data from device to host, command-queue 0
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szHalfBuffer, (void*)&fResult[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Push out the compute for queue 1 and the read for queue 0 to the driver
|
||||
// (not necessary on Linux, Mac OSX or WinXP)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
clFlush(cqCommandQueue[1]);
|
||||
|
||||
// Start Phase 0 (Rolls over) ***********************************
|
||||
|
||||
// Non Blocking Read of 2nd half of output data from device to host, command-queue 1
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[1], cmDevResult, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fResult[szHalfOffset], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
|
||||
// *** Sync to host and get average sequence time
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
dAvgTime = shrDeltaT(0)/(double)iCycles;
|
||||
|
||||
// Log config if asked for
|
||||
if (bShowConfig)
|
||||
{
|
||||
shrLog("\n2-Queue sequence Configuration:\n");
|
||||
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 2\n",
|
||||
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
|
||||
}
|
||||
|
||||
return dAvgTime;
|
||||
}
|
||||
|
||||
// Function to adjust compute task according to device capability
|
||||
// This allows a consistent overlap % across a wide variety of GPU's for test purposes
|
||||
// It also implitly illustrates the relationship between compute capability and overlap at fixed work size
|
||||
// *********************************************************************
|
||||
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitLoopCount, int iCycles)
|
||||
{
|
||||
// Locals
|
||||
double dCopyTime, dComputeTime;
|
||||
int iComputedLoopCount;
|
||||
|
||||
// Change Source Data
|
||||
shrFillArray(fSourceA, (int)uiNumElements);
|
||||
shrFillArray(fSourceB, (int)uiNumElements);
|
||||
|
||||
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
|
||||
|
||||
// *** Make sure queues are empty and then start timer
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
shrDeltaT(0);
|
||||
|
||||
// Run the copy iCycles times and measure copy time on this system
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Nonblocking Write of all of input data from host to device in command-queue 0
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
|
||||
ciErrNum |= clFlush(cqCommandQueue[0]);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
clFinish(cqCommandQueue[0]);
|
||||
dCopyTime = shrDeltaT(0);
|
||||
|
||||
// Run the compute iCycles times and measure compute time on this system
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Launch kernel computation, command-queue 0
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
ciErrNum |= clFlush(cqCommandQueue[0]);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
}
|
||||
clFinish(cqCommandQueue[0]);
|
||||
dComputeTime = shrDeltaT(0);
|
||||
|
||||
// Determine number of core loop cycles proportional to copy/compute time ratio
|
||||
dComputeTime = MAX(dComputeTime, 1.0e-6);
|
||||
iComputedLoopCount = CLAMP(2, (int)((dCopyTime/dComputeTime) * (double)iInitLoopCount), (iInitLoopCount * 4));
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
return (iComputedLoopCount);
|
||||
}
|
||||
|
||||
// Cleanup/Exit function
|
||||
// *********************************************************************
|
||||
void Cleanup (int iExitCode)
|
||||
{
|
||||
// Cleanup allocated objects
|
||||
shrLog("Starting Cleanup...\n\n");
|
||||
if(cPathAndName)free(cPathAndName);
|
||||
if(cSourceCL)free(cSourceCL);
|
||||
if(Golden)free(Golden);
|
||||
if(ckKernel[0])clReleaseKernel(ckKernel[0]);
|
||||
if(ckKernel[1])clReleaseKernel(ckKernel[1]);
|
||||
if(cpProgram)clReleaseProgram(cpProgram);
|
||||
if(fSourceA)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcA, (void*)fSourceA, 0, NULL, NULL);
|
||||
if(fSourceB)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcB, (void*)fSourceB, 0, NULL, NULL);
|
||||
if(fResult)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedResult, (void*)fResult, 0, NULL, NULL);
|
||||
if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
|
||||
if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
|
||||
if(cmDevResult)clReleaseMemObject(cmDevResult);
|
||||
if(cmPinnedSrcA)clReleaseMemObject(cmPinnedSrcA);
|
||||
if(cmPinnedSrcB)clReleaseMemObject(cmPinnedSrcB);
|
||||
if(cmPinnedResult)clReleaseMemObject(cmPinnedResult);
|
||||
if(cqCommandQueue[0])clReleaseCommandQueue(cqCommandQueue[0]);
|
||||
if(cqCommandQueue[1])clReleaseCommandQueue(cqCommandQueue[1]);
|
||||
if(cxGPUContext)clReleaseContext(cxGPUContext);
|
||||
if(cdDevices)free(cdDevices);
|
||||
|
||||
// Master status Pass/Fail (all tests)
|
||||
shrQAFinishExit( *gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED );
|
||||
}
|
||||
|
||||
// "Golden" Host processing vector hyptenuse function for comparison purposes
|
||||
// *********************************************************************
|
||||
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount)
|
||||
{
|
||||
for (unsigned int i = 0; i < uiNumElements; i++)
|
||||
{
|
||||
float fA = pfData1[i];
|
||||
float fB = pfData2[i];
|
||||
float fC = sqrtf(fA * fA + fB * fB);
|
||||
|
||||
pfResult[i] = fC;
|
||||
}
|
||||
}
|
||||
806
tests/opencl/vectorhypot/oclUtils.cpp
Normal file
806
tests/opencl/vectorhypot/oclUtils.cpp
Normal file
@@ -0,0 +1,806 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// *********************************************************************
|
||||
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <stdarg.h>
|
||||
#include "oclUtils.h"
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the platform ID for NVIDIA if available, otherwise default
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param clSelectedPlatformID OpenCL platoform ID
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID)
|
||||
{
|
||||
char chBuffer[1024];
|
||||
cl_uint num_platforms;
|
||||
cl_platform_id* clPlatformIDs;
|
||||
cl_int ciErrNum;
|
||||
*clSelectedPlatformID = NULL;
|
||||
|
||||
// Get OpenCL platform count
|
||||
ciErrNum = clGetPlatformIDs (0, NULL, &num_platforms);
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
shrLog(" Error %i in clGetPlatformIDs Call !!!\n\n", ciErrNum);
|
||||
return -1000;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(num_platforms == 0)
|
||||
{
|
||||
shrLog("No OpenCL platform found!\n\n");
|
||||
return -2000;
|
||||
}
|
||||
else
|
||||
{
|
||||
// if there's a platform or more, make space for ID's
|
||||
if ((clPlatformIDs = (cl_platform_id*)malloc(num_platforms * sizeof(cl_platform_id))) == NULL)
|
||||
{
|
||||
shrLog("Failed to allocate memory for cl_platform ID's!\n\n");
|
||||
return -3000;
|
||||
}
|
||||
|
||||
// get platform info for each platform and trap the NVIDIA platform if found
|
||||
ciErrNum = clGetPlatformIDs (num_platforms, clPlatformIDs, NULL);
|
||||
for(cl_uint i = 0; i < num_platforms; ++i)
|
||||
{
|
||||
ciErrNum = clGetPlatformInfo (clPlatformIDs[i], CL_PLATFORM_NAME, 1024, &chBuffer, NULL);
|
||||
if(ciErrNum == CL_SUCCESS)
|
||||
{
|
||||
if(strstr(chBuffer, "NVIDIA") != NULL)
|
||||
{
|
||||
*clSelectedPlatformID = clPlatformIDs[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// default to zeroeth platform if NVIDIA not found
|
||||
if(*clSelectedPlatformID == NULL)
|
||||
{
|
||||
shrLog("WARNING: NVIDIA OpenCL platform not found - defaulting to first platform!\n\n");
|
||||
*clSelectedPlatformID = clPlatformIDs[0];
|
||||
}
|
||||
|
||||
free(clPlatformIDs);
|
||||
}
|
||||
}
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print the device name
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void oclPrintDevName(int iLogMode, cl_device_id device)
|
||||
{
|
||||
char device_string[1024];
|
||||
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, "%s\n", device_string);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print info about the device
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void oclPrintDevInfo(int iLogMode, cl_device_id device)
|
||||
{
|
||||
char device_string[1024];
|
||||
bool nv_device_attibute_query = false;
|
||||
|
||||
// CL_DEVICE_NAME
|
||||
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_NAME: \t\t\t%s\n", device_string);
|
||||
|
||||
// CL_DEVICE_VENDOR
|
||||
clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_VENDOR: \t\t\t%s\n", device_string);
|
||||
|
||||
// CL_DRIVER_VERSION
|
||||
clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DRIVER_VERSION: \t\t\t%s\n", device_string);
|
||||
|
||||
// CL_DEVICE_VERSION
|
||||
clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_VERSION: \t\t\t%s\n", device_string);
|
||||
|
||||
// CL_DEVICE_OPENCL_C_VERSION (if CL_DEVICE_VERSION version > 1.0)
|
||||
if(strncmp("OpenCL 1.0", device_string, 10) != 0)
|
||||
{
|
||||
// This code is unused for devices reporting OpenCL 1.0, but a def is needed anyway to allow compilation using v 1.0 headers
|
||||
// This constant isn't #defined in 1.0
|
||||
#ifndef CL_DEVICE_OPENCL_C_VERSION
|
||||
#define CL_DEVICE_OPENCL_C_VERSION 0x103D
|
||||
#endif
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_OPENCL_C_VERSION: \t\t%s\n", device_string);
|
||||
}
|
||||
|
||||
// CL_DEVICE_TYPE
|
||||
cl_device_type type;
|
||||
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
|
||||
if( type & CL_DEVICE_TYPE_CPU )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
|
||||
if( type & CL_DEVICE_TYPE_GPU )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
|
||||
if( type & CL_DEVICE_TYPE_ACCELERATOR )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
|
||||
if( type & CL_DEVICE_TYPE_DEFAULT )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
|
||||
|
||||
// CL_DEVICE_MAX_COMPUTE_UNITS
|
||||
cl_uint compute_units;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", compute_units);
|
||||
|
||||
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
|
||||
size_t workitem_dims;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims);
|
||||
|
||||
// CL_DEVICE_MAX_WORK_ITEM_SIZES
|
||||
size_t workitem_size[3];
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
|
||||
|
||||
// CL_DEVICE_MAX_WORK_GROUP_SIZE
|
||||
size_t workgroup_size;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size);
|
||||
|
||||
// CL_DEVICE_MAX_CLOCK_FREQUENCY
|
||||
cl_uint clock_frequency;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency);
|
||||
|
||||
// CL_DEVICE_ADDRESS_BITS
|
||||
cl_uint addr_bits;
|
||||
clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(addr_bits), &addr_bits, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_ADDRESS_BITS:\t\t%u\n", addr_bits);
|
||||
|
||||
// CL_DEVICE_MAX_MEM_ALLOC_SIZE
|
||||
cl_ulong max_mem_alloc_size;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_mem_alloc_size), &max_mem_alloc_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(max_mem_alloc_size / (1024 * 1024)));
|
||||
|
||||
// CL_DEVICE_GLOBAL_MEM_SIZE
|
||||
cl_ulong mem_size;
|
||||
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(mem_size / (1024 * 1024)));
|
||||
|
||||
// CL_DEVICE_ERROR_CORRECTION_SUPPORT
|
||||
cl_bool error_correction_support;
|
||||
clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(error_correction_support), &error_correction_support, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", error_correction_support == CL_TRUE ? "yes" : "no");
|
||||
|
||||
// CL_DEVICE_LOCAL_MEM_TYPE
|
||||
cl_device_local_mem_type local_mem_type;
|
||||
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(local_mem_type), &local_mem_type, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", local_mem_type == 1 ? "local" : "global");
|
||||
|
||||
// CL_DEVICE_LOCAL_MEM_SIZE
|
||||
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(mem_size / 1024));
|
||||
|
||||
// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(mem_size), &mem_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(mem_size / 1024));
|
||||
|
||||
// CL_DEVICE_QUEUE_PROPERTIES
|
||||
cl_command_queue_properties queue_properties;
|
||||
clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(queue_properties), &queue_properties, NULL);
|
||||
if( queue_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");
|
||||
if( queue_properties & CL_QUEUE_PROFILING_ENABLE )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
|
||||
|
||||
// CL_DEVICE_IMAGE_SUPPORT
|
||||
cl_bool image_support;
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(image_support), &image_support, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", image_support);
|
||||
|
||||
// CL_DEVICE_MAX_READ_IMAGE_ARGS
|
||||
cl_uint max_read_image_args;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(max_read_image_args), &max_read_image_args, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", max_read_image_args);
|
||||
|
||||
// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
|
||||
cl_uint max_write_image_args;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(max_write_image_args), &max_write_image_args, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", max_write_image_args);
|
||||
|
||||
// CL_DEVICE_SINGLE_FP_CONFIG
|
||||
cl_device_fp_config fp_config;
|
||||
clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &fp_config, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_SINGLE_FP_CONFIG:\t\t%s%s%s%s%s%s\n",
|
||||
fp_config & CL_FP_DENORM ? "denorms " : "",
|
||||
fp_config & CL_FP_INF_NAN ? "INF-quietNaNs " : "",
|
||||
fp_config & CL_FP_ROUND_TO_NEAREST ? "round-to-nearest " : "",
|
||||
fp_config & CL_FP_ROUND_TO_ZERO ? "round-to-zero " : "",
|
||||
fp_config & CL_FP_ROUND_TO_INF ? "round-to-inf " : "",
|
||||
fp_config & CL_FP_FMA ? "fma " : "");
|
||||
|
||||
// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
|
||||
size_t szMaxDims[5];
|
||||
shrLogEx(iLogMode, 0, "\n CL_DEVICE_IMAGE <dim>");
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &szMaxDims[0], NULL);
|
||||
shrLogEx(iLogMode, 0, "\t\t\t2D_MAX_WIDTH\t %u\n", szMaxDims[0]);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[1], NULL);
|
||||
shrLogEx(iLogMode, 0, "\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", szMaxDims[1]);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &szMaxDims[2], NULL);
|
||||
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_WIDTH\t %u\n", szMaxDims[2]);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[3], NULL);
|
||||
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", szMaxDims[3]);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &szMaxDims[4], NULL);
|
||||
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_DEPTH\t %u\n", szMaxDims[4]);
|
||||
|
||||
// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
|
||||
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(device_string), &device_string, NULL);
|
||||
if (device_string != 0)
|
||||
{
|
||||
shrLogEx(iLogMode, 0, "\n CL_DEVICE_EXTENSIONS:");
|
||||
std::string stdDevString;
|
||||
stdDevString = std::string(device_string);
|
||||
size_t szOldPos = 0;
|
||||
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
|
||||
while (szSpacePos != stdDevString.npos)
|
||||
{
|
||||
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
|
||||
nv_device_attibute_query = true;
|
||||
|
||||
if (szOldPos > 0)
|
||||
{
|
||||
shrLogEx(iLogMode, 0, "\t\t");
|
||||
}
|
||||
shrLogEx(iLogMode, 0, "\t\t\t%s\n", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str());
|
||||
|
||||
do {
|
||||
szOldPos = szSpacePos + 1;
|
||||
szSpacePos = stdDevString.find(' ', szOldPos);
|
||||
} while (szSpacePos == szOldPos);
|
||||
}
|
||||
shrLogEx(iLogMode, 0, "\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_EXTENSIONS: None\n");
|
||||
}
|
||||
|
||||
if(nv_device_attibute_query)
|
||||
{
|
||||
cl_uint compute_capability_major, compute_capability_minor;
|
||||
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &compute_capability_major, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &compute_capability_minor, NULL);
|
||||
shrLogEx(iLogMode, 0, "\n CL_DEVICE_COMPUTE_CAPABILITY_NV:\t%u.%u\n", compute_capability_major, compute_capability_minor);
|
||||
|
||||
shrLogEx(iLogMode, 0, " NUMBER OF MULTIPROCESSORS:\t\t%u\n", compute_units); // this is the same value reported by CL_DEVICE_MAX_COMPUTE_UNITS
|
||||
shrLogEx(iLogMode, 0, " NUMBER OF CUDA CORES:\t\t\t%u\n", ConvertSMVer2Cores(compute_capability_major, compute_capability_minor) * compute_units);
|
||||
|
||||
cl_uint regs_per_block;
|
||||
clGetDeviceInfo(device, CL_DEVICE_REGISTERS_PER_BLOCK_NV, sizeof(cl_uint), ®s_per_block, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_REGISTERS_PER_BLOCK_NV:\t%u\n", regs_per_block);
|
||||
|
||||
cl_uint warp_size;
|
||||
clGetDeviceInfo(device, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), &warp_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_WARP_SIZE_NV:\t\t%u\n", warp_size);
|
||||
|
||||
cl_bool gpu_overlap;
|
||||
clGetDeviceInfo(device, CL_DEVICE_GPU_OVERLAP_NV, sizeof(cl_bool), &gpu_overlap, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_GPU_OVERLAP_NV:\t\t%s\n", gpu_overlap == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
|
||||
|
||||
cl_bool exec_timeout;
|
||||
clGetDeviceInfo(device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof(cl_bool), &exec_timeout, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:\t%s\n", exec_timeout == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
|
||||
|
||||
cl_bool integrated_memory;
|
||||
clGetDeviceInfo(device, CL_DEVICE_INTEGRATED_MEMORY_NV, sizeof(cl_bool), &integrated_memory, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_INTEGRATED_MEMORY_NV:\t%s\n", integrated_memory == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
|
||||
}
|
||||
|
||||
// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t");
|
||||
cl_uint vec_width [6];
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &vec_width[0], NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &vec_width[1], NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &vec_width[2], NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &vec_width[3], NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &vec_width[4], NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &vec_width[5], NULL);
|
||||
shrLogEx(iLogMode, 0, "CHAR %u, SHORT %u, INT %u, LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
|
||||
vec_width[0], vec_width[1], vec_width[2], vec_width[3], vec_width[4], vec_width[5]);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and return device capability
|
||||
//!
|
||||
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
int oclGetDevCap(cl_device_id device)
|
||||
{
|
||||
char cDevString[1024];
|
||||
bool bDevAttributeQuery = false;
|
||||
int iDevArch = -1;
|
||||
|
||||
// Get device extensions, and if any then search for cl_nv_device_attribute_query
|
||||
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(cDevString), &cDevString, NULL);
|
||||
if (cDevString != 0)
|
||||
{
|
||||
std::string stdDevString;
|
||||
stdDevString = std::string(cDevString);
|
||||
size_t szOldPos = 0;
|
||||
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
|
||||
while (szSpacePos != stdDevString.npos)
|
||||
{
|
||||
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
|
||||
{
|
||||
bDevAttributeQuery = true;
|
||||
}
|
||||
|
||||
do {
|
||||
szOldPos = szSpacePos + 1;
|
||||
szSpacePos = stdDevString.find(' ', szOldPos);
|
||||
} while (szSpacePos == szOldPos);
|
||||
}
|
||||
}
|
||||
|
||||
// if search succeeded, get device caps
|
||||
if(bDevAttributeQuery)
|
||||
{
|
||||
cl_int iComputeCapMajor, iComputeCapMinor;
|
||||
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), (void*)&iComputeCapMajor, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), (void*)&iComputeCapMinor, NULL);
|
||||
iDevArch = (10 * iComputeCapMajor) + iComputeCapMinor;
|
||||
}
|
||||
|
||||
return iDevArch;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the first device from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
cl_device_id oclGetFirstDev(cl_context cxGPUContext)
|
||||
{
|
||||
size_t szParmDataBytes;
|
||||
cl_device_id* cdDevices;
|
||||
|
||||
// get the list of GPU devices associated with context
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
|
||||
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
|
||||
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
|
||||
|
||||
cl_device_id first = cdDevices[0];
|
||||
free(cdDevices);
|
||||
|
||||
return first;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of device with maximal FLOPS from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext)
|
||||
{
|
||||
size_t szParmDataBytes;
|
||||
cl_device_id* cdDevices;
|
||||
|
||||
// get the list of GPU devices associated with context
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
|
||||
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
|
||||
size_t device_count = szParmDataBytes / sizeof(cl_device_id);
|
||||
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
|
||||
|
||||
cl_device_id max_flops_device = cdDevices[0];
|
||||
int max_flops = 0;
|
||||
|
||||
size_t current_device = 0;
|
||||
|
||||
// CL_DEVICE_MAX_COMPUTE_UNITS
|
||||
cl_uint compute_units;
|
||||
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
|
||||
|
||||
// CL_DEVICE_MAX_CLOCK_FREQUENCY
|
||||
cl_uint clock_frequency;
|
||||
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
|
||||
|
||||
max_flops = compute_units * clock_frequency;
|
||||
++current_device;
|
||||
|
||||
while( current_device < device_count )
|
||||
{
|
||||
// CL_DEVICE_MAX_COMPUTE_UNITS
|
||||
cl_uint compute_units;
|
||||
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
|
||||
|
||||
// CL_DEVICE_MAX_CLOCK_FREQUENCY
|
||||
cl_uint clock_frequency;
|
||||
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
|
||||
|
||||
int flops = compute_units * clock_frequency;
|
||||
if( flops > max_flops )
|
||||
{
|
||||
max_flops = flops;
|
||||
max_flops_device = cdDevices[current_device];
|
||||
}
|
||||
++current_device;
|
||||
}
|
||||
|
||||
free(cdDevices);
|
||||
|
||||
return max_flops_device;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Loads a Program file and prepends the cPreamble to the code.
|
||||
//!
|
||||
//! @return the source string if succeeded, 0 otherwise
|
||||
//! @param cFilename program filename
|
||||
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||
//! @param szFinalLength returned length of the code string
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
|
||||
{
|
||||
// locals
|
||||
FILE* pFileStream = NULL;
|
||||
size_t szSourceLength;
|
||||
|
||||
// open the OpenCL source code file
|
||||
#ifdef _WIN32 // Windows version
|
||||
if(fopen_s(&pFileStream, cFilename, "rb") != 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#else // Linux version
|
||||
pFileStream = fopen(cFilename, "rb");
|
||||
if(pFileStream == 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
size_t szPreambleLength = strlen(cPreamble);
|
||||
|
||||
// get the length of the source code
|
||||
fseek(pFileStream, 0, SEEK_END);
|
||||
szSourceLength = ftell(pFileStream);
|
||||
fseek(pFileStream, 0, SEEK_SET);
|
||||
|
||||
// allocate a buffer for the source code string and read it in
|
||||
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
|
||||
memcpy(cSourceString, cPreamble, szPreambleLength);
|
||||
if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
|
||||
{
|
||||
fclose(pFileStream);
|
||||
free(cSourceString);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// close the file and return the total length of the combined (preamble + source) string
|
||||
fclose(pFileStream);
|
||||
if(szFinalLength != 0)
|
||||
{
|
||||
*szFinalLength = szSourceLength + szPreambleLength;
|
||||
}
|
||||
cSourceString[szSourceLength + szPreambleLength] = '\0';
|
||||
|
||||
return cSourceString;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the nth device from the context
|
||||
//!
|
||||
//! @return the id or -1 when out of range
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//! @param device_idx index of the device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int nr)
|
||||
{
|
||||
size_t szParmDataBytes;
|
||||
cl_device_id* cdDevices;
|
||||
|
||||
// get the list of GPU devices associated with context
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
|
||||
|
||||
if( szParmDataBytes / sizeof(cl_device_id) <= nr ) {
|
||||
return (cl_device_id)-1;
|
||||
}
|
||||
|
||||
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
|
||||
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
|
||||
|
||||
cl_device_id device = cdDevices[nr];
|
||||
free(cdDevices);
|
||||
|
||||
return device;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the binary (PTX) of the program associated with the device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param binary returned code
|
||||
//! @param length length of returned code
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length)
|
||||
{
|
||||
// Grab the number of devices associated witht the program
|
||||
cl_uint num_devices;
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
|
||||
|
||||
// Grab the device ids
|
||||
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
|
||||
|
||||
// Grab the sizes of the binaries
|
||||
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
|
||||
|
||||
// Now get the binaries
|
||||
char** ptx_code = (char**) malloc(num_devices * sizeof(char*));
|
||||
for( unsigned int i=0; i<num_devices; ++i) {
|
||||
ptx_code[i]= (char*)malloc(binary_sizes[i]);
|
||||
}
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
|
||||
|
||||
// Find the index of the device of interest
|
||||
unsigned int idx = 0;
|
||||
while( idx<num_devices && devices[idx] != cdDevice ) ++idx;
|
||||
|
||||
// If it is associated prepare the result
|
||||
if( idx < num_devices )
|
||||
{
|
||||
*binary = ptx_code[idx];
|
||||
*length = binary_sizes[idx];
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
free( devices );
|
||||
free( binary_sizes );
|
||||
for( unsigned int i=0; i<num_devices; ++i) {
|
||||
if( i != idx ) free(ptx_code[i]);
|
||||
}
|
||||
free( ptx_code );
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param const char* cPtxFileName optional PTX file name
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName)
|
||||
{
|
||||
// Grab the number of devices associated with the program
|
||||
cl_uint num_devices;
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
|
||||
|
||||
// Grab the device ids
|
||||
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
|
||||
|
||||
// Grab the sizes of the binaries
|
||||
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
|
||||
|
||||
// Now get the binaries
|
||||
char** ptx_code = (char**)malloc(num_devices * sizeof(char*));
|
||||
for( unsigned int i=0; i<num_devices; ++i)
|
||||
{
|
||||
ptx_code[i] = (char*)malloc(binary_sizes[i]);
|
||||
}
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
|
||||
|
||||
// Find the index of the device of interest
|
||||
unsigned int idx = 0;
|
||||
while((idx < num_devices) && (devices[idx] != cdDevice))
|
||||
{
|
||||
++idx;
|
||||
}
|
||||
|
||||
// If the index is associated, log the result
|
||||
if(idx < num_devices)
|
||||
{
|
||||
|
||||
// if a separate filename is supplied, dump ptx there
|
||||
if (NULL != cPtxFileName)
|
||||
{
|
||||
shrLog("\nWriting ptx to separate file: %s ...\n\n", cPtxFileName);
|
||||
FILE* pFileStream = NULL;
|
||||
#ifdef _WIN32
|
||||
fopen_s(&pFileStream, cPtxFileName, "wb");
|
||||
#else
|
||||
pFileStream = fopen(cPtxFileName, "wb");
|
||||
#endif
|
||||
|
||||
fwrite(ptx_code[idx], binary_sizes[idx], 1, pFileStream);
|
||||
fclose(pFileStream);
|
||||
}
|
||||
else // log to logfile and console if no ptx file specified
|
||||
{
|
||||
shrLog("\n%s\nProgram Binary:\n%s\n%s\n", HDASHLINE, ptx_code[idx], HDASHLINE);
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
free(devices);
|
||||
free(binary_sizes);
|
||||
for(unsigned int i = 0; i < num_devices; ++i)
|
||||
{
|
||||
free(ptx_code[i]);
|
||||
}
|
||||
free( ptx_code );
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice)
|
||||
{
|
||||
// write out the build log and ptx, then exit
|
||||
char cBuildLog[10240];
|
||||
clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG,
|
||||
sizeof(cBuildLog), cBuildLog, NULL );
|
||||
shrLog("\n%s\nBuild Log:\n%s\n%s\n", HDASHLINE, cBuildLog, HDASHLINE);
|
||||
}
|
||||
|
||||
// Helper function for De-allocating cl objects
|
||||
// *********************************************************************
|
||||
void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < iNumObjs; i++)
|
||||
{
|
||||
if (cmMemObjs[i])clReleaseMemObject(cmMemObjs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to get OpenCL error string from constant
|
||||
// *********************************************************************
|
||||
const char* oclErrorString(cl_int error)
|
||||
{
|
||||
static const char* errorString[] = {
|
||||
"CL_SUCCESS",
|
||||
"CL_DEVICE_NOT_FOUND",
|
||||
"CL_DEVICE_NOT_AVAILABLE",
|
||||
"CL_COMPILER_NOT_AVAILABLE",
|
||||
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
|
||||
"CL_OUT_OF_RESOURCES",
|
||||
"CL_OUT_OF_HOST_MEMORY",
|
||||
"CL_PROFILING_INFO_NOT_AVAILABLE",
|
||||
"CL_MEM_COPY_OVERLAP",
|
||||
"CL_IMAGE_FORMAT_MISMATCH",
|
||||
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
|
||||
"CL_BUILD_PROGRAM_FAILURE",
|
||||
"CL_MAP_FAILURE",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"CL_INVALID_VALUE",
|
||||
"CL_INVALID_DEVICE_TYPE",
|
||||
"CL_INVALID_PLATFORM",
|
||||
"CL_INVALID_DEVICE",
|
||||
"CL_INVALID_CONTEXT",
|
||||
"CL_INVALID_QUEUE_PROPERTIES",
|
||||
"CL_INVALID_COMMAND_QUEUE",
|
||||
"CL_INVALID_HOST_PTR",
|
||||
"CL_INVALID_MEM_OBJECT",
|
||||
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
|
||||
"CL_INVALID_IMAGE_SIZE",
|
||||
"CL_INVALID_SAMPLER",
|
||||
"CL_INVALID_BINARY",
|
||||
"CL_INVALID_BUILD_OPTIONS",
|
||||
"CL_INVALID_PROGRAM",
|
||||
"CL_INVALID_PROGRAM_EXECUTABLE",
|
||||
"CL_INVALID_KERNEL_NAME",
|
||||
"CL_INVALID_KERNEL_DEFINITION",
|
||||
"CL_INVALID_KERNEL",
|
||||
"CL_INVALID_ARG_INDEX",
|
||||
"CL_INVALID_ARG_VALUE",
|
||||
"CL_INVALID_ARG_SIZE",
|
||||
"CL_INVALID_KERNEL_ARGS",
|
||||
"CL_INVALID_WORK_DIMENSION",
|
||||
"CL_INVALID_WORK_GROUP_SIZE",
|
||||
"CL_INVALID_WORK_ITEM_SIZE",
|
||||
"CL_INVALID_GLOBAL_OFFSET",
|
||||
"CL_INVALID_EVENT_WAIT_LIST",
|
||||
"CL_INVALID_EVENT",
|
||||
"CL_INVALID_OPERATION",
|
||||
"CL_INVALID_GL_OBJECT",
|
||||
"CL_INVALID_BUFFER_SIZE",
|
||||
"CL_INVALID_MIP_LEVEL",
|
||||
"CL_INVALID_GLOBAL_WORK_SIZE",
|
||||
};
|
||||
|
||||
const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
|
||||
|
||||
const int index = -error;
|
||||
|
||||
return (index >= 0 && index < errorCount) ? errorString[index] : "Unspecified Error";
|
||||
}
|
||||
|
||||
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||
// *********************************************************************
|
||||
const char* oclImageFormatString(cl_uint uiImageFormat)
|
||||
{
|
||||
// cl_channel_order
|
||||
if (uiImageFormat == CL_R)return "CL_R";
|
||||
if (uiImageFormat == CL_A)return "CL_A";
|
||||
if (uiImageFormat == CL_RG)return "CL_RG";
|
||||
if (uiImageFormat == CL_RA)return "CL_RA";
|
||||
if (uiImageFormat == CL_RGB)return "CL_RGB";
|
||||
if (uiImageFormat == CL_RGBA)return "CL_RGBA";
|
||||
if (uiImageFormat == CL_BGRA)return "CL_BGRA";
|
||||
if (uiImageFormat == CL_ARGB)return "CL_ARGB";
|
||||
if (uiImageFormat == CL_INTENSITY)return "CL_INTENSITY";
|
||||
if (uiImageFormat == CL_LUMINANCE)return "CL_LUMINANCE";
|
||||
|
||||
// cl_channel_type
|
||||
if (uiImageFormat == CL_SNORM_INT8)return "CL_SNORM_INT8";
|
||||
if (uiImageFormat == CL_SNORM_INT16)return "CL_SNORM_INT16";
|
||||
if (uiImageFormat == CL_UNORM_INT8)return "CL_UNORM_INT8";
|
||||
if (uiImageFormat == CL_UNORM_INT16)return "CL_UNORM_INT16";
|
||||
if (uiImageFormat == CL_UNORM_SHORT_565)return "CL_UNORM_SHORT_565";
|
||||
if (uiImageFormat == CL_UNORM_SHORT_555)return "CL_UNORM_SHORT_555";
|
||||
if (uiImageFormat == CL_UNORM_INT_101010)return "CL_UNORM_INT_101010";
|
||||
if (uiImageFormat == CL_SIGNED_INT8)return "CL_SIGNED_INT8";
|
||||
if (uiImageFormat == CL_SIGNED_INT16)return "CL_SIGNED_INT16";
|
||||
if (uiImageFormat == CL_SIGNED_INT32)return "CL_SIGNED_INT32";
|
||||
if (uiImageFormat == CL_UNSIGNED_INT8)return "CL_UNSIGNED_INT8";
|
||||
if (uiImageFormat == CL_UNSIGNED_INT16)return "CL_UNSIGNED_INT16";
|
||||
if (uiImageFormat == CL_UNSIGNED_INT32)return "CL_UNSIGNED_INT32";
|
||||
if (uiImageFormat == CL_HALF_FLOAT)return "CL_HALF_FLOAT";
|
||||
if (uiImageFormat == CL_FLOAT)return "CL_FLOAT";
|
||||
|
||||
// unknown constant
|
||||
return "Unknown";
|
||||
}
|
||||
198
tests/opencl/vectorhypot/oclUtils.h
Normal file
198
tests/opencl/vectorhypot/oclUtils.h
Normal file
@@ -0,0 +1,198 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef OCL_UTILS_H
|
||||
#define OCL_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// Common headers: Cross-API utililties and OpenCL header
|
||||
#include <shrUtils.h>
|
||||
|
||||
// All OpenCL headers
|
||||
#if defined (__APPLE__) || defined(MACOSX)
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/opencl.h>
|
||||
#endif
|
||||
|
||||
// Includes
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// For systems with CL_EXT that are not updated with these extensions, we copied these
|
||||
// extensions from <CL/cl_ext.h>
|
||||
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||
#endif
|
||||
|
||||
// reminders for build output window and log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including shrUtils.h")
|
||||
#pragma message ("Note: including opencl.h")
|
||||
#endif
|
||||
|
||||
// SDK Revision #
|
||||
#define OCL_SDKREVISION "7027912"
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param clSelectedPlatformID OpenCL platform ID
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print info about the device
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and return device capability
|
||||
//!
|
||||
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" int oclGetDevCap(cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print the device name
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the first device from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the nth device from the context
|
||||
//!
|
||||
//! @return the id or -1 when out of range
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//! @param device_idx index of the device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of device with maximal FLOPS from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Loads a Program file and prepends the cPreamble to the code.
|
||||
//!
|
||||
//! @return the source string if succeeded, 0 otherwise
|
||||
//! @param cFilename program filename
|
||||
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||
//! @param szFinalLength returned length of the code string
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the binary (PTX) of the program associated with the device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param binary returned code
|
||||
//! @param length length of returned code
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param const char* cPtxFileName optional PTX file name
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
|
||||
|
||||
// Helper function for De-allocating cl objects
|
||||
// *********************************************************************
|
||||
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
|
||||
|
||||
// Helper function to get OpenCL error string from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclErrorString(cl_int error);
|
||||
|
||||
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
// An error condition is defined by the sample/test value not equal to the reference
|
||||
if (iReference != iSample)
|
||||
{
|
||||
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
|
||||
iSample = (iSample == 0) ? -9999 : iSample;
|
||||
|
||||
// Log the error info
|
||||
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
|
||||
|
||||
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(iSample);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(iSample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
238
tests/opencl/vectorhypot/shrQATest.h
Normal file
238
tests/opencl/vectorhypot/shrQATest.h
Normal file
@@ -0,0 +1,238 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_QATEST_H
|
||||
#define SHR_QATEST_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#pragma message ("Note: including time.h")
|
||||
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#include <time.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#ifndef STRCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRCASECMP _stricmp
|
||||
#else
|
||||
#define STRCASECMP strcasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef STRNCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRNCASECMP _strnicmp
|
||||
#else
|
||||
#define STRNCASECMP strncasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
// Standardized QA Start/Finish for CUDA SDK tests
|
||||
#define shrQAStart(a, b) __shrQAStart(a, b)
|
||||
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
|
||||
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
|
||||
|
||||
inline int findExeNameStart(const char *exec_name)
|
||||
{
|
||||
int exename_start = (int)strlen(exec_name);
|
||||
|
||||
while( (exename_start > 0) &&
|
||||
(exec_name[exename_start] != '\\') &&
|
||||
(exec_name[exename_start] != '/') )
|
||||
{
|
||||
exename_start--;
|
||||
}
|
||||
if (exec_name[exename_start] == '\\' ||
|
||||
exec_name[exename_start] == '/')
|
||||
{
|
||||
return exename_start+1;
|
||||
} else {
|
||||
return exename_start;
|
||||
}
|
||||
}
|
||||
|
||||
inline int __shrQAStart(int argc, char **argv)
|
||||
{
|
||||
bool bQATest = false;
|
||||
// First clear the output buffer
|
||||
fflush(stdout);
|
||||
fflush(stdout);
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
char *string_argv = &argv[i][string_start];
|
||||
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
}
|
||||
|
||||
// We don't want to print the entire path, so we search for the first
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
return exename_start;
|
||||
}
|
||||
|
||||
enum eQAstatus {
|
||||
QA_FAILED = 0,
|
||||
QA_PASSED = 1,
|
||||
QA_WAIVED = 2
|
||||
};
|
||||
|
||||
inline void __ExitInTime(int seconds)
|
||||
{
|
||||
fprintf(stdout, "> exiting in %d seconds: ", seconds);
|
||||
fflush(stdout);
|
||||
time_t t;
|
||||
int count;
|
||||
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
|
||||
fprintf(stdout, "%d...", count);
|
||||
#ifdef WIN32
|
||||
Sleep(1000);
|
||||
#else
|
||||
sleep(1);
|
||||
#endif
|
||||
}
|
||||
fprintf(stdout,"done!\n\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
||||
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
|
||||
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bNoPrompt = true;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bNoPrompt = false;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
} else {
|
||||
if (!bNoPrompt) {
|
||||
fprintf(stdout, "\nPress <Enter> to exit...\n");
|
||||
fflush(stdout);
|
||||
getchar();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
bool bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
}
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish(argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish2(bQAtest, argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
#endif
|
||||
1954
tests/opencl/vectorhypot/shrUtils.cpp
Normal file
1954
tests/opencl/vectorhypot/shrUtils.cpp
Normal file
File diff suppressed because it is too large
Load Diff
642
tests/opencl/vectorhypot/shrUtils.h
Normal file
642
tests/opencl/vectorhypot/shrUtils.h
Normal file
@@ -0,0 +1,642 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_UTILS_H
|
||||
#define SHR_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// reminders for output window and build log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#endif
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
|
||||
// Other headers needed for both Windows and Linux
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Un-comment the following #define to enable profiling code in SDK apps
|
||||
//#define GPU_PROFILING
|
||||
|
||||
// Beginning of GPU Architecture definitions
|
||||
inline int ConvertSMVer2Cores(int major, int minor)
|
||||
{
|
||||
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||
typedef struct {
|
||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||
int Cores;
|
||||
} sSMtoCores;
|
||||
|
||||
sSMtoCores nGpuArchCoresPerSM[] =
|
||||
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
||||
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
||||
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
||||
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
||||
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
||||
{ -1, -1 }
|
||||
};
|
||||
|
||||
int index = 0;
|
||||
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
||||
return nGpuArchCoresPerSM[index].Cores;
|
||||
}
|
||||
index++;
|
||||
}
|
||||
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
|
||||
return -1;
|
||||
}
|
||||
// end of GPU Architecture definitions
|
||||
|
||||
|
||||
// Defines and enum for use with logging functions
|
||||
// *********************************************************************
|
||||
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
|
||||
#define MASTERLOGFILE "SdkMasterLog.csv"
|
||||
enum LOGMODES
|
||||
{
|
||||
LOGCONSOLE = 1, // bit to signal "log to console"
|
||||
LOGFILE = 2, // bit to signal "log to file"
|
||||
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
|
||||
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
|
||||
MASTER = 8, // bit to signal master .csv log output
|
||||
ERRORMSG = 16, // bit to signal "pre-pend Error"
|
||||
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
|
||||
};
|
||||
#define HDASHLINE "-----------------------------------------------------------\n"
|
||||
|
||||
// Standardized boolean
|
||||
enum shrBOOL
|
||||
{
|
||||
shrFALSE = 0,
|
||||
shrTRUE = 1
|
||||
};
|
||||
|
||||
// Standardized MAX, MIN and CLAMP
|
||||
#define MAX(a, b) ((a > b) ? a : b)
|
||||
#define MIN(a, b) ((a < b) ? a : b)
|
||||
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
|
||||
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... extended version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... short version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
|
||||
|
||||
// Simple argument checker macro
|
||||
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
|
||||
|
||||
// Define for user-customized error handling
|
||||
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
|
||||
|
||||
// Function to deallocate memory allocated within shrUtils
|
||||
// *********************************************************************
|
||||
extern "C" void shrFree(void* ptr);
|
||||
|
||||
// *********************************************************************
|
||||
// Helper function to log standardized information to Console, to File or to both
|
||||
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
|
||||
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||
//!
|
||||
//! Automatically opens file and stores handle if needed and not done yet
|
||||
//! Closes file and nulls handle on request
|
||||
//!
|
||||
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
|
||||
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
|
||||
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
|
||||
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
|
||||
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
|
||||
//! @param 2 dValue:
|
||||
//! Positive val = double value for time in secs to be formatted to 6 decimals.
|
||||
//! Negative val is an error code and this give error preformatting.
|
||||
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
|
||||
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
|
||||
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
|
||||
//! Single byte char type specifiers (%s and %c) ARE supported
|
||||
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
|
||||
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
|
||||
// *********************************************************************
|
||||
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
|
||||
|
||||
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
|
||||
// *********************************************************************
|
||||
extern "C" int shrLog(const char* cFormatString, ...);
|
||||
|
||||
// *********************************************************************
|
||||
// Delta timer function for up to 3 independent timers using host high performance counters
|
||||
// Maintains state for 3 independent counters
|
||||
//! Example: double dElapsedTime = shrDeltaTime(0);
|
||||
//!
|
||||
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
|
||||
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
|
||||
// *********************************************************************
|
||||
extern "C" double shrDeltaT(int iCounterID);
|
||||
|
||||
// Optional LogFileNameOverride function
|
||||
// *********************************************************************
|
||||
extern "C" void shrSetLogFileName (const char* cOverRideName);
|
||||
|
||||
// Helper function to init data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrFillArray(float* pfData, int iSize);
|
||||
|
||||
// Helper function to print data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrPrintArray(float* pfData, int iSize);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Find the path for a filename
|
||||
//! @return the path if succeeded, otherwise 0
|
||||
//! @param filename name of the file
|
||||
//! @param executablePath optional absolute path of the executable
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing single precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing double precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing single precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||
const float epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing double precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||
const double epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PPM image file (with unsigned char as data element type), padding
|
||||
//! 4th component
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param OutData handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//!
|
||||
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
|
||||
unsigned int *w, unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||
//! 4 bytes)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
|
||||
unsigned int *w,unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Command line arguments: General notes
|
||||
// * All command line arguments begin with '--' followed by the token;
|
||||
// token and value are seperated by '='; example --samples=50
|
||||
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
// (without whitespaces)
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if command line argument \a flag-name is given
|
||||
//! @return shrTRUE if command line argument \a flag_name has been given,
|
||||
//! otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param flag_name name of command line flag
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
|
||||
const char* flag_name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
|
||||
const char* arg_name, int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type unsigned int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
|
||||
const char* arg_name, unsigned int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type float
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
|
||||
const char* arg_name, float* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type string
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument list those element are strings
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val command line argument list
|
||||
//! @param len length of the list / number of elements
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val,
|
||||
unsigned int* len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
|
||||
const unsigned int len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned char arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integers with a tolernance for # of byte errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||
//! threshold for # pixel errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||
//! equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PGM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
|
||||
|
||||
extern "C" size_t shrRoundUp(int group_size, int global_size);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
if (iReference != iSample)
|
||||
{
|
||||
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(EXIT_FAILURE);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Standardized Exit
|
||||
// *********************************************************************
|
||||
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||
{
|
||||
#ifdef WIN32
|
||||
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#else
|
||||
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#endif
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
|
||||
getchar();
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
|
||||
}
|
||||
fflush(stderr);
|
||||
exit(iExitCode);
|
||||
}
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user