Index: Make.atlas =================================================================== RCS file: Make.atlas diff -N Make.atlas --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ Make.atlas 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,66 @@ +################################################################## +# (C) Copyright IBM Corporation 2008 +# +################################################################## + +# Platform + +ARCH := atlas + +# Tools + +SHELL := /bin/sh +CD := cd +CP := cp +LN_S := ln -s +MKDIR := mkdir +TOUCH := touch + +CC := mpicc +LINKER := mpicc +ARCHIVER := /usr/bin/ar +RANLIB := echo + +# Directories + +INCdir := $(TOPdir)/include +BINdir := $(TOPdir)/bin/$(ARCH) + +# HPL library + +HPLlib := $(TOPdir)/lib/$(ARCH)/libhpl.a + +# MPI package + +MPdir := +MPinc := +MPlib := + +# Linear Algebra Library package -- Atlas + +LAdir := /usr/local/atlas +LAinc := -I$(LAdir)/include +LAlib := -L$(LAdir)/lib -lf77blas -latlas -lgfortran + +# F2C options + +F2CDEFS := -DAdd__ -DF77_INTEGER=int -DStringSunStyle + +# HPL options + +HPL_INCS := -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc) $(CSinc) +HPL_DEFS := $(F2CDEFS) $(HPL_OPTS) $(HPL_INCS) +HPL_DEFS += -DHPL_USE_HUGE_PAGES=1 + +ifdef TIMING +HPL_DEFS += -DHPL_DETAILED_TIMING +endif + +HPL_LIBS := $(HPLlib) $(LAlib) $(MPlib) $(CSlib) + +CCNOOPT := -m64 -Wall $(HPL_DEFS) +CCFLAGS := $(CCNOOPT) -O3 -fomit-frame-pointer -funroll-loops +#CCFLAGS := $(CCNOOPT) -O0 -ggdb -g3 +LINKFLAGS := $(CCFLAGS) +ARFLAGS := -r + Index: Make.qs22 =================================================================== RCS file: Make.qs22 diff -N Make.qs22 --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ Make.qs22 20 Aug 2008 03:57:53 -0000 1.7 @@ -0,0 +1,74 @@ +################################################################## +# (C) Copyright IBM Corporation 2008 +# +################################################################## + +# Platform + +ARCH := qs22 + +# Tools + +SHELL := /bin/sh +CD := cd +CP := cp +LN_S := ln -s +MKDIR := mkdir +TOUCH := touch + +CC := mpicc +LINKER := mpicc +ARCHIVER := /usr/bin/ar +RANLIB := echo + +# Directories + +INCdir := $(TOPdir)/include +BINdir := $(TOPdir)/bin/$(ARCH) + +# HPL library + +HPLlib := $(TOPdir)/lib/$(ARCH)/libhpl.a +ACLlib := $(TOPdir)/accel/lib/libhpl_accel_ppu.a + +# MPI package + +MPdir := +MPinc := +MPlib := + +# Linear Algebra Library package -- Atlas + +LAdir := /usr/local/atlas +LAinc := -I$(LAdir)/include +LAlib := -L$(LAdir)/lib -lf77blas -latlas -lgfortran + +# Cell SDK + +CSdir := /opt/cell/sdk/prototype +CSinc := -I$(CSdir)/usr/include +CSlib := -L$(CSdir)/usr/lib64 -lstdc++ -lpthread -lrt -lspe2 -lnuma + +# F2C options + +F2CDEFS := -DAdd__ -DF77_INTEGER=int -DStringSunStyle + +# HPL options + +HPL_INCS := -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc) $(CSinc) +HPL_DEFS := $(F2CDEFS) $(HPL_OPTS) $(HPL_INCS) +HPL_DEFS += -DHPL_USE_HUGE_PAGES=1 + +ifdef TIMING +HPL_DEFS += -DHPL_DETAILED_TIMING +endif + +HPL_LIBS := $(HPLlib) $(LAlib) $(MPlib) $(CSlib) $(ACLlib) + +CCNOOPT := -m64 -Wall $(HPL_DEFS) +CCNOOPT += -DHPL_CALL_ACCEL +CCFLAGS := $(CCNOOPT) -O3 -fomit-frame-pointer -funroll-loops +#CCFLAGS := $(CCNOOPT) -O0 -ggdb3 +LINKFLAGS := $(CCFLAGS) +ARFLAGS := -r + Index: Make.qs22_sdkblas =================================================================== RCS file: Make.qs22_sdkblas diff -N Make.qs22_sdkblas --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ Make.qs22_sdkblas 7 Aug 2008 13:07:08 -0000 1.4 @@ -0,0 +1,78 @@ +################################################################## +# Licensed Materials - Property of IBM. +# (C) Copyright IBM Corporation 2007 +# All Rights Reserved. +# +# US Government Users Restricted Rights - +# Use, duplication or disclosure restricted by +# GSA ADP Schedule Contract with IBM Corporation. + +################################################################## + +# Platform + +ARCH := qs22_sdkblas + +# Tools + +SHELL := /bin/sh +CD := cd +CP := cp +LN_S := ln -s +MKDIR := mkdir +TOUCH := touch + +CC := mpicc +LINKER := mpicc +ARCHIVER := /usr/bin/ar +RANLIB := echo + +# Directories + +INCdir := $(TOPdir)/include +BINdir := $(TOPdir)/bin/$(ARCH) + +# HPL library + +HPLlib := $(TOPdir)/lib/$(ARCH)/libhpl.a + +# MPI package + +MPdir := +MPinc := +MPlib := + +# Linear Algebra Library package + +LAdir := /usr +LAinc := -I$(LAdir)/include +LAlib := -L$(LAdir)/lib64 -lblas + +# Cell SDK + +CSdir := /opt/cell/sdk/prototype +CSinc := -I$(CSdir)/usr/include +CSlib := -L$(CSdir)/usr/lib64 -lstdc++ -lpthread -lrt -lspe2 -lnuma + +# F2C options + +F2CDEFS := -DAdd__ -DF77_INTEGER=int -DStringSunStyle + +# HPL options + +HPL_INCS := -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc) $(CSinc) +HPL_DEFS := $(F2CDEFS) $(HPL_OPTS) $(HPL_INCS) +HPL_DEFS += -DHPL_USE_HUGE_PAGES=1 + +ifdef TIMING +HPL_DEFS += -DHPL_DETAILED_TIMING +endif + +HPL_LIBS := $(HPLlib) $(LAlib) $(MPlib) $(CSlib) + +CCNOOPT := -m64 -Wall $(HPL_DEFS) +CCFLAGS := $(CCNOOPT) -O3 -fomit-frame-pointer -funroll-loops +#CCFLAGS := $(CCNOOPT) -O0 -ggdb -g3 +LINKFLAGS := $(CCFLAGS) +ARFLAGS := -r + Index: Make.top =================================================================== RCS file: /cvsroot/hpl_qs22/Make.top,v retrieving revision 1.1 retrieving revision 1.4 diff -u -r1.1 -r1.4 --- Make.top 10 Feb 2008 21:45:50 -0000 1.1 +++ Make.top 26 Aug 2008 13:24:26 -0000 1.4 @@ -43,6 +43,8 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ###################################################################### +# Modifications (C) Copyright IBM Corporation 2008 +# ###################################################################### # arch = UNKNOWN # @@ -51,6 +53,7 @@ ## build ############################################################### # build_src : + ( $(CD) src/accel/$(arch); $(MAKE) ) ( $(CD) src/auxil/$(arch); $(MAKE) ) ( $(CD) src/blas/$(arch); $(MAKE) ) ( $(CD) src/comm/$(arch); $(MAKE) ) @@ -78,6 +81,7 @@ - $(MKDIR) bin/$(arch) # startup_src : + - $(MAKE) -f Make.top leaf le=src/accel arch=$(arch) - $(MAKE) -f Make.top leaf le=src/auxil arch=$(arch) - $(MAKE) -f Make.top leaf le=src/blas arch=$(arch) - $(MAKE) -f Make.top leaf le=src/comm arch=$(arch) @@ -98,6 +102,7 @@ ## refresh ############################################################# # refresh_src : + - $(CP) makes/Make.accel src/accel/$(arch)/Makefile - $(CP) makes/Make.auxil src/auxil/$(arch)/Makefile - $(CP) makes/Make.blas src/blas/$(arch)/Makefile - $(CP) makes/Make.comm src/comm/$(arch)/Makefile @@ -118,6 +123,7 @@ ## clean ############################################################### # clean_src : + - ( $(CD) src/accel/$(arch); $(MAKE) clean ) - ( $(CD) src/auxil/$(arch); $(MAKE) clean ) - ( $(CD) src/blas/$(arch); $(MAKE) clean ) - ( $(CD) src/comm/$(arch); $(MAKE) clean ) @@ -138,6 +144,7 @@ ## clean_arch ########################################################## # clean_arch_src : + - $(RM) -r src/accel/$(arch) - $(RM) -r src/auxil/$(arch) - $(RM) -r src/blas/$(arch) - $(RM) -r src/comm/$(arch) @@ -165,6 +172,7 @@ ## clean_guard ######################################################### # clean_guard_src : + - ( $(CD) src/accel/$(arch); $(RM) *.grd ) - ( $(CD) src/auxil/$(arch); $(RM) *.grd ) - ( $(CD) src/blas/$(arch); $(RM) *.grd ) - ( $(CD) src/comm/$(arch); $(RM) *.grd ) Index: Makefile =================================================================== RCS file: /cvsroot/hpl_qs22/Makefile,v retrieving revision 1.1 retrieving revision 1.4 diff -u -r1.1 -r1.4 --- Makefile 10 Feb 2008 21:45:50 -0000 1.1 +++ Makefile 26 Aug 2008 13:24:26 -0000 1.4 @@ -43,12 +43,16 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ###################################################################### +# Modifications (C) Copyright IBM Corporation 2008 +# ###################################################################### # # SHELL = /bin/sh # arch = UNKNOWN # +export TOPdir = $(shell pwd) +# ## Targets ############################################################# # all : install @@ -70,10 +74,12 @@ # build : $(MAKE) -f Make.top build_src arch=$(arch) + $(MAKE) -C accel arch=$(arch) $(MAKE) -f Make.top build_tst arch=$(arch) # clean : $(MAKE) -f Make.top clean_src arch=$(arch) + $(MAKE) -C accel clean arch=$(arch) $(MAKE) -f Make.top clean_tst arch=$(arch) # clean_arch : Index: accel/Makefile =================================================================== RCS file: accel/Makefile diff -N accel/Makefile --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/Makefile 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,25 @@ +# --------------------------------------------------------------- +# (C) Copyright IBM Corporation 2007,2008 +# +# All Rights Reserved. +# --------------------------------------------------------------- + +ifeq ($(arch),qs22) + +######################################################################## +# Target +######################################################################## + +DIRS = lib + +######################################################################## +# make.footer +######################################################################## + +include $(CELL_TOP)/buildutils/make.footer + +else + +all clean : + +endif Index: accel/lib/Makefile =================================================================== RCS file: accel/lib/Makefile diff -N accel/lib/Makefile --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/Makefile 20 Aug 2008 03:57:53 -0000 1.5 @@ -0,0 +1,39 @@ +# --------------------------------------------------------------- +# (C) Copyright IBM Corporation 2007,2008 +# +# --------------------------------------------------------------- + +######################################################################## +# Subdirectories +######################################################################## + +DIRS = spu + +######################################################################## +# Target +######################################################################## + +TARGET_PROCESSOR = ppu64 +LIBRARY = libhpl_accel_ppu.a + +#CC_OPT_LEVEL = -g + +CPPFLAGS = -DNDEBUG +#CPPFLAGS += -DACCEL_LITTLE_ENDIAN +CPPFLAGS += -DVALIDATE_4GB_CROSSING +CPPFLAGS += -DMATRIX_4GB_CROSSING +#CPPFLAGS += -DPANEL_4GB_CROSSING + +######################################################################## +# Local Defines +######################################################################## + +SYS_LIBS += -lspe2 -lpthread -lm + +IMPORTS = spu/hpl_accel_spu-embed64.o + +######################################################################## +# make.footer +######################################################################## + +include $(CELL_TOP)/buildutils/make.footer Index: accel/lib/hpl_accel.h =================================================================== RCS file: accel/lib/hpl_accel.h diff -N accel/lib/hpl_accel.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel.h 20 Aug 2008 03:57:53 -0000 1.13 @@ -0,0 +1,758 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef _HPL_ACCEL_H_ +#define _HPL_ACCEL_H_ + +#define M_SUB (64) /* Size of sub-blocks - M_SUB x M_SUB */ + +/* ---------------------------------------------------------------- */ +/* Inline functions for addressing matrix storage of various formats*/ +/* ---------------------------------------------------------------- */ + +/* The following inline functions compute an array index for the each + * of the supported formats - column ordered, row ordered, and blocked + * (column ordered blocks, whose blocks are row ordered). + * The inputs are the row (row), the column (col), the leading dimension + * (ld). + */ + +/* ld is the number of elements from column n to column n+1 + */ +static inline unsigned int INDEX_COL(unsigned int row, unsigned int col, unsigned int ld) { + return (col*ld + row); +} + +/* ld is the number of elements from row n to row n+1 + */ +static inline unsigned int INDEX_ROW(unsigned int row, unsigned int col, unsigned int ld) { + return (row*ld + col); +} + +/* ld is the number of elements from block column n to block column n+1. + * This can also be described as the number of elements between column + * n and column n+M_SUB + */ +static inline unsigned int INDEX_BLK(unsigned int row, unsigned int col, unsigned int ld) { + return ((col / M_SUB)*ld + INDEX_ROW( row, (col % M_SUB), M_SUB )); +} + + +/* NOTE 1: + * + * The following defines can be used to configure the code for handling + * 4GB crossings. They include: + * + * MATRIX_4GB_CROSSING If defined then all block ordered matrices can cross a 4GB + * address boundary. However, the crossing can only occur on a + * block boundary, never within a matrix block. In addition, + * the block leading dimension must be no larger than 2^28 - 1. + * If not defined, then a matrix can not cross a 4GB + * address boundary. + * + * PANEL_4GB_CROSSING If defined then all row or column order panels (this includes + * U panels, L panels,and row buffers) may cross at most 1 4GB + * address boundary, but only on a row/column boundary. In addition, + * the leading dimension must not exceed 2^28 - 1. + * If not defined, then a panel can not cross a 4GB address boundary. + * + * VALIDATE_4GB_CROSSING If defined, then include code to validate the specified + * boundary constraints. This define is intended for debug + * purposes only. + */ + +#ifdef __PPU__ + +#include + +/* hpl_accel_byte_swap + * ------------------- + * Convert a double from little-endian format to big-endian format. This + * function is not optimal. Recommend using hpl_accel_byte_swap_load and + * hpl_accel_byte_swap_store instead. + */ +static inline double hpl_accel_byte_swap(double d) { +#ifdef ACCEL_LITTLE_ENDIAN + union { + unsigned long long ull; + double d; + } in, out; + + in.d = d; + out.ull = __ldbrx(&in.ull); + return (out.d); +#else + return (d); +#endif +} + + +/* hpl_accel_byte_swap_load + * ------------------------ + * Load a little endian byte ordered, double word value. + */ +static inline double hpl_accel_byte_swap_load(unsigned long long *ptr) +{ +#ifdef ACCEL_LITTLE_ENDIAN + union { + unsigned long long ull; + double d; + } x; + + x.ull = __ldbrx(ptr); + return (x.d); +#else + return (*((double *)ptr)); +#endif +} + + +/* hpl_accel_byte_swap_store + * ------------------------- + * Store a double word value in little endian byte ordering. + */ +static inline void hpl_accel_byte_swap_store(unsigned long long *ptr, double d) +{ +#ifdef ACCEL_LITTLE_ENDIAN + union { + unsigned long long ull; + double d; + } x; + + x.d = d; + __stdbrx(ptr, x.ull); +#else + *((double *)ptr) = d; +#endif +} + + +/* hpl_accel_init + * -------------- + * Initialize the HPL accelerator. If the accelerator is successfully + * initialized, then HPL_ACCEL_INIT_SUCCESS is returned, otherwise + * HPL_ACCEL_INIT_FAIL is returned. + */ + +#define HPL_ACCEL_INIT_SUCCESS 0 +#define HPL_ACCEL_INIT_FAIL -1 + +extern int hpl_accel_init(); + +/* hpl_accel_fini + * -------------- + * Finalize the HPL accelerator. If the accelerator successfully + * finishes , then HPL_ACCEL_FINI_SUCCESS is returned, otherwise + * HPL_ACCEL_FINI_FAIL is returned. + */ +#define HPL_ACCEL_FINI_SUCCESS 0 +#define HPL_ACCEL_FINI_FAIL -1 + +extern int hpl_accel_fini(); + + +/* hpl_accel_dgemm_CL_R_B_CL + * hpl_accel_dgemm_CL_B_B_CL + * ------------------------- + * Specialized accelerated DGEMM. The DGEMM computes: + * + * [c] -= [a]*[b] + * + * If a panel is specified, then the output in placed in [panel]: + * + * [panel] = [c] - [a]*[b]; + * + * m Number of rows in [a], [c], and [panel]. + * n Number of cols in [b], [c], and [panel]. + * k Number of cols in [a] and rows in [b]. + * a Column-ordered, little-endian, matrix of m rows and k columns. + * lda Leading dimension of matrix [a]. + * b Big endian matrix of k rows and n columns. This is either row ordered, + * in the case of hpl_accel_dgemm_CL_R_B_CL, or block formatted, in the + * hpl_accel_dgemm_CL_B_B_CL. + * ldb Leading dimension of matrix [b]. For a block formatted [b] matrix, + * this is the number of doubles to advance b from block column n to + * column n+1. + * c Block-formatted, big-endian, matrix of m rows and n columns. + * The block contents are row-ordered with the individual blocks + * that are column-ordered. Blocks are 64x64. + * ldc Leading block dimension of matrix [c]. The number of doubles to + * to advance c from block column n to column n+1. + * blk_row Starting block matrix row offset. This offset is applied only to the + * [c] matrix. + * blk_col Starting block matrix column offset. This offset is applied to the [c] + * matrix and [b] matrix when it is block formatted (i.e. for + * hpl_accel_dgemm_CL_B_B_CL. + * panel Column ordered, little endian DGEMM result matrix of m rows and n columns. + * If NULL, the result is returned in [c]. + * ldp Leading dimension of [panel]. If [panel] is NULL, this must be 0. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * a Buffer may not straddle 4GB boundary (See Note 1). + * b Buffer may not straddle 4GB boundary (See Note 1). + * c Buffer may not straddle 4GB boundary (See Note 1). + * panel Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * k Must be 128 to be accelerated. + * m Optimal if a multiple of 64. Integral multiples of 64 may be accelerated. + * n Optimal if a multiple of 64. Integral multiples of 64 may be accelerated. + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * b Optimal if cacheline aligned. Accelerated if [b] is quadword aligned. + * c Optimal if cacheline aligned. Accelerated if [c] is quadword aligned. + * panel Optimal if cacheline aligned. Accelerated if [panel] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * ldb Optimal if a multiple of 16. Accelerated if ldb is even. + * ldc Optimal if a multiple of 16. Accelerated if ldc is even. + * ldp Optimal if a multiple of 16. Accelerated if ldp is even. + * blk_row Must be a multiple of M_SUB in order to be SPE accelerated. + * blk_col Must be a multiple of M_SUB in order to be SPE accelerated. + */ + +extern void hpl_accel_dgemm_CL_R_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete); + +extern void hpl_accel_dgemm_CL_B_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete); + + +/* hpl_accel_dgemm_C_C_C + * ------------------------- + * Specialized accelerated DGEMM. The DGEMM computes: + * + * [c] -= [a]*[b] + * + * m Number of rows in [a] and [c]. + * n Number of cols in [b] and [c]. + * k Number of cols in [a] and rows in [b]. + * a Column-ordered, big-endian, matrix of m rows and k columns. + * lda Leading dimension of matrix [a]. + * b Column-ordered, big endian matrix of k rows and n columns. + * ldb Leading dimension of matrix [b]. + * c Column-ordered, big-endian, matrix of m rows and n columns. + * ldc Leading block dimension of matrix [c]. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * a Buffer may not straddle 4GB boundary (See Note 1). + * c Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * k Must be a multiple of 4 and no bigger than 64 to be accelerated. + * m Optimal if a multiple of 16. Integral multiples of 8 may be accelerated. + * n Optimal if a multiple of 4. + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * b Optimal if cacheline aligned. Accelerated if [b] is quadword aligned. + * c Optimal if cacheline aligned. Accelerated if [c] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * ldb Optimal if a multiple of 16. Accelerated if ldb is even. + * ldc Optimal if a multiple of 16. Accelerated if ldc is even. + */ + +extern void hpl_accel_dgemm_C_C_C(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete); + +/* hpl_accel_dtrsm_CL_R_B + * ---------------------- + * Specialized accelerated DTRSM. The DTRSM solves for [x] the matrix equation + * + * [a]*[x] = [b] + * + * where a is unit lower triangle matrix. The solution is returned in [b] unless + * [c] is non-NULL, in which the solution is returned in [c]. + * + * m Number of rows in [b], number of column in [a]. + * n Number of columns in [b]. + * a Column-ordered, little-endian, unit lower triangle matrix of + * dimension lda rows by m columns. + * lda Leading dimension of matrix [a]. + * b Row-order, big-endian, matrix of m rows and n columns. On entry + * contains the right-hand side matrix and is overwritten by the + * solution matrix [x]. + * ldb Leading dimension of matrix [b]. + * c Block-formatted, big-endian, matrix. The block contents are + * row-ordered with the individual blocks that are column-ordered. + * Blocks are 64x64. If non-NULL, the solution is returned in the + * row of blocks in [c] instead of [b]. This must point to the start + * of a matrix block. + * ldc Leading block dimension of matrix [c]. The number of doubles to + * to advance c from block column to the next block column. If [c] + * is NULL, then ldc should also be 0. + * blk_row Starting [c] block matrix row offset. If [c] is NULL, then blk_row + * must also be 0. + * blk_col Starting [c] block matrix column offset. If [c] is NULL, then blk_col + * must also be 0. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * a Buffer may not straddle 4GB boundary (See Note 1). + * b Buffer may not straddle 4GB boundary (See Note 1). + * c Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * m Must be 128 to be accelerated. + * n Optimal if a multiple of 16. Integral multiples of 16 may be accelerated. + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * b Optimal if cacheline aligned. Accelerated if [b] is quadword aligned. + * c Optimal if cacheline aligned. Accelerated if [c] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * ldb Optimal if a multiple of 16. Accelerated if ldb is even. Memory throughput + * is maximized if ldb is NOT an integral multiple of 256. + * ldc Optimal if a multiple of 16. Accelerated if ldc is even. + * blk_col Must be a multiple of 16 in order to be SPE accelerated. This is a current + * implementation restriction. + */ + +extern void hpl_accel_dtrsm_CL_R_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete); + + +/* hpl_accel_dtrsm_CL_B + * -------------------- + * Specialized accelerated DTRSM. The DTRSM solves for [x] the matrix equation + * + * [a]*[x] = [b] + * + * where a is unit lower triangle matrix. The solution is returned in [b]. + * + * m Number of rows in [b], number of column in [a]. + * n Number of columns in [b]. + * a Column-ordered, little-endian, unit lower triangle matrix of + * dimension lda rows by m columns. + * lda Leading dimension of matrix [a]. + * b Block-formatted, big-endian, matrix of m rows and n columns. + * The block contents are row-ordered with the individual blocks + * that are column-ordered. Blocks are 64x64. On entry contains + * the right-hand side matrix and is overwritten by the + * solution matrix [x]. This must point to the start + * of a matrix block. + * ldb Leading dimension of matrix [b]. The number of doubles to + * to advance b from block column to the next block column. + * blk_row Starting [b] block matrix row offset. + * blk_col Starting [b] block matrix column offset. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * a Buffer may not straddle 4GB boundary (See Note 1). + * b Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * m Must be 128 to be accelerated. + * n Optimal if a multiple of 16. Integral multiples of 16 may be accelerated. + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * b Optimal if cacheline aligned. Accelerated if [b] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * ldb Optimal if a multiple of 16. Accelerated if ldb is even. + * blk_col Must be a multiple of 16 in order to be SPE accelerated. This is a current + * implementation restriction. + */ + +extern void hpl_accel_dtrsm_CL_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete); + + + +/* hpl_accel_reform_panel_CL_to_B + * ------------------------------- + * Copy and reformat the L panel from the panel buffer pointed to by panel into matrix [a]. + * The input L panel is assumed to be column-order, little endian with a leading dimension of ldp. + * The matrix is assumed to be constructed in 64x64 element, row-ordered, big-endian blocks. The blocks + * are assumed to be column ordered. + * + * m Number of rows of panel to copy to [a] + * n Number of columns of panel to copy to [a] + * a Block formatted matrix. a points to the location with [a] to receive the + * data being copied and reformatted from panel + * lda Leading dimension of matrix [a]. This contains the number of doubles to + * advance a from block column n to column n+1. + * panel Pointer to the L panel containing the data to be reformatted and copied to + * matrix [a]. The [panel] is column-ordered, little endian. + * ldp Leading dimension of the panel. This is the number of doubles between + * column n and column n+1 + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * + */ + +void hpl_ref_reform_panel_CL_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete); + + +/* hpl_accel_reform_matrix_CL_to_B + * ------------------------------- + * Inplace reformat the matrix [a] from column-ordered, little-endian to blocked, big-endian format. The blocked + * format is 64x64, row-ordered blocks with the blocks being column ordered. The pad between the columns of + * blocks are zero filled. + * + * m Number of rows in [a]. If m is not a multiple of 64, then the additional rows needed + * pad [a] to a multiple of 64 rows are zero'd. + * n Number of cols in [a]. + * a Column-ordered, little-endian, matrix of m rows and n columns. + * lda Leading dimension of matrix [a]. + * scratch Scratch buffer used to assist the reformating of [a]. The scratch buffer + * must be at least 64*roundup(m,64) elements. + * size The size (number of elements) of the scratch buffer. The scratch buffer + * must be at least approximately 64*m elements. In general, better performance + * is achieved if the scratch buffer is larger and more SPEs can be deployed + * to the problem. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * n Must be an integral multiple of 64. + * a Must be quadword aligned and buffer may not straddle 4GB boundary (See Note 1). + * lda Must be even and at least roundup(m,64). + * scratch Must be quadword aligned and must not straddle 4GB boundary. + * size Must be at least 64*roundup(m,64). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * a Mush be cacheline aligned. + * lda Must be a multiple of 16. + * scratch Must be cacheline aligned. + * size Must be at least 4*64*m for optimal performance. + * + * Note: For 4GB crossing support, the matrix a is considered to be a block "matrix". + */ +extern void hpl_accel_reform_matrix_CL_to_B(int m, int n, + double *a, int lda, + double *scratch, int size, + unsigned long long *incomplete); + + + +/* hpl_accel_reform_panel_B_to_CL + * ------------------------------- + * Copy and reformat the L panel from matrix [a] into the panel buffer pointed to by panel. + * The matrix is assumed to be constructed in 64x64 element, row-ordered, big-endian blocks. The blocks + * are assumed to be column ordered. The output L panel is assumed to be column-order, little endian + * with a leading dimension of ldp. + * + * m Number of rows of [a] to copy to panel + * n Number of columns of [a] to copy to panel + * panel Pointer to the L panel extracted and reformatted from matrix [a]. The + * [panel] is column-ordered, little-endian. + * ldp Leading dimension of the panel. + * a Block formatted matrix. a points to the start of the panel to be reformatted + * and copied into [panel]. + * lda Leading dimension of matrix [a]. This contains the number of doubles to + * advance a from block column n to column n+1. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * m Must be a multiple of 64. + * panel Must be quadword aligned and the buffer may not straddle 4GB boundary (See Note 1). + * ldp Must be even and at least m. + * a Must be quadword aligned and may not straddle a 4GB boundary (See Note 1). + * lda Must be even and at least m*M_SUB. + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * panel Must be cacheline aligned. + * ldp Must be a multiple of 16. + * a Mush be cacheline aligned. + * lda Must be a multiple of 16. + */ +extern void hpl_accel_reform_panel_B_to_CL(int m, int n, + double *panel, int ldp, + double *a, int lda, + unsigned long long *incomplete); + + + +/* hpl_accel_reform_panel_R_to_B + * ------------------------------- + * Copy and reformat a U panel from a row buffer pointed to by panel into matrix [a]. + * The input U panel is assumed to be row-order, big endian with a leading dimension of ldp. + * The matrix is assumed to be constructed in 64x64 element, row-ordered, big-endian blocks. + * The blocks are assumed to be column ordered. + * + * m Number of rows of panel to copy to [a] + * n Number of columns of panel to copy to [a] + * a Block formatted matrix. a points to the location with [a] to receive the + * data being copied and reformatted from panel + * lda Leading dimension of matrix [a]. This contains the number of doubles to + * advance a from block column n to column n+1. + * panel Pointer to the U panel containing the data to be reformatted and copied to + * matrix [a]. The [panel] is row-ordered, big-endian. + * ldp Leading dimension of the panel. This is the number of doubles between + * row n and row n+1 + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * m None + * panel Must be quadword aligned and the buffer may not straddle 4GB boundary (See Note 1). + * ldp Must be even and at least n. + * a Must be quadword aligned and may not straddle a 4GB boundary (See Note 1). + * lda Must be even and at least m*M_SUB. + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * panel Must be cacheline aligned. + * ldp Must be a multiple of 16. + * a Must be cacheline aligned. + * lda Must be a multiple of 16. + */ +extern void hpl_accel_reform_panel_R_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete); + + +/* hpl_accel_reform_rows_R_to_B + * hpl_accel_reform_rows_B_to_R + * ---------------------------- + * Copy and reformat a set of rows between row ordered and block ordered formats. + * hpl_accel_reform_rows_R_to_B reformats rows into blocks and hpl_accel_reform_rows_B_to_R + * reformats blocks into rows. These functions are expected to be used to gather/scatter winners + * and losers when pivoting so that rows are coalesced into large DMAs for efficient transfer. + * No endian swapping is performed on the data. Block data is assumed to be ordered in 64x64, + * row ordered elements. The blocks themselves are column ordered. + * + * m Number of rows to copy. Specifies the number of entries in the blk_rows array. + * n Number of values (doubles) per row to copy. + * rows Pointer to the data rows to be reformatted and copied to/from matrix [a]. + * ldr Leading dimension of the row buffer. This is the number of doubles between + * rows of the [rows] buffer. + * a Block formatted matrix. + * lda Leading dimension of matrix [a]. This contains the number of doubles to + * advance a from block column n to column n+1. + * blk_rows Array of row indices. blk_rows specifies starting [a] block matrix row offset + * for each of the m rows. + * blk_col Starting [a] block matrix column offset. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * rows Buffer must not straddle 4GB boundary. + * a Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * rows Optimal if cacheline aligned. Accelerated if [rows] is quadword aligned. + * ldr Optimal if a multiple of 16. Accelerated if ldr is even. + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * blk_col Optimal if a multiple of 16. Accelerated if blk_col is even. + */ + +extern void hpl_accel_reform_rows_R_to_B(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +extern void hpl_accel_reform_rows_B_to_R(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +/* hpl_accel_swap_rows_B_to_B + * ---------------------------- + * Swap a set of rows in block ordered format. + * hpl_accel_swap_rows_B_to_B swaps a set of rows pairwise in a block-formatted matrix. + * No endian swapping is performed on the data. Block data is assumed to be ordered in 64x64, + * row ordered elements. The blocks themselves are column ordered. + * + * m Number of rows to swap. Specifies the number of entries in the blk_rows array. + * n Number of values (doubles) per row to copy. + * a Block formatted matrix. + * lda Leading dimension of matrix [a]. This contains the number of doubles to + * advance a from block column n to column n+1. + * blk_rows Array of row indices. blk_rows specifies starting [a] block matrix row offset + * for each of the m rows. + * blk_col Starting [a] block matrix column offset. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * a Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * blk_col Optimal if a multiple of 16. Accelerated if blk_col is even. + */ + +extern void hpl_accel_swap_rows_B_to_B(int m, int n, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +/* hpl_accel_copy_rows_R_to_R + * ---------------------------- + * Copy a set of rows in row ordered format. + * hpl_accel_copy_rows_R_to_R copies a set of rows from row-oriented matrix a to + * row-oriented matrix b. + * No endian swapping is performed on the data. + * + * m Number of rows to copy. Specifies the number of entries in the blk_rows array. + * n Number of values (doubles) per row to copy. + * a Pointer to the source data rows to be copied to row-ordered matrix b. + * lda Leading dimension of the row-ordered source matrix a. + * b Pointer to the row-ordered destination matrix. + * ldb Leading dimension of the row-ordered destination matrix b. + * rows Array of row indices. rows specifies the destination row address in row-ordered + * matrix b to receive source row from matrix a. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + */ + +extern void hpl_accel_copy_rows_R_to_R(int m, int n, + double *a, int lda, + double *b, int ldb, + int *rows, + unsigned long long *incomplete); + +/* REFERENCE FUNCTIONS. + * + * These functions are non-accelerated implementations that run on the PPU. + * + * They may not place the same functional and performance restrictions as the + * SPU accelerated functions. + */ + +extern int hpl_ref_init(); + +extern void hpl_ref_dgemm_CL_R_B(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete); + +extern void hpl_ref_dgemm_CL_B_B(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete); + +extern void hpl_ref_dgemm_CL_R_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete); + +extern void hpl_ref_dgemm_CL_B_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete); + +extern void hpl_ref_dgemm_C_C_C(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete); + +extern void hpl_ref_dtrsm_CL_R(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned long long *incomplete); + +extern void hpl_ref_dtrsm_CL_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete); + +extern void hpl_ref_dtrsm_CL_R_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete); + +extern void hpl_ref_reform_matrix_CL_to_B(int m, int n, + double *a, int lda, + double *scratch, int size, + unsigned long long *incomplete); + +extern void hpl_ref_reform_panel_B_to_CL(int m, int n, + double *panel, int ldp, + double *a, int lda, + unsigned long long *incomplete); + +extern void hpl_ref_reform_panel_R_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete); + +extern void hpl_ref_reform_rows_R_to_B(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +extern void hpl_ref_reform_rows_B_to_R(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +extern void hpl_ref_swap_rows_B_to_B(int m, int n, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +extern void hpl_ref_copy_rows_R_to_R(int m, int n, + double *a, int lda, + double *b, int ldb, + int *rows, + unsigned long long *incomplete); + +#endif /* __PPU__ */ + +#endif /* _HPL_ACCEL_H_ */ Index: accel/lib/hpl_accel_copy.c =================================================================== RCS file: accel/lib/hpl_accel_copy.c diff -N accel/lib/hpl_accel_copy.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_copy.c 20 Aug 2008 03:57:53 -0000 1.4 @@ -0,0 +1,98 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" +#include "ppu_intrinsics.h" + +/* General purpose, reference, reformating facilities. + */ + +void hpl_accel_copy_rows_R_to_R(int m, int n, + double *a, int lda, + double *b, int ldb, + int *rows, + unsigned long long *incomplete) +{ + unsigned int non_aligned; + int n0 = 0; + + non_aligned = (((unsigned int)(lda | ldb) & 1) | + (((unsigned int)((uintptr_t)a) | (uintptr_t)b) & (16-1))); + + if ((non_aligned == 0) && (n > 1)) { + int m_start, m_left, m_per_cmd; + unsigned int idx; + volatile hpl_accel_copy_rows_parms_t *parms; + int i; + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + n0 = n & ~1; + + idx = hpl_accel_cmd_idx; + + m_start = 0; + m_left = m; + + /* Generate multiple command requests if the number of rows + * is greater than what will fit in a single command request. + */ + m_per_cmd = (int)(sizeof(parms->rows) / sizeof(int)); + + while (m_left > 0) { + + parms = (volatile hpl_accel_copy_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = (m_left < m_per_cmd) ? m_left : m_per_cmd; + parms->n = n0; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + + parms->a = a + m_start * lda; + parms->b = b; + + parms->incomplete = (parms->m < m_left) ? NULL : incomplete; + + for (i=0; im; i++) parms->rows[i] = rows[m_start+i]; + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_COPY_ROWS_R_TO_R, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + m_start += parms->m; + m_left -= parms->m; + } + } else { + if (incomplete) *incomplete = 0; + } + + /* Cleanup portions of the rows not implemented by the SPEs above. + */ + if (n0 < n) { + unsigned int y1, y2, x; + double *src, *dst; + + /* For each of the rows */ + for (y1=0; y1<(unsigned int)m; y1++) { + y2 = rows[y1]; /* New location for row y1 */ + src = a + (y1 * lda); + dst = b + (y2 * ldb); + for (x=n0; x<(unsigned int)n; x++) { + dst[x] = src[x]; + } + } + } +} + Index: accel/lib/hpl_accel_dgemm.c =================================================================== RCS file: accel/lib/hpl_accel_dgemm.c diff -N accel/lib/hpl_accel_dgemm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_dgemm.c 20 Aug 2008 03:57:53 -0000 1.12 @@ -0,0 +1,495 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ +#include +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" +#include "hpl_accel_global.h" + +#include + +static void _dgemm_CL_R_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp) +{ + unsigned int i, x, y; + unsigned long long *a_ptr; + double a_val, b_val, *p; + + if (panel) { + /* Write the result into the panel buffer. We first perform the compution, + * placing the result into [panel]. Then byte swap panel. + */ + p = panel; + for (x=0; x<(unsigned int)n; x++, p += ldp-m) { + a_ptr = (unsigned long long *)a; + b_val = b[INDEX_ROW(0,x,ldb)]; + + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + *p++ = c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] - a_val * b_val; + } + } + + a += lda; + for (i=1; i<(unsigned int)k; i++, a+=lda) { + p = panel; + for (x=0; x<(unsigned int)n; x++, p += ldp-m) { + a_ptr = (unsigned long long *)a; + b_val = b[INDEX_ROW(i,x,ldb)]; + + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + *p++ -= a_val * b_val; + } + } + } +#ifdef ACCEL_LITTLE_ENDIAN + /* Byte swap panel buffer + */ + unsigned long long *p_ptr = (unsigned long long *)panel; + for (x=0; x<(unsigned int)n; x++, p_ptr += ldp-m) { + for (y=0; y<(unsigned int)m; y++, p_ptr++) { + __stdbrx(p_ptr, *p_ptr); + } + } +#endif + } else { + /* Write the result into the c matrix. + */ + for (i=0; i<(unsigned int)k; i++, a+=lda) { + a_ptr = (unsigned long long *)a; + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] -= a_val * b[INDEX_ROW(i,x,ldb)]; + } + } + } + } +} + + +void hpl_accel_dgemm_CL_R_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete) +{ + int n0; + int m0 = 0; + unsigned int cmd; + unsigned int idx; + unsigned int aligned, bc, br; + volatile hpl_accel_dgemm_parms_t *parms; + + /* Do as much of the dgemm as possible using the blocked dgemm SPU specialist. + * This specialist assumes: + * m is at least M_SUB + * n is at least M_SUB + * k is equal to M + * a is quadword aligned. A multiple of 16 for optimal DMA performance + * b is quadword aligned. A multiple of 16 for optimal DMA performance + * c is quadword aligned. A multiple of 16 for optimal DMA performance + * panel is quadword aligned. A multiple of 16 for optimal DMA performance + * lda is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance + * ldb is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance + * ldc is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance + * ldp is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance + * blk_col is a multiple of M_SUB + * blk_row is a multiple of M_SUB + */ + bc = blk_col/M_SUB; + br = blk_row/M_SUB; + + c += (ldc * bc) + br*(M_SUB*M_SUB); + + blk_col %= M_SUB; + blk_row %= M_SUB; + + aligned = (blk_row | blk_col | + ((unsigned int)(lda | ldb | ldc | ldp) & 1) | + ((unsigned int)((uintptr_t)a | (uintptr_t)b | (uintptr_t)c | (uintptr_t)panel) & (16-1))); + + + if ((m >= M_SUB) && (n >= M_SUB) && (k == M) && (aligned == 0)) { + /* Either all or a portion of the computation can be done by the SPE accelerators. + */ + m0 = (m/M_SUB) * M_SUB; + n0 = (n/M_SUB) * M_SUB; + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(a, k, lda); + VALIDATE_PANEL_4GB_CROSSING(b, k, ldb); + VALIDATE_MATRIX_4GB_CROSSING(c, m0, n0, ldc); + VALIDATE_PANEL_4GB_CROSSING(panel, n0, ldp); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_dgemm_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->b = b; + parms->c = c; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + parms->ldc = ldc * sizeof(double); + parms->n = n0 / M_SUB; + parms->m = m0 / M_SUB; + parms->b_blk = 0; + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M); + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->b_count, b, ldb, M); + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n); + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + if (panel) { + parms->p = panel; + parms->ldp = ldp * sizeof(double); + cmd = HPL_ACCEL_CMD_DGEMM_PANEL; + } else { + cmd = HPL_ACCEL_CMD_DGEMM; + } + + /* Perform a sync in order to ensure that the parameters are written to + * memory before writing to the mailbox command queue. + */ + __sync(); + + send_cmd_to_spes(cmd, idx, HPL_ACCEL_SPES); + + /* Complete any remain portion on the right side. That is when n is not a multiple + * of M_SUB. + */ + if (n0 < n) { + _dgemm_CL_R_B_CL(m0, n-n0, k, a, lda, b+n0, ldb, c, ldc, blk_row, blk_col+n0, panel + n0*ldp, ldp); + } + } else { + /* This function is completely synchronous, therefore, clear incomplete. + */ + if (incomplete) *incomplete = 0; + } + + /* Cleanup any remaining portion of the matrix that was not handled above. + */ + if (m0 < m) { + _dgemm_CL_R_B_CL(m-m0, n, k, a+m0, lda, b, ldb, c, ldc, blk_row+m0, blk_col, ((panel) ? panel + m0 : panel), ldp); + } +} + + + +static void _dgemm_CL_B_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp) +{ + unsigned int i, x, y; + unsigned long long *a_ptr; + double a_val, b_val, *p; + + if (panel) { + /* Write the result into the panel buffer. We first perform the compution, + * placing the result into [panel]. Then byte swap panel. + */ + p = panel; + for (x=0; x<(unsigned int)n; x++, p += ldp-m) { + a_ptr = (unsigned long long *)a; + b_val = b[INDEX_BLK(0,x+blk_col,ldb)]; + + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + *p++ = c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] - a_val * b_val; + } + } + + a += lda; + for (i=1; i<(unsigned int)k; i++, a+=lda) { + p = panel; + for (x=0; x<(unsigned int)n; x++, p += ldp-m) { + a_ptr = (unsigned long long *)a; + b_val = b[INDEX_BLK(i,x+blk_col,ldb)]; + + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + *p++ -= a_val * b_val; + } + } + } +#ifdef ACCEL_LITTLE_ENDIAN + /* Byte swap panel buffer + */ + unsigned long long *p_ptr = (unsigned long long *)panel; + for (x=0; x<(unsigned int)n; x++, p_ptr += ldp-m) { + for (y=0; y<(unsigned int)m; y++, p_ptr++) { + __stdbrx(p_ptr, *p_ptr); + } + } +#endif + } else { + /* Write the result into the c matrix. + */ + for (i=0; i<(unsigned int)k; i++, a+=lda) { + a_ptr = (unsigned long long *)a; + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] -= a_val * b[INDEX_BLK(i,x+blk_col,ldb)]; + } + } + } + } +} + + +void hpl_accel_dgemm_CL_B_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete) +{ + int n0; + int m0 = 0; + unsigned int cmd; + unsigned int idx; + unsigned int aligned, bc, br; + volatile hpl_accel_dgemm_parms_t *parms; + + /* Do as much of the dgemm as possible using the blocked dgemm SPU specialist. + * This specialist assumes: + * m is at least M_SUB + * n is at least M_SUB + * k is equal to M + * a is quadword aligned. A multiple of 16 for optimal DMA performance + * b is quadword aligned. A multiple of 16 for optimal DMA performance + * c is quadword aligned. A multiple of 16 for optimal DMA performance + * panel is quadword aligned. A multiple of 16 for optimal DMA performance + * lda is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldb is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldc is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldp is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * blk_col is a multiple of M_SUB + * blk_row is a multiple of M_SUB + */ + + bc = blk_col/M_SUB; + br = blk_row/M_SUB; + + c += (ldc * bc) + br*(M_SUB*M_SUB); + b += (ldb * bc); + + blk_col %= M_SUB; + blk_row %= M_SUB; + + aligned = (blk_row | blk_col | + ((unsigned int)(lda | ldb | ldc | ldp) & 1) | + ((unsigned int)((uintptr_t)a | (uintptr_t)b | (uintptr_t)c | (uintptr_t)panel) & (16-1))); + + + if ((m >= M_SUB) && (n >= M_SUB) && (k == M) && (aligned == 0)) { + /* Either all or a portion of the computation can be done by the SPE accelerators. + */ + m0 = (m/M_SUB) * M_SUB; + n0 = (n/M_SUB) * M_SUB; + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(a, k, lda); + VALIDATE_MATRIX_4GB_CROSSING(b, k, n0, ldb); + VALIDATE_MATRIX_4GB_CROSSING(c, m0, n0, ldc); + VALIDATE_PANEL_4GB_CROSSING(panel, n0, ldp); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_dgemm_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->b = b; + parms->c = c; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + parms->ldc = ldc * sizeof(double); + parms->n = n0 / M_SUB; + parms->m = m0 / M_SUB; + parms->b_blk = -1; + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M); + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n); + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + if (panel) { + parms->p = panel; + parms->ldp = ldp * sizeof(double); + cmd = HPL_ACCEL_CMD_DGEMM_PANEL; + } else { + cmd = HPL_ACCEL_CMD_DGEMM; + } + + /* Perform a sync in order to ensure that the parameters are written to + * memory before writing to the mailbox command queue. + */ + __sync(); + + send_cmd_to_spes(cmd, idx, HPL_ACCEL_SPES); + + /* Complete any remain portion on the right side. That is when n is not a multiple + * of M_SUB. + */ + if (n0 < n) { + _dgemm_CL_B_B_CL(m0, n-n0, k, a, lda, b, ldb, c, ldc, blk_row, blk_col+n0, panel + n0*ldp, ldp); + } + } else { + /* This function is completely synchronous, therefore, clear incomplete. + */ + if (incomplete) *incomplete = 0; + } + + /* Cleanup any remaining portion of the matrix that was not handled above. + */ + if (m0 < m) { + _dgemm_CL_B_B_CL(m-m0, n, k, a+m0, lda, b, ldb, c, ldc, blk_row+m0, blk_col, ((panel) ? panel + m0 : panel), ldp); + } +} + + +void _dgemm_C_C_C(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc) +{ + unsigned int i; + unsigned int x, y; + double a_val; + + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = a[INDEX_COL(y,i,lda)]; + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_COL(y,x,ldc)] -= a_val * b[INDEX_COL(i,x,ldb)]; + } + } + } +} + +void hpl_accel_dgemm_C_C_C(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete) +{ + int m0 = 0; + int spes; + unsigned int cmd, idx, aligned; + volatile hpl_accel_dgemm_parms_t *parms; + + /* Do as much of the dgemm as possible using the column-ordered dgemm SPU specialist. + * This specialist assumes: + * k is a multiple of 4 and less than or equal to 64 + * m is a multiple of 8 + * n is a multiple of 4 + * a is quadword aligned. A multiple of 16 for optimal DMA performance + * b is quadword aligned. A multiple of 16 for optimal DMA performance + * c is quadword aligned. A multiple of 16 for optimal DMA performance + * lda is even (qword aligned cols). A multiple of 16 for optimal DMA + * performance. + * ldb is even (qword aligned cols). A multiple of 16 for optimal DMA + * performance. + * ldc is even (qword aligned cols). A multiple of 16 for optimal DMA + * performance. + */ + + aligned = (((unsigned int)(lda | ldb | ldc) & 1) | + ((unsigned int)((uintptr_t)a | (uintptr_t)b | (uintptr_t)c) & (16-1))); + + if ((m >= 8) && (k <= 64) && (((k & (4-1)) | (n & (4-1))) == 0) && (aligned == 0)) { + /* Either all or a portion of the computation can be done by the SPE accelerators. + */ + m0 = (m/8) * 8; + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(a, k, lda); + VALIDATE_PANEL_4GB_CROSSING(c, n, ldc); + + idx = hpl_accel_cmd_idx; + parms = (volatile hpl_accel_dgemm_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->b = b; + parms->c = c; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + parms->ldc = ldc * sizeof(double); + parms->n = n; + parms->m = m0; + parms->k = k; + parms->incomplete = incomplete; + + /* Compute the number of SPES to deploy. Each SPE will need to compute + * at least one M_SUB high block. + */ + spes = (m + (M_SUB-1)) / M_SUB; + if (spes > HPL_ACCEL_SPES) spes = HPL_ACCEL_SPES; + + init_incomplete(incomplete, spes); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + cmd = HPL_ACCEL_CMD_DGEMM_C_C_C; + + /* Perform a sync in order to ensure that the parameters are written to + * memory before writing to the mailbox command queue. + */ + __sync(); + + send_cmd_to_spes(cmd, idx, spes); + + } else { + /* This function is completely synchronous, therefore, clear incomplete. + */ + if (incomplete) *incomplete = 0; + } + + /* Cleanup any remaining portion of the matrix that was not handled above. + */ + if (m0 < m) { + _dgemm_C_C_C(m-m0, n, k, a+m0, lda, b, ldb, c+m0, ldc); + } +} Index: accel/lib/hpl_accel_dtrsm.c =================================================================== RCS file: accel/lib/hpl_accel_dtrsm.c diff -N accel/lib/hpl_accel_dtrsm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_dtrsm.c 20 Aug 2008 03:57:53 -0000 1.5 @@ -0,0 +1,250 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ +#include +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" +#include "hpl_accel_global.h" + +#include + + + +void hpl_accel_dtrsm_CL_R_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete) +{ + int spes; + int spans; + int n0 = 0; + unsigned int idx; + unsigned int aligned; + unsigned int cmd; + volatile hpl_accel_dtrsm_parms_t *parms; + + /* Do as much of the dtrsm as possible using the dtrsm SPU specialist. + * This specialist assumes: + * m is at 128. + * n is a multiple of 16. + * a is quadword aligned. A multiple of 16 for optimal DMA performance + * b is quadword aligned. A multiple of 16 for optimal DMA performance + * c is quadword aligned. A multiple of 16 for optimal DMA performance + * lda is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldb is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldc is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + */ + c += (blk_row * M_SUB) + ldc*(blk_col / M_SUB); + blk_col %= M_SUB; + + aligned = (((unsigned int)(lda | ldb | ldc) & 1) | (blk_col & 15) | + ((unsigned int)((uintptr_t)a | (uintptr_t)b | (uintptr_t)c) & (16-1))); + + + if ((m == M) && (n > 15) && (aligned == 0)) { + /* Either all or a portion of the computation can be done by the SPE accelerators. + */ + spans = n/16; + n0 = spans * 16; + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(a, m, lda); + VALIDATE_PANEL_4GB_CROSSING(b, m, ldb); + VALIDATE_MATRIX_4GB_CROSSING(c, m, n0, ldc); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_dtrsm_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->b = b; + parms->c = c; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + parms->ldc = ldc * sizeof(double); + parms->n = n0; + parms->m = m / M; + parms->blk_col = blk_col / 16; + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M); + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->b_count, b, ldb, M); + + spes = (spans < HPL_ACCEL_SPES) ? spans : HPL_ACCEL_SPES; + + init_incomplete(incomplete, spes); + + /* Perform a sync in order to ensure that the parameters are written to + * memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + cmd = (c == NULL) ? HPL_ACCEL_CMD_DTRSM : HPL_ACCEL_CMD_DTRSM_PANEL; + + send_cmd_to_spes(cmd, idx, spes); + } else { + /* This function is completely synchronous, therefore, clear incomplete. + */ + if (incomplete) *incomplete = 0; + } + + /* Cleanup any remaining portion of the matrix that was not handled above. + */ + if (n0 < n) { + unsigned int i, x, y; + unsigned long long *a_ptr; + double a_val; + double *b_next; + + a_ptr = (unsigned long long *)a; + if (c) { + /* Perform DTRSM cleanup into a block format matrix row. + */ + for (x=n0; x<(unsigned int)n; x++) { + c[INDEX_BLK(0, x+blk_col, ldc)] = b[INDEX_ROW(0, x, ldb)]; + } + /* y == 1 */ + a_ptr++; + for (i=1; i<(unsigned int)m; i++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + for (x=n0; x<(unsigned int)n; x++) { + c[INDEX_BLK(i, x+blk_col, ldc)] = b[INDEX_ROW(i, x, ldb)] - b[INDEX_ROW(0, x, ldb)] * a_val; + } + } + a_ptr += (lda - m); + + /* y > 1 + */ + for (y=2; y<(unsigned int)m; y++) { + a_ptr += y; + for (i=y; i<(unsigned int)m; i++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + for (x=n0; x<(unsigned int)n; x++) { + c[INDEX_BLK(i, x+blk_col, ldc)] -= c[INDEX_BLK(y-1, x+blk_col, ldc)] * a_val; + } + } + a_ptr += (lda - m); + } + } else { + /* Perform DTRSM cleanup into [b] + */ + for (y=1; y<(unsigned int)m; y++, b+=ldb) { + a_ptr += y; + b_next = b+ldb; + for (i=y; i<(unsigned int)m; i++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + for (x=n0; x<(unsigned int)n; x++) { + b_next[x] -= b[x] * a_val; + } + b_next += ldb; + } + a_ptr += (lda - m); + } + } + } +} + + + +void hpl_accel_dtrsm_CL_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete) +{ + int spes; + int spans; + int n0 = 0; + unsigned int i, x, y; + unsigned int idx; + unsigned int aligned; + volatile hpl_accel_dtrsm_parms_t *parms; + + /* Do as much of the dtrsm as possible using the dtrsm SPU specialist. + * This specialist assumes: + * m is at 128. + * n is a multiple of 16. + * a is quadword aligned. A multiple of 16 for optimal DMA performance + * b is quadword aligned. A multiple of 16 for optimal DMA performance + * lda is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldb is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + */ + b += (blk_row * M_SUB) + ldb*(blk_col / M_SUB); + blk_col %= M_SUB; + + aligned = (((unsigned int)(lda | ldb) & 1) | (blk_col & 15) | + ((unsigned int)((uintptr_t)a | (uintptr_t)b) & (16-1))); + + + if ((m == M) && (n > 15) && (aligned == 0)) { + /* Either all or a portion of the computation can be done by the SPE accelerators. + */ + spans = n/16; + n0 = spans * 16; + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(a, m, lda); + VALIDATE_MATRIX_4GB_CROSSING(b, m, n0, ldb); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_dtrsm_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->b = b; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + parms->n = n0; + parms->m = m / M; + parms->blk_col = blk_col / 16; + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M); + + spes = (spans < HPL_ACCEL_SPES) ? spans : HPL_ACCEL_SPES; + + init_incomplete(incomplete, spes); + + /* Perform a sync in order to ensure that the parameters are written to + * memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + send_cmd_to_spes(HPL_ACCEL_CMD_DTRSM_CL_B, idx, spes); + } else { + /* This function is completely synchronous, therefore, clear incomplete. + */ + if (incomplete) *incomplete = 0; + } + + /* Cleanup any remaining portion of the matrix that was not handled above. + */ + for (x=n0; x<(unsigned int)n; x++) { + for (y=1; y<(unsigned int)m; y++) { + for (i=y; i<(unsigned int)m; i++) { + b[INDEX_BLK(i, x+blk_col, ldb)] -= b[INDEX_BLK(y-1, x+blk_col, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]); + } + } + } +} Index: accel/lib/hpl_accel_global.c =================================================================== RCS file: accel/lib/hpl_accel_global.c diff -N accel/lib/hpl_accel_global.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_global.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,19 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ +#include "hpl_accel_spu.h" +#include "hpl_accel_global.h" + + +/* SPE Thread Info + */ +int hpl_accel_initialized = 0; +hpl_accel_thread_info_t hpl_accel_threads[HPL_ACCEL_SPES]; + + +/* SPE Command Queue + */ +unsigned int hpl_accel_cmd_idx = 0; +hpl_accel_cmd_entry_t hpl_accel_cmd_queue[HPL_ACCEL_CMD_ENTRIES]; + Index: accel/lib/hpl_accel_global.h =================================================================== RCS file: accel/lib/hpl_accel_global.h diff -N accel/lib/hpl_accel_global.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_global.h 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,34 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ +#include +#include +#include "hpl_accel_spu.h" + +#ifndef _HPL_ACCEL_GLOBAL_H_ +#define _HPL_ACCEL_GLOBAL_H_ + +#define HPL_ACCEL_CMD_ENTRIES 8 /* number of command queue entries */ + + +typedef struct hpl_accel_thread_info { + spe_context_ptr_t id; + pthread_t pthread; + spe_spu_control_area_t *ctl_area; // pointer to control ps area + int in_cnt; // inbound mailbox available element count + struct hpl_accel_init_parms *init_parms; +} hpl_accel_thread_info_t; + + +typedef struct hpl_accel_cmd_entry { + unsigned char parms[128] __attribute__ ((aligned (128))); +} hpl_accel_cmd_entry_t; + + +extern int hpl_accel_initialized; +extern hpl_accel_thread_info_t hpl_accel_threads[HPL_ACCEL_SPES]; +extern unsigned int hpl_accel_cmd_idx; +extern hpl_accel_cmd_entry_t hpl_accel_cmd_queue[HPL_ACCEL_CMD_ENTRIES]; + +#endif /* _HPL_ACCEL_GLOBAL_H_ */ Index: accel/lib/hpl_accel_init.c =================================================================== RCS file: accel/lib/hpl_accel_init.c diff -N accel/lib/hpl_accel_init.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_init.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,112 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "hpl_accel_global.h" +#include "hpl_accel_spu.h" + +static hpl_accel_init_parms_t init_parms[HPL_ACCEL_SPES]; + +static void *ppu_pthread_function(void *arg) { + hpl_accel_thread_info_t *info; + unsigned int entry = SPE_DEFAULT_ENTRY; + + info = (hpl_accel_thread_info_t *)arg; + + if (spe_context_run(info->id, &entry, 0, (void *)(info->init_parms), NULL, NULL) < 0) { + perror("Failed running context"); + exit (1); + } + pthread_exit(NULL); +} + +extern spe_program_handle_t hpl_accel_spu; + + +int hpl_accel_init() +{ + int i; + + if (!hpl_accel_initialized) { + + /* Create each of the SPU threads + */ + for (i=0; i +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" +#include "ppu_intrinsics.h" + +/* General purpose, reference, reformating facilities. + */ +void hpl_accel_reform_panel_CL_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete) +{ + hpl_ref_reform_panel_CL_to_B(m, n, a, lda, panel, ldp, incomplete); +} + + +void hpl_accel_reform_matrix_CL_to_B(int m, int n, + double *a, int lda, + double *scratch, int size, + unsigned long long *incomplete) + +{ + unsigned int idx; + int spes; + int m_padded; + volatile hpl_accel_reform_matrix_CL_to_B_parms_t *parms; + + m_padded = ((m + M_SUB-1)/M_SUB)*M_SUB; + + /* Assert that the parameter restrictions are not violated. + * n Must be an intregral multiple of 64. + * a Must be quadword aligned. + * lda Must be even and at least roundup(m,64). + * scratch Must be quadword aligned and must not straddle 4GB boundary. + * size Must be at least 64*roundup(m,64). + */ + assert((n % M_SUB) == 0); + assert(lda >= m_padded); + assert(size >= (m_padded-4)*M_SUB); + + /* Assert that the parameters conform also to the desired performance restrictions: + * a Must be cacheline aligned. + * lda Must be a mulitple of 16. + * scratch Must be cacheline aligned. + * size Must be at least 4*64*m for optimal performance. + */ + assert(((uintptr_t)a & (uintptr_t)127) == (uintptr_t)0); + assert(((uintptr_t)scratch & (uintptr_t)127) == (uintptr_t)0); + assert((lda & 15) == 0); + + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(scratch, 1, size); + VALIDATE_MATRIX_4GB_CROSSING(a, m, n, lda*M_SUB); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_reform_matrix_CL_to_B_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Compute the number of SPEs to deploy + */ + spes = size / ((m_padded-4) * M_SUB); + if (spes > HPL_ACCEL_SPES) spes = HPL_ACCEL_SPES; + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->scratch = scratch; + parms->lda = lda * sizeof(double); + parms->n = n; + parms->m = m; + parms->spes = spes; + parms->incomplete = incomplete; + + init_incomplete(incomplete, spes); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_MATRIX_CL_TO_B, idx, spes); +} + + + + + +void hpl_accel_reform_panel_B_to_CL(int m, int n, + double *panel, int ldp, + double *a, int lda, + unsigned long long *incomplete) +{ + unsigned int idx; + volatile hpl_accel_reform_panel_parms_t *parms; + + /* Assert that the parameter restrictions are not violated. + * m Must be an intregral multiple of 64. + * n Must be at least 1. + * panel Must be quadword aligned and buffer may not straddle 4GB boundary. + * ldp Must be at least m. + * a Must be quadword aligned and buffer may not straddle 4GB boundary. + * lda Must be at least m*M_SUB. + */ + assert((m % M_SUB) == 0); + assert(n > 0); + assert(ldp >= m); + assert(lda >= m*M_SUB); + + VALIDATE_PANEL_4GB_CROSSING(panel, n, ldp); + VALIDATE_MATRIX_4GB_CROSSING(a, m, n, lda); + + /* Assert that the parameters conform also to the desired performance restrictions: + * a Mush be cacheline aligned. + * lda Must be a mulitple of 16. + * panel Must be cacheline aligned. + * ldp Must be a multiple of 16. + */ + assert(((uintptr_t)a & (uintptr_t)127) == (uintptr_t)0); + assert(((uintptr_t)panel & (uintptr_t)127) == (uintptr_t)0); + assert((lda & 15) == 0); + assert((ldp & 15) == 0); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_reform_panel_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->n = n; + parms->m = m; + parms->a = a; + parms->lda = lda * sizeof(double); + parms->panel = panel; + parms->ldp = ldp * sizeof(double); + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n); + + init_incomplete(incomplete, HPL_ACCEL_REFORM_SPES); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_PANEL_B_TO_CL, idx, HPL_ACCEL_REFORM_SPES); +} + + +void hpl_accel_reform_panel_R_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete) +{ + unsigned int idx; + volatile hpl_accel_reform_panel_parms_t *parms; + + /* Assert that the parameter restrictions are not violated. + * panel Must be quadword aligned and buffer may not straddle 4GB boundary. + * ldp Must be even at least n. + * a Must be quadword aligned and buffer may not straddle 4GB boundary. + * lda Must be at least m*M_SUB. + */ + assert((ldp & 1) == 0); + assert(ldp >= n); + assert(lda >= m*M_SUB); + + VALIDATE_PANEL_4GB_CROSSING(panel, m, ldp); + VALIDATE_MATRIX_4GB_CROSSING(a, m, n, lda); + + /* Assert that the parameters conform also to the desired performance restrictions: + * a Must be cacheline aligned. + * lda Must be a mulitple of 16. + * panel Must be cacheline aligned. + * ldp Must be a multiple of 16. + */ + assert(((uintptr_t)a & (uintptr_t)127) == (uintptr_t)0); + assert(((uintptr_t)panel & (uintptr_t)127) == (uintptr_t)0); + assert((lda & 15) == 0); + assert((ldp & 15) == 0); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_reform_panel_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->n = n; + parms->m = m; + parms->a = a; + parms->lda = lda * sizeof(double); + parms->panel = panel; + parms->ldp = ldp * sizeof(double); + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n); + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_PANEL_R_TO_B, idx, HPL_ACCEL_SPES); +} + + +void hpl_accel_reform_rows_R_to_B(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) +{ + int i; + unsigned int non_aligned; + int n0 = 0; + + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + non_aligned = (((unsigned int)(blk_col | lda | ldr) & 1) | + (((unsigned int)(uintptr_t)a | (uintptr_t)rows) & (16-1))); + + if ((non_aligned == 0) && (n > 1)) { + int m_left; + int rows_per_block; + int *blk_row_ptr; + double *rows_ptr; + unsigned int idx; + volatile hpl_accel_reform_rows_parms_t *parms; + + /* Assert that we won't span a 4G boundary crossing + */ + assert((((uintptr_t)rows) >> 32) == ((uintptr_t)(rows + m*ldr - 1) >> 32)); + + VALIDATE_MATRIX_4GB_CROSSING(a, ((lda/M_SUB) & ~(M_SUB-1)), n, lda); + + n0 = n & ~1; + + idx = hpl_accel_cmd_idx; + + m_left = m; + blk_row_ptr = blk_rows; + rows_ptr = rows; + + /* Generate multiple command requests if the number of rows + * is greater than what will fit in a single command request. + */ + rows_per_block = (int)(sizeof(parms->blk_rows) / sizeof(int)); + + while (m_left > rows_per_block) { + parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = rows_per_block; + parms->n = n0; + parms->rows = rows_ptr; + parms->ldr = ldr * sizeof(double); + parms->a = a; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->incomplete = NULL; + for (i=0; iblk_rows[i] = blk_row_ptr[i]; + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_R_TO_B, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + m_left -= rows_per_block; + rows_ptr += rows_per_block * ldr; + blk_row_ptr += rows_per_block; + } + + if (m_left > 0) { + parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = m_left; + parms->n = n0; + parms->rows = rows_ptr; + parms->ldr = ldr * sizeof(double); + parms->a = a; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->incomplete = incomplete; + for (i=0; iblk_rows[i] = blk_row_ptr[i]; + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_R_TO_B, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + } else { + if (incomplete) *incomplete = 0; + } + hpl_accel_cmd_idx = idx; + } else { + if (incomplete) *incomplete = 0; + } + + /* Cleanup portions of the rows not implemented by the SPEs above. + */ + if (n0 < n) { + unsigned int x, y, row; + int first_span, span, left; + double *src, *dst; + + blk_col += n0; + rows += n0; + n -= n0; + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + first_span = M_SUB - blk_col; + if (first_span > n) first_span = n; + + /* For each of the rows */ + for (y=0; y<(unsigned int)m; y++) { + row = blk_rows[y]; + left = n; + dst = a + (row * M_SUB); + span = first_span; + left = n - first_span; + + /* For each of the destination buffer block spans + */ + src = rows; + + for (x=0; x<(unsigned int)span; x++) dst[x+blk_col] = src[x]; + while (left) { + dst += lda; + src += span; + span = (left > M_SUB) ? M_SUB : left; + for (x=0; x<(unsigned int)span; x++) dst[x] = src[x]; + left -= span; + } + rows += ldr; + } + } +} + + +void hpl_accel_reform_rows_B_to_R(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) +{ + int i; + unsigned int non_aligned; + int n0 = 0; + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + non_aligned = (((unsigned int)(blk_col | lda | ldr) & 1) | + (((unsigned int)(uintptr_t)a | (uintptr_t)rows) & (16-1))); + + if ((non_aligned == 0) && (n > 1)) { + int m_left; + int rows_per_block; + int *blk_row_ptr; + double *rows_ptr; + unsigned int idx; + volatile hpl_accel_reform_rows_parms_t *parms; + + /* Assert that we won't span a 4G boundary crossing + */ + assert((((uintptr_t)rows) >> 32) == ((uintptr_t)(rows + m*ldr - 1) >> 32)); + + VALIDATE_MATRIX_4GB_CROSSING(a, ((lda/M_SUB) & ~(M_SUB-1)), n, lda); + + n0 = n & ~1; + + idx = hpl_accel_cmd_idx; + + m_left = m; + blk_row_ptr = blk_rows; + rows_ptr = rows; + + /* Generate multiple command requests if the number of rows + * is greater than what will fit in a single command request. + */ + rows_per_block = (int)(sizeof(parms->blk_rows) / sizeof(int)); + + while (m_left > rows_per_block) { + parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = rows_per_block; + parms->n = n0; + parms->rows = rows_ptr; + parms->ldr = ldr * sizeof(double); + parms->a = a; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->incomplete = NULL; + for (i=0; iblk_rows[i] = blk_row_ptr[i]; + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_B_TO_R, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + m_left -= rows_per_block; + rows_ptr += rows_per_block * ldr; + blk_row_ptr += rows_per_block; + } + + if (m_left > 0) { + parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = m_left; + parms->n = n0; + parms->rows = rows_ptr; + parms->ldr = ldr * sizeof(double); + parms->a = a; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->incomplete = incomplete; + for (i=0; iblk_rows[i] = blk_row_ptr[i]; + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_B_TO_R, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + } else { + if (incomplete) *incomplete = 0; + } + hpl_accel_cmd_idx = idx; + } else { + if (incomplete) *incomplete = 0; + } + + /* Cleanup portions of the rows not implemented by the SPEs above. + */ + if (n0 < n) { + unsigned int x, y, row; + int first_span, span, left; + double *src, *dst; + + blk_col += n0; + rows += n0; + n -= n0; + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + first_span = M_SUB - blk_col; + if (first_span > n) first_span = n; + + /* For each of the rows */ + for (y=0; y<(unsigned int)m; y++) { + row = (unsigned int)blk_rows[y]; + left = n; + src = a + (row * M_SUB); + span = first_span; + left = n - first_span; + + /* For each of the destination buffer block spans + */ + dst = rows; + + for (x=0; x<(unsigned int)span; x++) dst[x] = src[x+(unsigned int)blk_col]; + while (left) { + src += lda; + dst += span; + span = (left > M_SUB) ? M_SUB : left; + for (x=0; x<(unsigned int)span; x++) dst[x] = src[x]; + left -= span; + } + rows += ldr; + } + } +} + Index: accel/lib/hpl_accel_spu.h =================================================================== RCS file: accel/lib/hpl_accel_spu.h diff -N accel/lib/hpl_accel_spu.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_spu.h 20 Aug 2008 03:57:53 -0000 1.11 @@ -0,0 +1,417 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +/* This file contains definitions shared between the PPE and SPE + */ + +#ifndef _HPL_ACCEL_SPU_H_ +#define _HPL_ACCEL_SPU_H_ + +#include "hpl_accel.h" +#include + +#define SUB (2) /* Number of sub-blocks per block (1 dim)*/ +#define M (SUB*M_SUB) /* Size of the matrix block - M x M */ +#define SUB_SUB (SUB*SUB) /* The number of sub-blocks per block */ + + +/* SPE Commands + */ +#define HPL_ACCEL_CMD_DGEMM 0 +#define HPL_ACCEL_CMD_DTRSM 1 +#define HPL_ACCEL_CMD_REFORM_MATRIX_CL_TO_B 2 +#define HPL_ACCEL_CMD_REFORM_PANEL_B_TO_CL 3 +#define HPL_ACCEL_CMD_REFORM_PANEL_R_TO_B 4 +#define HPL_ACCEL_CMD_DGEMM_PANEL 5 +#define HPL_ACCEL_CMD_REFORM_ROWS_R_TO_B 6 +#define HPL_ACCEL_CMD_REFORM_ROWS_B_TO_R 7 +#define HPL_ACCEL_CMD_FINI 8 +#define HPL_ACCEL_CMD_DTRSM_CL_B 9 +#define HPL_ACCEL_CMD_DTRSM_PANEL 10 +#define HPL_ACCEL_CMD_DGEMM_C_C_C 11 +#define HPL_ACCEL_CMD_SWAP_ROWS_B_TO_B 12 +#define HPL_ACCEL_CMD_COPY_ROWS_R_TO_R 13 + + +#define HPL_ACCEL_CMD_MASK 0x7F + +#define HPL_ACCEL_SPES 8 /* # of SPEs to use per accelerator */ +#define HPL_ACCEL_REFORM_SPES 4 /* # of SPEs to use during some reformat */ +#define HPL_ACCEL_PARM_TAG 31 + +/* Function parameters */ + +#ifdef __SPU__ +#include + +typedef struct hpl_accel_init_parms { + unsigned int id __attribute__ ((aligned (16))); + unsigned long long cmd_base __attribute__ ((aligned (16)));; +} hpl_accel_init_parms_t; + +typedef struct hpl_accel_dgemm_parms { + unsigned long long a __attribute__ ((aligned (16))); + unsigned long long b __attribute__ ((aligned (16))); + unsigned long long c __attribute__ ((aligned (16))); + unsigned long long p __attribute__ ((aligned (16))); + vec_uint4 ld; /* lda, ldb, ldc, ldp */ + vec_uint4 dim; /* n, m, k, pad */ + vec_uint4 flags; /* b_blk, a_count, b_count, p_count */ + unsigned long long incomplete __attribute__ ((aligned (16))); +} hpl_accel_dgemm_parms_t; + +typedef struct hpl_accel_dtrsm_parms { + unsigned long long a __attribute__ ((aligned (16))); + unsigned long long b __attribute__ ((aligned (16))); + unsigned long long c __attribute__ ((aligned (16))); + vec_uint4 ld; /* lda, ldb, ldc, pad */ + vec_uint4 dim; /* n, m, a_count, b_count */ + vec_uint4 blk_col; + unsigned long long incomplete __attribute__ ((aligned (16))); +} hpl_accel_dtrsm_parms_t; + +typedef struct hpl_accel_reform_matrix_CL_to_B_parms { + unsigned long long a __attribute__ ((aligned (16))); + unsigned long long scratch __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int n __attribute__ ((aligned (16))); + int m __attribute__ ((aligned (16))); + int spes __attribute__ ((aligned (16))); + unsigned long long incomplete __attribute__ ((aligned (16))); +} hpl_accel_reform_matrix_CL_to_B_parms_t; + +typedef struct hpl_accel_reform_panel_parms { + unsigned long long a __attribute__ ((aligned (16))); + unsigned long long panel __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int ldp __attribute__ ((aligned (16))); + int n __attribute__ ((aligned (16))); + int m __attribute__ ((aligned (16))); + int p_count __attribute__ ((aligned (16))); + unsigned long long incomplete __attribute__ ((aligned (16))); +} hpl_accel_reform_panel_parms_t; + +typedef struct hpl_accel_reform_rows_parms { + vector signed int m_n_ldr_lda; + vector unsigned long long rows_a; + vector unsigned long long incomplete_blk_col; + int blk_rows[5*4]; +} hpl_accel_reform_rows_parms_t; + +typedef struct hpl_accel_swap_rows_parms { + vector signed int m_n_lda_blk_col __attribute__ ((aligned (16))); + vector unsigned long long a_incomplete __attribute__ ((aligned (16))); + int blk_rows[6*4]; +} hpl_accel_swap_rows_parms_t; + +typedef struct hpl_accel_copy_rows_parms { + vector signed int m_n_lda_ldb __attribute__ ((aligned (16))); + vector unsigned long long a_b __attribute__ ((aligned (16))); + vector unsigned long long incomplete_pad __attribute__ ((aligned (16))); + int rows[4*4]; +} hpl_accel_copy_rows_parms_t; + +#else + +typedef struct hpl_accel_init_parms { + unsigned int id __attribute__ ((aligned (16))); + void *cmd_base __attribute__ ((aligned (16))); + void *signotify1[HPL_ACCEL_SPES] __attribute__ ((aligned (16))); +} hpl_accel_init_parms_t; + +typedef struct hpl_accel_dgemm_parms { + const double *a __attribute__ ((aligned (16))); + const double *b __attribute__ ((aligned (16))); + double *c __attribute__ ((aligned (16))); + double *p __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int ldb; + int ldc; + int ldp; + int n __attribute__ ((aligned (16))); + int m; + int k; + int b_blk __attribute__ ((aligned (16))); + int a_count; + int b_count; + int p_count; + unsigned long long *incomplete __attribute__ ((aligned (16))); +} hpl_accel_dgemm_parms_t; + + +typedef struct hpl_accel_dtrsm_parms { + const double *a __attribute__ ((aligned (16))); + double *b __attribute__ ((aligned (16))); + double *c __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int ldb; + int ldc; + int n __attribute__ ((aligned (16))); + int m; + int a_count; + int b_count; + unsigned int blk_col __attribute__ ((aligned (16))); + unsigned long long *incomplete __attribute__ ((aligned (16))); +} hpl_accel_dtrsm_parms_t; + +typedef struct hpl_accel_reform_matrix_CL_to_B_parms { + double *a __attribute__ ((aligned (16))); + double *scratch __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int n __attribute__ ((aligned (16))); + int m __attribute__ ((aligned (16)));; + int spes __attribute__ ((aligned (16))); + unsigned long long *incomplete __attribute__ ((aligned (16))); +} hpl_accel_reform_matrix_CL_to_B_parms_t; + +typedef struct hpl_accel_reform_panel_parms { + double *a __attribute__ ((aligned (16))); + double *panel __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int ldp __attribute__ ((aligned (16))); + int n __attribute__ ((aligned (16))); + int m __attribute__ ((aligned (16))); + int p_count __attribute__ ((aligned (16))); + unsigned long long *incomplete __attribute__ ((aligned (16))); +} hpl_accel_reform_panel_parms_t; + +typedef struct hpl_accel_reform_rows_parms { + int m, n, ldr, lda; + double *rows, *a; + unsigned long long *incomplete; + int blk_col, pad; + int blk_rows[5*4]; +} hpl_accel_reform_rows_parms_t; + +typedef struct hpl_accel_swap_rows_parms { + int m, n, lda, blk_col; + double *a; + unsigned long long *incomplete; + int blk_rows[6*4]; +} hpl_accel_swap_rows_parms_t; + +typedef struct hpl_accel_copy_rows_parms { + int m, n, lda, ldb; + double *a; + double *b; + unsigned long long *incomplete; + unsigned long long pad; + int rows[4*4]; +} hpl_accel_copy_rows_parms_t; + +#endif + + +/* Inline support functions. + */ +#ifdef __PPU__ + +#include +#include "hpl_accel_global.h" + + +/* init_incomplete + * --------------- + * Initialize the asynchronous completion notification variable according + * to the specified number of paraticants. The number of participants can + * be between 1 and 8 where each byte in the unsigned long long variable + * is a flag for each of the participants. The bytes are assigned as follows: + * + * msb lsb + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | SPE 0 | SPE 1 | SPE 2 | SPE 3 | SPE 4 | SPE 5 | SPE 6 | SPE 7 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Parameters: + * incomplete Pointer to the asynchronous completion variable. + * + * participants Number of participants that will participate in the + * command that need to acknowledge completion status. + * + */ +static inline void init_incomplete(unsigned long long *incomplete, int participants) +{ + if (incomplete) { + *incomplete = 0xFFFFFFFFFFFFFFFFULL << (8*(8-participants)); + } +} + + +/* send_cmd_to_spes + * ---------------- + * Send the command with the index to the parameter buffer to the specified + * number of SPE participants. The command and index are combined into a + * 32-bit message that is placed in the inbound SPE mailbox. The 7 least + * significant bits of the message contain the command id. The 25 most + * significant bits is an offset from the cmd_base to the cacheline containing + * the command paramters. + * + * Parameters: + * idx Command buffer index that contains the parameters for + * this command. + * + * participants Number of participants that will participate in the command. + */ + +static inline void send_cmd_to_spes(unsigned int cmd, int idx, int participants) +{ + int i; + int cnt; + volatile spe_spu_control_area_t *ctl; + + /* Construct cmd message to be sent to each of the SPEs via the + * inbound mailbox. + */ + cmd |= (unsigned int)(idx * sizeof(struct hpl_accel_cmd_entry)); + + for (i=0; iSPU_Mbox_Stat >> 8) & 0xFF; + } + + /* Place the command into the inbound mailbox. + */ + ctl->SPU_In_Mbox = cmd; + hpl_accel_threads[i].in_cnt = cnt-1; + } +} +#endif + + +#ifdef PANEL_4GB_CROSSING +#define COMPUTE_PANEL_4GB_CROSSING_COUNT(_count, _panel, _ld, _max) { \ + int _cnt; \ + /* Calculate the number of rows/columns to the 4GB crossing and clamp \ + * the result to max. \ + */ \ + _cnt = (0x20000000 - ((unsigned int)(uintptr_t)_panel) / sizeof(double)) / _ld; \ + _count = (_cnt > _max) ? _max : _cnt; \ +} +#else /* !PANEL_4GB_CROSSING */ +#define COMPUTE_PANEL_4GB_CROSSING_COUNT(_count, _panel, _ld, _max) +#endif /* PANEL_4GB_CROSSING */ + +#define COUNT_PANEL_4GB_CROSSINGS(_p, _n, _ld) \ + /* return the number of 4GB crossings in panel _p */ \ + (((uintptr_t)(_p + _n*_ld - 1) >> 32) - ((uintptr_t)_p >> 32)) + +#ifdef VALIDATE_4GB_CROSSING +#ifdef PANEL_4GB_CROSSING + +#define VALIDATE_PANEL_4GB_CROSSING(_p, _n, _ld) { \ + /* Verify that if the panel crosses a 4GB boundary. It does so only on a row \ + * boundary, and only once. \ + */ \ + if (_p) { \ + unsigned int _crossings; \ + unsigned int _bytes_til_crossing; \ + \ + _crossings = COUNT_PANEL_4GB_CROSSINGS(_p, _n, _ld); \ + switch (_crossings) { \ + case 0: \ + break; \ + case 1: \ + _bytes_til_crossing = ((uintptr_t)_p ^ (-1)) + 1; \ + if ((_bytes_til_crossing % (_ld * sizeof(double))) != 0) { \ + fprintf(stderr, "%s %d - Panel crosses 4GB boundary within a row/col. Parameters p=%p n=%d ld=%d\n",\ + __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld); \ + abort(); \ + } \ + break; \ + default: \ + fprintf(stderr, "%s %d - Panel crosses %d 4GB boundary. Parameters p=%p n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _crossings, _p, _n, _ld); \ + abort(); \ + break; \ + } \ + if (_ld > 0x0FFFFFFF) { \ + fprintf(stderr, "%s %d - Panel leading dimension too big. Parameters p=%p n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld); \ + abort(); \ + } \ + } \ +} + +#else /* ! PANEL_4GB_CROSSING */ + +#define VALIDATE_PANEL_4GB_CROSSING(_p, _n, _ld) { \ + /* Verify that the panel does not cross a 4GB boundary */ \ + if (_p) { \ + if ( COUNT_PANEL_4GB_CROSSINGS(_p, _n, _ld) != 0 ) { \ + fprintf(stderr, "%s %d - Panel crosses 4GB boundary unexpectedly. Parameters p=%p n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld); \ + abort(); \ + } \ + if (_ld > 0x0FFFFFFF) { \ + fprintf(stderr, "%s %d - Panel leading dimension too big. Parameters p=%p n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld); \ + abort(); \ + } \ + } \ +} +#endif + +#ifdef MATRIX_4GB_CROSSING + +#define VALIDATE_MATRIX_4GB_CROSSING(_p, _m, _n, _ld) { \ + if (_p) { \ + int _i; \ + double *_start, *_end; \ + unsigned int _blks_per_col, _dbls_to_crossing; \ + \ + if (_ld > 0x0FFFFFFF) { \ + fprintf(stderr, "%s %d - Matrix leading dimension too big. Parameters p=%p m=%d n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld); \ + abort(); \ + } \ + /* For each column of blocks */ \ + _blks_per_col = (_m + (M_SUB-1))/M_SUB; \ + for (_i=0, _start=(double *)_p; _i<_n; _i+=M_SUB) { \ + _end = _start + _ld; \ + if (((uintptr_t)(_end) >> 32) > ((uintptr_t)(_start) >> 32)) { \ + /* This column crosses a 4GB boundary. Check to see that it occurs only on a block boundary */ \ + _dbls_to_crossing = 0x20000000 - ((unsigned int)(uintptr_t)_start) / sizeof(double); \ + if (((M_SUB*M_SUB)*_blks_per_col > _dbls_to_crossing) && \ + ((_dbls_to_crossing % (M_SUB*M_SUB)) != 0)) { \ + fprintf(stderr, "%s %d - Matrix block straddles 4GB boundary. Parameters p=%p m=%d n=%d ld=%d\n",\ + __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld); \ + abort(); \ + } \ + } \ + _start = _end; \ + } \ + } \ +} + +#else /* !MATRIX_4GB_CROSSING */ + +#define VALIDATE_MATRIX_4GB_CROSSING(_p, _m, _n, _ld) { \ + if (_p) { \ + if ((((uintptr_t)_p) >> 32) != ((uintptr_t)(_p + _ld * ((_n+M_SUB-1)/M_SUB) + ((_m+M_SUB-1)/M_SUB)*M_SUB*M_SUB-1) >> 32)) { \ + fprintf(stderr, "%s %d - Matrix crosses 4GB boundary unexpectedly. Parameters p=%p m=%d n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld); \ + abort(); \ + } \ + if (_ld > 0x0FFFFFFF) { \ + fprintf(stderr, "%s %d - Matrix leading dimension too big. Parameters p=%p m=%d n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld); \ + abort(); \ + } \ + } \ +} +#endif + +#else /* VALIDATE_4GB_CROSSING */ +#define VALIDATE_PANEL_4GB_CROSSING(_p, _n, _ld) +#define VALIDATE_MATRIX_4GB_CROSSING(_p, _m, _n, _ld) +#endif /* VALIDATE_4GB_CROSSING */ + +#endif /* _HPL_ACCEL_SPU_H_ */ Index: accel/lib/hpl_accel_swap.c =================================================================== RCS file: accel/lib/hpl_accel_swap.c diff -N accel/lib/hpl_accel_swap.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_swap.c 20 Aug 2008 03:57:53 -0000 1.4 @@ -0,0 +1,150 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" +#include "ppu_intrinsics.h" + +/* General purpose, reference, reformating facilities. + */ + +void hpl_accel_swap_rows_B_to_B(int m, int n, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) +{ + int i; + unsigned int non_aligned; + int n0 = 0; + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + non_aligned = (((unsigned int)(blk_col | lda) & 1) | + (((unsigned int)(uintptr_t)a) & (16-1))); + + if ((non_aligned == 0) && (n > 1)) { + int m_start, m_left; + int rows_per_block; + unsigned int idx; + volatile hpl_accel_swap_rows_parms_t *parms; + + /* Assert that we won't span a 4G boundary crossing + */ + VALIDATE_MATRIX_4GB_CROSSING(a, ((lda/M_SUB) & ~(M_SUB-1)), n, lda); + + n0 = n & ~1; + + idx = hpl_accel_cmd_idx; + + m_start = 0; + m_left = m; + + /* Generate multiple command requests if the number of rows + * is greater than what will fit in a single command request. + */ + rows_per_block = (int)(sizeof(parms->blk_rows) / sizeof(int)); + + while (m_left > rows_per_block) { + parms = (volatile hpl_accel_swap_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = rows_per_block; + parms->n = n0; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->a = a + INDEX_BLK(m_start,0,lda); + parms->incomplete = NULL; + + for (i=0; iblk_rows[i] = blk_rows[m_start+i]-m_start; + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_SWAP_ROWS_B_TO_B, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + m_start += rows_per_block; + m_left -= rows_per_block; + } + + if (m_left > 0) { + parms = (volatile hpl_accel_swap_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = m_left; + parms->n = n0; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->a = a + INDEX_BLK(m_start,0,lda); + parms->incomplete = incomplete; + + for (i=0; iblk_rows[i] = blk_rows[m_start+i]-m_start; + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_SWAP_ROWS_B_TO_B, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + } else { + if (incomplete) *incomplete = 0; + } + hpl_accel_cmd_idx = idx; + } else { + if (incomplete) *incomplete = 0; + } + + /* Cleanup portions of the rows not implemented by the SPEs above. + */ + if (n0 < n) { + unsigned int y1, y2, x; + int first_span, span, left; + double tmp, *src, *dst; + + blk_col += n0; + n -= n0; + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + first_span = M_SUB - blk_col; + if (first_span > n) first_span = n; + + /* For each of the rows */ + for (y1=0; y1<(unsigned int)m; y1++) { + y2 = blk_rows[y1]; /* New location for row y1 */ + if (y1 != y2) { + dst = a + (y1 * M_SUB); + src = a + (y2 * M_SUB); + for (x=0; x<(unsigned int)first_span; x++) + {tmp = dst[x+blk_col]; dst[x+blk_col] = src[x+blk_col]; src[x+blk_col] = tmp;} + left = n - first_span; + while (left) { + dst += lda; + src += lda; + span = (left > M_SUB) ? M_SUB : left; + for (x=0; x<(unsigned int)span; x++) + {tmp = dst[x]; dst[x] = src[x]; src[x] = tmp;} + left -= span; + } + } + } + } +} + Index: accel/lib/hpl_ref.c =================================================================== RCS file: accel/lib/hpl_ref.c diff -N accel/lib/hpl_ref.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_ref.c 20 Aug 2008 03:57:53 -0000 1.11 @@ -0,0 +1,419 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" + +#include + + +int hpl_ref_init() +{ + return HPL_ACCEL_INIT_SUCCESS; +} + + +void hpl_ref_dgemm_CL_R_B(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete) +{ + unsigned int i; + unsigned int x, y; + double a_val; + + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y,x,ldc)] -= a_val * b[INDEX_ROW(i,x,ldb)]; + } + } + } + + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_dgemm_CL_R_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *p, int ldp, + unsigned long long *incomplete) +{ + unsigned int i; + unsigned int x, y; + double a_val; + + if (p) { + /* Copy c into p */ + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] = c[INDEX_BLK(y+blk_row, x+blk_col, ldc)]; + } + } + /* Perform DGEMM on p */ + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] -= a_val * b[INDEX_ROW(i,x,ldb)]; + } + } + } + /* Byte swap the result */ + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] = hpl_accel_byte_swap(p[INDEX_COL(y,x,ldp)]); + } + } + } else { + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y+blk_row, x+blk_col, ldc)] -= a_val * b[INDEX_ROW(i,x,ldb)]; + } + } + } + } + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_dgemm_CL_B_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *p, int ldp, + unsigned long long *incomplete) +{ + unsigned int i; + unsigned int x, y; + double a_val; + + if (p) { + /* Copy c into p */ + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] = c[INDEX_BLK(y+blk_row, x+blk_col, ldc)]; + } + } + /* Perform DGEMM on P */ + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] -= a_val * b[INDEX_BLK(i,x+blk_col,ldb)]; + } + } + } + /* Byte swap the result */ + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] = hpl_accel_byte_swap(p[INDEX_COL(y,x,ldp)]); + } + } + } else { + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y+blk_row, x+blk_col, ldc)] -= a_val * b[INDEX_BLK(i,x+blk_col,ldb)]; + } + } + } + } + if (incomplete) *incomplete = 0; +} + + + +void hpl_ref_dgemm_CL_B_B(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete) +{ + unsigned int i; + unsigned int x, y; + double a_val; + + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y,x,ldc)] -= a_val * b[INDEX_BLK(i,x,ldb)]; + } + } + } + + if (incomplete) *incomplete = 0; +} + + +extern void hpl_ref_dgemm_C_C_C(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete) +{ + unsigned int i; + unsigned int x, y; + double a_val, c_val; + + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = a[INDEX_COL(y,i,lda)]; + for (x=0; x<(unsigned int)n; x++) { + c_val = c[INDEX_COL(y,x,ldc)]; + c_val -= a_val * b[INDEX_COL(i,x,ldb)]; + c[INDEX_COL(y,x,ldc)] = c_val; + } + } + } + + if (incomplete) *incomplete = 0; +} + + + +void hpl_ref_dtrsm_CL_R(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned long long *incomplete) +{ + unsigned int i, x, y; + + for (x=0; x<(unsigned int)n; x++) { + for (y=1; y<(unsigned int)m; y++) { + for (i=y; i<(unsigned int)m; i++) { + b[INDEX_ROW(i, x, ldb)] -= b[INDEX_ROW(y-1, x, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]); + } + } + } + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_dtrsm_CL_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete) +{ + unsigned int i, x, y; + + for (x=0; x<(unsigned int)n; x++) { + for (y=1; y<(unsigned int)m; y++) { + for (i=y; i<(unsigned int)m; i++) { + b[INDEX_BLK(i+blk_row, x+blk_col, ldb)] -= b[INDEX_BLK(y-1+blk_row, x+blk_col, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]); + } + } + } + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_dtrsm_CL_R_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete) +{ + unsigned int i, x, y; + + if (c) { + for (x=0; x<(unsigned int)n; x++) { + + for (i=0; i<(unsigned int)m; i++) c[INDEX_BLK(i+blk_row, x+blk_col, ldc)] = b[INDEX_ROW(i, x, ldb)]; /* Copy the column of b into c */ + for (y=1; y<(unsigned int)m; y++) { + for (i=y; i<(unsigned int)m; i++) { + c[INDEX_BLK(i+blk_row, x+blk_col, ldc)] -= c[INDEX_BLK(y-1+blk_row, x+blk_col, ldc)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]); + } + } + } + } else { + for (x=0; x<(unsigned int)n; x++) { + for (y=1; y<(unsigned int)m; y++) { + for (i=y; i<(unsigned int)m; i++) { + b[INDEX_ROW(i, x, ldb)] -= b[INDEX_ROW(y-1, x, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]); + } + } + } + } + if (incomplete) *incomplete = 0; +} + + + + +/* General purpose, reference, reformating facilities. + */ +void hpl_ref_reform_panel_CL_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete) +{ + unsigned int x, y; + + for (x=0; x<(unsigned int)n; x++) { + for (y=0; y<(unsigned int)m; y++) { + a[INDEX_BLK(y,x,lda)] = hpl_accel_byte_swap(panel[INDEX_COL(y,x,ldp)]); + } + } + + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_reform_matrix_CL_to_B(int m, int n, + double *a, int lda, + double *scratch, + int size __attribute__ ((unused)) , + unsigned long long *incomplete) + +{ + unsigned int i; + unsigned int x, y; + unsigned int col; + + /* Reformat the matrix [a] from column-order, little-endian to blocked, + * big-endian format. + */ + + /* For each column of blocks */ + for (col=0; col<(unsigned int)n; col+=M_SUB) { + /* Reformat the column of block into the scratch buffer */ + for (x=0; x<(unsigned int)M_SUB; x++) { + for (y=0; y<(unsigned int)m; y++) { + scratch[INDEX_ROW(y,x,M_SUB)] = hpl_accel_byte_swap(a[INDEX_COL(y,x,lda)]); + } + } + /* Copy the reformated data back into a */ + memcpy(a, scratch, sizeof(double)*M_SUB*m); + + /* Zero the trailing block column of data */ + a += M_SUB*m; + for (i=0; i<(unsigned int)M_SUB*(lda-m); i++) *a++ = 0.0; + } + if (incomplete) *incomplete = 0; +} + + + + +void hpl_ref_reform_panel_B_to_CL(int m, int n, + double *panel, int ldp, + double *a, int lda, + unsigned long long *incomplete) +{ + unsigned int x, y; + + for (x=0; x<(unsigned int)n; x++) { + for (y=0; y<(unsigned int)m; y++) { + panel[INDEX_COL(y,x,ldp)] = hpl_accel_byte_swap(a[INDEX_BLK(y,x,lda)]); + } + } + if (incomplete) *incomplete = 0; +} + + + +void hpl_ref_reform_panel_R_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete) +{ + unsigned int x, y; + + for (x=0; x<(unsigned int)n; x++) { + for (y=0; y<(unsigned int)m; y++) { + a[INDEX_BLK(y,x,lda)] = panel[INDEX_ROW(y,x,ldp)]; + } + } + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_reform_rows_R_to_B(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) +{ + unsigned int x, y; + + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + a[INDEX_BLK((unsigned int)blk_rows[y], blk_col+x, lda)] = rows[INDEX_ROW(y, x, ldr)]; + } + } + if (incomplete) *incomplete = 0; +} + + + +void hpl_ref_reform_rows_B_to_R(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) + +{ + unsigned int x, y; + + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + rows[INDEX_ROW(y, x, ldr)] = a[INDEX_BLK((unsigned int)blk_rows[y], blk_col+x, lda)]; + } + } + if (incomplete) *incomplete = 0; +} + +void hpl_ref_swap_rows_B_to_B(int m, int n, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) +{ + unsigned int y1, y2, x; + + for (y1=0; y1<(unsigned int)m; y1++) { + y2 = blk_rows[y1]; /* New location for row y1 */ + if (y1 != y2) { + /* Swap rows y1 and y2 */ + for (x=0; x<(unsigned int)n; x++) { + double tmp = a[INDEX_BLK(y1, x+blk_col, lda)]; + a[INDEX_BLK(y1, x+blk_col, lda)] = a[INDEX_BLK(y2, x+blk_col, lda)]; + a[INDEX_BLK(y2, x+blk_col, lda)] = tmp; + } + } + } + if (incomplete) *incomplete = 0; +} + +void hpl_ref_copy_rows_R_to_R(int m, int n, + double *a, int lda, + double *b, int ldb, + int *rows, + unsigned long long *incomplete) +{ + unsigned int y1, y2, x; + + for (y1=0; y1<(unsigned int)m; y1++) { + y2 = rows[y1]; /* New location for row y1 */ + /* Copy row a[y1] to b[y2] */ + for (x=0; x<(unsigned int)n; x++) { + b[INDEX_ROW(y2, x, ldb)] = a[INDEX_ROW(y1, x, lda)]; + } + } + if (incomplete) *incomplete = 0; +} Index: accel/lib/spu/Makefile =================================================================== RCS file: accel/lib/spu/Makefile diff -N accel/lib/spu/Makefile --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/Makefile 20 Aug 2008 03:57:53 -0000 1.9 @@ -0,0 +1,57 @@ +# --------------------------------------------------------------- +# (C) Copyright IBM Corporation 2007,2008 +# +# --------------------------------------------------------------- + +######################################################################## +# Target +######################################################################## + +PROGRAM_spu := hpl_accel_spu + +LIBRARY_embed64 = libhpl_accel_spu.a + +OBJS = hpl_accel_spu.o \ + accel_dgemm.o \ + accel_dgemm_panel.o \ + accel_dgemm_C.o \ + accel_dtrsm.o \ + accel_dtrsm_panel.o \ + accel_dtrsm_CL_B.o \ + accel_reform_matrix_CL_to_B.o \ + accel_reform_panel_B_to_CL.o \ + accel_reform_panel_R_to_B.o \ + accel_reform_rows_B_to_R.o \ + accel_reform_rows_R_to_B.o \ + accel_swap_rows_B_to_B.o \ + accel_copy_rows_R_to_R.o \ + accel_buffers.o \ + accel_mm_dp_64Cx64.o \ + accel_dtrsm_dp_128Cx16.o \ + accel_mm_dp.o + + +######################################################################## +# Local Defines +######################################################################## + +# CC_OPT_LEVEL = -g + +#CPPFLAGS = -DACCEL_LITTLE_ENDIAN +CPPFLAGS += -DMATRIX_4GB_CROSSING + +# THE SPU CODE DOES NOT YET SUPPORT 4GB PANEL CROSSING +#CPPFLAGS += -DPANEL_4GB_CROSSING + +CFLAGS_gcc = -march=celledp -mtune=celledp +CFLAGS_xlc = -qarch=edp -qtune=edp + +INCLUDE = -I.. + + + +######################################################################## +# make.footer +######################################################################## + +include $(CELL_TOP)/buildutils/make.footer Index: accel/lib/spu/accel_buffers.S =================================================================== RCS file: accel/lib/spu/accel_buffers.S diff -N accel/lib/spu/accel_buffers.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_buffers.S 18 May 2008 21:11:28 -0000 1.2 @@ -0,0 +1,26 @@ +/* -------------------------------------------------------------- */ +/* (C)Copyright 2007,2008 */ +/* International Business Machines Corporation, */ +/* */ +/* All Rights Reserved. */ +/* -------------------------------------------------------------- */ + + .data + .align 7 + .global bufA +bufA: + .global bufA_128x128 +bufA_128x128: + .skip 2*64*64*8 + .global bufB +bufB: .skip 2*64*64*8 + + .global bufC +bufC: + .global bufB_128x16 +bufB_128x16: + .skip 2*128*16*8 + .global bufB_list +bufB_list: + .skip 64*64*8 + Index: accel/lib/spu/accel_buffers.h =================================================================== RCS file: accel/lib/spu/accel_buffers.h diff -N accel/lib/spu/accel_buffers.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_buffers.h 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,24 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef _ACCEL_BUFFERS_H_ +#define _ACCEL_BUFFERS_H_ + +#include + +/* The local store buffers is carved up uniquely for each acceleration function. + */ + +/* DGEMM buffer set */ +extern vec_double2 bufA[2][64*64/2]; +extern vec_double2 bufB[2][64*64/2]; +extern vec_double2 bufC[2][64*64/2]; + +/* DTRSM buffer set */ +extern vec_double2 bufA_128x128[128*128/2]; +extern vec_double2 bufB_128x16[2][128*16/2]; +extern vec_uint4 bufB_list[8][128/2]; + +#endif /* _ACCEL_BUFFERS_H_ */ Index: accel/lib/spu/accel_copy_rows_R_to_R.c =================================================================== RCS file: accel/lib/spu/accel_copy_rows_R_to_R.c diff -N accel/lib/spu/accel_copy_rows_R_to_R.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_copy_rows_R_to_R.c 20 Aug 2008 03:57:53 -0000 1.4 @@ -0,0 +1,127 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include