![]() |
CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers
|
#include <mma_tensor_op_tile_iterator.h>
Classes | |
| struct | Policy |
| Internal structure of iterator - made public to enable introspection. More... | |
Public Types | |
| using | Shape = Shape_ |
| Shape of tile to load (concept: PitchLinearShape) More... | |
| using | Element = Element_ |
| Element type. More... | |
| using | Layout = cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 > |
| Layout of source tile. More... | |
| using | InstructionShape = InstructionShape_ |
| Shape of one matrix product operation (concept: GemmShape) More... | |
| using | TensorRef = TensorRef< Element, Layout > |
| TensorRef type for loading element from a tensor. More... | |
| using | Index = typename TensorRef::Index |
| Index type. More... | |
| using | LongIndex = typename TensorRef::LongIndex |
| Long Index type. More... | |
| using | TensorCoord = typename TensorRef::TensorCoord |
| Coordinate for an element in the tensor. More... | |
| using | Fragment = Array< Element, Shape::kCount/kThreads > |
| Fragment object holding a thread's part of a tile. More... | |
Public Member Functions | |
| CUTLASS_HOST_DEVICE | MmaTensorOpMultiplicandTileIterator () |
| Default ctor constructs null iterator. More... | |
| CUTLASS_DEVICE | MmaTensorOpMultiplicandTileIterator (TensorRef const &ref, int lane_id) |
| Constructor from TensorRef. More... | |
| CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & | add_pointer_offset (LongIndex offset) |
| Adds a pointer offset to internal pointer(s) to advance through memory. More... | |
| CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & | add_tile_offset (TensorCoord const &tile_offset) |
| Advances an iterator along logical dimensions of matrix in units of whole tiles. More... | |
| CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & | operator++ () |
| Advances the iterator along the advance dimension. More... | |
| CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & | operator-- () |
| Advances the iterator along the opposite of the advance dimension. More... | |
| CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & | operator+= (TensorCoord const &tile_offset) |
| advances in units of whole tiles along the logical coordinate space of the tensor More... | |
| CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & | operator-= (TensorCoord const &tile_offset) |
| CUTLASS_HOST_DEVICE void | load (Fragment &frag) const |
| Loads a fragment from memory at the location pointed to by the iterator. More... | |
| CUTLASS_DEVICE void | load_with_byte_offset (Fragment &frag, Index byte_offset) const |
| Loads a fragment from memory with additional logical offset. More... | |
| CUTLASS_DEVICE void | load_with_pointer_offset (Fragment &frag, Index pointer_offset) const |
| Loads a fragment from memory with additional logical offset. More... | |
| CUTLASS_DEVICE void | load (Fragment &frag, TensorCoord const &tile_offset) const |
| Loads a fragment from memory with logical offset in units of whole tiles. More... | |
| CUTLASS_DEVICE void | load (Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const |
| Loads a fragment from memory with logical offset in units of whole tiles. More... | |
| CUTLASS_DEVICE void | load_with_byte_offset (Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const |
| Loads a fragment from memory with logical offset in units of whole tiles. More... | |
| CUTLASS_DEVICE void | set_kgroup_index (int k_group) |
Static Public Attributes | |
| static Operand const | kOperand = Operand_ |
| Operand tag. More... | |
| static int const | kOpDelta = OpDelta_ |
| Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) More... | |
| static int const | kThreads = 32 |
| Number of participating threads. More... | |
| static int const | kPartitionsK = PartitionsK_ |
| Number of partitions along K dimension. More... | |
This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared memory and therefore must be initialized with a TensorRef to shared memory.
Satisfies: ReadableRandomAccessContiguousTileIteratorConcept
| using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Element = Element_ |
| using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Fragment = Array<Element, Shape::kCount / kThreads> |
| using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Index = typename TensorRef::Index |
| using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::InstructionShape = InstructionShape_ |
| using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Layout = cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits<Element_>::value, 64> |
| using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::LongIndex = typename TensorRef::LongIndex |
| using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::Shape = Shape_ |
| using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::TensorCoord = typename TensorRef::TensorCoord |
| using cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::TensorOpMultiplicandCongruous< sizeof_bits< Element_ >::value, 64 >, InstructionShape_, OpDelta_, 32, PartitionsK_ >::TensorRef = TensorRef<Element, Layout> |
|
inline |
|
inline |
|
inline |
|
inline |
|
inline |
|
inline |
| frag | fragment to load from the tensor |
| tile_offset | loads a tile with a logical offset in units of whole tiles |
|
inline |
| frag | fragment to load from the tensor |
| tile_offset | loads a tile with a logical offset in units of whole tiles |
| pointer_offset | loads a tile with a logical offset AND a pointer offset |
|
inline |
| frag | fragment to load from the tensor |
| byte_offset | loads a tile with a linear offset in units of bytes |
|
inline |
| frag | fragment to load from the tensor |
| tile_offset | loads a tile with a logical offset in units of whole tiles |
| byte_offset | loads a tile with a logical offset AND a pointer offset |
|
inline |
| frag | fragment to load from the tensor |
| pointer_offset | loads a tile with a linear offset |
|
inline |
|
inline |
|
inline |
advances in units of whole tiles along the logical coordinate space of the tensor
|
inline |
|
inline |
Notify the iterator which k-group it is currently pointing to.
This does not advance the iterator. Rather, it overrides its internal tracking with constant-valued k-group index to enable the compiler to fold constants and achieve more efficient code.
This is used by some nontrivial permuted layouts.
|
static |
|
static |
|
static |
|
static |
1.8.11