c

org.apache.beam.sdk.extensions.smb

SortedBucketSource

abstract class SortedBucketSource[KeyType] extends BoundedSource[KV[KeyType, CoGbkResult]]

A PTransform for co-grouping sources written using compatible SortedBucketSink transforms. It differs from org.apache.beam.sdk.transforms.join.CoGroupByKey because no shuffle step is required, since the source files are written in pre-sorted order. Instead, matching buckets' files are sequentially read in a merge-sort style, and outputs resulting value groups as org.apache.beam.sdk.transforms.join.CoGbkResult.

Source compatibility

Each of the BucketedInput sources must use the same key function and hashing scheme. Since SortedBucketSink writes an additional file representing BucketMetadata, SortedBucketSource begins by reading each metadata file and using BucketMetadata#isCompatibleWith(BucketMetadata) to check compatibility.

The number of buckets, N, does not have to match across sources. Since that value is required be to a power of 2, all values of N are compatible, albeit requiring a fan-out from the source with smallest N.

Source
SortedBucketSource.java
Ordering
  1. Alphabetic
  2. By Inheritance
Inherited
  1. SortedBucketSource
  2. BoundedSource
  3. Source
  4. HasDisplayData
  5. Serializable
  6. AnyRef
  7. Any
  1. Hide All
  2. Show All
Visibility
  1. Public
  2. Protected

Instance Constructors

  1. new SortedBucketSource(sources: List[BucketedInput[_ <: AnyRef]], targetParallelism: TargetParallelism, bucketOffsetId: Int, effectiveParallelism: Int, metricsKey: String, estimatedSizeBytes: Long)
    Attributes
    protected[smb]
  2. new SortedBucketSource(sources: List[BucketedInput[_ <: AnyRef]], targetParallelism: TargetParallelism, metricsKey: String)
  3. new SortedBucketSource(sources: List[BucketedInput[_ <: AnyRef]], targetParallelism: TargetParallelism)
  4. new SortedBucketSource(sources: List[BucketedInput[_ <: AnyRef]])

Abstract Value Members

  1. abstract def comparator(): Comparator[ComparableKeyBytes]
    Attributes
    protected[smb]
  2. abstract def createSplitSource(splitNum: Int, totalParallelism: Int, estSplitSize: Long): SortedBucketSource[KeyType]

    returns

    A split source of the implementing subtype

    Attributes
    protected[smb]
  3. abstract def keyTypeCoder(): Coder[KeyType]
    Attributes
    protected[smb]
  4. abstract def toKeyFn(): Function[ComparableKeyBytes, KeyType]
    Attributes
    protected[smb]

Concrete Value Members

  1. final def !=(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  2. final def ##: Int
    Definition Classes
    AnyRef → Any
  3. final def ==(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  4. final def asInstanceOf[T0]: T0
    Definition Classes
    Any
  5. def clone(): AnyRef
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws(classOf[java.lang.CloneNotSupportedException]) @native()
  6. def coGbkResultSchema(): CoGbkResultSchema
    Attributes
    protected[smb]
  7. def createReader(options: PipelineOptions): BoundedReader[KV[KeyType, CoGbkResult]]
    Definition Classes
    SortedBucketSource → BoundedSource
    Annotations
    @Override()
  8. final def eq(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  9. def equals(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef → Any
  10. def finalize(): Unit
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws(classOf[java.lang.Throwable])
  11. final def getClass(): Class[_ <: AnyRef]
    Definition Classes
    AnyRef → Any
    Annotations
    @native()
  12. def getEstimatedSizeBytes(options: PipelineOptions): Long
    Definition Classes
    SortedBucketSource → BoundedSource
    Annotations
    @Override()
  13. def getOrComputeSourceSpec(): SourceSpec
    Attributes
    protected[smb]
  14. def getOutputCoder(): Coder[KV[KeyType, CoGbkResult]]
    Definition Classes
    SortedBucketSource → Source
    Annotations
    @Override()
  15. def hashCode(): Int
    Definition Classes
    AnyRef → Any
    Annotations
    @native()
  16. final def isInstanceOf[T0]: Boolean
    Definition Classes
    Any
  17. final def ne(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  18. final def notify(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native()
  19. final def notifyAll(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native()
  20. def populateDisplayData(builder: Builder): Unit
    Definition Classes
    SortedBucketSource → Source → HasDisplayData
    Annotations
    @Override()
  21. def split(desiredBundleSizeBytes: Long, options: PipelineOptions): List[_ <: BoundedSource[KV[KeyType, CoGbkResult]]]
    Definition Classes
    SortedBucketSource → BoundedSource
    Annotations
    @Override()
  22. final def synchronized[T0](arg0: => T0): T0
    Definition Classes
    AnyRef
  23. def toString(): String
    Definition Classes
    AnyRef → Any
  24. def validate(): Unit
    Definition Classes
    Source
  25. final def wait(): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws(classOf[java.lang.InterruptedException])
  26. final def wait(arg0: Long, arg1: Int): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws(classOf[java.lang.InterruptedException])
  27. final def wait(arg0: Long): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws(classOf[java.lang.InterruptedException]) @native()

Deprecated Value Members

  1. def getDefaultOutputCoder(): Coder[KV[KeyType, CoGbkResult]]
    Definition Classes
    Source
    Annotations
    @Deprecated
    Deprecated

Inherited from BoundedSource[KV[KeyType, CoGbkResult]]

Inherited from Source[KV[KeyType, CoGbkResult]]

Inherited from HasDisplayData

Inherited from Serializable

Inherited from AnyRef

Inherited from Any

Ungrouped