From 683081abed847accc99a4629456a5f654d673fec Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Thu, 7 Jul 2016 14:05:11 -0500
Subject: [PATCH 1/8] Fix setindex! with SubDArray source

This method is an optimization wherein we try to chunk accesses based upon the parent DArray's parts. The hard thing is then going backwards and trying to figure out which parts of the assignment indices need to be used in order to access those chunks.  This is a four stage process that uses five different types of indices:

1. Find the indices of each portion of the DArray
2. Find the valid subset of indices of the SubArray that index into that portion
3. Find the portion of the indices for the assignment that need to be used for that subset of indices in step 2. This is the hard part.  It requires creating another set of indices that represents the mask of valid indices from step 2.  With those masks in hand, it's possible to reindex `I` to the indices we need. The trouble is that `setindex!` supports singleton dimensions in the source array in ways that `getindex` does not, so we need to selectively drop singleton dimensions as we restrict the indices. A final complication is that the last index can be a linear index over many indices in either the source or destination.
4. Finally, if the entire DArray chunk isn't getting used, we need to shift the indices from step 2 to refer to the local part of the DArray.
---
 src/DistributedArrays.jl | 93 +++++++++++++++++++++++++++++++++++++---
 test/darray.jl           | 16 +++++++
 2 files changed, 103 insertions(+), 6 deletions(-)

diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index 0360bd2..e30f02a 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -635,21 +635,102 @@ function Base.setindex!(a::Array, d::DArray,
     return a
 end
 
+# Similar to Base.indexin, but just create a logical mask
+indexin_mask(a, b::Number) = a .== b
+indexin_mask(a, r::Range{Int}) = [i in r for i in a]
+indexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b))
+indexin_mask(a, b::AbstractArray) = indexin_mask(a, Set(b))
+indexin_mask(a, b) = [i in b for i in a]
+
+import Base: tail
+# Given a tuple of indices and a tuple of masks, restrict the indices to the
+# valid regions. This is, effectively, reversing Base.setindex_shape_check.
+# We can't just use indexing into MergedIndices here because getindex is much 
+# pickier about singleton dimensions than setindex! is.
+restrict_indices(::Tuple{}, ::Tuple{}) = ()
+function restrict_indices(a::Tuple{Any, Vararg{Any}}, b::Tuple{Any, Vararg{Any}})
+    if (length(a[1]) == length(b[1]) == 1) || (length(a[1]) > 1 && length(b[1]) > 1)
+        (vec(a[1])[vec(b[1])], restrict_indices(tail(a), tail(b))...)
+    elseif length(a[1]) == 1
+        (a[1], restrict_indices(tail(a), b))
+    elseif length(b[1]) == 1 && b[1][1]
+        restrict_indices(a, tail(b))
+    else
+        throw(DimensionMismatch("this should be caught by setindex_shape_check; please submit an issue"))
+    end
+end
+# The final indices are funky - they're allowed to accumulate together.
+# Too many masks is an easy fix -- just use the outer product to merge them:
+function restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}})
+    restrict_indices(a, (map(Bool, vec(vec(b[1])*vec(b[2])')), tail(tail(b))...))
+end
+# But too many indices is much harder; this will require merging the indices
+# in `a` before applying the final mask in `b`.
+function restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any})
+    if length(a[1]) == 1
+        (a[1], restrict_indices(tail(a), b))
+    else
+        # When one mask spans multiple indices, we need to merge the indices
+        # together. At this point, we can just use indexing to merge them since
+        # there's no longer special handling of singleton dimensions
+        (view(MergedIndices(a, map(length, a)), b[1]),)
+    end
+end
+
+immutable MergedIndices{T,N} <: AbstractArray{CartesianIndex{N}, N}
+    indices::T
+    sz::NTuple{N,Int}
+end
+Base.size(M::MergedIndices) = M.sz
+Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) = CartesianIndex(map(getindex, M.indices, I))
+# Boundschecking for using MergedIndices as an array index. This is overly
+# strict -- even for SubArrays of ReshapedIndices, we require that the entire
+# parent array's indices are valid. In this usage, it is just fine... and is a
+# huge optimization over exact bounds checking.
+typealias ReshapedMergedIndices{T,N,M<:MergedIndices} Base.ReshapedArray{T,N,M}
+typealias SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} SubArray{T,N,M}
+typealias MergedIndicesOrSub Union{MergedIndices, SubMergedIndices}
+import Base: _chkbnds
+# Ambiguity with linear indexing:
+@inline _chkbnds(A::AbstractVector, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
+@inline _chkbnds(A::AbstractArray, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
+# Generic bounds checking
+@inline _chkbnds{T,N}(A::AbstractArray{T,N}, checked::NTuple{N,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
+@inline _chkbnds{T,N,M}(A::AbstractArray{T,N}, checked::NTuple{M,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
+
+# The tricky thing here is that we want to optimize the accesses into the
+# distributed array, but in doing so, we lose track of which indices in I we
+# should be using.
+#
+# I’ve come to the conclusion that the function is utterly insane.
+# There are *6* flavors of indices with four different reference points:
+# 1. Find the indices of each portion of the DArray.
+# 2. Find the valid subset of indices for the SubArray into that portion.
+# 3. Find the portion of the `I` indices that should be used when you access the
+#    `K` indices in the subarray.  This guy is nasty.  It’s totally backwards
+#    from all other arrays, wherein we simply iterate over the source array’s
+#    elements.  You need to *both* know which elements in `J` were skipped
+#    (`indexin_mask`) and which dimensions should match up (`restrict_indices`)
+# 4. If `K` doesn’t correspond to an entire chunk, reinterpret `K` in terms of
+#    the local portion of the source array
 function Base.setindex!(a::Array, s::SubDArray,
         I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...)
+    Base.setindex_shape_check(s, Base.index_lengths(a, I...)...)
     n = length(I)
     d = s.parent
-    J = s.indexes
+    J = Base.decolon(d, s.indexes...)
     if length(J) < n
+        # TODO: this failsafe only works sometimes; the proper solution is to
+        # implement `restrict_indices` to merge the indices above.
         a[I...] = convert(Array,s)
         return a
     end
-    offs = [isa(J[i],Int) ? J[i]-1 : first(J[i])-1 for i=1:n]
     @sync for i = 1:length(d.pids)
-        K_c = Any[d.indexes[i]...]
-        K = [ intersect(J[j],K_c[j]) for j=1:n ]
+        K_c = d.indexes[i]
+        K = map(intersect, J, K_c)
         if !any(isempty, K)
-            idxs = [ I[j][K[j]-offs[j]] for j=1:n ]
+            K_mask = map(indexin_mask, J, K_c)
+            idxs = restrict_indices(Base.decolon(a, I...), K_mask)
             if isequal(K, K_c)
                 # whole chunk
                 @async a[idxs...] = chunk(d, i)
@@ -657,7 +738,7 @@ function Base.setindex!(a::Array, s::SubDArray,
                 # partial chunk
                 @async a[idxs...] =
                     remotecall_fetch(d.pids[i]) do
-                        view(localpart(d), [K[j]-first(K_c[j])+1 for j=1:n]...)
+                        view(localpart(d), [K[j]-first(K_c[j])+1 for j=1:length(J)]...)
                     end
             end
         end
diff --git a/test/darray.jl b/test/darray.jl
index 3283cf9..ea406a9 100644
--- a/test/darray.jl
+++ b/test/darray.jl
@@ -79,6 +79,22 @@ facts("test DArray / Array conversion") do
         @fact fetch(@spawnat MYID localpart(D)[1,1]) --> D[1,1]
         @fact fetch(@spawnat OTHERIDS localpart(D)[1,1]) --> D[1,101]
         close(D2)
+
+        S2 = convert(Vector{Float64}, D[4, 23:176])
+        @fact A[4, 23:176] --> S2
+
+        S3 = convert(Vector{Float64}, D[23:176, 197])
+        @fact A[23:176, 197] --> S3
+
+        S4 = zeros(4)
+        setindex!(S4, D[3:4, 99:100], :)
+        @fact S4 --> vec(D[3:4, 99:100])
+        @fact S4 --> vec(A[3:4, 99:100])
+        
+        S5 = zeros(2,2)
+        setindex!(S5, D[1,1:4], :, 1:2)
+        @fact vec(S5) --> D[1, 1:4]
+        @fact vec(S5) --> A[1, 1:4]
     end
     close(D)
 end

From e382ca6599a05833cc5aff0012100d0055376c4a Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Fri, 8 Jul 2016 10:30:55 -0500
Subject: [PATCH 2/8] Remove unnecessary failsafe

This is no longer needed -- the comment is from when I only had restrict_indices partially implemented
---
 src/DistributedArrays.jl | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index e30f02a..fda268d 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -719,12 +719,6 @@ function Base.setindex!(a::Array, s::SubDArray,
     n = length(I)
     d = s.parent
     J = Base.decolon(d, s.indexes...)
-    if length(J) < n
-        # TODO: this failsafe only works sometimes; the proper solution is to
-        # implement `restrict_indices` to merge the indices above.
-        a[I...] = convert(Array,s)
-        return a
-    end
     @sync for i = 1:length(d.pids)
         K_c = d.indexes[i]
         K = map(intersect, J, K_c)

From 7f0b41801a2a65df51d5c0eb4b69c16d04882ec2 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Sat, 9 Jul 2016 12:32:51 -0500
Subject: [PATCH 3/8] Also implement checkbounds_indices

---
 src/DistributedArrays.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index fda268d..aa6aa09 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -697,6 +697,10 @@ import Base: _chkbnds
 # Generic bounds checking
 @inline _chkbnds{T,N}(A::AbstractArray{T,N}, checked::NTuple{N,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
 @inline _chkbnds{T,N,M}(A::AbstractArray{T,N}, checked::NTuple{M,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
+import Base: checkbounds_indices
+@inline checkbounds_indices(::Tuple{},   I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices((),   (parent(parent(I[1])).indices..., tail(I)...))
+@inline checkbounds_indices(inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
+@inline checkbounds_indices(inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
 
 # The tricky thing here is that we want to optimize the accesses into the
 # distributed array, but in doing so, we lose track of which indices in I we

From 36204e6cc3b130cb0a799b174cd99869b7d0537b Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Sat, 9 Jul 2016 12:33:44 -0500
Subject: [PATCH 4/8] Only enable this method on 0.5

---
 src/DistributedArrays.jl | 210 ++++++++++++++++++++-------------------
 1 file changed, 107 insertions(+), 103 deletions(-)

diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index aa6aa09..84bbf91 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -635,113 +635,117 @@ function Base.setindex!(a::Array, d::DArray,
     return a
 end
 
-# Similar to Base.indexin, but just create a logical mask
-indexin_mask(a, b::Number) = a .== b
-indexin_mask(a, r::Range{Int}) = [i in r for i in a]
-indexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b))
-indexin_mask(a, b::AbstractArray) = indexin_mask(a, Set(b))
-indexin_mask(a, b) = [i in b for i in a]
-
-import Base: tail
-# Given a tuple of indices and a tuple of masks, restrict the indices to the
-# valid regions. This is, effectively, reversing Base.setindex_shape_check.
-# We can't just use indexing into MergedIndices here because getindex is much 
-# pickier about singleton dimensions than setindex! is.
-restrict_indices(::Tuple{}, ::Tuple{}) = ()
-function restrict_indices(a::Tuple{Any, Vararg{Any}}, b::Tuple{Any, Vararg{Any}})
-    if (length(a[1]) == length(b[1]) == 1) || (length(a[1]) > 1 && length(b[1]) > 1)
-        (vec(a[1])[vec(b[1])], restrict_indices(tail(a), tail(b))...)
-    elseif length(a[1]) == 1
-        (a[1], restrict_indices(tail(a), b))
-    elseif length(b[1]) == 1 && b[1][1]
-        restrict_indices(a, tail(b))
-    else
-        throw(DimensionMismatch("this should be caught by setindex_shape_check; please submit an issue"))
+# We also want to optimize setindex! with a SubDArray source, but this is hard
+# and only works on 0.5.
+if VERSION > v"0.5.0-dev+5230"
+    # Similar to Base.indexin, but just create a logical mask
+    indexin_mask(a, b::Number) = a .== b
+    indexin_mask(a, r::Range{Int}) = [i in r for i in a]
+    indexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b))
+    indexin_mask(a, b::AbstractArray) = indexin_mask(a, Set(b))
+    indexin_mask(a, b) = [i in b for i in a]
+
+    import Base: tail
+    # Given a tuple of indices and a tuple of masks, restrict the indices to the
+    # valid regions. This is, effectively, reversing Base.setindex_shape_check.
+    # We can't just use indexing into MergedIndices here because getindex is much 
+    # pickier about singleton dimensions than setindex! is.
+    restrict_indices(::Tuple{}, ::Tuple{}) = ()
+    function restrict_indices(a::Tuple{Any, Vararg{Any}}, b::Tuple{Any, Vararg{Any}})
+        if (length(a[1]) == length(b[1]) == 1) || (length(a[1]) > 1 && length(b[1]) > 1)
+            (vec(a[1])[vec(b[1])], restrict_indices(tail(a), tail(b))...)
+        elseif length(a[1]) == 1
+            (a[1], restrict_indices(tail(a), b))
+        elseif length(b[1]) == 1 && b[1][1]
+            restrict_indices(a, tail(b))
+        else
+            throw(DimensionMismatch("this should be caught by setindex_shape_check; please submit an issue"))
+        end
     end
-end
-# The final indices are funky - they're allowed to accumulate together.
-# Too many masks is an easy fix -- just use the outer product to merge them:
-function restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}})
-    restrict_indices(a, (map(Bool, vec(vec(b[1])*vec(b[2])')), tail(tail(b))...))
-end
-# But too many indices is much harder; this will require merging the indices
-# in `a` before applying the final mask in `b`.
-function restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any})
-    if length(a[1]) == 1
-        (a[1], restrict_indices(tail(a), b))
-    else
-        # When one mask spans multiple indices, we need to merge the indices
-        # together. At this point, we can just use indexing to merge them since
-        # there's no longer special handling of singleton dimensions
-        (view(MergedIndices(a, map(length, a)), b[1]),)
-    end
-end
-
-immutable MergedIndices{T,N} <: AbstractArray{CartesianIndex{N}, N}
-    indices::T
-    sz::NTuple{N,Int}
-end
-Base.size(M::MergedIndices) = M.sz
-Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) = CartesianIndex(map(getindex, M.indices, I))
-# Boundschecking for using MergedIndices as an array index. This is overly
-# strict -- even for SubArrays of ReshapedIndices, we require that the entire
-# parent array's indices are valid. In this usage, it is just fine... and is a
-# huge optimization over exact bounds checking.
-typealias ReshapedMergedIndices{T,N,M<:MergedIndices} Base.ReshapedArray{T,N,M}
-typealias SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} SubArray{T,N,M}
-typealias MergedIndicesOrSub Union{MergedIndices, SubMergedIndices}
-import Base: _chkbnds
-# Ambiguity with linear indexing:
-@inline _chkbnds(A::AbstractVector, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
-@inline _chkbnds(A::AbstractArray, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
-# Generic bounds checking
-@inline _chkbnds{T,N}(A::AbstractArray{T,N}, checked::NTuple{N,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
-@inline _chkbnds{T,N,M}(A::AbstractArray{T,N}, checked::NTuple{M,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
-import Base: checkbounds_indices
-@inline checkbounds_indices(::Tuple{},   I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices((),   (parent(parent(I[1])).indices..., tail(I)...))
-@inline checkbounds_indices(inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
-@inline checkbounds_indices(inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
-
-# The tricky thing here is that we want to optimize the accesses into the
-# distributed array, but in doing so, we lose track of which indices in I we
-# should be using.
-#
-# I’ve come to the conclusion that the function is utterly insane.
-# There are *6* flavors of indices with four different reference points:
-# 1. Find the indices of each portion of the DArray.
-# 2. Find the valid subset of indices for the SubArray into that portion.
-# 3. Find the portion of the `I` indices that should be used when you access the
-#    `K` indices in the subarray.  This guy is nasty.  It’s totally backwards
-#    from all other arrays, wherein we simply iterate over the source array’s
-#    elements.  You need to *both* know which elements in `J` were skipped
-#    (`indexin_mask`) and which dimensions should match up (`restrict_indices`)
-# 4. If `K` doesn’t correspond to an entire chunk, reinterpret `K` in terms of
-#    the local portion of the source array
-function Base.setindex!(a::Array, s::SubDArray,
-        I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...)
-    Base.setindex_shape_check(s, Base.index_lengths(a, I...)...)
-    n = length(I)
-    d = s.parent
-    J = Base.decolon(d, s.indexes...)
-    @sync for i = 1:length(d.pids)
-        K_c = d.indexes[i]
-        K = map(intersect, J, K_c)
-        if !any(isempty, K)
-            K_mask = map(indexin_mask, J, K_c)
-            idxs = restrict_indices(Base.decolon(a, I...), K_mask)
-            if isequal(K, K_c)
-                # whole chunk
-                @async a[idxs...] = chunk(d, i)
-            else
-                # partial chunk
-                @async a[idxs...] =
-                    remotecall_fetch(d.pids[i]) do
-                        view(localpart(d), [K[j]-first(K_c[j])+1 for j=1:length(J)]...)
-                    end
+    # The final indices are funky - they're allowed to accumulate together.
+    # Too many masks is an easy fix -- just use the outer product to merge them:
+    function restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}})
+        restrict_indices(a, (map(Bool, vec(vec(b[1])*vec(b[2])')), tail(tail(b))...))
+    end
+    # But too many indices is much harder; this will require merging the indices
+    # in `a` before applying the final mask in `b`.
+    function restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any})
+        if length(a[1]) == 1
+            (a[1], restrict_indices(tail(a), b))
+        else
+            # When one mask spans multiple indices, we need to merge the indices
+            # together. At this point, we can just use indexing to merge them since
+            # there's no longer special handling of singleton dimensions
+            (view(MergedIndices(a, map(length, a)), b[1]),)
+        end
+    end
+
+    immutable MergedIndices{T,N} <: AbstractArray{CartesianIndex{N}, N}
+        indices::T
+        sz::NTuple{N,Int}
+    end
+    Base.size(M::MergedIndices) = M.sz
+    Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) = CartesianIndex(map(getindex, M.indices, I))
+    # Boundschecking for using MergedIndices as an array index. This is overly
+    # strict -- even for SubArrays of ReshapedIndices, we require that the entire
+    # parent array's indices are valid. In this usage, it is just fine... and is a
+    # huge optimization over exact bounds checking.
+    typealias ReshapedMergedIndices{T,N,M<:MergedIndices} Base.ReshapedArray{T,N,M}
+    typealias SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} SubArray{T,N,M}
+    typealias MergedIndicesOrSub Union{MergedIndices, SubMergedIndices}
+    import Base: _chkbnds
+    # Ambiguity with linear indexing:
+    @inline _chkbnds(A::AbstractVector, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
+    @inline _chkbnds(A::AbstractArray, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
+    # Generic bounds checking
+    @inline _chkbnds{T,N}(A::AbstractArray{T,N}, checked::NTuple{N,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
+    @inline _chkbnds{T,N,M}(A::AbstractArray{T,N}, checked::NTuple{M,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
+    import Base: checkbounds_indices
+    @inline checkbounds_indices(::Tuple{},   I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices((),   (parent(parent(I[1])).indices..., tail(I)...))
+    @inline checkbounds_indices(inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
+    @inline checkbounds_indices(inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
+
+    # The tricky thing here is that we want to optimize the accesses into the
+    # distributed array, but in doing so, we lose track of which indices in I we
+    # should be using.
+    #
+    # I’ve come to the conclusion that the function is utterly insane.
+    # There are *6* flavors of indices with four different reference points:
+    # 1. Find the indices of each portion of the DArray.
+    # 2. Find the valid subset of indices for the SubArray into that portion.
+    # 3. Find the portion of the `I` indices that should be used when you access the
+    #    `K` indices in the subarray.  This guy is nasty.  It’s totally backwards
+    #    from all other arrays, wherein we simply iterate over the source array’s
+    #    elements.  You need to *both* know which elements in `J` were skipped
+    #    (`indexin_mask`) and which dimensions should match up (`restrict_indices`)
+    # 4. If `K` doesn’t correspond to an entire chunk, reinterpret `K` in terms of
+    #    the local portion of the source array
+    function Base.setindex!(a::Array, s::SubDArray,
+            I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...)
+        Base.setindex_shape_check(s, Base.index_lengths(a, I...)...)
+        n = length(I)
+        d = s.parent
+        J = Base.decolon(d, s.indexes...)
+        @sync for i = 1:length(d.pids)
+            K_c = d.indexes[i]
+            K = map(intersect, J, K_c)
+            if !any(isempty, K)
+                K_mask = map(indexin_mask, J, K_c)
+                idxs = restrict_indices(Base.decolon(a, I...), K_mask)
+                if isequal(K, K_c)
+                    # whole chunk
+                    @async a[idxs...] = chunk(d, i)
+                else
+                    # partial chunk
+                    @async a[idxs...] =
+                        remotecall_fetch(d.pids[i]) do
+                            view(localpart(d), [K[j]-first(K_c[j])+1 for j=1:length(J)]...)
+                        end
+                end
             end
         end
+        return a
     end
-    return a
 end
 
 Base.fill!(A::DArray, x) = begin

From 1c93330b51a9b6260470a757d342c89883007c47 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Sat, 23 Jul 2016 17:04:04 -0500
Subject: [PATCH 5/8] Fixup checkbounds_indices to the new APIs

Also clarify the comment since I was confused upon coming back to this method a few weeks later
---
 src/DistributedArrays.jl | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index 84bbf91..bb87f07 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -686,24 +686,24 @@ if VERSION > v"0.5.0-dev+5230"
     end
     Base.size(M::MergedIndices) = M.sz
     Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) = CartesianIndex(map(getindex, M.indices, I))
-    # Boundschecking for using MergedIndices as an array index. This is overly
-    # strict -- even for SubArrays of ReshapedIndices, we require that the entire
-    # parent array's indices are valid. In this usage, it is just fine... and is a
-    # huge optimization over exact bounds checking.
+    # Additionally, we optimize bounds checking when using MergedIndices as an 
+    # array index since checking, e.g., A[1:500, 1:500] is *way* faster than
+    # checking an array of 500^2 elements of CartesianIndex{2}. This optimization
+    # also applies to reshapes of MergedIndices since the outer shape of the
+    # container doesn't affect the index elements themselves. We can go even
+    # farther and say that even restricted views of MergedIndices must be valid
+    # over the entire array. This is overly strict in general, but in this
+    # use-case all the merged indices must be valid at some point, so it's ok.
     typealias ReshapedMergedIndices{T,N,M<:MergedIndices} Base.ReshapedArray{T,N,M}
     typealias SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} SubArray{T,N,M}
-    typealias MergedIndicesOrSub Union{MergedIndices, SubMergedIndices}
-    import Base: _chkbnds
-    # Ambiguity with linear indexing:
-    @inline _chkbnds(A::AbstractVector, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
-    @inline _chkbnds(A::AbstractArray, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
-    # Generic bounds checking
-    @inline _chkbnds{T,N}(A::AbstractArray{T,N}, checked::NTuple{N,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
-    @inline _chkbnds{T,N,M}(A::AbstractArray{T,N}, checked::NTuple{M,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
+    typealias MergedIndicesOrSub Union{MergedIndices, ReshapedMergedIndices, SubMergedIndices}
     import Base: checkbounds_indices
-    @inline checkbounds_indices(::Tuple{},   I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices((),   (parent(parent(I[1])).indices..., tail(I)...))
-    @inline checkbounds_indices(inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
-    @inline checkbounds_indices(inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
+    @inline checkbounds_indices(::Type{Bool}, inds::Tuple{}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
+        checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
+    @inline checkbounds_indices(::Type{Bool}, inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
+        checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
+    @inline checkbounds_indices(::Type{Bool}, inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
+        checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
 
     # The tricky thing here is that we want to optimize the accesses into the
     # distributed array, but in doing so, we lose track of which indices in I we

From 6d25dfe162f99d3fa061b02250539dfae9a7673c Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Sat, 23 Jul 2016 18:29:21 -0500
Subject: [PATCH 6/8] Add a lazy ProductIndices type instead of creating the
 outer product

Both these lazy arrays are effectively generalizations of Tim's MappedArrays.jl package. Doing this generally adds a bit more difficulty in terms of element types, but that is true of the MappedArray type, too.  It might be worth breaking this out into a package at some point.
---
 src/DistributedArrays.jl | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index bb87f07..1abb4cf 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -638,7 +638,11 @@ end
 # We also want to optimize setindex! with a SubDArray source, but this is hard
 # and only works on 0.5.
 if VERSION > v"0.5.0-dev+5230"
-    # Similar to Base.indexin, but just create a logical mask
+    # Similar to Base.indexin, but just create a logical mask. Note that this
+    # must return a logical mask in order to support merging multiple masks
+    # together into one linear index since we need to know how many elements to
+    # skip at the end. In many cases range intersection would be much faster
+    # than generating a logical mask, but that loses the endpoint information.
     indexin_mask(a, b::Number) = a .== b
     indexin_mask(a, r::Range{Int}) = [i in r for i in a]
     indexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b))
@@ -663,11 +667,12 @@ if VERSION > v"0.5.0-dev+5230"
         end
     end
     # The final indices are funky - they're allowed to accumulate together.
-    # Too many masks is an easy fix -- just use the outer product to merge them:
+    # An easy (albeit very inefficient) fix for too many masks is to use the
+    # outer product to merge them. But we can do that lazily with a custom type:
     function restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}})
-        restrict_indices(a, (map(Bool, vec(vec(b[1])*vec(b[2])')), tail(tail(b))...))
+        (vec(a[1])[vec(ProductIndices(b, map(length, b)))],)
     end
-    # But too many indices is much harder; this will require merging the indices
+    # But too many indices is much harder; this requires merging the indices
     # in `a` before applying the final mask in `b`.
     function restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any})
         if length(a[1]) == 1
@@ -680,8 +685,15 @@ if VERSION > v"0.5.0-dev+5230"
         end
     end
 
-    immutable MergedIndices{T,N} <: AbstractArray{CartesianIndex{N}, N}
-        indices::T
+    immutable ProductIndices{I,N} <: AbstractArray{Bool, N}
+        indices::I
+        sz::NTuple{N,Int}
+    end
+    Base.size(P::ProductIndices) = P.sz
+    Base.getindex{_,N}(P::ProductIndices{_,N}, I::Vararg{Int, N}) = Bool((&)(map(getindex, P.indices, I)...))
+
+    immutable MergedIndices{I,N} <: AbstractArray{CartesianIndex{N}, N}
+        indices::I
         sz::NTuple{N,Int}
     end
     Base.size(M::MergedIndices) = M.sz

From 474b93ca1c9d98ed288fbfcb05e65bd17f0d2835 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Sat, 23 Jul 2016 18:44:14 -0500
Subject: [PATCH 7/8] Propagate inbounds for the lazy array types

As a further optimization, (at)inbounds could be added throughout the algorithm once it has received more widespread testing.
---
 src/DistributedArrays.jl | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index 1abb4cf..fcb34eb 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -690,14 +690,18 @@ if VERSION > v"0.5.0-dev+5230"
         sz::NTuple{N,Int}
     end
     Base.size(P::ProductIndices) = P.sz
-    Base.getindex{_,N}(P::ProductIndices{_,N}, I::Vararg{Int, N}) = Bool((&)(map(getindex, P.indices, I)...))
+    # This gets passed to map to avoid breaking propagation of inbounds
+    Base.@propagate_inbounds propagate_getindex(A, I...) = A[I...]
+    Base.@propagate_inbounds Base.getindex{_,N}(P::ProductIndices{_,N}, I::Vararg{Int, N}) =
+        Bool((&)(map(propagate_getindex, P.indices, I)...))
 
     immutable MergedIndices{I,N} <: AbstractArray{CartesianIndex{N}, N}
         indices::I
         sz::NTuple{N,Int}
     end
     Base.size(M::MergedIndices) = M.sz
-    Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) = CartesianIndex(map(getindex, M.indices, I))
+    Base.@propagate_inbounds Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) =
+        CartesianIndex(map(propagate_getindex, M.indices, I))
     # Additionally, we optimize bounds checking when using MergedIndices as an 
     # array index since checking, e.g., A[1:500, 1:500] is *way* faster than
     # checking an array of 500^2 elements of CartesianIndex{2}. This optimization

From 4fb54047e7823d56357248573f64725a6a77a47f Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Sat, 23 Jul 2016 18:44:54 -0500
Subject: [PATCH 8/8] Update for SubArray change

cf. https://github.com/JuliaLang/julia/pull/17228#issuecomment-233033247
---
 src/DistributedArrays.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index fcb34eb..c409f37 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -550,7 +550,7 @@ end
 
 function Base.convert{T,N}(::Type{DArray}, SD::SubArray{T,N})
     D = SD.parent
-    DArray(SD.dims, procs(D)) do I
+    DArray(size(SD), procs(D)) do I
         TR = typeof(SD.indexes[1])
         lindices = Array(TR, 0)
         for (i,r) in zip(I, SD.indexes)