Details

Type: Improvement

Status: Open

Priority: Major

Resolution: Unresolved

Affects Version/s: None

Fix Version/s: Backlog

Component/s: Compiler (Misc)

Labels:None

Environment:
compiler, optimize, for loop, while
Description
I would like to suggest the compiler optimize the common case of for loops, that is,
for (var < Range [by step])

for (var < int to int [by step])

for (var < int until int [by step])

to use while loops under the covers instead of. Currently, nested for loops using ranges/iterators are sometimes an order of magnitude slower than while loops. However, while loop constructs for iterating over arrays are very cumbersome, and the functional style (foreach) is also cumbersome and introduces a lot of function call overhead as well.
The following two matrix multipication implementations
def matMulUsingIterators (

a : Array[Array[Double]],

b : Array[Array[Double]],

c : Array[Array[Double]]) : Unit = {


val b_j = new Array[Double](b.length)


for (j < 0 until b(0).length) {

for (k < 0 until b.length) {

b_j(k) = b(k)(j)

}

for (i < 0 until a.length) {

val c_i = c(i)

val a_i = a(i)

var s = 0.0d;

for (k < 0 until b.length) {

s += a_i(k) * b_j(k)

}

c_i(j) = s

}

}

}


def matMulUsingRanges (

a : Array[Array[Double]],

b : Array[Array[Double]],

c : Array[Array[Double]]) : Unit = {


val jRange = 0 until b(0).length;

val kRange = 0 until b.length;

val iRange = 0 until a.length;


val b_j = new Array[Double](b.length)


for (j < jRange) {

for (k < kRange) {

b_j(k) = b(k)(j)

}

for (i < iRange) {

val c_i = c(i);

val a_i = a(i);

var s = 0.0d;

for (k < kRange) {

s += a_i(k) * b_j(k)

}

c_i(j) = s

}

}

}

are much slower than the same algorithm coded with while loops:
def matMulUsingWhileLoop (

a : Array[Array[Double]],

b : Array[Array[Double]],

c : Array[Array[Double]]) : Unit = {


val m = a.length;

val p = b(0).length;

val n = b.length;


val b_j = new Array[Double](b.length);


var i = 0; var j = 0; var k = 0;

while (j < p) {

k = 0

while (k < n) {

b_j(k) = b(k)(j);

k += 1

}

i = 0

while (i < m) {

val c_i = c(i);

val a_i = a(i);

var s = 0.0d;

k = 0;

while (k < n) {

s += a_i(k) * b_j(k);

k += 1

}

c_i(j) = s;

i += 1

}

j += 1;

}

}

but the while loop code is more complex and error prone.
(Sorry, Trac appears to remove some line breaks; I
added some explicit semis but might have missed some;
I'll try attaching actual working source code)
Running this while measuring time in nanoseconds:
Iterators 2,807,815,301ns

Ranges 2,789,958,191ns

While Loop 190,778,574ns

MatMul by Iterators is 14 times as slow as with while loops.
It does not appear that the Hotspot runtime profiling and optimization dramatically helps this performance problem
This performance problem can hurt adoption of Scala for many types of uses/applications.
Attachments
Activity
Field  Original Value  New Value 

Workflow  jira [ 20547 ]  scala [ 24423 ] 
Workflow  scala [ 24423 ]  OpenClosed Workflow v3 [ 29883 ] 
Assignee  Iulian Dragos [ dragos ] 
Description 
I would like to suggest the compiler optimize the common case of for loops, that is, for (var < Range [by step]) for (var < int to int [by step]) for (var < int until int [by step]) to use while loops under the covers instead of. Currently, nested for loops using ranges/iterators are sometimes an order of magnitude slower than while loops. However, while loop constructs for iterating over arrays are very cumbersome, and the functional style (foreach) is also cumbersome and introduces a lot of function call overhead as well. The following two matrix multipication implementations def matMulUsingIterators ( a : Array[Array[Double]], b : Array[Array[Double]], c : Array[Array[Double]]) : Unit = { val b_j = new Array[Double](b.length) for (j < 0 until b(0).length) { for (k < 0 until b.length) { b_j(k) = b(k)(j) } for (i < 0 until a.length) { val c_i = c(i) val a_i = a(i) var s = 0.0d; for (k < 0 until b.length) { s += a_i(k) * b_j(k) } c_i(j) = s } } } def matMulUsingRanges ( a : Array[Array[Double]], b : Array[Array[Double]], c : Array[Array[Double]]) : Unit = { val jRange = 0 until b(0).length; val kRange = 0 until b.length; val iRange = 0 until a.length; val b_j = new Array[Double](b.length) for (j < jRange) { for (k < kRange) { b_j(k) = b(k)(j) } for (i < iRange) { val c_i = c(i); val a_i = a(i); var s = 0.0d; for (k < kRange) { s += a_i(k) * b_j(k) } c_i(j) = s } } } are much slower than the same algorithm coded with while loops: def matMulUsingWhileLoop ( a : Array[Array[Double]], b : Array[Array[Double]], c : Array[Array[Double]]) : Unit = { val m = a.length; val p = b(0).length; val n = b.length; val b_j = new Array[Double](b.length); var i = 0; var j = 0; var k = 0; while (j < p) { k = 0 while (k < n) { b_j(k) = b(k)(j); k += 1 } i = 0 while (i < m) { val c_i = c(i); val a_i = a(i); var s = 0.0d; k = 0; while (k < n) { s += a_i(k) * b_j(k); k += 1 } c_i(j) = s; i += 1 } j += 1; } } but the while loop code is more complex and error prone. (Sorry, Trac appears to remove some line breaks; I added some explicit semis but might have missed some; I'll try attaching actual working source code) Running this while measuring time in nanoseconds: Iterators 2,807,815,301ns Ranges 2,789,958,191ns While Loop 190,778,574ns MatMul by Iterators is 14 times as slow as with while loops. It does not appear that the Hotspot runtime profiling and optimization dramatically helps this performance problem This performance problem can hurt adoption of Scala for many types of uses/applications. 
I would like to suggest the compiler optimize the common case of for loops, that is,
{code} for (var < Range [by step]) for (var < int to int [by step]) for (var < int until int [by step]) {code} to use while loops under the covers instead of. Currently, nested for loops using ranges/iterators are sometimes an order of magnitude slower than while loops. However, while loop constructs for iterating over arrays are very cumbersome, and the functional style (foreach) is also cumbersome and introduces a lot of function call overhead as well. The following two matrix multipication implementations {code} def matMulUsingIterators ( a : Array[Array[Double]], b : Array[Array[Double]], c : Array[Array[Double]]) : Unit = { val b_j = new Array[Double](b.length) for (j < 0 until b(0).length) { for (k < 0 until b.length) { b_j(k) = b(k)(j) } for (i < 0 until a.length) { val c_i = c(i) val a_i = a(i) var s = 0.0d; for (k < 0 until b.length) { s += a_i(k) * b_j(k) } c_i(j) = s } } } def matMulUsingRanges ( a : Array[Array[Double]], b : Array[Array[Double]], c : Array[Array[Double]]) : Unit = { val jRange = 0 until b(0).length; val kRange = 0 until b.length; val iRange = 0 until a.length; val b_j = new Array[Double](b.length) for (j < jRange) { for (k < kRange) { b_j(k) = b(k)(j) } for (i < iRange) { val c_i = c(i); val a_i = a(i); var s = 0.0d; for (k < kRange) { s += a_i(k) * b_j(k) } c_i(j) = s } } } {code} are much slower than the same algorithm coded with while loops: {code} def matMulUsingWhileLoop ( a : Array[Array[Double]], b : Array[Array[Double]], c : Array[Array[Double]]) : Unit = { val m = a.length; val p = b(0).length; val n = b.length; val b_j = new Array[Double](b.length); var i = 0; var j = 0; var k = 0; while (j < p) { k = 0 while (k < n) { b_j(k) = b(k)(j); k += 1 } i = 0 while (i < m) { val c_i = c(i); val a_i = a(i); var s = 0.0d; k = 0; while (k < n) { s += a_i(k) * b_j(k); k += 1 } c_i(j) = s; i += 1 } j += 1; } } {code} but the while loop code is more complex and error prone. (Sorry, Trac appears to remove some line breaks; I added some explicit semis but might have missed some; I'll try attaching actual working source code) Running this while measuring time in nanoseconds: {code} Iterators 2,807,815,301ns Ranges 2,789,958,191ns While Loop 190,778,574ns {code} MatMul by Iterators is 14 times as slow as with while loops. It does not appear that the Hotspot runtime profiling and optimization dramatically helps this performance problem This performance problem can hurt adoption of Scala for many types of uses/applications. 
Workflow  OpenClosed Workflow v3 [ 29883 ]  Copy of OpenClosed Workflow v3 [ 40216 ] 
Fix Version/s  Backlog [ 11701 ] 
// Scala code to compare performance of nested int loops
object MatMul {
def matMulUsingIterators (
a : Array[Array[Double]],
b : Array[Array[Double]],
c : Array[Array[Double]]) : Unit = {
val b_j = new Array[Double](b.length)
for (j < 0 until b(0).length) {
for (k < 0 until b.length) {
b_j(k) = b(k)(j)
}
for (i < 0 until a.length) {
val c_i = c(i)
val a_i = a(i)
var s = 0.0d;
for (k < 0 until b.length) {
s += a_i(k) * b_j(k)
}
c_i(j) = s
}
}
}
def matMulUsingRanges (
a : Array[Array[Double]],
b : Array[Array[Double]],
c : Array[Array[Double]]) : Unit = {
val jRange = 0 until b(0).length // p
val kRange = 0 until b.length // n
val iRange = 0 until a.length // m
val b_j = new Array[Double](b.length)
for (j < jRange) {
for (k < kRange) {
b_j(k) = b(k)(j)
}
for (i < iRange) {
val c_i = c(i)
val a_i = a(i)
var s = 0.0d;
for (k < kRange) {
s += a_i(k) * b_j(k)
}
c_i(j) = s
}
}
}
def matMulUsingLimits (
a : Array[Array[Double]],
b : Array[Array[Double]],
c : Array[Array[Double]]) : Unit = {
val b_j = new Array[Double](b.length)
val m = a.length
val p = b(0).length
val n = b.length
for (j < 0 until p) {
for (k < 0 until n) {
b_j(k) = b(k)(j)
}
for (i < 0 until m) {
val c_i = c(i)
val a_i = a(i)
var s = 0.0d;
for (k < 0 until n) {
s += a_i(k) * b_j(k)
}
c_i(j) = s
}
}
}
def matMulUsingWhileLoop (
a : Array[Array[Double]],
b : Array[Array[Double]],
c : Array[Array[Double]]) : Unit = {
val m = a.length
val p = b(0).length
val n = b.length
val b_j = new Array[Double](b.length)
var i = 0; var j = 0; var k = 0
while (j < p) {
k = 0
while (k < n) {
b_j(k) = b(k)(j)
k += 1
}
i = 0
while (i < m) {
val c_i = c(i)
val a_i = a(i)
var s = 0.0d;
k = 0
while (k < n) {
s += a_i(k) * b_j(k)
k += 1
}
c_i(j) = s
i += 1
}
j += 1
}
}
def time[R](block: => R) : (Long, R) = {
val start = System.nanoTime()
val result : R = block
val time = System.nanoTime()  start
(time, result)
}
val format = new java.text.DecimalFormat("0,000'ns'");
def report[R](label: String, result: (Long, R)) = {
println(label + " " + format.format(result._1))
}
private val FACTOR = 5
private val M = 80
private val N = 70
private val P = 60
def main(args : Array[String]) = {
for (trial < 3 until 0 by 1) {
val factor = (if (System.getProperty("factor") != null)
Integer.parseInt(System.getProperty("factor"))
else
FACTOR)
val multiplier = if (trial == 1) factor else 1;
val m = M * multiplier
val n = N * multiplier
val p = P * multiplier
val a = new Array[Array[Double]](m,n)
val b = new Array[Array[Double]](n,p)
val c = new Array[Array[Double]](m,p)
println("\nMultiply c[" + m + "," + p + "] = a[" + m + "," + n + "] times b[" + n + "," + p + "]\n");
val whileTime = time(matMulUsingWhileLoop(a,b,c))
val iterTime = time(matMulUsingIterators(a,b,c))
report("Iterators ", iterTime)
report("Limits ", time(matMulUsingLimits(a,b,c)))
report("Ranges ", time(matMulUsingRanges(a,b,c)))
report("While Loop ", whileTime)
println("MatMul by Iterators is " + iterTime._1 / whileTime._1 + " times as slow as with while loops.")
}
}
}