Post by Roger Cabo on Jan 12, 2024 15:37:06 GMT 1
I'v done a short test with Mat Mul VS Code MatrixMull.. with 1mio calls:
by calculation : 0.0532157999995633
by Mat : 0.102792799999555
Not sure what happen on Intel.
Further it's possible to speed up the I_MatrixMultiply() by about 4 times with simple threading.
Calculate each of the 4 blocks in a different thread at once!
$Library "gfawinx"
$Library "UpdateRT"
UpdateRuntime ' Patches GfaWin23.Ocx
Type m_Matrix
m0 As Double
m1 As Double
m2 As Double
m3 As Double
m4 As Double
m5 As Double
m6 As Double
m7 As Double
m8 As Double
m9 As Double
m10 As Double
m11 As Double
m12 As Double
m13 As Double
m14 As Double
m15 As Double
EndType
$StepOff // let this be real! :)
Dim m1 As m_Matrix
Dim m2 As m_Matrix
Dim m3 As m_Matrix
Dim i%
OpenW 1
FontSize = 14
Dim t# = Timer
For i% = 0 To 1000000
I_MatrixMultiply(m1, m2, m3)
Next i%
Debug "by calculation : " & Timer - t#
// -----------------
Global Double a(0 .. 3, 0 .. 3)
Global Double b(0 .. 3, 0 .. 3)
Global Double c(0 .. 3, 0 .. 3)
Mat Set a() = 2
Mat Set b() = 2
t# = Timer : Dim i%
For i% = 0 To 1000000
Mat Mul c() = a()*b()
Next
Debug "by Mat : " & Timer - t#
Erase a(), b(), c()
Stop
Proc I_MatrixMultiply(ByRef m1 As m_Matrix, ByRef m2 As m_Matrix, ByRef result As m_Matrix) Naked
// ... Looks a lot but is terrible fast with because the adresses are stored in the adr-registers.
// ... Multiplikation jeder Zeile von m1 mit jeder Spalte von m2
result.m0 = m1.m0 * m2.m0 + m1.m1 * m2.m4 + m1.m2 * m2.m8 + m1.m3 * m2.m12
result.m1 = m1.m0 * m2.m1 + m1.m1 * m2.m5 + m1.m2 * m2.m9 + m1.m3 * m2.m13
result.m2 = m1.m0 * m2.m2 + m1.m1 * m2.m6 + m1.m2 * m2.m10 + m1.m3 * m2.m14
result.m3 = m1.m0 * m2.m3 + m1.m1 * m2.m7 + m1.m2 * m2.m11 + m1.m3 * m2.m15
result.m4 = m1.m4 * m2.m0 + m1.m5 * m2.m4 + m1.m6 * m2.m8 + m1.m7 * m2.m12
result.m5 = m1.m4 * m2.m1 + m1.m5 * m2.m5 + m1.m6 * m2.m9 + m1.m7 * m2.m13
result.m6 = m1.m4 * m2.m2 + m1.m5 * m2.m6 + m1.m6 * m2.m10 + m1.m7 * m2.m14
result.m7 = m1.m4 * m2.m3 + m1.m5 * m2.m7 + m1.m6 * m2.m11 + m1.m7 * m2.m15
result.m8 = m1.m8 * m2.m0 + m1.m9 * m2.m4 + m1.m10 * m2.m8 + m1.m11 * m2.m12
result.m9 = m1.m8 * m2.m1 + m1.m9 * m2.m5 + m1.m10 * m2.m9 + m1.m11 * m2.m13
result.m10 = m1.m8 * m2.m2 + m1.m9 * m2.m6 + m1.m10 * m2.m10 + m1.m11 * m2.m14
result.m11 = m1.m8 * m2.m3 + m1.m9 * m2.m7 + m1.m10 * m2.m11 + m1.m11 * m2.m15
result.m12 = m1.m12 * m2.m0 + m1.m13 * m2.m4 + m1.m14 * m2.m8 + m1.m15 * m2.m12
result.m13 = m1.m12 * m2.m1 + m1.m13 * m2.m5 + m1.m14 * m2.m9 + m1.m15 * m2.m13
result.m14 = m1.m12 * m2.m2 + m1.m13 * m2.m6 + m1.m14 * m2.m10 + m1.m15 * m2.m14
result.m15 = m1.m12 * m2.m3 + m1.m13 * m2.m7 + m1.m14 * m2.m11 + m1.m15 * m2.m15
°// ... Durchführen der Matrixmultiplikation
°For i% = 0 To 3
°For j% = 0 To 3
°result(i * 4 + j) = m_rot(i * 4) * m_trans(j) + m_rot(i * 4 + 1) * m_trans(j + 4) + m_rot(i * 4 + 2) * m_trans(j + 8) + m_rot(i * 4 + 3) * m_trans(j + 12)
°Next
°Next
EndProc
by calculation : 0.0532157999995633
by Mat : 0.102792799999555
Not sure what happen on Intel.
Further it's possible to speed up the I_MatrixMultiply() by about 4 times with simple threading.
Calculate each of the 4 blocks in a different thread at once!