summaryrefslogtreecommitdiff
path: root/src/libffmpeg/libavcodec/armv4l
diff options
context:
space:
mode:
Diffstat (limited to 'src/libffmpeg/libavcodec/armv4l')
-rw-r--r--src/libffmpeg/libavcodec/armv4l/dsputil_arm.c12
-rw-r--r--src/libffmpeg/libavcodec/armv4l/jrevdct_arm.S650
-rw-r--r--src/libffmpeg/libavcodec/armv4l/mpegvideo_arm.c2
-rw-r--r--src/libffmpeg/libavcodec/armv4l/simple_idct_arm.S332
4 files changed, 498 insertions, 498 deletions
diff --git a/src/libffmpeg/libavcodec/armv4l/dsputil_arm.c b/src/libffmpeg/libavcodec/armv4l/dsputil_arm.c
index 0195c3ca6..cebd176b3 100644
--- a/src/libffmpeg/libavcodec/armv4l/dsputil_arm.c
+++ b/src/libffmpeg/libavcodec/armv4l/dsputil_arm.c
@@ -14,7 +14,7 @@
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "../dsputil.h"
@@ -205,13 +205,13 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
#endif
c->idct_put= j_rev_dct_ARM_put;
c->idct_add= j_rev_dct_ARM_add;
- c->idct = j_rev_dct_ARM;
+ c->idct = j_rev_dct_ARM;
c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;/* FF_NO_IDCT_PERM */
} else if (idct_algo==FF_IDCT_SIMPLEARM){
- c->idct_put= simple_idct_ARM_put;
- c->idct_add= simple_idct_ARM_add;
- c->idct = simple_idct_ARM;
- c->idct_permutation_type= FF_NO_IDCT_PERM;
+ c->idct_put= simple_idct_ARM_put;
+ c->idct_add= simple_idct_ARM_add;
+ c->idct = simple_idct_ARM;
+ c->idct_permutation_type= FF_NO_IDCT_PERM;
#ifdef HAVE_IPP
} else if (idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_IPP){
#else
diff --git a/src/libffmpeg/libavcodec/armv4l/jrevdct_arm.S b/src/libffmpeg/libavcodec/armv4l/jrevdct_arm.S
index 76eda57ea..294ea4750 100644
--- a/src/libffmpeg/libavcodec/armv4l/jrevdct_arm.S
+++ b/src/libffmpeg/libavcodec/armv4l/jrevdct_arm.S
@@ -1,6 +1,6 @@
-/*
+/*
C-like prototype :
- void j_rev_dct_ARM(DCTBLOCK data)
+ void j_rev_dct_ARM(DCTBLOCK data)
With DCTBLOCK being a pointer to an array of 64 'signed shorts'
@@ -22,7 +22,7 @@
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
+
*/
#define FIX_0_298631336 2446
#define FIX_0_541196100 4433
@@ -36,8 +36,8 @@
#define FIX_M_1_847759065 -15137
#define FIX_M_1_961570560 -16069
#define FIX_M_2_562915447 -20995
-#define FIX_0xFFFF 0xFFFF
-
+#define FIX_0xFFFF 0xFFFF
+
#define FIX_0_298631336_ID 0
#define FIX_0_541196100_ID 4
#define FIX_0_765366865_ID 8
@@ -51,336 +51,336 @@
#define FIX_M_1_961570560_ID 40
#define FIX_M_2_562915447_ID 44
#define FIX_0xFFFF_ID 48
- .text
- .align
-
- .global j_rev_dct_ARM
+ .text
+ .align
+
+ .global j_rev_dct_ARM
j_rev_dct_ARM:
- stmdb sp!, { r4 - r12, lr } @ all callee saved regs
+ stmdb sp!, { r4 - r12, lr } @ all callee saved regs
- sub sp, sp, #4 @ reserve some space on the stack
- str r0, [ sp ] @ save the DCT pointer to the stack
+ sub sp, sp, #4 @ reserve some space on the stack
+ str r0, [ sp ] @ save the DCT pointer to the stack
- mov lr, r0 @ lr = pointer to the current row
- mov r12, #8 @ r12 = row-counter
- add r11, pc, #(const_array-.-8) @ r11 = base pointer to the constants array
+ mov lr, r0 @ lr = pointer to the current row
+ mov r12, #8 @ r12 = row-counter
+ add r11, pc, #(const_array-.-8) @ r11 = base pointer to the constants array
row_loop:
- ldrsh r0, [lr, # 0] @ r0 = 'd0'
- ldrsh r1, [lr, # 8] @ r1 = 'd1'
-
- @ Optimization for row that have all items except the first set to 0
- @ (this works as the DCTELEMS are always 4-byte aligned)
- ldr r5, [lr, # 0]
- ldr r2, [lr, # 4]
- ldr r3, [lr, # 8]
- ldr r4, [lr, #12]
- orr r3, r3, r4
- orr r3, r3, r2
- orrs r5, r3, r5
- beq end_of_row_loop @ nothing to be done as ALL of them are '0'
- orrs r2, r3, r1
- beq empty_row
-
- ldrsh r2, [lr, # 2] @ r2 = 'd2'
- ldrsh r4, [lr, # 4] @ r4 = 'd4'
- ldrsh r6, [lr, # 6] @ r6 = 'd6'
-
- ldr r3, [r11, #FIX_0_541196100_ID]
- add r7, r2, r6
- ldr r5, [r11, #FIX_M_1_847759065_ID]
- mul r7, r3, r7 @ r7 = z1
- ldr r3, [r11, #FIX_0_765366865_ID]
- mla r6, r5, r6, r7 @ r6 = tmp2
- add r5, r0, r4 @ r5 = tmp0
- mla r2, r3, r2, r7 @ r2 = tmp3
- sub r3, r0, r4 @ r3 = tmp1
-
- add r0, r2, r5, lsl #13 @ r0 = tmp10
- rsb r2, r2, r5, lsl #13 @ r2 = tmp13
- add r4, r6, r3, lsl #13 @ r4 = tmp11
- rsb r3, r6, r3, lsl #13 @ r3 = tmp12
-
- stmdb sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11
-
- ldrsh r3, [lr, #10] @ r3 = 'd3'
- ldrsh r5, [lr, #12] @ r5 = 'd5'
- ldrsh r7, [lr, #14] @ r7 = 'd7'
-
- add r0, r3, r5 @ r0 = 'z2'
- add r2, r1, r7 @ r2 = 'z1'
- add r4, r3, r7 @ r4 = 'z3'
- add r6, r1, r5 @ r6 = 'z4'
- ldr r9, [r11, #FIX_1_175875602_ID]
- add r8, r4, r6 @ r8 = z3 + z4
- ldr r10, [r11, #FIX_M_0_899976223_ID]
- mul r8, r9, r8 @ r8 = 'z5'
- ldr r9, [r11, #FIX_M_2_562915447_ID]
- mul r2, r10, r2 @ r2 = 'z1'
- ldr r10, [r11, #FIX_M_1_961570560_ID]
- mul r0, r9, r0 @ r0 = 'z2'
- ldr r9, [r11, #FIX_M_0_390180644_ID]
- mla r4, r10, r4, r8 @ r4 = 'z3'
- ldr r10, [r11, #FIX_0_298631336_ID]
- mla r6, r9, r6, r8 @ r6 = 'z4'
- ldr r9, [r11, #FIX_2_053119869_ID]
- mla r7, r10, r7, r2 @ r7 = tmp0 + z1
- ldr r10, [r11, #FIX_3_072711026_ID]
- mla r5, r9, r5, r0 @ r5 = tmp1 + z2
- ldr r9, [r11, #FIX_1_501321110_ID]
- mla r3, r10, r3, r0 @ r3 = tmp2 + z2
- add r7, r7, r4 @ r7 = tmp0
- mla r1, r9, r1, r2 @ r1 = tmp3 + z1
- add r5, r5, r6 @ r5 = tmp1
- add r3, r3, r4 @ r3 = tmp2
- add r1, r1, r6 @ r1 = tmp3
-
- ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
- @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
-
- @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
- add r8, r0, r1
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, # 0]
-
- @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
- sub r8, r0, r1
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, #14]
-
- @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
- add r8, r6, r3
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, # 2]
-
- @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
- sub r8, r6, r3
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, #12]
-
- @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
- add r8, r4, r5
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, # 4]
-
- @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
- sub r8, r4, r5
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, #10]
-
- @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
- add r8, r2, r7
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, # 6]
-
- @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
- sub r8, r2, r7
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, # 8]
-
- @ End of row loop
- add lr, lr, #16
- subs r12, r12, #1
- bne row_loop
- beq start_column_loop
-
+ ldrsh r0, [lr, # 0] @ r0 = 'd0'
+ ldrsh r1, [lr, # 8] @ r1 = 'd1'
+
+ @ Optimization for row that have all items except the first set to 0
+ @ (this works as the DCTELEMS are always 4-byte aligned)
+ ldr r5, [lr, # 0]
+ ldr r2, [lr, # 4]
+ ldr r3, [lr, # 8]
+ ldr r4, [lr, #12]
+ orr r3, r3, r4
+ orr r3, r3, r2
+ orrs r5, r3, r5
+ beq end_of_row_loop @ nothing to be done as ALL of them are '0'
+ orrs r2, r3, r1
+ beq empty_row
+
+ ldrsh r2, [lr, # 2] @ r2 = 'd2'
+ ldrsh r4, [lr, # 4] @ r4 = 'd4'
+ ldrsh r6, [lr, # 6] @ r6 = 'd6'
+
+ ldr r3, [r11, #FIX_0_541196100_ID]
+ add r7, r2, r6
+ ldr r5, [r11, #FIX_M_1_847759065_ID]
+ mul r7, r3, r7 @ r7 = z1
+ ldr r3, [r11, #FIX_0_765366865_ID]
+ mla r6, r5, r6, r7 @ r6 = tmp2
+ add r5, r0, r4 @ r5 = tmp0
+ mla r2, r3, r2, r7 @ r2 = tmp3
+ sub r3, r0, r4 @ r3 = tmp1
+
+ add r0, r2, r5, lsl #13 @ r0 = tmp10
+ rsb r2, r2, r5, lsl #13 @ r2 = tmp13
+ add r4, r6, r3, lsl #13 @ r4 = tmp11
+ rsb r3, r6, r3, lsl #13 @ r3 = tmp12
+
+ stmdb sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11
+
+ ldrsh r3, [lr, #10] @ r3 = 'd3'
+ ldrsh r5, [lr, #12] @ r5 = 'd5'
+ ldrsh r7, [lr, #14] @ r7 = 'd7'
+
+ add r0, r3, r5 @ r0 = 'z2'
+ add r2, r1, r7 @ r2 = 'z1'
+ add r4, r3, r7 @ r4 = 'z3'
+ add r6, r1, r5 @ r6 = 'z4'
+ ldr r9, [r11, #FIX_1_175875602_ID]
+ add r8, r4, r6 @ r8 = z3 + z4
+ ldr r10, [r11, #FIX_M_0_899976223_ID]
+ mul r8, r9, r8 @ r8 = 'z5'
+ ldr r9, [r11, #FIX_M_2_562915447_ID]
+ mul r2, r10, r2 @ r2 = 'z1'
+ ldr r10, [r11, #FIX_M_1_961570560_ID]
+ mul r0, r9, r0 @ r0 = 'z2'
+ ldr r9, [r11, #FIX_M_0_390180644_ID]
+ mla r4, r10, r4, r8 @ r4 = 'z3'
+ ldr r10, [r11, #FIX_0_298631336_ID]
+ mla r6, r9, r6, r8 @ r6 = 'z4'
+ ldr r9, [r11, #FIX_2_053119869_ID]
+ mla r7, r10, r7, r2 @ r7 = tmp0 + z1
+ ldr r10, [r11, #FIX_3_072711026_ID]
+ mla r5, r9, r5, r0 @ r5 = tmp1 + z2
+ ldr r9, [r11, #FIX_1_501321110_ID]
+ mla r3, r10, r3, r0 @ r3 = tmp2 + z2
+ add r7, r7, r4 @ r7 = tmp0
+ mla r1, r9, r1, r2 @ r1 = tmp3 + z1
+ add r5, r5, r6 @ r5 = tmp1
+ add r3, r3, r4 @ r3 = tmp2
+ add r1, r1, r6 @ r1 = tmp3
+
+ ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
+ @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
+
+ @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
+ add r8, r0, r1
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, # 0]
+
+ @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
+ sub r8, r0, r1
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, #14]
+
+ @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
+ add r8, r6, r3
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, # 2]
+
+ @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
+ sub r8, r6, r3
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, #12]
+
+ @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
+ add r8, r4, r5
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, # 4]
+
+ @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
+ sub r8, r4, r5
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, #10]
+
+ @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
+ add r8, r2, r7
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, # 6]
+
+ @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
+ sub r8, r2, r7
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, # 8]
+
+ @ End of row loop
+ add lr, lr, #16
+ subs r12, r12, #1
+ bne row_loop
+ beq start_column_loop
+
empty_row:
- ldr r1, [r11, #FIX_0xFFFF_ID]
- mov r0, r0, lsl #2
- and r0, r0, r1
- add r0, r0, r0, lsl #16
- str r0, [lr, # 0]
- str r0, [lr, # 4]
- str r0, [lr, # 8]
- str r0, [lr, #12]
+ ldr r1, [r11, #FIX_0xFFFF_ID]
+ mov r0, r0, lsl #2
+ and r0, r0, r1
+ add r0, r0, r0, lsl #16
+ str r0, [lr, # 0]
+ str r0, [lr, # 4]
+ str r0, [lr, # 8]
+ str r0, [lr, #12]
end_of_row_loop:
- @ End of loop
- add lr, lr, #16
- subs r12, r12, #1
- bne row_loop
+ @ End of loop
+ add lr, lr, #16
+ subs r12, r12, #1
+ bne row_loop
start_column_loop:
- @ Start of column loop
- ldr lr, [ sp ]
- mov r12, #8
+ @ Start of column loop
+ ldr lr, [ sp ]
+ mov r12, #8
column_loop:
- ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0'
- ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2'
- ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4'
- ldrsh r6, [lr, #(12*8)] @ r6 = 'd6'
-
- ldr r3, [r11, #FIX_0_541196100_ID]
- add r1, r2, r6
- ldr r5, [r11, #FIX_M_1_847759065_ID]
- mul r1, r3, r1 @ r1 = z1
- ldr r3, [r11, #FIX_0_765366865_ID]
- mla r6, r5, r6, r1 @ r6 = tmp2
- add r5, r0, r4 @ r5 = tmp0
- mla r2, r3, r2, r1 @ r2 = tmp3
- sub r3, r0, r4 @ r3 = tmp1
-
- add r0, r2, r5, lsl #13 @ r0 = tmp10
- rsb r2, r2, r5, lsl #13 @ r2 = tmp13
- add r4, r6, r3, lsl #13 @ r4 = tmp11
- rsb r6, r6, r3, lsl #13 @ r6 = tmp12
-
- ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1'
- ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3'
- ldrsh r5, [lr, #(10*8)] @ r5 = 'd5'
- ldrsh r7, [lr, #(14*8)] @ r7 = 'd7'
-
- @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
- orr r9, r1, r3
- orr r10, r5, r7
- orrs r10, r9, r10
- beq empty_odd_column
-
- stmdb sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11
-
- add r0, r3, r5 @ r0 = 'z2'
- add r2, r1, r7 @ r2 = 'z1'
- add r4, r3, r7 @ r4 = 'z3'
- add r6, r1, r5 @ r6 = 'z4'
- ldr r9, [r11, #FIX_1_175875602_ID]
- add r8, r4, r6
- ldr r10, [r11, #FIX_M_0_899976223_ID]
- mul r8, r9, r8 @ r8 = 'z5'
- ldr r9, [r11, #FIX_M_2_562915447_ID]
- mul r2, r10, r2 @ r2 = 'z1'
- ldr r10, [r11, #FIX_M_1_961570560_ID]
- mul r0, r9, r0 @ r0 = 'z2'
- ldr r9, [r11, #FIX_M_0_390180644_ID]
- mla r4, r10, r4, r8 @ r4 = 'z3'
- ldr r10, [r11, #FIX_0_298631336_ID]
- mla r6, r9, r6, r8 @ r6 = 'z4'
- ldr r9, [r11, #FIX_2_053119869_ID]
- mla r7, r10, r7, r2 @ r7 = tmp0 + z1
- ldr r10, [r11, #FIX_3_072711026_ID]
- mla r5, r9, r5, r0 @ r5 = tmp1 + z2
- ldr r9, [r11, #FIX_1_501321110_ID]
- mla r3, r10, r3, r0 @ r3 = tmp2 + z2
- add r7, r7, r4 @ r7 = tmp0
- mla r1, r9, r1, r2 @ r1 = tmp3 + z1
- add r5, r5, r6 @ r5 = tmp1
- add r3, r3, r4 @ r3 = tmp2
- add r1, r1, r6 @ r1 = tmp3
-
- ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
- @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
-
- @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
- add r8, r0, r1
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #( 0*8)]
-
- @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
- sub r8, r0, r1
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #(14*8)]
-
- @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
- add r8, r4, r3
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #( 2*8)]
-
- @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
- sub r8, r4, r3
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #(12*8)]
-
- @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
- add r8, r6, r5
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #( 4*8)]
-
- @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
- sub r8, r6, r5
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #(10*8)]
-
- @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
- add r8, r2, r7
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #( 6*8)]
-
- @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
- sub r8, r2, r7
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #( 8*8)]
-
- @ End of row loop
- add lr, lr, #2
- subs r12, r12, #1
- bne column_loop
- beq the_end
-
+ ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0'
+ ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2'
+ ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4'
+ ldrsh r6, [lr, #(12*8)] @ r6 = 'd6'
+
+ ldr r3, [r11, #FIX_0_541196100_ID]
+ add r1, r2, r6
+ ldr r5, [r11, #FIX_M_1_847759065_ID]
+ mul r1, r3, r1 @ r1 = z1
+ ldr r3, [r11, #FIX_0_765366865_ID]
+ mla r6, r5, r6, r1 @ r6 = tmp2
+ add r5, r0, r4 @ r5 = tmp0
+ mla r2, r3, r2, r1 @ r2 = tmp3
+ sub r3, r0, r4 @ r3 = tmp1
+
+ add r0, r2, r5, lsl #13 @ r0 = tmp10
+ rsb r2, r2, r5, lsl #13 @ r2 = tmp13
+ add r4, r6, r3, lsl #13 @ r4 = tmp11
+ rsb r6, r6, r3, lsl #13 @ r6 = tmp12
+
+ ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1'
+ ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3'
+ ldrsh r5, [lr, #(10*8)] @ r5 = 'd5'
+ ldrsh r7, [lr, #(14*8)] @ r7 = 'd7'
+
+ @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
+ orr r9, r1, r3
+ orr r10, r5, r7
+ orrs r10, r9, r10
+ beq empty_odd_column
+
+ stmdb sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11
+
+ add r0, r3, r5 @ r0 = 'z2'
+ add r2, r1, r7 @ r2 = 'z1'
+ add r4, r3, r7 @ r4 = 'z3'
+ add r6, r1, r5 @ r6 = 'z4'
+ ldr r9, [r11, #FIX_1_175875602_ID]
+ add r8, r4, r6
+ ldr r10, [r11, #FIX_M_0_899976223_ID]
+ mul r8, r9, r8 @ r8 = 'z5'
+ ldr r9, [r11, #FIX_M_2_562915447_ID]
+ mul r2, r10, r2 @ r2 = 'z1'
+ ldr r10, [r11, #FIX_M_1_961570560_ID]
+ mul r0, r9, r0 @ r0 = 'z2'
+ ldr r9, [r11, #FIX_M_0_390180644_ID]
+ mla r4, r10, r4, r8 @ r4 = 'z3'
+ ldr r10, [r11, #FIX_0_298631336_ID]
+ mla r6, r9, r6, r8 @ r6 = 'z4'
+ ldr r9, [r11, #FIX_2_053119869_ID]
+ mla r7, r10, r7, r2 @ r7 = tmp0 + z1
+ ldr r10, [r11, #FIX_3_072711026_ID]
+ mla r5, r9, r5, r0 @ r5 = tmp1 + z2
+ ldr r9, [r11, #FIX_1_501321110_ID]
+ mla r3, r10, r3, r0 @ r3 = tmp2 + z2
+ add r7, r7, r4 @ r7 = tmp0
+ mla r1, r9, r1, r2 @ r1 = tmp3 + z1
+ add r5, r5, r6 @ r5 = tmp1
+ add r3, r3, r4 @ r3 = tmp2
+ add r1, r1, r6 @ r1 = tmp3
+
+ ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
+ @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
+
+ @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
+ add r8, r0, r1
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #( 0*8)]
+
+ @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
+ sub r8, r0, r1
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #(14*8)]
+
+ @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
+ add r8, r4, r3
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #( 2*8)]
+
+ @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
+ sub r8, r4, r3
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #(12*8)]
+
+ @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
+ add r8, r6, r5
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #( 4*8)]
+
+ @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
+ sub r8, r6, r5
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #(10*8)]
+
+ @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
+ add r8, r2, r7
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #( 6*8)]
+
+ @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
+ sub r8, r2, r7
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #( 8*8)]
+
+ @ End of row loop
+ add lr, lr, #2
+ subs r12, r12, #1
+ bne column_loop
+ beq the_end
+
empty_odd_column:
- @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
- @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
- add r0, r0, #(1<<17)
- mov r0, r0, asr #18
- strh r0, [lr, #( 0*8)]
- strh r0, [lr, #(14*8)]
-
- @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
- @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
- add r4, r4, #(1<<17)
- mov r4, r4, asr #18
- strh r4, [lr, #( 2*8)]
- strh r4, [lr, #(12*8)]
-
- @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
- @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
- add r6, r6, #(1<<17)
- mov r6, r6, asr #18
- strh r6, [lr, #( 4*8)]
- strh r6, [lr, #(10*8)]
-
- @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
- @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
- add r2, r2, #(1<<17)
- mov r2, r2, asr #18
- strh r2, [lr, #( 6*8)]
- strh r2, [lr, #( 8*8)]
-
- @ End of row loop
- add lr, lr, #2
- subs r12, r12, #1
- bne column_loop
-
-the_end:
- @ The end....
- add sp, sp, #4
- ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return
+ @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
+ @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
+ add r0, r0, #(1<<17)
+ mov r0, r0, asr #18
+ strh r0, [lr, #( 0*8)]
+ strh r0, [lr, #(14*8)]
+
+ @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
+ @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
+ add r4, r4, #(1<<17)
+ mov r4, r4, asr #18
+ strh r4, [lr, #( 2*8)]
+ strh r4, [lr, #(12*8)]
+
+ @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
+ @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
+ add r6, r6, #(1<<17)
+ mov r6, r6, asr #18
+ strh r6, [lr, #( 4*8)]
+ strh r6, [lr, #(10*8)]
+
+ @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
+ @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
+ add r2, r2, #(1<<17)
+ mov r2, r2, asr #18
+ strh r2, [lr, #( 6*8)]
+ strh r2, [lr, #( 8*8)]
+
+ @ End of row loop
+ add lr, lr, #2
+ subs r12, r12, #1
+ bne column_loop
+
+the_end:
+ @ The end....
+ add sp, sp, #4
+ ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return
const_array:
- .align
- .word FIX_0_298631336
- .word FIX_0_541196100
- .word FIX_0_765366865
- .word FIX_1_175875602
- .word FIX_1_501321110
- .word FIX_2_053119869
- .word FIX_3_072711026
- .word FIX_M_0_390180644
- .word FIX_M_0_899976223
- .word FIX_M_1_847759065
- .word FIX_M_1_961570560
- .word FIX_M_2_562915447
- .word FIX_0xFFFF
+ .align
+ .word FIX_0_298631336
+ .word FIX_0_541196100
+ .word FIX_0_765366865
+ .word FIX_1_175875602
+ .word FIX_1_501321110
+ .word FIX_2_053119869
+ .word FIX_3_072711026
+ .word FIX_M_0_390180644
+ .word FIX_M_0_899976223
+ .word FIX_M_1_847759065
+ .word FIX_M_1_961570560
+ .word FIX_M_2_562915447
+ .word FIX_0xFFFF
diff --git a/src/libffmpeg/libavcodec/armv4l/mpegvideo_arm.c b/src/libffmpeg/libavcodec/armv4l/mpegvideo_arm.c
index 6e4c9fb3c..263e3c5bc 100644
--- a/src/libffmpeg/libavcodec/armv4l/mpegvideo_arm.c
+++ b/src/libffmpeg/libavcodec/armv4l/mpegvideo_arm.c
@@ -13,7 +13,7 @@
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
diff --git a/src/libffmpeg/libavcodec/armv4l/simple_idct_arm.S b/src/libffmpeg/libavcodec/armv4l/simple_idct_arm.S
index 95ac0dee4..43751896d 100644
--- a/src/libffmpeg/libavcodec/armv4l/simple_idct_arm.S
+++ b/src/libffmpeg/libavcodec/armv4l/simple_idct_arm.S
@@ -1,4 +1,4 @@
-/*
+/*
* simple_idct_arm.S
* Copyright (C) 2002 Frederic 'dilb' Boulay.
* All Rights Reserved.
@@ -16,11 +16,11 @@
*
* You should have received a copy of the GNU General Public License
* along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*
* The function defined in this file, is derived from the simple_idct function
- * from the libavcodec library part of the ffmpeg project.
+ * from the libavcodec library part of the ffmpeg project.
*/
/* useful constants for the algorithm, they are save in __constant_ptr__ at */
@@ -51,9 +51,9 @@
#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
- .text
- .align
- .global simple_idct_ARM
+ .text
+ .align
+ .global simple_idct_ARM
simple_idct_ARM:
@@ void simple_idct_ARM(int16_t *block)
@@ -120,8 +120,8 @@ __b_evaluation:
ldr r11, [r12, #offW7] @ R11=W7
mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
- teq r2, #0 @ if null avoid muls
- mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ teq r2, #0 @ if null avoid muls
+ mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2, r2, #0 @ R2=-ROWr16[3]
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ -147,7 +147,7 @@ __b_evaluation:
@@ MAC16(b3, -W1, row[7]);
@@ MAC16(b1, -W5, row[7]);
mov r3, r3, asr #16 @ R3=ROWr16[5]
- teq r3, #0 @ if null avoid muls
+ teq r3, #0 @ if null avoid muls
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
mov r4, r4, asr #16 @ R4=ROWr16[7]
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
@@ -155,7 +155,7 @@ __b_evaluation:
rsbne r3, r3, #0 @ R3=-ROWr16[5]
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
@@ R3 is free now
- teq r4, #0 @ if null avoid muls
+ teq r4, #0 @ if null avoid muls
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
rsbne r4, r4, #0 @ R4=-ROWr16[7]
@@ -187,7 +187,7 @@ __a_evaluation:
teq r2, #0
beq __end_bef_a_evaluation
- add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
+ add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
mul r11, r8, r4 @ R11=W2*ROWr16[2]
sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
@@ -203,7 +203,7 @@ __a_evaluation:
@@ a2 -= W4*row[4]
@@ a3 += W4*row[4]
ldrsh r11, [r14, #8] @ R11=ROWr16[4]
- teq r11, #0 @ if null avoid muls
+ teq r11, #0 @ if null avoid muls
mulne r11, r9, r11 @ R11=W4*ROWr16[4]
@@ R9 is free now
ldrsh r9, [r14, #12] @ R9=ROWr16[6]
@@ -212,7 +212,7 @@ __a_evaluation:
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
- teq r9, #0 @ if null avoid muls
+ teq r9, #0 @ if null avoid muls
mulne r11, r10, r9 @ R11=W6*ROWr16[6]
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10, r8, r9 @ R10=W2*ROWr16[6]
@@ -294,165 +294,165 @@ __end_row_loop:
- @@ at this point, R0=block, R1-R11 (free)
- @@ R12=__const_ptr_, R14=&block[n]
- add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
+ @@ at this point, R0=block, R1-R11 (free)
+ @@ R12=__const_ptr_, R14=&block[n]
+ add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
__col_loop:
__b_evaluation2:
- @@ at this point, R0=block (temp), R1-R11 (free)
- @@ R12=__const_ptr_, R14=&block[n]
- @@ proceed with b0-b3 first, followed by a0-a3
- @@ MUL16(b0, W1, col[8x1]);
- @@ MUL16(b1, W3, col[8x1]);
- @@ MUL16(b2, W5, col[8x1]);
- @@ MUL16(b3, W7, col[8x1]);
- @@ MAC16(b0, W3, col[8x3]);
- @@ MAC16(b1, -W7, col[8x3]);
- @@ MAC16(b2, -W1, col[8x3]);
- @@ MAC16(b3, -W5, col[8x3]);
- ldr r8, [r12, #offW1] @ R8=W1
- ldrsh r7, [r14, #16]
- mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
- ldr r9, [r12, #offW3] @ R9=W3
- ldr r10, [r12, #offW5] @ R10=W5
- mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
- ldr r11, [r12, #offW7] @ R11=W7
- mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
- ldrsh r2, [r14, #48]
- mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
- teq r2, #0 @ if 0, then avoid muls
- mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
- rsbne r2, r2, #0 @ R2=-ROWr16[3]
- mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
- mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
- mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
-
- @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
- @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
- @@ R12=__const_ptr_, R14=&block[n]
- @@ MAC16(b0, W5, col[5x8]);
- @@ MAC16(b2, W7, col[5x8]);
- @@ MAC16(b3, W3, col[5x8]);
- @@ MAC16(b1, -W1, col[5x8]);
- @@ MAC16(b0, W7, col[7x8]);
- @@ MAC16(b2, W3, col[7x8]);
- @@ MAC16(b3, -W1, col[7x8]);
- @@ MAC16(b1, -W5, col[7x8]);
- ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
- teq r3, #0 @ if 0 then avoid muls
- mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
- mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
- mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
- rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
- ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
- mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
- @@ R3 is free now
- teq r4, #0 @ if 0 then avoid muls
- mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
- mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
- rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
- mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
- mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
- @@ R4 is free now
+ @@ at this point, R0=block (temp), R1-R11 (free)
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ proceed with b0-b3 first, followed by a0-a3
+ @@ MUL16(b0, W1, col[8x1]);
+ @@ MUL16(b1, W3, col[8x1]);
+ @@ MUL16(b2, W5, col[8x1]);
+ @@ MUL16(b3, W7, col[8x1]);
+ @@ MAC16(b0, W3, col[8x3]);
+ @@ MAC16(b1, -W7, col[8x3]);
+ @@ MAC16(b2, -W1, col[8x3]);
+ @@ MAC16(b3, -W5, col[8x3]);
+ ldr r8, [r12, #offW1] @ R8=W1
+ ldrsh r7, [r14, #16]
+ mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ ldr r9, [r12, #offW3] @ R9=W3
+ ldr r10, [r12, #offW5] @ R10=W5
+ mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ ldr r11, [r12, #offW7] @ R11=W7
+ mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ ldrsh r2, [r14, #48]
+ mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ teq r2, #0 @ if 0, then avoid muls
+ mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ rsbne r2, r2, #0 @ R2=-ROWr16[3]
+ mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+
+ @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
+ @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ MAC16(b0, W5, col[5x8]);
+ @@ MAC16(b2, W7, col[5x8]);
+ @@ MAC16(b3, W3, col[5x8]);
+ @@ MAC16(b1, -W1, col[5x8]);
+ @@ MAC16(b0, W7, col[7x8]);
+ @@ MAC16(b2, W3, col[7x8]);
+ @@ MAC16(b3, -W1, col[7x8]);
+ @@ MAC16(b1, -W5, col[7x8]);
+ ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
+ teq r3, #0 @ if 0 then avoid muls
+ mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
+ mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
+ mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
+ rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
+ ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
+ mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
+ @@ R3 is free now
+ teq r4, #0 @ if 0 then avoid muls
+ mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
+ mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
+ rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
+ mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
+ mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
+ @@ R4 is free now
__end_b_evaluation2:
- @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
- @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
- @@ R12=__const_ptr_, R14=&block[n]
+ @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
+ @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+ @@ R12=__const_ptr_, R14=&block[n]
__a_evaluation2:
- @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
- @@ a1 = a0 + W6 * row[2];
- @@ a2 = a0 - W6 * row[2];
- @@ a3 = a0 - W2 * row[2];
- @@ a0 = a0 + W2 * row[2];
- ldrsh r6, [r14, #0]
- ldr r9, [r12, #offW4] @ R9=W4
- mul r6, r9, r6 @ R6=W4*ROWr16[0]
- ldr r10, [r12, #offW6] @ R10=W6
- ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet)
- add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
- mul r11, r10, r4 @ R11=W6*ROWr16[2]
- ldr r8, [r12, #offW2] @ R8=W2
- add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
- sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
- mul r11, r8, r4 @ R11=W2*ROWr16[2]
- sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
- add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
-
- @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
- @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
- @@ R12=__const_ptr_, R14=&block[n]
- @@ a0 += W4*row[4]
- @@ a1 -= W4*row[4]
- @@ a2 -= W4*row[4]
- @@ a3 += W4*row[4]
- ldrsh r11, [r14, #64] @ R11=ROWr16[4]
- teq r11, #0 @ if null avoid muls
- mulne r11, r9, r11 @ R11=W4*ROWr16[4]
- @@ R9 is free now
- addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
- subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
- subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
- ldrsh r9, [r14, #96] @ R9=ROWr16[6]
- addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
- @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
- teq r9, #0 @ if null avoid muls
- mulne r11, r10, r9 @ R11=W6*ROWr16[6]
- addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
- mulne r10, r8, r9 @ R10=W2*ROWr16[6]
- @@ a0 += W6*row[6];
- @@ a3 -= W6*row[6];
- @@ a1 -= W2*row[6];
- @@ a2 += W2*row[6];
- subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
- subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
- addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
+ @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
+ @@ a1 = a0 + W6 * row[2];
+ @@ a2 = a0 - W6 * row[2];
+ @@ a3 = a0 - W2 * row[2];
+ @@ a0 = a0 + W2 * row[2];
+ ldrsh r6, [r14, #0]
+ ldr r9, [r12, #offW4] @ R9=W4
+ mul r6, r9, r6 @ R6=W4*ROWr16[0]
+ ldr r10, [r12, #offW6] @ R10=W6
+ ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet)
+ add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
+ mul r11, r10, r4 @ R11=W6*ROWr16[2]
+ ldr r8, [r12, #offW2] @ R8=W2
+ add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
+ sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
+ mul r11, r8, r4 @ R11=W2*ROWr16[2]
+ sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
+ add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
+
+ @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
+ @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ a0 += W4*row[4]
+ @@ a1 -= W4*row[4]
+ @@ a2 -= W4*row[4]
+ @@ a3 += W4*row[4]
+ ldrsh r11, [r14, #64] @ R11=ROWr16[4]
+ teq r11, #0 @ if null avoid muls
+ mulne r11, r9, r11 @ R11=W4*ROWr16[4]
+ @@ R9 is free now
+ addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
+ subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
+ subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
+ ldrsh r9, [r14, #96] @ R9=ROWr16[6]
+ addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
+ @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
+ teq r9, #0 @ if null avoid muls
+ mulne r11, r10, r9 @ R11=W6*ROWr16[6]
+ addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
+ mulne r10, r8, r9 @ R10=W2*ROWr16[6]
+ @@ a0 += W6*row[6];
+ @@ a3 -= W6*row[6];
+ @@ a1 -= W2*row[6];
+ @@ a2 += W2*row[6];
+ subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
+ subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
+ addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
__end_a_evaluation2:
- @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
- @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
- @@ R12=__const_ptr_, R14=&block[n]
- @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
- @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
- @@ col[16] = ((a2 + b2) >> COL_SHIFT);
- @@ col[24] = ((a3 + b3) >> COL_SHIFT);
- @@ col[32] = ((a3 - b3) >> COL_SHIFT);
- @@ col[40] = ((a2 - b2) >> COL_SHIFT);
- @@ col[48] = ((a1 - b1) >> COL_SHIFT);
- @@ col[56] = ((a0 - b0) >> COL_SHIFT);
- @@@@@ no optimisation here @@@@@
- add r8, r6, r0 @ R8=a0+b0
- add r9, r2, r1 @ R9=a1+b1
- mov r8, r8, asr #COL_SHIFT
- mov r9, r9, asr #COL_SHIFT
- strh r8, [r14, #0]
- strh r9, [r14, #16]
- add r8, r3, r5 @ R8=a2+b2
- add r9, r4, r7 @ R9=a3+b3
- mov r8, r8, asr #COL_SHIFT
- mov r9, r9, asr #COL_SHIFT
- strh r8, [r14, #32]
- strh r9, [r14, #48]
- sub r8, r4, r7 @ R8=a3-b3
- sub r9, r3, r5 @ R9=a2-b2
- mov r8, r8, asr #COL_SHIFT
- mov r9, r9, asr #COL_SHIFT
- strh r8, [r14, #64]
- strh r9, [r14, #80]
- sub r8, r2, r1 @ R8=a1-b1
- sub r9, r6, r0 @ R9=a0-b0
- mov r8, r8, asr #COL_SHIFT
- mov r9, r9, asr #COL_SHIFT
- strh r8, [r14, #96]
- strh r9, [r14, #112]
+ @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
+ @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
+ @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
+ @@ col[16] = ((a2 + b2) >> COL_SHIFT);
+ @@ col[24] = ((a3 + b3) >> COL_SHIFT);
+ @@ col[32] = ((a3 - b3) >> COL_SHIFT);
+ @@ col[40] = ((a2 - b2) >> COL_SHIFT);
+ @@ col[48] = ((a1 - b1) >> COL_SHIFT);
+ @@ col[56] = ((a0 - b0) >> COL_SHIFT);
+ @@@@@ no optimisation here @@@@@
+ add r8, r6, r0 @ R8=a0+b0
+ add r9, r2, r1 @ R9=a1+b1
+ mov r8, r8, asr #COL_SHIFT
+ mov r9, r9, asr #COL_SHIFT
+ strh r8, [r14, #0]
+ strh r9, [r14, #16]
+ add r8, r3, r5 @ R8=a2+b2
+ add r9, r4, r7 @ R9=a3+b3
+ mov r8, r8, asr #COL_SHIFT
+ mov r9, r9, asr #COL_SHIFT
+ strh r8, [r14, #32]
+ strh r9, [r14, #48]
+ sub r8, r4, r7 @ R8=a3-b3
+ sub r9, r3, r5 @ R9=a2-b2
+ mov r8, r8, asr #COL_SHIFT
+ mov r9, r9, asr #COL_SHIFT
+ strh r8, [r14, #64]
+ strh r9, [r14, #80]
+ sub r8, r2, r1 @ R8=a1-b1
+ sub r9, r6, r0 @ R9=a0-b0
+ mov r8, r8, asr #COL_SHIFT
+ mov r9, r9, asr #COL_SHIFT
+ strh r8, [r14, #96]
+ strh r9, [r14, #112]
__end_col_loop:
- @@ at this point, R0-R11 (free)
- @@ R12=__const_ptr_, R14=&block[n]
- ldr r0, [sp, #0] @ R0=block
- teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished.
- sub r14, r14, #2
- bne __col_loop
+ @@ at this point, R0-R11 (free)
+ @@ R12=__const_ptr_, R14=&block[n]
+ ldr r0, [sp, #0] @ R0=block
+ teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished.
+ sub r14, r14, #2
+ bne __col_loop
@@ -466,15 +466,15 @@ __end_simple_idct_ARM:
@@ kind of sub-function, here not to overload the common case.
__end_bef_a_evaluation:
- add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
+ add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
mul r11, r8, r4 @ R11=W2*ROWr16[2]
sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
- bal __end_a_evaluation
+ bal __end_a_evaluation
__constant_ptr__: @@ see #defines at the beginning of the source code for values.
- .align
+ .align
.word W1
.word W2
.word W3