arch/sh64/lib/page_copy.S

   1 /* Written by Richard P. Curnow, SuperH (UK) Ltd.
   2
   3    Tight version of mempy for the case of just copying a page.
   4    Prefetch strategy empirically optimised against RTL simulations
   5    of SH5-101 cut2 eval chip with Cayman board DDR memory.
   6
   7    Parameters:
   8    r2 : source effective address (start of page)
   9    r3 : destination effective address (start of page)
  10
  11    Always copies 4096 bytes.
  12
  13    Points to review.
  14    * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
  15      It seems like the prefetch needs to be at at least 4 lines ahead to get
  16      the data into the cache in time, and the allocos contend with outstanding
  17      prefetches for the same cache set, so it's better to have the numbers
  18      different.
  19    */
  20
  21         .section .text..SHmedia32,"ax"
  22         .little
  23
  24         .balign 8
  25         .global sh64_page_copy
  26 sh64_page_copy:
  27
  28         /* Copy 4096 bytes worth of data from r2 to r3.
  29            Do prefetches 4 lines ahead.
  30            Do alloco 2 lines ahead */
  31
  32         pta 1f, tr1
  33         pta 2f, tr2
  34         pta 3f, tr3
  35         ptabs r18, tr0
  36
  37         ld.q r2, 0x00, r63
  38         ld.q r2, 0x20, r63
  39         ld.q r2, 0x40, r63
  40         ld.q r2, 0x60, r63
  41         alloco r3, 0x00
  42         alloco r3, 0x20
  43
  44         movi 3968, r6
  45         add  r3, r6, r6
  46         addi r6, 64, r7
  47         addi r7, 64, r8
  48         sub r2, r3, r60
  49         addi r60, 8, r61
  50         addi r61, 8, r62
  51         addi r62, 8, r23
  52         addi r60, 0x80, r22
  53
  54 /* Minimal code size.  The extra branches inside the loop don't cost much
  55    because they overlap with the time spent waiting for prefetches to
  56    complete. */
  57 1:
  58         bge/u r3, r6, tr2  ! skip prefetch for last 4 lines
  59         ldx.q r3, r22, r63 ! prefetch 4 lines hence
  60 2:
  61         bge/u r3, r7, tr3  ! skip alloco for last 2 lines
  62         alloco r3, 0x40    ! alloc destination line 2 lines ahead
  63 3:
  64         ldx.q r3, r60, r36
  65         ldx.q r3, r61, r37
  66         ldx.q r3, r62, r38
  67         ldx.q r3, r23, r39
  68         st.q  r3,   0, r36
  69         st.q  r3,   8, r37
  70         st.q  r3,  16, r38
  71         st.q  r3,  24, r39
  72         addi r3, 32, r3
  73         bgt/l r8, r3, tr1
  74
  75         blink tr0, r63     ! return
  76
  77