mac_ppc.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. /*
  2. Copyright 2005-2013 Intel Corporation. All Rights Reserved.
  3. This file is part of Threading Building Blocks.
  4. Threading Building Blocks is free software; you can redistribute it
  5. and/or modify it under the terms of the GNU General Public License
  6. version 2 as published by the Free Software Foundation.
  7. Threading Building Blocks is distributed in the hope that it will be
  8. useful, but WITHOUT ANY WARRANTY; without even the implied warranty
  9. of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with Threading Building Blocks; if not, write to the Free Software
  13. Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  14. As a special exception, you may use this file as part of a free software
  15. library without restriction. Specifically, if other files instantiate
  16. templates or use macros or inline functions from this file, or you compile
  17. this file and link it with other files to produce an executable, this
  18. file does not by itself cause the resulting executable to be covered by
  19. the GNU General Public License. This exception does not however
  20. invalidate any other reasons why the executable file might be covered by
  21. the GNU General Public License.
  22. */
  23. #if !defined(__TBB_machine_H) || defined(__TBB_machine_gcc_power_H)
  24. #error Do not #include this internal file directly; use public TBB headers instead.
  25. #endif
  26. #define __TBB_machine_gcc_power_H
  27. #include <stdint.h>
  28. #include <unistd.h>
  29. // TODO: rename to gcc_power.h?
  30. // This file is for Power Architecture with compilers supporting GNU inline-assembler syntax (currently GNU g++ and IBM XL).
  31. // Note that XL V9.0 (sometimes?) has trouble dealing with empty input and/or clobber lists, so they should be avoided.
  32. #if __powerpc64__ || __ppc64__
  33. // IBM XL documents __powerpc64__ (and __PPC64__).
  34. // Apple documents __ppc64__ (with __ppc__ only on 32-bit).
  35. #define __TBB_WORDSIZE 8
  36. #else
  37. #define __TBB_WORDSIZE 4
  38. #endif
  39. #ifndef __BYTE_ORDER__
  40. // Hopefully endianness can be validly determined at runtime.
  41. // This may silently fail in some embedded systems with page-specific endianness.
  42. #elif __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
  43. #define __TBB_BIG_ENDIAN 1
  44. #elif __BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__
  45. #define __TBB_BIG_ENDIAN 0
  46. #else
  47. #define __TBB_BIG_ENDIAN -1 // not currently supported
  48. #endif
  49. // On Power Architecture, (lock-free) 64-bit atomics require 64-bit hardware:
  50. #if __TBB_WORDSIZE==8
  51. // Do not change the following definition, because TBB itself will use 64-bit atomics in 64-bit builds.
  52. #define __TBB_64BIT_ATOMICS 1
  53. #elif __bgp__
  54. // Do not change the following definition, because this is known 32-bit hardware.
  55. #define __TBB_64BIT_ATOMICS 0
  56. #else
  57. // To enable 64-bit atomics in 32-bit builds, set the value below to 1 instead of 0.
  58. // You must make certain that the program will only use them on actual 64-bit hardware
  59. // (which typically means that the entire program is only executed on such hardware),
  60. // because their implementation involves machine instructions that are illegal elsewhere.
  61. // The setting can be chosen independently per compilation unit,
  62. // which also means that TBB itself does not need to be rebuilt.
  63. // Alternatively (but only for the current architecture and TBB version),
  64. // override the default as a predefined macro when invoking the compiler.
  65. #ifndef __TBB_64BIT_ATOMICS
  66. #define __TBB_64BIT_ATOMICS 0
  67. #endif
  68. #endif
  69. inline int32_t __TBB_machine_cmpswp4 (volatile void *ptr, int32_t value, int32_t comparand )
  70. {
  71. int32_t result;
  72. __asm__ __volatile__("sync\n"
  73. "0:\n\t"
  74. "lwarx %[res],0,%[ptr]\n\t" /* load w/ reservation */
  75. "cmpw %[res],%[cmp]\n\t" /* compare against comparand */
  76. "bne- 1f\n\t" /* exit if not same */
  77. "stwcx. %[val],0,%[ptr]\n\t" /* store new value */
  78. "bne- 0b\n" /* retry if reservation lost */
  79. "1:\n\t" /* the exit */
  80. "isync"
  81. : [res]"=&r"(result)
  82. , "+m"(* (int32_t*) ptr) /* redundant with "memory" */
  83. : [ptr]"r"(ptr)
  84. , [val]"r"(value)
  85. , [cmp]"r"(comparand)
  86. : "memory" /* compiler full fence */
  87. , "cr0" /* clobbered by cmp and/or stwcx. */
  88. );
  89. return result;
  90. }
  91. #if __TBB_WORDSIZE==8
  92. inline int64_t __TBB_machine_cmpswp8 (volatile void *ptr, int64_t value, int64_t comparand )
  93. {
  94. int64_t result;
  95. __asm__ __volatile__("sync\n"
  96. "0:\n\t"
  97. "ldarx %[res],0,%[ptr]\n\t" /* load w/ reservation */
  98. "cmpd %[res],%[cmp]\n\t" /* compare against comparand */
  99. "bne- 1f\n\t" /* exit if not same */
  100. "stdcx. %[val],0,%[ptr]\n\t" /* store new value */
  101. "bne- 0b\n" /* retry if reservation lost */
  102. "1:\n\t" /* the exit */
  103. "isync"
  104. : [res]"=&r"(result)
  105. , "+m"(* (int64_t*) ptr) /* redundant with "memory" */
  106. : [ptr]"r"(ptr)
  107. , [val]"r"(value)
  108. , [cmp]"r"(comparand)
  109. : "memory" /* compiler full fence */
  110. , "cr0" /* clobbered by cmp and/or stdcx. */
  111. );
  112. return result;
  113. }
  114. #elif __TBB_64BIT_ATOMICS /* && __TBB_WORDSIZE==4 */
  115. inline int64_t __TBB_machine_cmpswp8 (volatile void *ptr, int64_t value, int64_t comparand )
  116. {
  117. int64_t result;
  118. int64_t value_register, comparand_register, result_register; // dummy variables to allocate registers
  119. __asm__ __volatile__("sync\n\t"
  120. "ld %[val],%[valm]\n\t"
  121. "ld %[cmp],%[cmpm]\n"
  122. "0:\n\t"
  123. "ldarx %[res],0,%[ptr]\n\t" /* load w/ reservation */
  124. "cmpd %[res],%[cmp]\n\t" /* compare against comparand */
  125. "bne- 1f\n\t" /* exit if not same */
  126. "stdcx. %[val],0,%[ptr]\n\t" /* store new value */
  127. "bne- 0b\n" /* retry if reservation lost */
  128. "1:\n\t" /* the exit */
  129. "std %[res],%[resm]\n\t"
  130. "isync"
  131. : [resm]"=m"(result)
  132. , [res] "=&r"( result_register)
  133. , [val] "=&r"( value_register)
  134. , [cmp] "=&r"(comparand_register)
  135. , "+m"(* (int64_t*) ptr) /* redundant with "memory" */
  136. : [ptr] "r"(ptr)
  137. , [valm]"m"(value)
  138. , [cmpm]"m"(comparand)
  139. : "memory" /* compiler full fence */
  140. , "cr0" /* clobbered by cmpd and/or stdcx. */
  141. );
  142. return result;
  143. }
  144. #endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */
  145. #define __TBB_MACHINE_DEFINE_LOAD_STORE(S,ldx,stx,cmpx) \
  146. template <typename T> \
  147. struct machine_load_store<T,S> { \
  148. static inline T load_with_acquire(const volatile T& location) { \
  149. T result; \
  150. __asm__ __volatile__(ldx " %[res],0(%[ptr])\n" \
  151. "0:\n\t" \
  152. cmpx " %[res],%[res]\n\t" \
  153. "bne- 0b\n\t" \
  154. "isync" \
  155. : [res]"=r"(result) \
  156. : [ptr]"b"(&location) /* cannot use register 0 here */ \
  157. , "m"(location) /* redundant with "memory" */ \
  158. : "memory" /* compiler acquire fence */ \
  159. , "cr0" /* clobbered by cmpw/cmpd */); \
  160. return result; \
  161. } \
  162. static inline void store_with_release(volatile T &location, T value) { \
  163. __asm__ __volatile__("lwsync\n\t" \
  164. stx " %[val],0(%[ptr])" \
  165. : "=m"(location) /* redundant with "memory" */ \
  166. : [ptr]"b"(&location) /* cannot use register 0 here */ \
  167. , [val]"r"(value) \
  168. : "memory"/*compiler release fence*/ /*(cr0 not affected)*/); \
  169. } \
  170. }; \
  171. \
  172. template <typename T> \
  173. struct machine_load_store_relaxed<T,S> { \
  174. static inline T load (const __TBB_atomic T& location) { \
  175. T result; \
  176. __asm__ __volatile__(ldx " %[res],0(%[ptr])" \
  177. : [res]"=r"(result) \
  178. : [ptr]"b"(&location) /* cannot use register 0 here */ \
  179. , "m"(location) \
  180. ); /*(no compiler fence)*/ /*(cr0 not affected)*/ \
  181. return result; \
  182. } \
  183. static inline void store (__TBB_atomic T &location, T value) { \
  184. __asm__ __volatile__(stx " %[val],0(%[ptr])" \
  185. : "=m"(location) \
  186. : [ptr]"b"(&location) /* cannot use register 0 here */ \
  187. , [val]"r"(value) \
  188. ); /*(no compiler fence)*/ /*(cr0 not affected)*/ \
  189. } \
  190. };
  191. namespace tbb {
  192. namespace internal {
  193. __TBB_MACHINE_DEFINE_LOAD_STORE(1,"lbz","stb","cmpw")
  194. __TBB_MACHINE_DEFINE_LOAD_STORE(2,"lhz","sth","cmpw")
  195. __TBB_MACHINE_DEFINE_LOAD_STORE(4,"lwz","stw","cmpw")
  196. #if __TBB_WORDSIZE==8
  197. __TBB_MACHINE_DEFINE_LOAD_STORE(8,"ld" ,"std","cmpd")
  198. #elif __TBB_64BIT_ATOMICS /* && __TBB_WORDSIZE==4 */
  199. template <typename T>
  200. struct machine_load_store<T,8> {
  201. static inline T load_with_acquire(const volatile T& location) {
  202. T result;
  203. T result_register; // dummy variable to allocate a register
  204. __asm__ __volatile__("ld %[res],0(%[ptr])\n\t"
  205. "std %[res],%[resm]\n"
  206. "0:\n\t"
  207. "cmpd %[res],%[res]\n\t"
  208. "bne- 0b\n\t"
  209. "isync"
  210. : [resm]"=m"(result)
  211. , [res]"=&r"(result_register)
  212. : [ptr]"b"(&location) /* cannot use register 0 here */
  213. , "m"(location) /* redundant with "memory" */
  214. : "memory" /* compiler acquire fence */
  215. , "cr0" /* clobbered by cmpd */);
  216. return result;
  217. }
  218. static inline void store_with_release(volatile T &location, T value) {
  219. T value_register; // dummy variable to allocate a register
  220. __asm__ __volatile__("lwsync\n\t"
  221. "ld %[val],%[valm]\n\t"
  222. "std %[val],0(%[ptr])"
  223. : "=m"(location) /* redundant with "memory" */
  224. , [val]"=&r"(value_register)
  225. : [ptr]"b"(&location) /* cannot use register 0 here */
  226. , [valm]"m"(value)
  227. : "memory"/*compiler release fence*/ /*(cr0 not affected)*/);
  228. }
  229. };
  230. struct machine_load_store_relaxed<T,8> {
  231. static inline T load (const volatile T& location) {
  232. T result;
  233. T result_register; // dummy variable to allocate a register
  234. __asm__ __volatile__("ld %[res],0(%[ptr])\n\t"
  235. "std %[res],%[resm]"
  236. : [resm]"=m"(result)
  237. , [res]"=&r"(result_register)
  238. : [ptr]"b"(&location) /* cannot use register 0 here */
  239. , "m"(location)
  240. ); /*(no compiler fence)*/ /*(cr0 not affected)*/
  241. return result;
  242. }
  243. static inline void store (volatile T &location, T value) {
  244. T value_register; // dummy variable to allocate a register
  245. __asm__ __volatile__("ld %[val],%[valm]\n\t"
  246. "std %[val],0(%[ptr])"
  247. : "=m"(location)
  248. , [val]"=&r"(value_register)
  249. : [ptr]"b"(&location) /* cannot use register 0 here */
  250. , [valm]"m"(value)
  251. ); /*(no compiler fence)*/ /*(cr0 not affected)*/
  252. }
  253. };
  254. #define __TBB_machine_load_store_relaxed_8
  255. #endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */
  256. }} // namespaces internal, tbb
  257. #undef __TBB_MACHINE_DEFINE_LOAD_STORE
  258. #define __TBB_USE_GENERIC_PART_WORD_CAS 1
  259. #define __TBB_USE_GENERIC_FETCH_ADD 1
  260. #define __TBB_USE_GENERIC_FETCH_STORE 1
  261. #define __TBB_USE_GENERIC_SEQUENTIAL_CONSISTENCY_LOAD_STORE 1
  262. #define __TBB_control_consistency_helper() __asm__ __volatile__("isync": : :"memory")
  263. #define __TBB_full_memory_fence() __asm__ __volatile__( "sync": : :"memory")
  264. static inline intptr_t __TBB_machine_lg( uintptr_t x ) {
  265. __TBB_ASSERT(x, "__TBB_Log2(0) undefined");
  266. // cntlzd/cntlzw starts counting at 2^63/2^31 (ignoring any higher-order bits), and does not affect cr0
  267. #if __TBB_WORDSIZE==8
  268. __asm__ __volatile__ ("cntlzd %0,%0" : "+r"(x));
  269. return 63-static_cast<intptr_t>(x);
  270. #else
  271. __asm__ __volatile__ ("cntlzw %0,%0" : "+r"(x));
  272. return 31-static_cast<intptr_t>(x);
  273. #endif
  274. }
  275. #define __TBB_Log2(V) __TBB_machine_lg(V)
  276. // Assumes implicit alignment for any 32-bit value
  277. typedef uint32_t __TBB_Flag;
  278. #define __TBB_Flag __TBB_Flag
  279. inline bool __TBB_machine_trylockbyte( __TBB_atomic __TBB_Flag &flag ) {
  280. return __TBB_machine_cmpswp4(&flag,1,0)==0;
  281. }
  282. #define __TBB_TryLockByte(P) __TBB_machine_trylockbyte(P)