oneapi-src
diff --git a/‎AI-and-Analytics/Features-and-Functionality/IntelPyTorch_Extensions_AutoMixedPrecision/README.md
+2-2 b/‎AI-and-Analytics/Features-and-Functionality/IntelPyTorch_Extensions_AutoMixedPrecision/README.md
+2-2
diff --git a/‎AI-and-Analytics/Getting-Started-Samples/README.md
+4-1 b/‎AI-and-Analytics/Getting-Started-Samples/README.md
+4-1
diff --git a/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/Assets/hwmapping.png
195 KB b/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/Assets/hwmapping.png
195 KB
diff --git a/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/Assets/localmem.png
31.9 KB b/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/Assets/localmem.png
31.9 KB
diff --git a/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/Assets/ndrange-subgroup.png
258 KB b/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/Assets/ndrange-subgroup.png
258 KB
diff --git a/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/Assets/ndrange.png
25.7 KB b/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/Assets/ndrange.png
25.7 KB
diff --git a/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/Assets/workgroup.png
31.3 KB b/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/Assets/workgroup.png
31.3 KB
diff --git a/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/Readme.md
+26 b/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/Readme.md
+26
diff --git a/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/dpex_reductions.ipynb
+874 b/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/dpex_reductions.ipynb
+874
diff --git a/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/lab/atomics_kernel.py
+33 b/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/lab/atomics_kernel.py
+33
diff --git a/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/lab/local_memory.py
+91 b/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/lab/local_memory.py
+91
diff --git a/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/lab/local_memory_kernel.py
+99 b/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/lab/local_memory_kernel.py
+99
diff --git a/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/lab/private_memory_kernel.py
+57 b/‎AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training/08_dpex_reductions/lab/private_memory_kernel.py
+57
@@ -34,8 +34,8 @@ Third party program Licenses can be found here: [third-party-programs.txt](https
 
 ### On a Linux\* System
 
-Please follow instructions [here](https://intel.github.io/intel-extension-for-pytorch/1.11.200/tutorials/installation.html).
+Please follow instructions [here](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html).
 
 ## Running the Sample
 
-Please follow instructions [here](https://intel.github.io/intel-extension-for-pytorch/1.11.200/tutorials/examples.html#complete-bfloat16).
+Please follow instructions [here](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/examples.html#complete-bfloat16).
@@ -19,9 +19,12 @@ Third party program Licenses can be found here: [third-party-programs.txt](https
 | daal4py | [IntelPython_daal4py_GettingStarted](IntelPython_daal4py_GettingStarted)                     | Batch linear regression using the python API package daal4py from oneAPI Data Analytics Library (oneDAL) .
 | Intel® Neural Compressor | [INC-Sample-for-Tensorflow](INC-Sample-for-Tensorflow)                     |Quantize a fp32 model into int8 by Intel® Neural Compressor, and compare the performance between fp32 and int8 .
 | Modin | [IntelModin_GettingStarted](IntelModin_GettingStarted)                     | Run Modin-accelerated Pandas functions and note the performance gain .
-| PyTorch | [IntelPyTorch_GettingStarted](IntelPyTorch_GettingStarted) | A simple training example for PyTorch.
+| PyTorch | [IntelPyTorch_GettingStarted](Intel_Extension_For_PyTorch_GettingStarted) | A simple training example for PyTorch.
 | TensorFlow | [IntelTensorFlow_GettingStarted](IntelTensorFlow_GettingStarted)               | A simple training example for TensorFlow.
 | XGBoost | [IntelPython_XGBoost_GettingStarted](IntelPython_XGBoost_GettingStarted)                     | Set up and train an XGBoost* model on datasets for prediction.
+| Modin |[IntelModin_Vs_Pandas](IntelModin_Vs_Pandas)| compares the performance of Intel® Distribution of Modin* and the performance of Pandas
+| Scikit-learn (OneDAL) | [Intel_Extension_For_SKLearn_GettingStarted](Intel_Extension_For_SKLearn_GettingStarted) |speed up Scikit-learn application use oneDAL
+|oneAPI docker image | [IntelAIKitContainer_GettingStarted](IntelAIKitContainer_GettingStarted)         | configuration script to automatically configure the environment |
 
 
 # Using Samples in Intel® DevCloud for oneAPI
 
@@ -0,0 +1,26 @@
+## Title
+ Reductions using numba-dpex: This is part 8 of the AI Numba-dpex essentials training series
+  
+## Requirements
+| Optimized for                       | Description
+|:---                               |:---
+| OS                                | Linux* Ubuntu 18.04, 20 Windows* 10
+| Hardware                          | Skylake with GEN9 or newer
+| Software                          | Intel&reg; oneAPI DPC++ Compiler, Jupyter Notebooks, Intel Devcloud
+  
+## Purpose
+These hands-on exercises show how to perform reductions using Numba-dpex.
+
+## License  
+Code samples are licensed under the MIT license. See [License.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/License.txt) for details.
+
+Third party program Licenses can be found here: [third-party-programs.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/third-party-programs.txt)
+
+## Install Directions
+
+The Jupyter notebooks are tested and can be run on Intel Devcloud.
+Below are the steps to access these Jupyter notebooks on Intel Devcloud
+1. Register on [Intel Devcloud](https://intelsoftwaresites.secure.force.com/Devcloud/oneapi)
+2. Go to the "Terminal" in the Intel Devcloud    
+3. Navigate to "oneAPI-samples/AI-and-Analytics/Jupyter/Numba_dpex_Essentials_training" folder and open the Welcome.ipynb, click on "Module 8 - dpex_reductions" notebook and follow the instructions
+
@@ -0,0 +1,33 @@
+##==============================================================
+## Copyright © Intel Corporation
+##
+## SPDX-License-Identifier: Apache-2.0
+## =============================================================
+
+import dpnp as np
+import numba_dpex as ndpex
+import timeit
+
+
+@ndpex.kernel
+def atomic_reduction(a):
+    idx = ndpex.get_global_id(0)
+    ndpex.atomic.add(a, 0, a[idx])
+
+
+def main():
+    N = 1024
+    a = np.arange(N)   
+
+    #print("Using device ...")
+    #print(a.device)
+
+    atomic_reduction[N, ndpex.DEFAULT_LOCAL_SIZE](a)
+    #print("Reduction sum =", a[0])
+
+    #print("Done...")
+
+
+if __name__ == "__main__":
+    t = timeit.Timer(lambda: main())    
+    print("Time to calculate reduction using atomics",t.timeit(500),"seconds")
@@ -0,0 +1,91 @@
+##==============================================================
+## Copyright © Intel Corporation
+##
+## SPDX-License-Identifier: Apache-2.0
+## =============================================================
+
+import dpctl
+import numpy as np
+from numba import float32
+
+import numba_dpex as dpex
+
+
+def no_arg_barrier_support():
+    """
+    This example demonstrates the usage of numba_dpex's ``barrier``
+    intrinsic function. The ``barrier`` function is usable only inside
+    a ``kernel`` and is equivalent to OpenCL's ``barrier`` function.
+    """
+
+    @dpex.kernel
+    def twice(A):
+        i = dpex.get_global_id(0)
+        d = A[i]
+        # no argument defaults to global mem fence
+        dpex.barrier()
+        A[i] = d * 2
+
+    N = 10
+    arr = np.arange(N).astype(np.float32)
+    print(arr)
+
+    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
+    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
+    device = dpctl.select_default_device()
+    print("Using device ...")
+    device.print_device_info()
+
+    with dpctl.device_context(device):
+        twice[N, dpex.DEFAULT_LOCAL_SIZE](arr)
+
+    # the output should be `arr * 2, i.e. [0, 2, 4, 6, ...]`
+    print(arr)
+
+
+def local_memory():
+    """
+    This example demonstrates the usage of numba-dpex's `local.array`
+    intrinsic function. The function is used to create a static array
+    allocated on the devices local address space.
+    """
+    blocksize = 10
+
+    @dpex.kernel
+    def reverse_array(A):
+        lm = dpex.local.array(shape=10, dtype=float32)
+        i = dpex.get_global_id(0)
+
+        # preload
+        lm[i] = A[i]
+        # barrier local or global will both work as we only have one work group
+        dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)  # local mem fence
+        # write
+        A[i] += lm[blocksize - 1 - i]
+
+    arr = np.arange(blocksize).astype(np.float32)
+    print(arr)
+
+    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
+    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
+    device = dpctl.select_default_device()
+    print("Using device ...")
+    device.print_device_info()
+
+    with dpctl.device_context(device):
+        reverse_array[blocksize, dpex.DEFAULT_LOCAL_SIZE](arr)
+
+    # the output should be `orig[::-1] + orig, i.e. [9, 9, 9, ...]``
+    print(arr)
+
+
+def main():
+    no_arg_barrier_support()
+    local_memory()
+
+    print("Done...")
+
+
+if __name__ == "__main__":
+    main()
+    
@@ -0,0 +1,99 @@
+# Copyright 2020, 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dpctl
+import numpy as np
+from numba import float32
+
+import numba_dpex as dpex
+
+
+def no_arg_barrier_support():
+    """
+    This example demonstrates the usage of numba_dpex's ``barrier``
+    intrinsic function. The ``barrier`` function is usable only inside
+    a ``kernel`` and is equivalent to OpenCL's ``barrier`` function.
+    """
+
+    @dpex.kernel
+    def twice(A):
+        i = dpex.get_global_id(0)
+        d = A[i]
+        # no argument defaults to global mem fence
+        dpex.barrier()
+        A[i] = d * 2
+
+    N = 10
+    arr = np.arange(N).astype(np.float32)
+    print(arr)
+
+    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
+    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
+    device = dpctl.select_default_device()
+    print("Using device ...")
+    device.print_device_info()
+
+    with dpctl.device_context(device):
+        twice[N, dpex.DEFAULT_LOCAL_SIZE](arr)
+
+    # the output should be `arr * 2, i.e. [0, 2, 4, 6, ...]`
+    print(arr)
+
+
+def local_memory():
+    """
+    This example demonstrates the usage of numba-dpex's `local.array`
+    intrinsic function. The function is used to create a static array
+    allocated on the devices local address space.
+    """
+    blocksize = 10
+
+    @dpex.kernel
+    def reverse_array(A):
+        lm = dpex.local.array(shape=10, dtype=float32)
+        i = dpex.get_global_id(0)
+
+        # preload
+        lm[i] = A[i]
+        # barrier local or global will both work as we only have one work group
+        dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)  # local mem fence
+        # write
+        A[i] += lm[blocksize - 1 - i]
+
+    arr = np.arange(blocksize).astype(np.float32)
+    print(arr)
+
+    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
+    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
+    device = dpctl.select_default_device()
+    print("Using device ...")
+    device.print_device_info()
+
+    with dpctl.device_context(device):
+        reverse_array[blocksize, dpex.DEFAULT_LOCAL_SIZE](arr)
+
+    # the output should be `orig[::-1] + orig, i.e. [9, 9, 9, ...]``
+    print(arr)
+
+
+def main():
+    no_arg_barrier_support()
+    local_memory()
+
+    print("Done...")
+
+
+if __name__ == "__main__":
+    main()
+    
@@ -0,0 +1,57 @@
+##==============================================================
+## Copyright © Intel Corporation
+##
+## SPDX-License-Identifier: Apache-2.0
+## =============================================================
+import dpctl
+import numpy as np
+from numba import float32
+
+import numba_dpex as dpex
+
+
+def private_memory():
+    """
+    This example demonstrates the usage of numba_dpex's `private.array`
+    intrinsic function. The function is used to create a static array
+    allocated on the devices private address space.
+    """
+
+    @dpex.kernel
+    def private_memory_kernel(A):
+        memory = dpex.private.array(shape=1, dtype=np.float32)
+        i = dpex.get_global_id(0)
+
+        # preload
+        memory[0] = i
+        dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)  # local mem fence
+
+        # memory will not hold correct deterministic result if it is not
+        # private to each thread.
+        A[i] = memory[0] * 2
+
+    N = 4
+    arr = np.zeros(N).astype(np.float32)
+    orig = np.arange(N).astype(np.float32)
+
+    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
+    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
+    device = dpctl.select_default_device()
+    print("Using device ...")
+    device.print_device_info()
+
+    with dpex.offload_to_sycl_device(device):
+        private_memory_kernel[N, N](arr)
+
+    #np.testing.assert_allclose(orig * 2, arr)
+    # the output should be `orig[i] * 2, i.e. [0, 2, 4, ..]``
+    print(arr)
+
+
+def main():
+    private_memory()
+    print("Done...")
+
+if __name__ == "__main__":
+    main()
+