system76-coreboot/util/crossgcc/patches/gcc-6.3.0_nds32_ite.patch

diff --git a/gcc/common.opt b/gcc/common.opt
index 67048db..e6f8fd3 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1281,7 +1281,7 @@ ffast-math
 Common

 ffat-lto-objects
-Common Var(flag_fat_lto_objects)
+Common Var(flag_fat_lto_objects) Init(1)
 Output lto objects containing both the intermediate language and binary output.

 ffinite-math-only
diff --git a/gcc/common/config/nds32/nds32-common.c b/gcc/common/config/nds32/nds32-common.c
index fb75956..66ea95c 100644
--- a/gcc/common/config/nds32/nds32-common.c
+++ b/gcc/common/config/nds32/nds32-common.c
@@ -53,6 +53,16 @@ nds32_handle_option (struct gcc_options *opts ATTRIBUTE_UNUSED,

       return true;

+    case OPT_misr_secure_:
+      /* Check the valid security level: 0 1 2 3.  */
+      if (value < 0 || value > 3)
+	{
+	  error_at (loc, "for the option -misr-secure=X, the valid X "
+			 "must be: 0, 1, 2, or 3");
+	  return false;
+	}
+      return true;
+
     case OPT_mcache_block_size_:
       /* Check valid value: 4 8 16 32 64 128 256 512.  */
       if (exact_log2 (value) < 2 || exact_log2 (value) > 9)
@@ -74,15 +84,69 @@ nds32_handle_option (struct gcc_options *opts ATTRIBUTE_UNUSED,
 /* Implement TARGET_OPTION_OPTIMIZATION_TABLE.  */
 static const struct default_options nds32_option_optimization_table[] =
 {
-  /* Enable -fomit-frame-pointer by default at -O1 or higher.  */
-  { OPT_LEVELS_1_PLUS, OPT_fomit_frame_pointer, NULL, 1 },
+#ifdef TARGET_DEFAULT_NO_MATH_ERRNO
+  /* Under some configuration, we would like to use -fno-math-errno by default
+     at all optimization levels for performance and code size consideration.
+     Please check gcc/config.gcc for more implementation details.  */
+  { OPT_LEVELS_ALL,               OPT_fmath_errno,         NULL, 0 },
+#endif
+#if TARGET_LINUX_ABI == 0
+  /* Disable -fdelete-null-pointer-checks by default in ELF toolchain.  */
+  { OPT_LEVELS_ALL,               OPT_fdelete_null_pointer_checks,
+							   NULL, 0 },
+#endif
+  /* Enable -fsched-pressure by default at -O1 and above.  */
+  { OPT_LEVELS_1_PLUS,            OPT_fsched_pressure,     NULL, 1 },
+  /* Enable -fomit-frame-pointer by default at all optimization levels.  */
+  { OPT_LEVELS_ALL,               OPT_fomit_frame_pointer, NULL, 1 },
+  /* Enable -mrelax-hint by default at all optimization levels.  */
+  { OPT_LEVELS_ALL,               OPT_mrelax_hint,         NULL, 1 },
+  /* Enable -mabi-compatible by default at all optimization levels.  */
+  { OPT_LEVELS_ALL,               OPT_mabi_compatible,     NULL, 1 },
+  /* Enalbe -malways-align by default at -O1 and above, but not -Os or -Og.  */
+  { OPT_LEVELS_1_PLUS_SPEED_ONLY, OPT_malways_align,       NULL, 1 },
   /* Enable -mv3push by default at -Os, but it is useless under V2 ISA.  */
-  { OPT_LEVELS_SIZE,   OPT_mv3push,             NULL, 1 },
-
-  { OPT_LEVELS_NONE,   0,                       NULL, 0 }
+  { OPT_LEVELS_SIZE,              OPT_mv3push,             NULL, 1 },
+  /* Enable -mload-store-opt by default at -Os.  */
+  { OPT_LEVELS_SIZE,              OPT_mload_store_opt,     NULL, 1 },
+  /* Enable -mregrename by default at -O1 and above.  */
+  { OPT_LEVELS_1_PLUS,            OPT_mregrename,          NULL, 1 },
+  /* Enable -mgcse by default at -O1 and above.  */
+  { OPT_LEVELS_1_PLUS,            OPT_mgcse,               NULL, 1 },
+  /* Enable -msign-conversion by default at -O1 and above.  */
+  { OPT_LEVELS_1_PLUS,            OPT_msign_conversion,    NULL, 1 },
+  /* Enable -mscalbn-transform by default at -O1 and above.  */
+  { OPT_LEVELS_1_PLUS,            OPT_mscalbn_transform,   NULL, 1 },
+  /* Enable -mconst_remeterialization by default at -O1 and above.  */
+  { OPT_LEVELS_1_PLUS,            OPT_mconst_remater, NULL, 1 },
+  /* Enable -mcprop-acc by default at -O1 and above.  */
+  { OPT_LEVELS_1_PLUS,            OPT_mcprop_acc,   NULL, 1 },
+#ifdef TARGET_OS_DEFAULT_IFC
+  /* Enable -mifc by default at -Os, but it is useless under V2/V3M ISA.  */
+  { OPT_LEVELS_SIZE,              OPT_mifc,                NULL, 1 },
+#endif
+#ifdef TARGET_OS_DEFAULT_EX9
+  /* Enable -mex9 by default at -Os, but it is useless under V2/V3M ISA.  */
+  { OPT_LEVELS_SIZE,              OPT_mex9,                NULL, 1 },
+#endif
+
+  { OPT_LEVELS_NONE,              0,                       NULL, 0 }
 };

 /* ------------------------------------------------------------------------ */
+
+/* Implement TARGET_EXCEPT_UNWIND_INFO.  */
+static enum unwind_info_type
+nds32_except_unwind_info (struct gcc_options *opts ATTRIBUTE_UNUSED)
+{
+  if (TARGET_LINUX_ABI)
+    return UI_DWARF2;
+
+  return UI_SJLJ;
+}
+
+/* ------------------------------------------------------------------------ */
+

 /* Run-time Target Specification.  */

@@ -95,14 +159,22 @@ static const struct default_options nds32_option_optimization_table[] =

    Other MASK_XXX flags are set individually.
    By default we enable
-     TARGET_16_BIT   : Generate 16/32 bit mixed length instruction.
-     TARGET_PERF_EXT : Generate performance extention instrcution.
-     TARGET_CMOV     : Generate conditional move instruction.  */
+     TARGET_16_BIT     : Generate 16/32 bit mixed length instruction.
+     TARGET_EXT_PERF   : Generate performance extention instrcution.
+     TARGET_EXT_PERF2  : Generate performance extention version 2 instrcution.
+     TARGET_EXT_STRING : Generate string extention instrcution.
+     TARGET_HW_ABS     : Generate hardware abs instruction.
+     TARGET_CMOV       : Generate conditional move instruction.  */
 #undef TARGET_DEFAULT_TARGET_FLAGS
 #define TARGET_DEFAULT_TARGET_FLAGS		\
   (TARGET_CPU_DEFAULT				\
+   | TARGET_DEFAULT_FPU_ISA			\
+   | TARGET_DEFAULT_FPU_FMA			\
    | MASK_16_BIT				\
-   | MASK_PERF_EXT				\
+   | MASK_EXT_PERF				\
+   | MASK_EXT_PERF2				\
+   | MASK_EXT_STRING				\
+   | MASK_HW_ABS				\
    | MASK_CMOV)

 #undef TARGET_HANDLE_OPTION
@@ -115,7 +187,7 @@ static const struct default_options nds32_option_optimization_table[] =
 /* Defining the Output Assembler Language.  */

 #undef TARGET_EXCEPT_UNWIND_INFO
-#define TARGET_EXCEPT_UNWIND_INFO sjlj_except_unwind_info
+#define TARGET_EXCEPT_UNWIND_INFO nds32_except_unwind_info

 /* ------------------------------------------------------------------------ */

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 1d5b23f..367a821 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -433,8 +433,28 @@ mips*-*-*)
 	;;
 nds32*)
 	cpu_type=nds32
-	extra_headers="nds32_intrinsic.h"
-	extra_objs="nds32-cost.o nds32-intrinsic.o nds32-isr.o nds32-md-auxiliary.o nds32-pipelines-auxiliary.o nds32-predicates.o nds32-memory-manipulation.o nds32-fp-as-gp.o"
+	extra_headers="nds32_intrinsic.h nds32_isr.h nds32_init.inc"
+	case ${target} in
+	  nds32*-*-linux*)
+	    extra_options="${extra_options} nds32/nds32-linux.opt"
+	    ;;
+	  nds32*-*-elf*)
+	    extra_options="${extra_options} nds32/nds32-elf.opt"
+	    ;;
+	  *)
+	    ;;
+	esac
+	extra_options="${extra_options} g.opt"
+	extra_objs="nds32-cost.o nds32-intrinsic.o nds32-md-auxiliary.o \
+		    nds32-pipelines-auxiliary.o nds32-predicates.o \
+		    nds32-memory-manipulation.o nds32-fp-as-gp.o \
+		    nds32-load-store-opt.o nds32-soft-fp-comm.o nds32-isr.o \
+		    nds32-regrename.o nds32-gcse.o nds32-relax-opt.o \
+		    nds32-sign-conversion.o \
+		    nds32-scalbn-transform.o nds32-lmwsmw.o \
+		    nds32-reg-utils.o nds32-const-remater.o \
+		    nds32-utils.o nds32-abi-compatible.o \
+		    nds32-cprop-acc.o"
 	;;
 nios2-*-*)
 	cpu_type=nios2
@@ -2265,17 +2285,67 @@ msp430*-*-*)
 	tmake_file="${tmake_file} msp430/t-msp430"
 	extra_gcc_objs="driver-msp430.o"
 	;;
-nds32le-*-*)
+nds32*-*-*)
 	target_cpu_default="0"
 	tm_defines="${tm_defines}"
-	tm_file="dbxelf.h elfos.h newlib-stdint.h ${tm_file}"
-	tmake_file="nds32/t-nds32 nds32/t-mlibs"
-	;;
-nds32be-*-*)
-	target_cpu_default="0|MASK_BIG_ENDIAN"
-	tm_defines="${tm_defines} TARGET_BIG_ENDIAN_DEFAULT=1"
-	tm_file="dbxelf.h elfos.h newlib-stdint.h ${tm_file}"
-	tmake_file="nds32/t-nds32 nds32/t-mlibs"
+	case ${target} in
+	  nds32le*-*-*)
+	    ;;
+	  nds32be-*-*)
+	    target_cpu_default="${target_cpu_default}|MASK_BIG_ENDIAN"
+	    tm_defines="${tm_defines} TARGET_BIG_ENDIAN_DEFAULT=1"
+	    ;;
+	esac
+	case ${target} in
+	  nds32*-*-elf*)
+	    tm_file="dbxelf.h elfos.h newlib-stdint.h ${tm_file} nds32/elf.h nds32/nds32_intrinsic.h"
+	    tmake_file="nds32/t-nds32 nds32/t-elf"
+	    ;;
+	  nds32*-*-linux*)
+	    tm_file="dbxelf.h elfos.h ${tm_file} gnu-user.h linux.h glibc-stdint.h nds32/linux.h nds32/nds32_intrinsic.h"
+	    tmake_file="${tmake_file} nds32/t-nds32 nds32/t-linux"
+	    ;;
+	esac
+	nds32_multilibs="${with_multilib_list}"
+	if test "$nds32_multilibs" = "default"; then
+	  nds32_multilibs=""
+	fi
+	nds32_multilibs=`echo $nds32_multilibs | sed -e 's/,/ /g'`
+	for nds32_multilib in ${nds32_multilibs}; do
+		case ${nds32_multilib} in
+		dsp | zol | v3m+ | graywolf )
+			TM_MULTILIB_CONFIG="${TM_MULTILIB_CONFIG} ${nds32_multilib}"
+			;;
+		*)
+			echo "--with-multilib-list=${nds32_multilib} not supported."
+			exit 1
+		esac
+	done
+
+	# Handle --enable-default-relax setting.
+	if test x${enable_default_relax} = xyes; then
+		tm_defines="${tm_defines} TARGET_DEFAULT_RELAX=1"
+	fi
+	# Handle --enable-Os-default-ifc setting.
+	if test x${enable_Os_default_ifc} = xyes; then
+		tm_defines="${tm_defines} TARGET_OS_DEFAULT_IFC=1"
+	fi
+	# Handle --enable-Os-default-ex9 setting.
+	if test x${enable_Os_default_ex9} = xyes; then
+		tm_defines="${tm_defines} TARGET_OS_DEFAULT_EX9=1"
+	fi
+	# Handle --with-ext-dsp
+	if test x${with_ext_dsp} = xyes; then
+		tm_defines="${tm_defines} TARGET_DEFAULT_EXT_DSP=1"
+	fi
+	if test x${with_ext_zol} = xyes; then
+		tm_defines="${tm_defines} TARGET_DEFAULT_HWLOOP=1"
+	fi
+	# Handle --with-16bit-ext, and default is on
+	if test x${with_ext_16bit} != xno; then
+		tm_defines="${tm_defines} TARGET_DEFAULT_16BIT=1"
+	fi
+
 	;;
 nios2-*-*)
 	tm_file="elfos.h ${tm_file}"
@@ -4097,15 +4167,51 @@ case "${target}" in
 		;;

 	nds32*-*-*)
-		supported_defaults="arch nds32_lib"
+		supported_defaults="arch cpu nds32_lib float fpu_config memory_model"

 		# process --with-arch
 		case "${with_arch}" in
-		"" | v2 | v3 | v3m)
+		"" | v3 | v3j)
+			# OK
+			tm_defines="${tm_defines} TARGET_ARCH_DEFAULT=0"
+			tm_defines="${tm_defines} TARGET_DEFAULT_ISR_VECTOR_SIZE=4"
+			;;
+		v2 | v2j | v3m)
+			# OK
+			tm_defines="${tm_defines} TARGET_ARCH_DEFAULT=0"
+			tm_defines="${tm_defines} TARGET_DEFAULT_ISR_VECTOR_SIZE=16"
+			;;
+		v3f)
+			tm_defines="${tm_defines} TARGET_ARCH_DEFAULT=1"
+			tm_defines="${tm_defines} TARGET_DEFAULT_ISR_VECTOR_SIZE=4"
+			;;
+		v3s)
+			tm_defines="${tm_defines} TARGET_ARCH_DEFAULT=2"
+			tm_defines="${tm_defines} TARGET_DEFAULT_ISR_VECTOR_SIZE=4"
+			;;
+		*)
+			echo "Cannot accept --with-arch=$with_arch, available values are: v2 v2j v3 v3j v3m v3f v3s" 1>&2
+			exit 1
+			;;
+		esac
+
+		# process --with-memory-model
+		case "${with_memory_model}" in
+		"" | fast | slow)
+			;;
+		*)
+			echo "Cannot accept --with-memory-model=$with_memory_model, available values are: fast slow" 1>&2
+			exit 1
+			;;
+		esac
+
+		# process --with-cpu
+		case "${with_cpu}" in
+		"" | n7 | n8 | e8 | s8 | n9 | n10 | d10 | graywolf | n12 | n13 | panther)
 			# OK
 			;;
 		*)
-			echo "Cannot accept --with-arch=$with_arch, available values are: v2 v3 v3m" 1>&2
+			echo "Cannot accept --with-cpu=$with_cpu, available values are: n7 n8 e8 s8 n9 n10 d10 graywolf n12 n13 panther" 1>&2
 			exit 1
 			;;
 		esac
@@ -4115,31 +4221,56 @@ case "${target}" in
 		"")
 			# the default library is newlib
 			with_nds32_lib=newlib
+			tm_defines="${tm_defines} TARGET_DEFAULT_CTOR_DTOR=1"
 			;;
 		newlib)
 			# OK
+			tm_defines="${tm_defines} TARGET_DEFAULT_CTOR_DTOR=1"
 			;;
 		mculib)
 			# OK
+			# for the arch=v3f or arch=v3s under mculib toolchain,
+			# we would like to set -fno-math-errno as default
+			case "${with_arch}" in
+			v3f | v3s)
+				tm_defines="${tm_defines} TARGET_DEFAULT_NO_MATH_ERRNO=1"
+				;;
+			esac
+			;;
+		glibc)
+			# OK
+			tm_defines="${tm_defines} TARGET_DEFAULT_TLSDESC_TRAMPOLINE=1"
+			;;
+		uclibc)
 			;;
 		*)
-			echo "Cannot accept --with-nds32-lib=$with_nds32_lib, available values are: newlib mculib" 1>&2
+			echo "Cannot accept --with-nds32-lib=$with_nds32_lib, available values are: newlib mculib glibc uclibc" 1>&2
+			exit 1
+			;;
+		esac
+
+		# process --with-float
+		case "${with_float}" in
+		"" | soft | hard)
+			# OK
+			;;
+		*)
+			echo "Cannot accept --with-float=$with_float, available values are: soft hard" 1>&2
+			exit 1
+			;;
+		esac
+
+		# process --with-config-fpu
+		case "${with_config_fpu}" in
+		"" | 0 | 1 | 2 | 3)
+			# OK
+			;;
+		*)
+			echo "Cannot accept --with-config-fpu=$with_config_fpu, available values from 0 to 7" 1>&2
 			exit 1
 			;;
 		esac
-		;;

-	nios2*-*-*)
-		supported_defaults="arch"
-			case "$with_arch" in
-			"" | r1 | r2)
-				# OK
-				;;
-			*)
-				echo "Unknown arch used in --with-arch=$with_arch" 1>&2
-				exit 1
-				;;
-			esac
 		;;

 	powerpc*-*-* | rs6000-*-*)
@@ -4527,7 +4658,7 @@ case ${target} in
 esac

 t=
-all_defaults="abi cpu cpu_32 cpu_64 arch arch_32 arch_64 tune tune_32 tune_64 schedule float mode fpu nan fp_32 odd_spreg_32 divide llsc mips-plt synci tls"
+all_defaults="abi cpu cpu_32 cpu_64 arch arch_32 arch_64 tune tune_32 tune_64 schedule float mode fpu nan fp_32 odd_spreg_32 divide llsc mips-plt synci tls memory_model"
 for option in $all_defaults
 do
 	eval "val=\$with_"`echo $option | sed s/-/_/g`
diff --git a/gcc/config/nds32/constants.md b/gcc/config/nds32/constants.md
index bea42ee..6c92412 100644
--- a/gcc/config/nds32/constants.md
+++ b/gcc/config/nds32/constants.md
@@ -23,25 +23,176 @@
 (define_constants
   [(R8_REGNUM  8)
    (TA_REGNUM 15)
+   (TP_REGNUM 25)
    (FP_REGNUM 28)
    (GP_REGNUM 29)
    (LP_REGNUM 30)
    (SP_REGNUM 31)
+   (LB_REGNUM 98)
+   (LE_REGNUM 99)
+   (LC_REGNUM 100)
   ])


+;; The unpec operation index.
+(define_c_enum "unspec_element" [
+  UNSPEC_COPYSIGN
+  UNSPEC_FCPYNSD
+  UNSPEC_FCPYNSS
+  UNSPEC_FCPYSD
+  UNSPEC_FCPYSS
+  UNSPEC_CLIP
+  UNSPEC_CLIPS
+  UNSPEC_CLO
+  UNSPEC_PBSAD
+  UNSPEC_PBSADA
+  UNSPEC_BSE
+  UNSPEC_BSE_2
+  UNSPEC_BSP
+  UNSPEC_BSP_2
+  UNSPEC_FFB
+  UNSPEC_FFMISM
+  UNSPEC_FLMISM
+  UNSPEC_KDMBB
+  UNSPEC_KDMBT
+  UNSPEC_KDMTB
+  UNSPEC_KDMTT
+  UNSPEC_KHMBB
+  UNSPEC_KHMBT
+  UNSPEC_KHMTB
+  UNSPEC_KHMTT
+  UNSPEC_KSLRAW
+  UNSPEC_KSLRAWU
+  UNSPEC_SVA
+  UNSPEC_SVS
+  UNSPEC_WSBH
+  UNSPEC_LWUP
+  UNSPEC_LBUP
+  UNSPEC_SWUP
+  UNSPEC_SBUP
+  UNSPEC_LMWZB
+  UNSPEC_SMWZB
+  UNSPEC_UALOAD_HW
+  UNSPEC_UALOAD_W
+  UNSPEC_UALOAD_DW
+  UNSPEC_UASTORE_HW
+  UNSPEC_UASTORE_W
+  UNSPEC_UASTORE_DW
+  UNSPEC_GOTINIT
+  UNSPEC_GOT
+  UNSPEC_GOTOFF
+  UNSPEC_PLT
+  UNSPEC_TLSGD
+  UNSPEC_TLSLD
+  UNSPEC_TLSIE
+  UNSPEC_TLSLE
+  UNSPEC_ROUND
+  UNSPEC_VEC_COMPARE
+  UNSPEC_KHM
+  UNSPEC_KHMX
+  UNSPEC_CLIP_OV
+  UNSPEC_CLIPS_OV
+  UNSPEC_BITREV
+  UNSPEC_KABS
+  UNSPEC_LOOP_END
+  UNSPEC_TLS_DESC
+  UNSPEC_TLS_IE
+  UNSPEC_ADD32
+  UNSPEC_ICT
+])
+
+
 ;; The unspec_volatile operation index.
 (define_c_enum "unspec_volatile_element" [
-  UNSPEC_VOLATILE_FUNC_RETURN
+  UNSPEC_VOLATILE_EH_RETURN
   UNSPEC_VOLATILE_ISYNC
   UNSPEC_VOLATILE_ISB
+  UNSPEC_VOLATILE_DSB
+  UNSPEC_VOLATILE_MSYNC
+  UNSPEC_VOLATILE_MSYNC_ALL
+  UNSPEC_VOLATILE_MSYNC_STORE
   UNSPEC_VOLATILE_MFSR
   UNSPEC_VOLATILE_MFUSR
   UNSPEC_VOLATILE_MTSR
   UNSPEC_VOLATILE_MTUSR
   UNSPEC_VOLATILE_SETGIE_EN
   UNSPEC_VOLATILE_SETGIE_DIS
+  UNSPEC_VOLATILE_FMFCSR
+  UNSPEC_VOLATILE_FMTCSR
+  UNSPEC_VOLATILE_FMFCFG
+  UNSPEC_VOLATILE_JR_ITOFF
+  UNSPEC_VOLATILE_JR_TOFF
+  UNSPEC_VOLATILE_JRAL_ITON
+  UNSPEC_VOLATILE_JRAL_TON
+  UNSPEC_VOLATILE_RET_ITOFF
+  UNSPEC_VOLATILE_RET_TOFF
+  UNSPEC_VOLATILE_STANDBY_NO_WAKE_GRANT
+  UNSPEC_VOLATILE_STANDBY_WAKE_GRANT
+  UNSPEC_VOLATILE_STANDBY_WAKE_DONE
+  UNSPEC_VOLATILE_TEQZ
+  UNSPEC_VOLATILE_TNEZ
+  UNSPEC_VOLATILE_TRAP
+  UNSPEC_VOLATILE_SETEND_BIG
+  UNSPEC_VOLATILE_SETEND_LITTLE
+  UNSPEC_VOLATILE_BREAK
+  UNSPEC_VOLATILE_SYSCALL
+  UNSPEC_VOLATILE_NOP
+  UNSPEC_VOLATILE_RES_DEP
+  UNSPEC_VOLATILE_DATA_DEP
+  UNSPEC_VOLATILE_LLW
+  UNSPEC_VOLATILE_SCW
+  UNSPEC_VOLATILE_CCTL_L1D_INVALALL
+  UNSPEC_VOLATILE_CCTL_L1D_WBALL_ALVL
+  UNSPEC_VOLATILE_CCTL_L1D_WBALL_ONE_LVL
+  UNSPEC_VOLATILE_CCTL_IDX_WRITE
+  UNSPEC_VOLATILE_CCTL_IDX_READ
+  UNSPEC_VOLATILE_CCTL_VA_WBINVAL_L1
+  UNSPEC_VOLATILE_CCTL_VA_WBINVAL_LA
+  UNSPEC_VOLATILE_CCTL_IDX_WBINVAL
+  UNSPEC_VOLATILE_CCTL_VA_LCK
+  UNSPEC_VOLATILE_DPREF_QW
+  UNSPEC_VOLATILE_DPREF_HW
+  UNSPEC_VOLATILE_DPREF_W
+  UNSPEC_VOLATILE_DPREF_DW
+  UNSPEC_VOLATILE_TLBOP_TRD
+  UNSPEC_VOLATILE_TLBOP_TWR
+  UNSPEC_VOLATILE_TLBOP_RWR
+  UNSPEC_VOLATILE_TLBOP_RWLK
+  UNSPEC_VOLATILE_TLBOP_UNLK
+  UNSPEC_VOLATILE_TLBOP_PB
+  UNSPEC_VOLATILE_TLBOP_INV
+  UNSPEC_VOLATILE_TLBOP_FLUA
+  UNSPEC_VOLATILE_ENABLE_INT
+  UNSPEC_VOLATILE_DISABLE_INT
+  UNSPEC_VOLATILE_SET_PENDING_SWINT
+  UNSPEC_VOLATILE_CLR_PENDING_SWINT
+  UNSPEC_VOLATILE_CLR_PENDING_HWINT
+  UNSPEC_VOLATILE_GET_ALL_PENDING_INT
+  UNSPEC_VOLATILE_GET_PENDING_INT
+  UNSPEC_VOLATILE_SET_INT_PRIORITY
+  UNSPEC_VOLATILE_GET_INT_PRIORITY
+  UNSPEC_VOLATILE_SET_TRIG_LEVEL
+  UNSPEC_VOLATILE_SET_TRIG_EDGE
+  UNSPEC_VOLATILE_GET_TRIG_TYPE
+  UNSPEC_VOLATILE_RELAX_GROUP
+  UNSPEC_VOLATILE_INNERMOST_LOOP_BEGIN
+  UNSPEC_VOLATILE_INNERMOST_LOOP_END
+  UNSPEC_VOLATILE_OMIT_FP_BEGIN
+  UNSPEC_VOLATILE_OMIT_FP_END
   UNSPEC_VOLATILE_POP25_RETURN
+  UNSPEC_VOLATILE_SIGNATURE_BEGIN
+  UNSPEC_VOLATILE_SIGNATURE_END
+  UNSPEC_VOLATILE_NO_HWLOOP
+  UNSPEC_VOLATILE_NO_IFC_BEGIN
+  UNSPEC_VOLATILE_NO_IFC_END
+  UNSPEC_VOLATILE_NO_EX9_BEGIN
+  UNSPEC_VOLATILE_NO_EX9_END
+  UNSPEC_VOLATILE_UNALIGNED_FEATURE
+  UNSPEC_VOLATILE_ENABLE_UNALIGNED
+  UNSPEC_VOLATILE_DISABLE_UNALIGNED
+  UNSPEC_VOLATILE_RDOV
+  UNSPEC_VOLATILE_CLROV
+  UNSPEC_VOLATILE_HWLOOP_LAST_INSN
 ])

 ;; ------------------------------------------------------------------------
diff --git a/gcc/config/nds32/constraints.md b/gcc/config/nds32/constraints.md
index 1f44a1a..8163f46 100644
--- a/gcc/config/nds32/constraints.md
+++ b/gcc/config/nds32/constraints.md
@@ -25,9 +25,6 @@
 ;; Machine-dependent floating: G H


-(define_register_constraint "w" "(TARGET_ISA_V3 || TARGET_ISA_V3M) ? LOW_REGS : NO_REGS"
-  "LOW register class $r0 ~ $r7 constraint for V3/V3M ISA")
-
 (define_register_constraint "l" "LOW_REGS"
   "LOW register class $r0 ~ $r7")

@@ -41,9 +38,59 @@
 (define_register_constraint "t" "R15_TA_REG"
   "Temporary Assist register $ta (i.e. $r15)")

+(define_register_constraint "e" "R8_REG"
+  "Function Entry register $r8)")
+
 (define_register_constraint "k" "STACK_REG"
   "Stack register $sp")

+(define_register_constraint "v" "R5_REG"
+  "Register $r5")
+
+(define_register_constraint "x" "FRAME_POINTER_REG"
+  "Frame pointer register $fp")
+
+(define_register_constraint "f"
+  "(TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE) ? FP_REGS : NO_REGS"
+ "The Floating point registers $fs0 ~ $fs31")
+
+(define_register_constraint "A" "LOOP_REGS"
+  "Loop register class")
+
+(define_constraint "Iv00"
+  "Constant value 0"
+  (and (match_code "const_int")
+       (match_test "ival == 0")))
+
+(define_constraint "Iv01"
+  "Constant value 1"
+  (and (match_code "const_int")
+       (match_test "ival == 1")))
+
+(define_constraint "Iv02"
+  "Constant value 2"
+  (and (match_code "const_int")
+       (match_test "ival == 2")))
+
+(define_constraint "Iv04"
+  "Constant value 4"
+  (and (match_code "const_int")
+       (match_test "ival == 4")))
+
+(define_constraint "Iv08"
+  "Constant value 8"
+  (and (match_code "const_int")
+       (match_test "ival == 8")))
+
+(define_constraint "Iu01"
+  "Unsigned immediate 1-bit value"
+  (and (match_code "const_int")
+       (match_test "ival == 1 || ival == 0")))
+
+(define_constraint "Iu02"
+  "Unsigned immediate 2-bit value"
+  (and (match_code "const_int")
+       (match_test "ival < (1 << 2) && ival >= 0")))

 (define_constraint "Iu03"
   "Unsigned immediate 3-bit value"
@@ -65,6 +112,11 @@
   (and (match_code "const_int")
        (match_test "ival < (1 << 4) && ival >= -(1 << 4)")))

+(define_constraint "Cs05"
+  "Signed immediate 5-bit value"
+  (and (match_code "const_double")
+       (match_test "nds32_const_double_range_ok_p (op, SFmode, -(1 << 4), (1 << 4))")))
+
 (define_constraint "Iu05"
   "Unsigned immediate 5-bit value"
   (and (match_code "const_int")
@@ -75,6 +127,11 @@
   (and (match_code "const_int")
        (match_test "IN_RANGE (ival, -31, 0)")))

+(define_constraint "Iu06"
+  "Unsigned immediate 6-bit value"
+  (and (match_code "const_int")
+       (match_test "ival < (1 << 6) && ival >= 0")))
+
 ;; Ip05 is special and dedicated for v3 movpi45 instruction.
 ;; movpi45 has imm5u field but the range is 16 ~ 47.
 (define_constraint "Ip05"
@@ -84,10 +141,10 @@
 		    && ival >= (0 + 16)
 		    && (TARGET_ISA_V3 || TARGET_ISA_V3M)")))

-(define_constraint "Iu06"
+(define_constraint "IU06"
   "Unsigned immediate 6-bit value constraint for addri36.sp instruction"
   (and (match_code "const_int")
-       (match_test "ival < (1 << 6)
+       (match_test "ival < (1 << 8)
 		    && ival >= 0
 		    && (ival % 4 == 0)
 		    && (TARGET_ISA_V3 || TARGET_ISA_V3M)")))
@@ -103,6 +160,11 @@
        (match_test "ival < (1 << 9) && ival >= 0")))


+(define_constraint "Is08"
+  "Signed immediate 8-bit value"
+  (and (match_code "const_int")
+       (match_test "ival < (1 << 7) && ival >= -(1 << 7)")))
+
 (define_constraint "Is10"
   "Signed immediate 10-bit value"
   (and (match_code "const_int")
@@ -113,6 +175,10 @@
   (and (match_code "const_int")
        (match_test "ival < (1 << 10) && ival >= -(1 << 10)")))

+(define_constraint "Is14"
+  "Signed immediate 14-bit value"
+  (and (match_code "const_int")
+       (match_test "ival < (1 << 13) && ival >= -(1 << 13)")))

 (define_constraint "Is15"
   "Signed immediate 15-bit value"
@@ -194,12 +260,21 @@
   (and (match_code "const_int")
        (match_test "ival < (1 << 19) && ival >= -(1 << 19)")))

+(define_constraint "Cs20"
+  "Signed immediate 20-bit value"
+  (and (match_code "const_double")
+       (match_test "nds32_const_double_range_ok_p (op, SFmode, -(1 << 19), (1 << 19))")))

 (define_constraint "Ihig"
   "The immediate value that can be simply set high 20-bit"
   (and (match_code "const_int")
        (match_test "(ival != 0) && ((ival & 0xfff) == 0)")))

+(define_constraint "Chig"
+  "The immediate value that can be simply set high 20-bit"
+  (and (match_code "high")
+       (match_test "GET_CODE (XEXP (op, 0)) == CONST_DOUBLE")))
+
 (define_constraint "Izeb"
   "The immediate value 0xff"
   (and (match_code "const_int")
@@ -213,12 +288,12 @@
 (define_constraint "Ixls"
   "The immediate value 0x01"
   (and (match_code "const_int")
-       (match_test "TARGET_PERF_EXT && (ival == 0x1)")))
+       (match_test "TARGET_EXT_PERF && (ival == 0x1)")))

 (define_constraint "Ix11"
   "The immediate value 0x7ff"
   (and (match_code "const_int")
-       (match_test "TARGET_PERF_EXT && (ival == 0x7ff)")))
+       (match_test "TARGET_EXT_PERF && (ival == 0x7ff)")))

 (define_constraint "Ibms"
   "The immediate value with power of 2"
@@ -232,23 +307,70 @@
        (match_test "(TARGET_ISA_V3 || TARGET_ISA_V3M)
 		    && (IN_RANGE (exact_log2 (ival + 1), 1, 8))")))

+(define_constraint "CVp5"
+  "Unsigned immediate 5-bit value for movpi45 instruction with range 16-47"
+  (and (match_code "const_vector")
+       (match_test "nds32_valid_CVp5_p (op)")))
+
+(define_constraint "CVs5"
+  "Signed immediate 5-bit value"
+  (and (match_code "const_vector")
+       (match_test "nds32_valid_CVs5_p (op)")))
+
+(define_constraint "CVs2"
+  "Signed immediate 20-bit value"
+  (and (match_code "const_vector")
+       (match_test "nds32_valid_CVs2_p (op)")))
+
+(define_constraint "CVhi"
+  "The immediate value that can be simply set high 20-bit"
+  (and (match_code "const_vector")
+       (match_test "nds32_valid_CVhi_p (op)")))

 (define_memory_constraint "U33"
   "Memory constraint for 333 format"
   (and (match_code "mem")
-       (match_test "nds32_mem_format (op) == ADDRESS_LO_REG_IMM3U")))
+       (match_test "nds32_mem_format (op) == ADDRESS_POST_INC_LO_REG_IMM3U
+		    || nds32_mem_format (op) == ADDRESS_POST_MODIFY_LO_REG_IMM3U
+		    || nds32_mem_format (op) == ADDRESS_LO_REG_IMM3U")))

 (define_memory_constraint "U45"
   "Memory constraint for 45 format"
   (and (match_code "mem")
        (match_test "(nds32_mem_format (op) == ADDRESS_REG)
-		    && (GET_MODE (op) == SImode)")))
+		    && ((GET_MODE (op) == SImode)
+		       || (GET_MODE (op) == SFmode))")))
+
+(define_memory_constraint "Ufe"
+  "Memory constraint for fe format"
+  (and (match_code "mem")
+       (match_test "nds32_mem_format (op) == ADDRESS_R8_IMM7U
+		    && (GET_MODE (op) == SImode
+			|| GET_MODE (op) == SFmode)")))

 (define_memory_constraint "U37"
   "Memory constraint for 37 format"
   (and (match_code "mem")
        (match_test "(nds32_mem_format (op) == ADDRESS_SP_IMM7U
 		    || nds32_mem_format (op) == ADDRESS_FP_IMM7U)
-		    && (GET_MODE (op) == SImode)")))
+		    && (GET_MODE (op) == SImode
+			|| GET_MODE (op) == SFmode)")))
+
+(define_memory_constraint "Umw"
+  "Memory constraint for lwm/smw"
+  (and (match_code "mem")
+       (match_test "nds32_valid_smw_lwm_base_p (op)")))
+
+(define_memory_constraint "Da"
+  "Memory constraint for non-offset loads/stores"
+  (and (match_code "mem")
+       (match_test "REG_P (XEXP (op, 0))
+		    || (GET_CODE (XEXP (op, 0)) == POST_INC)")))
+
+(define_memory_constraint "Q"
+  "Memory constraint for no symbol_ref and const"
+  (and (match_code "mem")
+       (match_test "(TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)
+		     && nds32_float_mem_operand_p (op)")))

 ;; ------------------------------------------------------------------------
diff --git a/gcc/config/nds32/elf.h b/gcc/config/nds32/elf.h
new file mode 100644
index 0000000..315dcd8
--- /dev/null
+++ b/gcc/config/nds32/elf.h
@@ -0,0 +1,83 @@
+/* Definitions of target machine of Andes NDS32 cpu for GNU compiler
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+
+/* ------------------------------------------------------------------------ */
+
+#define TARGET_LINUX_ABI 0
+
+/* In the configure stage we may use options --enable-default-relax,
+   --enable-Os-default-ifc and --enable-Os-default-ex9.  They effect
+   the default spec of passing --relax, --mifc, and --mex9 to linker.
+   We use NDS32_RELAX_SPEC, NDS32_IFC_SPEC, and NDS32_EX9_SPEC
+   so that we can customize them conveniently.  */
+#define LINK_SPEC \
+  " %{G*}" \
+  " %{mbig-endian:-EB} %{mlittle-endian:-EL}" \
+  " %{shared:-shared}" \
+  NDS32_RELAX_SPEC \
+  NDS32_IFC_SPEC \
+  NDS32_EX9_SPEC
+
+#define LIB_SPEC \
+  " -lc -lgloss"
+
+#define LIBGCC_SPEC \
+  " -lgcc"
+
+/* The option -mno-ctor-dtor can disable constructor/destructor feature
+   by applying different crt stuff.  In the convention, crt0.o is the
+   startup file without constructor/destructor;
+   crt1.o, crti.o, crtbegin.o, crtend.o, and crtn.o are the
+   startup files with constructor/destructor.
+   Note that crt0.o, crt1.o, crti.o, and crtn.o are provided
+   by newlib/mculib/glibc/ublic, while crtbegin.o and crtend.o are
+   currently provided by GCC for nds32 target.
+
+   For nds32 target so far:
+   If -mno-ctor-dtor, we are going to link
+   "crt0.o [user objects]".
+   If -mctor-dtor, we are going to link
+   "crt1.o crtbegin1.o [user objects] crtend1.o".
+
+   Note that the TARGET_DEFAULT_CTOR_DTOR would effect the
+   default behavior.  Check gcc/config.gcc for more information.  */
+#ifdef TARGET_DEFAULT_CTOR_DTOR
+  #define STARTFILE_SPEC \
+    " %{!mno-ctor-dtor:crt1.o%s;:crt0.o%s}" \
+    " %{!mno-ctor-dtor:crtbegin1.o%s}" \
+    " %{mcrt-arg:crtarg.o%s}"
+  #define ENDFILE_SPEC \
+    " %{!mno-ctor-dtor:crtend1.o%s}"
+#else
+  #define STARTFILE_SPEC \
+    " %{mctor-dtor|coverage:crt1.o%s;:crt0.o%s}" \
+    " %{mctor-dtor|coverage:crtbegin1.o%s}" \
+    " %{mcrt-arg:crtarg.o%s}"
+  #define ENDFILE_SPEC \
+    " %{mctor-dtor|coverage:crtend1.o%s}"
+#endif
+
+#define STARTFILE_CXX_SPEC \
+  " %{!mno-ctor-dtor:crt1.o%s;:crt0.o%s}" \
+  " %{!mno-ctor-dtor:crtbegin1.o%s}" \
+  " %{mcrt-arg:crtarg.o%s}"
+#define ENDFILE_CXX_SPEC \
+  " %{!mno-ctor-dtor:crtend1.o%s}"
diff --git a/gcc/config/nds32/iterators.md b/gcc/config/nds32/iterators.md
index ab0f103..6023b9c 100644
--- a/gcc/config/nds32/iterators.md
+++ b/gcc/config/nds32/iterators.md
@@ -26,30 +26,99 @@
 ;; A list of integer modes that are up to one word long.
 (define_mode_iterator QIHISI [QI HI SI])

+;; A list of integer modes for one word and double word.
+(define_mode_iterator SIDI [SI DI])
+
 ;; A list of integer modes that are up to one half-word long.
 (define_mode_iterator QIHI [QI HI])

 ;; A list of the modes that are up to double-word long.
 (define_mode_iterator DIDF [DI DF])

+;; A list of the modes that are up to one word long vector.
+(define_mode_iterator VQIHI [V4QI V2HI])
+
+;; A list of the modes that are up to one word long vector and scalar.
+(define_mode_iterator VSQIHI [V4QI V2HI QI HI])
+
+(define_mode_iterator VSQIHIDI [V4QI V2HI QI HI DI])
+
+(define_mode_iterator VQIHIDI [V4QI V2HI DI])
+
+;; A list of the modes that are up to one word long vector
+;; and scalar for HImode.
+(define_mode_iterator VSHI [V2HI HI])
+
+;; A list of the modes that are up to double-word long.
+(define_mode_iterator ANYF [(SF "TARGET_FPU_SINGLE")
+			    (DF "TARGET_FPU_DOUBLE")])

 ;;----------------------------------------------------------------------------
 ;; Mode attributes.
 ;;----------------------------------------------------------------------------

-(define_mode_attr size [(QI "b") (HI "h") (SI "w")])
+(define_mode_attr size [(QI "b") (HI "h") (SI "w") (SF "s") (DF "d")])

-(define_mode_attr byte [(QI "1") (HI "2") (SI "4")])
+(define_mode_attr byte [(QI "1") (HI "2") (SI "4") (V4QI "4") (V2HI "4")])

+(define_mode_attr bits [(V4QI "8") (QI "8") (V2HI "16") (HI "16") (DI "64")])
+
+(define_mode_attr VELT [(V4QI "QI") (V2HI "HI")])

 ;;----------------------------------------------------------------------------
 ;; Code iterators.
 ;;----------------------------------------------------------------------------

+;; shifts
+(define_code_iterator shift_rotate [ashift ashiftrt lshiftrt rotatert])
+
+(define_code_iterator shifts [ashift ashiftrt lshiftrt])
+
+(define_code_iterator shiftrt [ashiftrt lshiftrt])
+
+(define_code_iterator sat_plus [ss_plus us_plus])
+
+(define_code_iterator all_plus [plus ss_plus us_plus])
+
+(define_code_iterator sat_minus [ss_minus us_minus])
+
+(define_code_iterator all_minus [minus ss_minus us_minus])
+
+(define_code_iterator plus_minus [plus minus])
+
+(define_code_iterator extend [sign_extend zero_extend])
+
+(define_code_iterator sumax [smax umax])
+
+(define_code_iterator sumin [smin umin])
+
+(define_code_iterator sumin_max [smax umax smin umin])

 ;;----------------------------------------------------------------------------
 ;; Code attributes.
 ;;----------------------------------------------------------------------------

+;; shifts
+(define_code_attr shift
+  [(ashift "ashl") (ashiftrt "ashr") (lshiftrt "lshr") (rotatert "rotr")])
+
+(define_code_attr su
+  [(ashiftrt "") (lshiftrt "u") (sign_extend "s") (zero_extend "u")])
+
+(define_code_attr zs
+  [(sign_extend "s") (zero_extend "z")])
+
+(define_code_attr uk
+  [(plus "") (ss_plus "k") (us_plus "uk")
+   (minus "") (ss_minus "k") (us_minus "uk")])
+
+(define_code_attr opcode
+  [(plus "add") (minus "sub") (smax "smax") (umax "umax") (smin "smin") (umin "umin")])
+
+(define_code_attr add_rsub
+  [(plus "a") (minus "rs")])
+
+(define_code_attr add_sub
+  [(plus "a") (minus "s")])

 ;;----------------------------------------------------------------------------
diff --git a/gcc/config/nds32/linux.h b/gcc/config/nds32/linux.h
new file mode 100644
index 0000000..36ddf2f
--- /dev/null
+++ b/gcc/config/nds32/linux.h
@@ -0,0 +1,78 @@
+/* Definitions of target machine of Andes NDS32 cpu for GNU compiler
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+
+/* ------------------------------------------------------------------------ */
+
+#define TARGET_LINUX_ABI 1
+
+#undef  SIZE_TYPE
+#define SIZE_TYPE "unsigned int"
+
+#undef  PTRDIFF_TYPE
+#define PTRDIFF_TYPE "int"
+
+#ifdef TARGET_DEFAULT_TLSDESC_TRAMPOLINE
+  #define NDS32_TLSDESC_TRAMPOLINE_SPEC \
+    " %{!mno-tlsdesc-trampoline:--mtlsdesc-trampoline}"
+#else
+  #define NDS32_TLSDESC_TRAMPOLINE_SPEC ""
+#endif
+
+#define TARGET_OS_CPP_BUILTINS()                \
+  do                                            \
+    {                                           \
+      GNU_USER_TARGET_OS_CPP_BUILTINS();           \
+    }                                           \
+  while (0)
+
+#define GLIBC_DYNAMIC_LINKER "/lib/ld.so.1"
+
+/* In the configure stage we may use options --enable-default-relax,
+   --enable-Os-default-ifc and --enable-Os-default-ex9.  They effect
+   the default spec of passing --relax, --mifc, and --mex9 to linker.
+   We use NDS32_RELAX_SPEC, NDS32_IFC_SPEC, and NDS32_EX9_SPEC
+   so that we can customize them conveniently.  */
+#define LINK_SPEC \
+ " %{G*}" \
+ " %{mbig-endian:-EB} %{mlittle-endian:-EL}" \
+ " %{shared:-shared} \
+  %{!shared: \
+    %{!static: \
+      %{rdynamic:-export-dynamic} \
+      -dynamic-linker " GNU_USER_DYNAMIC_LINKER "} \
+    %{static:-static}}" \
+  NDS32_RELAX_SPEC \
+  NDS32_IFC_SPEC \
+  NDS32_EX9_SPEC \
+  NDS32_TLSDESC_TRAMPOLINE_SPEC
+
+#define LINK_PIE_SPEC "%{pie:%{!fno-pie:%{!fno-PIE:%{!static:-pie}}}} "
+
+
+/* The SYNC operations are implemented as library functions, not
+   INSN patterns.  As a result, the HAVE defines for the patterns are
+   not defined.  We need to define them to generate the corresponding
+   __GCC_HAVE_SYNC_COMPARE_AND_SWAP_* and __GCC_ATOMIC_*_LOCK_FREE
+   defines.
+   Ref: https://sourceware.org/ml/libc-alpha/2014-09/msg00322.html  */
+#define HAVE_sync_compare_and_swapqi 1
+#define HAVE_sync_compare_and_swaphi 1
+#define HAVE_sync_compare_and_swapsi 1
diff --git a/gcc/config/nds32/nds32-abi-compatible.c b/gcc/config/nds32/nds32-abi-compatible.c
new file mode 100644
index 0000000..f2ed006
--- /dev/null
+++ b/gcc/config/nds32/nds32-abi-compatible.c
@@ -0,0 +1,315 @@
+/* A Gimple-level pass of Andes NDS32 cpu for GNU compiler.
+   This pass collects the usage of float-point.
+
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "hash-set.h"
+#include "machmode.h"
+#include "vec.h"
+#include "double-int.h"
+#include "input.h"
+#include "alias.h"
+#include "symtab.h"
+#include "wide-int.h"
+#include "inchash.h"
+#include "tree.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "rtl.h"
+#include "regs.h"
+#include "hard-reg-set.h"
+#include "insn-config.h"   /* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"     /* For DFA state_t.  */
+#include "insn-codes.h"    /* For CODE_FOR_xxx.  */
+#include "reload.h"     /* For push_reload ().  */
+#include "flags.h"
+#include "input.h"
+#include "function.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "dominance.h"
+#include "cfg.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "predict.h"
+#include "basic-block.h"
+#include "bitmap.h"
+#include "df.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"     /* For add_builtin_function ().  */
+#include "ggc.h"
+#include "tree-pass.h"
+#include "tree-ssa-alias.h"
+#include "fold-const.h"
+#include "gimple-expr.h"
+#include "is-a.h"
+#include "gimple.h"
+#include "gimplify.h"
+#include "gimple-iterator.h"
+#include "gimplify-me.h"
+#include "gimple-ssa.h"
+#include "ipa-ref.h"
+#include "lto-streamer.h"
+#include "cgraph.h"
+#include "tree-cfg.h"
+#include "tree-phinodes.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
+#include "tree-pass.h"
+#include "gimple-pretty-print.h"
+#include "gimple-walk.h"
+
+/* Indicate the translation unit whether including floating-point arithmetic
+   or not.  */
+bool nds32_include_fp_arith = false;
+
+/* Return true if the return type and argument types of current function
+   pass the insepction. Furthermore, the global value NDS32_INCLUDE_FP_ARITH
+   is modified.  */
+
+static bool
+nds32_acd_func_rtn_args_check (tree fn_decl)
+{
+  tree fn_type = TREE_TYPE (fn_decl);
+  function_args_iterator iter;
+  tree arg_type = NULL_TREE;
+  tree rtn_type = NULL_TREE;
+  unsigned argno = 1;
+
+  gcc_assert (fn_type);
+
+  rtn_type = TREE_TYPE (fn_type);
+  if (dump_file)
+    {
+      fprintf (dump_file,
+	       " Check the return & arguments for function %s\n"
+	       "  Prototype:",
+	       fndecl_name (fn_decl));
+      print_generic_decl (dump_file, fn_decl, 0);
+      fprintf (dump_file, "\n");
+    }
+
+  /* Check the return type.  */
+  if (FLOAT_TYPE_P (rtn_type)
+      || RECORD_OR_UNION_TYPE_P (rtn_type))
+    {
+      if (dump_file)
+	fprintf (dump_file, "  ! Return type is FP or record/union type\n");
+      nds32_include_fp_arith = true;
+
+      return false;
+    }
+
+  /* Check if the function has a variable argument list.  */
+  if (stdarg_p (fn_type))
+    {
+      if (dump_file)
+	fprintf (dump_file, "  ! Has variable argument list (i.e. ,...)\n");
+      nds32_include_fp_arith = true;
+
+      return false;
+    }
+
+  /* Check the arguments.  */
+  FOREACH_FUNCTION_ARGS (fn_type, arg_type, iter)
+    {
+      if (arg_type == void_type_node)
+	break;
+
+      if (FLOAT_TYPE_P (arg_type)
+	  || RECORD_OR_UNION_TYPE_P (arg_type))
+	{
+	  if (dump_file)
+	    fprintf (dump_file,
+		     "  ! No.%d argument is FP or record/union type\n",
+		     argno);
+	  nds32_include_fp_arith = true;
+
+	  return false;
+	}
+      argno++;
+    }
+
+  if (dump_file)
+    fprintf (dump_file,
+	     "  >> Pass the inspection of return & arguments type\n");
+
+  return true;
+}
+
+/* Helper for nds32_abi_compatible. Return *TP if it is a floating-point
+   -related operand.  */
+
+static tree
+nds32_acd_walk_op_fn (tree *tp, int *walk_subtrees, void *data ATTRIBUTE_UNUSED)
+{
+  tree t = *tp;
+
+  if (t && TREE_TYPE (t)
+      && (FLOAT_TYPE_P (TREE_TYPE (t))
+	  || TREE_CODE (t) == REAL_CST
+	  || TREE_CODE (t) == COMPLEX_CST
+	  || TREE_CODE (t) == FLOAT_EXPR
+	  || TREE_CODE (t) == REALPART_EXPR))
+    {
+      *walk_subtrees = 0;
+      return t;
+    }
+
+  return NULL_TREE;
+}
+
+/* Helper for nds32_abi_compatible. Return non-NULL tree and set
+   *HANDLED_OPS_P to true if *GSI_P is an ASM stmt.  */
+
+static tree
+nds32_acd_walk_stmt_fn (gimple_stmt_iterator *gsi_p, bool *handled_ops_p,
+		       	struct walk_stmt_info *wi ATTRIBUTE_UNUSED)
+{
+  gimple *stmt = gsi_stmt (*gsi_p);
+
+  switch (gimple_code (stmt))
+    {
+    case GIMPLE_DEBUG:
+      *handled_ops_p = true;
+      break;
+
+    case GIMPLE_ASM:
+      *handled_ops_p = true;
+      return (tree) -1;
+      break;
+
+    case GIMPLE_CALL:
+	{
+	  tree call_decl = gimple_call_fndecl (stmt);
+	  if (!call_decl
+	      || !nds32_acd_func_rtn_args_check (call_decl))
+	    {
+	      *handled_ops_p = true;
+	      return call_decl;
+	    }
+	}
+      break;
+
+    default:
+      break;
+    }
+
+  return NULL_TREE;
+}
+
+/* This function is the entry of ABI compatible detection pass.  */
+
+static int
+nds32_abi_compatible (void)
+{
+  basic_block bb;
+  struct walk_stmt_info wi;
+
+  memset (&wi, 0, sizeof (wi));
+
+  if (!nds32_acd_func_rtn_args_check (current_function_decl))
+    return 0;
+
+  if (dump_file)
+    fprintf (dump_file, "Check function body %s\n",
+	     function_name (cfun));
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      gimple *ret;
+      gimple_seq seq = bb_seq (bb);
+
+      ret = walk_gimple_seq (seq,
+			     nds32_acd_walk_stmt_fn,
+			     nds32_acd_walk_op_fn,
+			     &wi);
+      if (ret != NULL)
+	{
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, " ! NO PASS: ");
+	      print_gimple_stmt (dump_file, ret, 0, TDF_SLIM|TDF_RAW);
+	    }
+	  nds32_include_fp_arith = true;
+	  break;
+	}
+    }
+
+  if (dump_file)
+    if (!nds32_include_fp_arith)
+      fprintf (dump_file,
+	       " >> Pass the inspection of FP operand for function body\n");
+
+  return 0;
+}
+
+static bool
+gate_nds32_abi_compatible (void)
+{
+  return flag_nds32_abi_compatible
+    && !nds32_include_fp_arith;
+}
+
+const pass_data pass_data_nds32_abi_compatible =
+{
+  GIMPLE_PASS,				/* type */
+  "abi_compatible",			/* name */
+  OPTGROUP_NONE,			/* optinfo_flags */
+  TV_MACH_DEP,				/* tv_id */
+  ( PROP_cfg | PROP_ssa ),		/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  0,					/* todo_flags_finish */
+};
+
+class pass_nds32_abi_compatible : public gimple_opt_pass
+{
+public:
+  pass_nds32_abi_compatible (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_nds32_abi_compatible, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) { return gate_nds32_abi_compatible (); }
+  unsigned int execute (function *) { return nds32_abi_compatible (); }
+};
+
+gimple_opt_pass *
+make_pass_nds32_abi_compatible (gcc::context *ctxt)
+{
+  return new pass_nds32_abi_compatible (ctxt);
+}
diff --git a/gcc/config/nds32/nds32-const-remater.c b/gcc/config/nds32/nds32-const-remater.c
new file mode 100644
index 0000000..760e567
--- /dev/null
+++ b/gcc/config/nds32/nds32-const-remater.c
@@ -0,0 +1,461 @@
+/* Global CSE pass of Andes NDS32 cpu for GNU compiler
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+/* ------------------------------------------------------------------------ */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "tree.h"
+#include "rtl.h"
+#include "df.h"
+#include "alias.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "regs.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
+#include "explow.h"
+#include "emit-rtl.h"
+#include "stmt.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"		/* For add_builtin_function().  */
+#include "builtins.h"
+#include "cpplib.h"
+#include "params.h"
+#include "tree-pass.h"
+#include "dbgcnt.h"
+#include "df.h"
+#include "tm-constrs.h"
+
+/* ------------------------------------------------------------------------ */
+
+typedef struct reg_avail_info
+{
+  rtx insn;
+  unsigned int uint;
+  unsigned int regno;
+} reg_avail_info_t;
+
+
+static void find_common_const (void);
+static bool try_rematerialize (rtx_insn *, unsigned int,
+			       auto_vec<reg_avail_info_t, 32> *);
+static void clean_reg_avail_info (rtx ,const_rtx, void *);
+static rtx get_const (rtx);
+static bool addsi3_format_p (rtx);
+
+/* Search the register records.  */
+static bool
+try_rematerialize (rtx_insn *insn, unsigned int uint_r,
+		   auto_vec<reg_avail_info_t, 32> *reg_avail_infos)
+{
+  unsigned int i, uint_i, cl_i, cl_r, ct_i, ct_r;
+  rtx pat, src, dest, new_insn;
+  bool done = FALSE;
+  df_ref df_rec;
+  df_link *link;
+
+  cl_r = __builtin_clz (uint_r);
+  ct_r = __builtin_ctz (uint_r);
+  for (i = 0; i < reg_avail_infos->length (); ++i)
+    {
+      if ((*reg_avail_infos)[i].uint != uint_r)
+	{
+	  uint_i = (*reg_avail_infos)[i].uint;
+	  if (dump_file)
+	    fprintf (dump_file, "Try rematerialize %08x with const %08x\n",
+		     uint_r, uint_i);
+	  cl_i = __builtin_clz (uint_i);
+	  ct_i = __builtin_ctz (uint_i);
+	  src = SET_DEST (PATTERN ((*reg_avail_infos)[i].insn));
+	  dest = SET_DEST (PATTERN (insn));
+
+	  if (cl_r > cl_i
+	      && (uint_i >> (cl_r - cl_i)) == uint_r)
+	    {
+	      /* Right shift logical.  */
+	      pat = gen_rtx_LSHIFTRT (SImode, src, GEN_INT (cl_r - cl_i));
+	      done = TRUE;
+	      if (dump_file)
+		fprintf (dump_file,
+			 "Rematerialize %08x with const %08x by l>> %d\n",
+			 uint_r, uint_i, (cl_r - cl_i));
+	    }
+	  else if (ct_i >= ct_r
+		   && ((int) uint_i >> (ct_i - ct_r)) == (int) uint_r)
+	    {
+	      /* Right shift arithmetic.  */
+	      pat = gen_rtx_ASHIFTRT (SImode, src, GEN_INT (ct_i - ct_r));
+	      done = TRUE;
+	      if (dump_file)
+		fprintf (dump_file,
+			 "Rematerialize %08x with const %08x by a>> %d\n",
+			 uint_r, uint_i, (cl_r - cl_i));
+	    }
+	  else if (ct_r > ct_i
+		   && (uint_i << (ct_r - ct_i)) == uint_r)
+	    {
+	      /* Left shift.  */
+	      pat = gen_rtx_ASHIFT (SImode, src, GEN_INT (ct_r - ct_i));
+	      done = TRUE;
+	      if (dump_file)
+		fprintf (dump_file,
+			 "Rematerialize %08x with const %08x by << %d\n",
+			uint_r, uint_i, (ct_r - ct_i));
+	    }
+	  else if (TARGET_EXT_PERF && __builtin_popcount (uint_r ^ uint_i) == 1)
+	    {
+	      unsigned int val = uint_r ^ uint_i;
+	      if ((uint_r & (uint_r ^ uint_i)) != 0)
+		{
+		  if (val > (1 << 5))
+		    {
+		      /* Bit set.  */
+		      pat = gen_rtx_IOR (SImode, src, GEN_INT (val));
+		      done = TRUE;
+		      if (dump_file)
+			fprintf (dump_file,
+				 "Rematerialize %08x with const %08x by | %08x\n",
+				 uint_r, uint_i, uint_r ^ uint_i);
+		    }
+		  else
+		    {
+		      /* Transform to plus if immediate can fit addi45.  */
+		      pat = gen_rtx_PLUS (SImode, src, GEN_INT (val));
+		      done = TRUE;
+		      if (dump_file)
+			fprintf (dump_file,
+				 "Rematerialize %08x with const %08x by | %08x\n",
+				 uint_r, uint_i, uint_r ^ uint_i);
+		    }
+		}
+	      else
+		{
+		  if (val > (1 << 5))
+		    {
+		      /* Bit clear.  */
+		      pat = gen_rtx_AND (SImode, src, GEN_INT (~(uint_r ^ uint_i)));
+		      done = TRUE;
+		      if (dump_file)
+			fprintf (dump_file,
+				 "Rematerialize %08x with const %08x by & %08x\n",
+				 uint_r, uint_i, ~(uint_r ^ uint_i));
+		    }
+		  else
+		    {
+		      /* Transform to plus if immediate can fit subi45.  */
+		      pat = gen_rtx_PLUS (SImode, src, GEN_INT ((int) -val));
+		      done = TRUE;
+		      if (dump_file)
+			fprintf (dump_file,
+				 "Rematerialize %08x with const %08x by | %08x\n",
+				 uint_r, uint_i, uint_r ^ uint_i);
+		    }
+		}
+	    }
+	  else if  ((uint_r > uint_i ? uint_r - uint_i
+		     : uint_i - uint_r) < 0x4000)
+	    {
+	      /* Check insn_info existence because the instruction
+		 maybe be deleted.*/
+	      if (DF_INSN_INFO_GET ((*reg_avail_infos)[i].insn))
+		{
+		  df_rec = DF_INSN_DEFS ((*reg_avail_infos)[i].insn);
+		  link = DF_REF_CHAIN (df_rec);
+
+		  /* Do not use the dead instruction. */
+		  /* Do not use the original matched sethi.  */
+		  if (!link)
+		    continue;
+		  for (link = DF_REF_CHAIN (df_rec); link; link = link->next)
+		    {
+		      if (DF_REF_REGNO (link->ref) == 0
+			  || !DF_REF_INSN_INFO (link->ref)
+			  || DF_REF_INSN (link->ref) == insn)
+			break;
+		    }
+		  if (link)
+		    continue;
+		}
+
+	      /* Add.  */
+	      if (uint_r > uint_i)
+		{
+		  pat = gen_rtx_PLUS (SImode, src, GEN_INT (uint_r - uint_i));
+		  done = TRUE;
+		}
+	      else
+		{
+		  pat = gen_rtx_PLUS (SImode, src, GEN_INT ((HOST_WIDE_INT)
+							    uint_r - uint_i));
+		  done = TRUE;
+		}
+	    }
+
+	  if (done)
+	    {
+	      /* Emit the new instruction.  */
+	      new_insn = gen_move_insn (dest, pat);
+	      emit_insn_before (new_insn, insn);
+	      set_dst_reg_note (new_insn, REG_EQUAL, GEN_INT (uint_r), dest);
+	      return TRUE;
+	    }
+	}
+    }
+  return FALSE;
+}
+
+/* Clean the reg_avail_info value.  */
+static void
+clean_reg_avail_info (rtx dest, const_rtx setter ATTRIBUTE_UNUSED,
+		      void *data)
+{
+  unsigned int i;
+  auto_vec<reg_avail_info_t, 32> *reg_avail_infos =
+    (auto_vec<reg_avail_info_t, 32> *) data;
+
+  if (GET_CODE (dest) == SUBREG)
+    dest = SUBREG_REG (dest);
+
+  if (REG_P (dest))
+    for (i = 0; i < reg_avail_infos->length (); ++i)
+      if ((*reg_avail_infos)[i].regno == REGNO (dest)
+	  || (GET_MODE_SIZE (GET_MODE (dest)) == 8
+	      && (*reg_avail_infos)[i].regno == REGNO (dest) + 1))
+	reg_avail_infos->unordered_remove (i--);
+}
+
+/* Return the const if the setting value is a constant integer.  */
+static rtx
+get_const (rtx insn)
+{
+  rtx note;
+
+  if (GET_CODE (PATTERN (insn)) != SET
+      || !REG_P (SET_DEST (PATTERN (insn)))
+      || GET_MODE (SET_DEST (PATTERN (insn))) != SImode)
+    return NULL_RTX;
+
+  /* Constant move instruction.  */
+  if (CONST_INT_P (XEXP (PATTERN (insn), 1)))
+    return XEXP (PATTERN (insn), 1);
+
+  note = find_reg_note (insn, REG_EQUAL, NULL_RTX);
+  if (!note)
+    note = find_reg_note (insn, REG_EQUIV, NULL_RTX);
+
+  if (note && CONST_INT_P (XEXP (note, 0)))
+    return XEXP (note, 0);
+
+  return NULL_RTX;
+}
+
+/* Return true if the instruction is addi format.  */
+static bool
+addsi3_format_p (rtx insn)
+{
+  if (GET_CODE (XEXP (PATTERN (insn), 1)) == PLUS
+      && GET_CODE (XEXP (XEXP (PATTERN (insn), 1), 1)) == CONST_INT)
+    return TRUE;
+
+  return FALSE;
+}
+
+/* Return true if the instruction is sethi format.  */
+static bool
+sethi_format_p (rtx insn)
+{
+  if (GET_CODE (PATTERN (insn)) == SET
+      && GET_CODE (XEXP (PATTERN (insn), 1)) == CONST_INT
+      && satisfies_constraint_Ihig (XEXP (PATTERN (insn), 1)))
+    return TRUE;
+  return FALSE;
+}
+
+/* Return true if the register definition only be used by insn.  */
+static bool
+use_only_p (rtx insn)
+{
+  rtx def_insn;
+  df_ref rec;
+  df_link *link;
+  rec = DF_INSN_USES (insn);
+  link = DF_REF_CHAIN (rec);
+
+  if (!link
+      || DF_REF_REGNO (link->ref) == 0
+      || !DF_REF_INSN_INFO (link->ref))
+    return FALSE;
+
+  def_insn = DF_REF_INSN (link->ref);
+
+  if (!sethi_format_p (def_insn))
+    return FALSE;
+
+  rec = DF_INSN_DEFS (def_insn);
+  link = DF_REF_CHAIN (rec);
+
+  if (!link
+      || link->next
+      || DF_REF_REGNO (link->ref) == 0
+      || !DF_REF_INSN_INFO (link->ref))
+    return FALSE;
+
+  return TRUE;
+}
+
+/* Traverse instructions in each basic block, and save the value of
+   setting constant instructions.  */
+static void
+find_common_const (void)
+{
+  basic_block bb;
+  unsigned int i;
+
+  /* Save register constant value.  */
+  auto_vec<reg_avail_info_t, 32> reg_avail_infos;
+  reg_avail_info_t reg_avail_info;
+
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      rtx_insn *insn;
+      rtx dest, cst;
+
+      /* Clear the vector.  */
+      while (!reg_avail_infos.is_empty ())
+	reg_avail_infos.pop ();
+
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (!NONDEBUG_INSN_P (insn))
+	    continue;
+
+	  if (CALL_P (insn))
+	    {
+	      /* Clean hard register.  */
+	      for (i = 0; i < reg_avail_infos.length ();)
+		{
+		  if (HARD_REGISTER_NUM_P (reg_avail_infos[i].regno)
+		      && call_used_regs[reg_avail_infos[i].regno])
+		    reg_avail_infos.unordered_remove (i);
+		  else
+		    ++i;
+		}
+	    }
+
+	  cst = get_const (insn);
+	  if (cst == NULL_RTX)
+	    {
+	      note_stores (PATTERN (insn), clean_reg_avail_info,
+			   &reg_avail_infos);
+	      continue;
+	    }
+
+	  dest = SET_DEST (PATTERN (insn));
+
+	  if (addsi3_format_p (insn)
+	      && use_only_p (insn)
+	      && try_rematerialize (insn, XUINT (cst, 0), &reg_avail_infos))
+	    {
+	      delete_insn (insn);
+	      df_insn_rescan_all ();
+	    }
+
+	  note_stores (PATTERN (insn), clean_reg_avail_info, &reg_avail_infos);
+	  reg_avail_info.insn = insn;
+	  reg_avail_info.uint = XUINT (cst, 0);
+	  reg_avail_info.regno = REGNO (dest);
+	  if (dump_file)
+	    fprintf (dump_file, "Find const %08x on %u\n",
+		     reg_avail_info.uint, reg_avail_info.regno);
+	  reg_avail_infos.safe_push (reg_avail_info);
+	}
+    }
+}
+
+static unsigned int
+nds32_const_remater_opt (void)
+{
+  df_chain_add_problem (DF_DU_CHAIN + DF_UD_CHAIN);
+  df_note_add_problem ();
+  df_insn_rescan_all ();
+  df_analyze ();
+
+  find_common_const ();
+
+  df_insn_rescan_all ();
+  return 0;
+}
+
+const pass_data pass_data_nds32_const_remater_opt =
+{
+  RTL_PASS,				/* type */
+  "const_remater_opt",				/* name */
+  OPTGROUP_NONE,			/* optinfo_flags */
+  TV_MACH_DEP,				/* tv_id */
+  0,					/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_df_finish,	/* todo_flags_finish */
+};
+
+class pass_nds32_const_remater_opt : public rtl_opt_pass
+{
+public:
+  pass_nds32_const_remater_opt (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_nds32_const_remater_opt, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) { return flag_nds32_const_remater_opt; }
+  unsigned int execute (function *) { return nds32_const_remater_opt (); }
+};
+
+rtl_opt_pass *
+make_pass_nds32_const_remater_opt (gcc::context *ctxt)
+{
+  return new pass_nds32_const_remater_opt (ctxt);
+}
+
+/* ------------------------------------------------------------------------ */
diff --git a/gcc/config/nds32/nds32-cost.c b/gcc/config/nds32/nds32-cost.c
index e6a29fc..881d086 100644
--- a/gcc/config/nds32/nds32-cost.c
+++ b/gcc/config/nds32/nds32-cost.c
@@ -24,73 +24,447 @@
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
-#include "target.h"
-#include "rtl.h"
 #include "tree.h"
-#include "tm_p.h"
-#include "optabs.h"		/* For GEN_FCN.  */
+#include "rtl.h"
+#include "df.h"
+#include "alias.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "regs.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
+#include "explow.h"
+#include "emit-rtl.h"
+#include "stmt.h"
+#include "expr.h"
 #include "recog.h"
+#include "diagnostic-core.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "tm_p.h"
 #include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"		/* For add_builtin_function().  */
+#include "builtins.h"
+#include "tree-pass.h"

 /* ------------------------------------------------------------------------ */

-bool
-nds32_rtx_costs_impl (rtx x,
-		      machine_mode mode ATTRIBUTE_UNUSED,
-		      int outer_code,
-		      int opno ATTRIBUTE_UNUSED,
-		      int *total,
-		      bool speed)
-{
-  int code = GET_CODE (x);
+typedef bool (*rtx_cost_func) (rtx, int, int, int, int*);

-  /* According to 'speed', goto suitable cost model section.  */
-  if (speed)
-    goto performance_cost;
-  else
-    goto size_cost;
+struct rtx_cost_model_t {
+  rtx_cost_func speed_prefer;
+  rtx_cost_func size_prefer;
+};

+static rtx_cost_model_t rtx_cost_model;

-performance_cost:
-  /* This is section for performance cost model.  */
+static int insn_size_16bit; /* Initial at nds32_init_rtx_costs.  */
+static const int insn_size_32bit = 4;
+
+static bool
+nds32_rtx_costs_speed_prefer (rtx x ATTRIBUTE_UNUSED,
+			      int code,
+			      int outer_code ATTRIBUTE_UNUSED,
+			      int opno ATTRIBUTE_UNUSED,
+			      int *total)
+{
+  rtx op0;
+  rtx op1;
+  enum machine_mode mode = GET_MODE (x);
+  /* Scale cost by mode size.  */
+  int cost = COSTS_N_INSNS (GET_MODE_SIZE (mode) / GET_MODE_SIZE (SImode));

-  /* In gcc/rtl.h, the default value of COSTS_N_INSNS(N) is N*4.
-     We treat it as 4-cycle cost for each instruction
-     under performance consideration.  */
   switch (code)
     {
-    case SET:
-      /* For 'SET' rtx, we need to return false
-         so that it can recursively calculate costs.  */
-      return false;
-
     case USE:
       /* Used in combine.c as a marker.  */
       *total = 0;
-      break;
+      return true;
+
+    case CONST_INT:
+      /* When not optimizing for size, we care more about the cost
+	 of hot code, and hot code is often in a loop.  If a constant
+	 operand needs to be forced into a register, we will often be
+	 able to hoist the constant load out of the loop, so the load
+	 should not contribute to the cost.  */
+      if (outer_code == SET || outer_code == PLUS)
+	*total = satisfies_constraint_Is20 (x) ? 0 : 4;
+      else if (outer_code == AND || outer_code == IOR || outer_code == XOR
+	       || outer_code == MINUS)
+	*total = satisfies_constraint_Iu15 (x) ? 0 : 4;
+      else if (outer_code == ASHIFT || outer_code == ASHIFTRT
+	       || outer_code == LSHIFTRT)
+	*total = satisfies_constraint_Iu05 (x) ? 0 : 4;
+      else if (GET_RTX_CLASS (outer_code) == RTX_COMPARE
+	       || GET_RTX_CLASS (outer_code) == RTX_COMM_COMPARE)
+	*total = satisfies_constraint_Is16 (x) ? 0 : 4;
+      else
+	*total = COSTS_N_INSNS (1);
+      return true;
+
+    case CONST:
+    case LO_SUM:
+    case HIGH:
+    case SYMBOL_REF:
+      *total = COSTS_N_INSNS (1);
+      return true;
+
+    case MEM:
+      *total = COSTS_N_INSNS (1);
+      return true;
+
+    case SET:
+      op0 = SET_DEST (x);
+      op1 = SET_SRC (x);
+      mode = GET_MODE (op0);
+      /* Scale cost by mode size.  */
+      cost = COSTS_N_INSNS (GET_MODE_SIZE (mode) / GET_MODE_SIZE (SImode));
+
+      switch (GET_CODE (op1))
+	{
+	case REG:
+	case SUBREG:
+	  /* Register move and Store instructions.  */
+	  if ((REG_P (op0) || MEM_P (op0))
+	      && GET_MODE_SIZE (mode) <= GET_MODE_SIZE (DImode))
+	    *total = COSTS_N_INSNS (1);
+	  else
+	    *total = cost;
+	  return true;
+
+	case MEM:
+	  /* Load instructions.  */
+	  if (REG_P (op0) && GET_MODE_SIZE (mode) <= GET_MODE_SIZE (DImode))
+	    *total = COSTS_N_INSNS (1);
+	  else
+	    *total = cost;
+	  return true;
+
+	case CONST_INT:
+	  /* movi instruction.  */
+	  if (REG_P (op0) && GET_MODE_SIZE (mode) < GET_MODE_SIZE (DImode))
+	    {
+	      if (satisfies_constraint_Is20 (op1))
+		*total = COSTS_N_INSNS (1) - 1;
+	      else
+		*total = COSTS_N_INSNS (2);
+	    }
+	  else
+	    *total = cost;
+	  return true;
+
+	case CONST:
+	case SYMBOL_REF:
+	case LABEL_REF:
+	  /* la instruction.  */
+	  if (REG_P (op0) && GET_MODE_SIZE (mode) < GET_MODE_SIZE (DImode))
+	    *total = COSTS_N_INSNS (1) - 1;
+	  else
+	    *total = cost;
+	  return true;
+	case VEC_SELECT:
+	  *total = cost;
+	  return true;
+
+	default:
+	  *total = cost;
+	  return true;
+	}
+
+    case PLUS:
+      op0 = XEXP (x, 0);
+      op1 = XEXP (x, 1);
+
+      if (GET_MODE_SIZE (mode) >= GET_MODE_SIZE (DImode))
+	*total = cost;
+      else if (GET_CODE (op0) == MULT || GET_CODE (op0) == LSHIFTRT
+	       || GET_CODE (op1) == MULT || GET_CODE (op1) == LSHIFTRT)
+	{
+	  /* ALU_SHIFT */
+	  if (TARGET_PIPELINE_PANTHER)
+	    *total = COSTS_N_INSNS (1);
+	  else
+	    *total = COSTS_N_INSNS (2);
+	}
+      else if ((GET_CODE (op1) == CONST_INT
+		&& satisfies_constraint_Is15 (op1))
+		|| REG_P (op1))
+	/* ADD instructions */
+	*total = COSTS_N_INSNS (1);
+      else
+	/* ADD instructions: IMM out of range.  */
+	*total = COSTS_N_INSNS (2);
+      return true;
+
+    case MINUS:
+      op0 = XEXP (x, 0);
+      op1 = XEXP (x, 1);
+
+      if (GET_MODE_SIZE (mode) >= GET_MODE_SIZE (DImode))
+	*total = cost;
+      else if (GET_CODE (op0) == MULT || GET_CODE (op0) == LSHIFTRT
+	       || GET_CODE (op1) == MULT || GET_CODE (op1) == LSHIFTRT)
+	{
+	  /* ALU_SHIFT */
+	  if (TARGET_PIPELINE_PANTHER)
+	    *total = COSTS_N_INSNS (1);
+	  else
+	    *total = COSTS_N_INSNS (2);
+	}
+      else if ((GET_CODE (op0) == CONST_INT
+		&& satisfies_constraint_Is15 (op0))
+		|| REG_P (op0))
+	/* SUB instructions */
+	*total = COSTS_N_INSNS (1);
+      else
+	/* SUB instructions: IMM out of range.  */
+	*total = COSTS_N_INSNS (2);
+      return true;
+
+    case TRUNCATE:
+      /* TRUNCATE and AND behavior is same. */
+      *total = COSTS_N_INSNS (1);
+      return true;
+
+    case AND:
+    case IOR:
+    case XOR:
+      op0 = XEXP (x, 0);
+      op1 = XEXP (x, 1);
+
+      if (NDS32_EXT_DSP_P ())
+	{
+	  /* We prefer (and (ior) (ior)) than (ior (and) (and)) for
+	     synthetize pk** and insb instruction.  */
+	  if (code == AND && GET_CODE (op0) == IOR && GET_CODE (op1) == IOR)
+	    return COSTS_N_INSNS (1);
+
+	  if (code == IOR && GET_CODE (op0) == AND && GET_CODE (op1) == AND)
+	    return COSTS_N_INSNS (10);
+	}
+
+      if (GET_MODE_SIZE (mode) >= GET_MODE_SIZE (DImode))
+	*total = cost;
+      else if (GET_CODE (op0) == ASHIFT || GET_CODE (op0) == LSHIFTRT)
+	{
+	  /* ALU_SHIFT */
+	  if (TARGET_PIPELINE_PANTHER)
+	    *total = COSTS_N_INSNS (1);
+	  else
+	    *total = COSTS_N_INSNS (2);
+	}
+      else if ((GET_CODE (op1) == CONST_INT
+	       && satisfies_constraint_Iu15 (op1))
+	       || REG_P (op1))
+	/* AND, OR, XOR instructions */
+	*total = COSTS_N_INSNS (1);
+      else if (code == AND || GET_CODE (op0) == NOT)
+	/* BITC instruction */
+	*total = COSTS_N_INSNS (1);
+      else
+	/* AND, OR, XOR instructions: IMM out of range.  */
+	*total = COSTS_N_INSNS (2);
+      return true;

     case MULT:
+      if (GET_MODE (x) == DImode
+	  || GET_CODE (XEXP (x, 1)) == SIGN_EXTEND
+	  || GET_CODE (XEXP (x, 1)) == ZERO_EXTEND)
+	/* MUL instructions */
+	*total = COSTS_N_INSNS (1);
+      else if (GET_MODE_SIZE (mode) >= GET_MODE_SIZE (DImode))
+	*total = cost;
+      else if (outer_code == PLUS || outer_code == MINUS)
+	{
+	  /* ALU_SHIFT */
+	  if (TARGET_PIPELINE_PANTHER)
+	    *total = COSTS_N_INSNS (1);
+	  else
+	    *total = COSTS_N_INSNS (2);
+	}
+      else if ((GET_CODE (XEXP (x, 1)) == CONST_INT
+	       && satisfies_constraint_Iu05 (XEXP (x, 1)))
+	       || REG_P (XEXP (x, 1)))
+	/* MUL instructions */
+	*total = COSTS_N_INSNS (1);
+      else
+	/* MUL instructions: IMM out of range.  */
+	*total = COSTS_N_INSNS (2);
+
+      if (TARGET_MUL_SLOW)
+	*total += COSTS_N_INSNS (4);
+
+      return true;
+
+    case LSHIFTRT:
+      if (GET_MODE_SIZE (mode) >= GET_MODE_SIZE (DImode))
+	*total = cost;
+      else if (outer_code == PLUS || outer_code == MINUS
+	       || outer_code == AND || outer_code == IOR
+	       || outer_code == XOR)
+	{
+	  /* ALU_SHIFT */
+	  if (TARGET_PIPELINE_PANTHER)
+	    *total = COSTS_N_INSNS (1);
+	  else
+	    *total = COSTS_N_INSNS (2);
+	}
+      else if ((GET_CODE (XEXP (x, 1)) == CONST_INT
+	       && satisfies_constraint_Iu05 (XEXP (x, 1)))
+	       || REG_P (XEXP (x, 1)))
+	/* SRL instructions */
+	*total = COSTS_N_INSNS (1);
+      else
+	/* SRL instructions: IMM out of range.  */
+	*total = COSTS_N_INSNS (2);
+      return true;
+
+    case ASHIFT:
+      if (GET_MODE_SIZE (mode) >= GET_MODE_SIZE (DImode))
+	*total = cost;
+      else if (outer_code == AND || outer_code == IOR
+	       || outer_code == XOR)
+	{
+	  /* ALU_SHIFT */
+	  if (TARGET_PIPELINE_PANTHER)
+	    *total = COSTS_N_INSNS (1);
+	  else
+	    *total = COSTS_N_INSNS (2);
+	}
+      else if ((GET_CODE (XEXP (x, 1)) == CONST_INT
+	       && satisfies_constraint_Iu05 (XEXP (x, 1)))
+	       || REG_P (XEXP (x, 1)))
+	/* SLL instructions */
+	*total = COSTS_N_INSNS (1);
+      else
+	/* SLL instructions: IMM out of range.  */
+	*total = COSTS_N_INSNS (2);
+      return true;
+
+    case ASHIFTRT:
+    case ROTATERT:
+      if (GET_MODE_SIZE (mode) >= GET_MODE_SIZE (DImode))
+	*total = cost;
+      else if ((GET_CODE (XEXP (x, 1)) == CONST_INT
+	       && satisfies_constraint_Iu05 (XEXP (x, 1)))
+	       || REG_P (XEXP (x, 1)))
+	/* ROTR, SLL instructions */
+	*total = COSTS_N_INSNS (1);
+      else
+	/* ROTR, SLL instructions: IMM out of range.  */
+	*total = COSTS_N_INSNS (2);
+      return true;
+
+    case LT:
+    case LTU:
+      if (outer_code == SET)
+	{
+	  if ((GET_CODE (XEXP (x, 1)) == CONST_INT
+	      && satisfies_constraint_Iu15 (XEXP (x, 1)))
+	      || REG_P (XEXP (x, 1)))
+	    /* SLT, SLTI instructions */
+	    *total = COSTS_N_INSNS (1);
+	  else
+	    /* SLT, SLT instructions: IMM out of range.  */
+	    *total = COSTS_N_INSNS (2);
+	}
+      else
+	/* branch */
+	*total = COSTS_N_INSNS (2);
+      return true;
+
+    case EQ:
+    case NE:
+    case GE:
+    case LE:
+    case GT:
+      /* branch */
+      *total = COSTS_N_INSNS (2);
+      return true;
+
+    case IF_THEN_ELSE:
+      if (GET_CODE (XEXP (x, 1)) == LABEL_REF)
+	/* branch */
+	*total = COSTS_N_INSNS (2);
+      else
+	/* cmovz, cmovn instructions */
+	*total = COSTS_N_INSNS (1);
+      return true;
+
+    case LABEL_REF:
+      if (outer_code == IF_THEN_ELSE)
+	/* branch */
+	*total = COSTS_N_INSNS (2);
+      else
+	*total = COSTS_N_INSNS (1);
+      return true;
+
+    case ZERO_EXTEND:
+    case SIGN_EXTEND:
+      if (MEM_P (XEXP (x, 0)))
+	/* Using memory access. */
+	*total = COSTS_N_INSNS (1);
+      else
+	/* Zero extend and sign extend instructions.  */
+	*total = COSTS_N_INSNS (1);
+      return true;
+
+    case NEG:
+    case NOT:
       *total = COSTS_N_INSNS (1);
-      break;
+      return true;

     case DIV:
     case UDIV:
     case MOD:
     case UMOD:
-      *total = COSTS_N_INSNS (7);
-      break;
+      *total = COSTS_N_INSNS (20);
+      return true;

-    default:
+    case CALL:
+      *total = COSTS_N_INSNS (2);
+      return true;
+
+    case CLZ:
+    case SMIN:
+    case SMAX:
+    case ZERO_EXTRACT:
+      if (TARGET_EXT_PERF)
+	*total = COSTS_N_INSNS (1);
+      else
+	*total = COSTS_N_INSNS (3);
+      return true;
+    case VEC_SELECT:
       *total = COSTS_N_INSNS (1);
-      break;
-    }
-
-  return true;
-
+      return true;

-size_cost:
-  /* This is section for size cost model.  */
+    default:
+      *total = COSTS_N_INSNS (3);
+      return true;
+    }
+}

+static bool
+nds32_rtx_costs_size_prefer (rtx x,
+			     int code,
+			     int outer_code,
+			     int opno ATTRIBUTE_UNUSED,
+			     int *total)
+{
   /* In gcc/rtl.h, the default value of COSTS_N_INSNS(N) is N*4.
      We treat it as 4-byte cost for each instruction
      under code size consideration.  */
@@ -98,7 +472,7 @@ size_cost:
     {
     case SET:
       /* For 'SET' rtx, we need to return false
-         so that it can recursively calculate costs.  */
+	 so that it can recursively calculate costs.  */
       return false;

     case USE:
@@ -108,92 +482,169 @@ size_cost:

     case CONST_INT:
       /* All instructions involving constant operation
-         need to be considered for cost evaluation.  */
+	 need to be considered for cost evaluation.  */
       if (outer_code == SET)
 	{
 	  /* (set X imm5s), use movi55, 2-byte cost.
 	     (set X imm20s), use movi, 4-byte cost.
 	     (set X BIG_INT), use sethi/ori, 8-byte cost.  */
 	  if (satisfies_constraint_Is05 (x))
-	    *total = COSTS_N_INSNS (1) - 2;
+	    *total = insn_size_16bit;
 	  else if (satisfies_constraint_Is20 (x))
-	    *total = COSTS_N_INSNS (1);
+	    *total = insn_size_32bit;
 	  else
-	    *total = COSTS_N_INSNS (2);
+	    *total = insn_size_32bit * 2;
 	}
       else if (outer_code == PLUS || outer_code == MINUS)
 	{
 	  /* Possible addi333/subi333 or subi45/addi45, 2-byte cost.
 	     General case, cost 1 instruction with 4-byte.  */
 	  if (satisfies_constraint_Iu05 (x))
-	    *total = COSTS_N_INSNS (1) - 2;
+	    *total = insn_size_16bit;
 	  else
-	    *total = COSTS_N_INSNS (1);
+	    *total = insn_size_32bit;
 	}
       else if (outer_code == ASHIFT)
 	{
 	  /* Possible slli333, 2-byte cost.
 	     General case, cost 1 instruction with 4-byte.  */
 	  if (satisfies_constraint_Iu03 (x))
-	    *total = COSTS_N_INSNS (1) - 2;
+	    *total = insn_size_16bit;
 	  else
-	    *total = COSTS_N_INSNS (1);
+	    *total = insn_size_32bit;
 	}
       else if (outer_code == ASHIFTRT || outer_code == LSHIFTRT)
 	{
 	  /* Possible srai45 or srli45, 2-byte cost.
 	     General case, cost 1 instruction with 4-byte.  */
 	  if (satisfies_constraint_Iu05 (x))
-	    *total = COSTS_N_INSNS (1) - 2;
+	    *total = insn_size_16bit;
 	  else
-	    *total = COSTS_N_INSNS (1);
+	    *total = insn_size_32bit;
 	}
       else
 	{
 	  /* For other cases, simply set it 4-byte cost.  */
-	  *total = COSTS_N_INSNS (1);
+	  *total = insn_size_32bit;
 	}
       break;

     case CONST_DOUBLE:
       /* It requires high part and low part processing, set it 8-byte cost.  */
-      *total = COSTS_N_INSNS (2);
+      *total = insn_size_32bit * 2;
+      break;
+
+    case CONST:
+    case SYMBOL_REF:
+      *total = insn_size_32bit * 2;
       break;

     default:
       /* For other cases, generally we set it 4-byte cost
-         and stop resurively traversing.  */
-      *total = COSTS_N_INSNS (1);
+	 and stop resurively traversing.  */
+      *total = insn_size_32bit;
       break;
     }

   return true;
 }

-int
-nds32_address_cost_impl (rtx address,
-			 machine_mode mode ATTRIBUTE_UNUSED,
-			 addr_space_t as ATTRIBUTE_UNUSED,
-			 bool speed)
+void
+nds32_init_rtx_costs (void)
+{
+  rtx_cost_model.speed_prefer = nds32_rtx_costs_speed_prefer;
+  rtx_cost_model.size_prefer  = nds32_rtx_costs_size_prefer;
+
+  if (TARGET_16_BIT)
+    insn_size_16bit = 2;
+  else
+    insn_size_16bit = 4;
+}
+
+/* This target hook describes the relative costs of RTL expressions.
+   Return 'true' when all subexpressions of x have been processed.
+   Return 'false' to sum the costs of sub-rtx, plus cost of this operation.
+   Refer to gcc/rtlanal.c for more information.  */
+bool
+nds32_rtx_costs_impl (rtx x,
+		      machine_mode mode ATTRIBUTE_UNUSED,
+		      int outer_code,
+		      int opno,
+		      int *total,
+		      bool speed)
+{
+  int code = GET_CODE (x);
+
+  /* According to 'speed', use suitable cost model section.  */
+  if (speed)
+    return rtx_cost_model.speed_prefer(x, code, outer_code, opno, total);
+  else
+    return rtx_cost_model.size_prefer(x, code, outer_code, opno, total);
+}
+
+
+int nds32_address_cost_speed_prefer (rtx address)
 {
   rtx plus0, plus1;
   enum rtx_code code;

   code = GET_CODE (address);

-  /* According to 'speed', goto suitable cost model section.  */
-  if (speed)
-    goto performance_cost;
-  else
-    goto size_cost;
+  switch (code)
+    {
+    case POST_MODIFY:
+    case POST_INC:
+    case POST_DEC:
+      /* We encourage that rtx contains
+	 POST_MODIFY/POST_INC/POST_DEC behavior.  */
+      return COSTS_N_INSNS (1) - 2;
+
+    case SYMBOL_REF:
+      /* We can have gp-relative load/store for symbol_ref.
+	Have it 4-byte cost.  */
+      return COSTS_N_INSNS (2);
+
+    case CONST:
+      /* It is supposed to be the pattern (const (plus symbol_ref const_int)).
+	 Have it 4-byte cost.  */
+      return COSTS_N_INSNS (2);
+
+    case REG:
+      /* Simply return 4-byte costs.  */
+      return COSTS_N_INSNS (1) - 2;
+
+    case PLUS:
+      /* We do not need to check if the address is a legitimate address,
+	 because this hook is never called with an invalid address.
+	 But we better check the range of
+	 const_int value for cost, if it exists.  */
+      plus0 = XEXP (address, 0);
+      plus1 = XEXP (address, 1);
+
+      if (REG_P (plus0) && CONST_INT_P (plus1))
+	return COSTS_N_INSNS (1) - 2;
+      else if (ARITHMETIC_P (plus0) || ARITHMETIC_P (plus1))
+	return COSTS_N_INSNS (1) - 1;
+      else if (REG_P (plus0) && REG_P (plus1))
+	return COSTS_N_INSNS (1);
+
+      /* For other 'plus' situation, make it cost 4-byte.  */
+      return COSTS_N_INSNS (1);

-performance_cost:
-  /* This is section for performance cost model.  */
+    default:
+      break;
+    }

-  /* FALLTHRU, currently we use same cost model as size_cost.  */
+  return COSTS_N_INSNS (4);

-size_cost:
-  /* This is section for size cost model.  */
+}
+
+int nds32_address_cost_speed_fwprop (rtx address)
+{
+  rtx plus0, plus1;
+  enum rtx_code code;
+
+  code = GET_CODE (address);

   switch (code)
     {
@@ -201,18 +652,18 @@ size_cost:
     case POST_INC:
     case POST_DEC:
       /* We encourage that rtx contains
-         POST_MODIFY/POST_INC/POST_DEC behavior.  */
+	 POST_MODIFY/POST_INC/POST_DEC behavior.  */
       return 0;

     case SYMBOL_REF:
       /* We can have gp-relative load/store for symbol_ref.
-         Have it 4-byte cost.  */
-      return COSTS_N_INSNS (1);
+	 Have it 4-byte cost.  */
+      return COSTS_N_INSNS (2);

     case CONST:
       /* It is supposed to be the pattern (const (plus symbol_ref const_int)).
-         Have it 4-byte cost.  */
-      return COSTS_N_INSNS (1);
+	 Have it 4-byte cost.  */
+      return COSTS_N_INSNS (2);

     case REG:
       /* Simply return 4-byte costs.  */
@@ -220,21 +671,25 @@ size_cost:

     case PLUS:
       /* We do not need to check if the address is a legitimate address,
-         because this hook is never called with an invalid address.
-         But we better check the range of
-         const_int value for cost, if it exists.  */
+	 because this hook is never called with an invalid address.
+	 But we better check the range of
+	 const_int value for cost, if it exists.  */
       plus0 = XEXP (address, 0);
       plus1 = XEXP (address, 1);

       if (REG_P (plus0) && CONST_INT_P (plus1))
-        {
+	{
 	  /* If it is possible to be lwi333/swi333 form,
 	     make it 2-byte cost.  */
-	  if (satisfies_constraint_Iu05 (plus1))
+	  if (satisfies_constraint_Iu03 (plus1))
 	    return (COSTS_N_INSNS (1) - 2);
 	  else
 	    return COSTS_N_INSNS (1);
 	}
+      if (ARITHMETIC_P (plus0) || ARITHMETIC_P (plus1))
+	return COSTS_N_INSNS (1) - 2;
+      else if (REG_P (plus0) && REG_P (plus1))
+	return COSTS_N_INSNS (1);

       /* For other 'plus' situation, make it cost 4-byte.  */
       return COSTS_N_INSNS (1);
@@ -246,4 +701,84 @@ size_cost:
   return COSTS_N_INSNS (4);
 }

+
+int nds32_address_cost_size_prefer (rtx address)
+{
+  rtx plus0, plus1;
+  enum rtx_code code;
+
+  code = GET_CODE (address);
+
+  switch (code)
+    {
+    case POST_MODIFY:
+    case POST_INC:
+    case POST_DEC:
+      /* We encourage that rtx contains
+	 POST_MODIFY/POST_INC/POST_DEC behavior.  */
+      return 0;
+
+    case SYMBOL_REF:
+      /* We can have gp-relative load/store for symbol_ref.
+	 Have it 4-byte cost.  */
+      return COSTS_N_INSNS (2);
+
+    case CONST:
+      /* It is supposed to be the pattern (const (plus symbol_ref const_int)).
+	 Have it 4-byte cost.  */
+      return COSTS_N_INSNS (2);
+
+    case REG:
+      /* Simply return 4-byte costs.  */
+      return COSTS_N_INSNS (1) - 1;
+
+    case PLUS:
+      /* We do not need to check if the address is a legitimate address,
+	 because this hook is never called with an invalid address.
+	 But we better check the range of
+	 const_int value for cost, if it exists.  */
+      plus0 = XEXP (address, 0);
+      plus1 = XEXP (address, 1);
+
+      if (REG_P (plus0) && CONST_INT_P (plus1))
+	{
+	  /* If it is possible to be lwi333/swi333 form,
+	     make it 2-byte cost.  */
+	  if (satisfies_constraint_Iu03 (plus1))
+	    return (COSTS_N_INSNS (1) - 2);
+	  else
+	    return COSTS_N_INSNS (1) - 1;
+	}
+
+      /* (plus (reg) (mult (reg) (const))) */
+      if (ARITHMETIC_P (plus0) || ARITHMETIC_P (plus1))
+	return (COSTS_N_INSNS (1) - 1);
+
+      /* For other 'plus' situation, make it cost 4-byte.  */
+      return COSTS_N_INSNS (1);
+
+    default:
+      break;
+    }
+
+  return COSTS_N_INSNS (4);
+
+}
+
+int nds32_address_cost_impl (rtx address,
+			     enum machine_mode mode ATTRIBUTE_UNUSED,
+			     addr_space_t as ATTRIBUTE_UNUSED,
+			     bool speed_p)
+{
+  if (speed_p)
+    {
+      if (current_pass->tv_id == TV_FWPROP)
+	return nds32_address_cost_speed_fwprop (address);
+      else
+	return nds32_address_cost_speed_prefer (address);
+    }
+  else
+    return nds32_address_cost_size_prefer (address);
+}
+
 /* ------------------------------------------------------------------------ */
diff --git a/gcc/config/nds32/nds32-cprop-acc.c b/gcc/config/nds32/nds32-cprop-acc.c
new file mode 100644
index 0000000..0852095
--- /dev/null
+++ b/gcc/config/nds32/nds32-cprop-acc.c
@@ -0,0 +1,845 @@
+/* Copy propagation on hard registers for accumulate style instruction.
+   Copyright (C) 2000-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "rtl.h"
+#include "tm_p.h"
+#include "insn-config.h"
+#include "regs.h"
+#include "addresses.h"
+#include "predict.h"
+#include "basic-block.h"
+#include "reload.h"
+#include "hash-set.h"
+#include "dominance.h"
+#include "cfg.h"
+#include "function.h"
+#include "recog.h"
+#include "cfgrtl.h"
+#include "flags.h"
+#include "diagnostic-core.h"
+#include "obstack.h"
+#include "tree-pass.h"
+#include "bitmap.h"
+#include "df.h"
+#include "output.h"
+#include "emit-rtl.h"
+#include <vector>
+
+/* For each move instruction, we have a two-dimensional vector that record
+   what insns need to replace the operands when the move instruction is
+   propagated.  */
+
+typedef std::vector<rtx_insn *> insn_list;
+
+/* Function called by note_uses to replace used subexpressions.  */
+
+struct replace_src_operands_data
+{
+  rtx dst_reg;
+  rtx src_reg;
+  unsigned int old_regno;
+  unsigned int new_regno;
+  rtx_insn *insn;
+};
+
+/* Return true if a mode change from ORIG to NEW is allowed for REGNO.
+   Adapted from mode_change_ok in regcprop.  */
+
+static bool
+nds32_mode_change_ok (enum machine_mode orig_mode, enum machine_mode new_mode,
+		      unsigned int regno ATTRIBUTE_UNUSED)
+{
+  if (GET_MODE_SIZE (orig_mode) < GET_MODE_SIZE (new_mode))
+    return false;
+
+#ifdef CANNOT_CHANGE_MODE_CLASS
+  return !REG_CANNOT_CHANGE_MODE_P (regno, orig_mode, new_mode);
+#endif
+
+  return true;
+}
+
+/* Register REGNO was originally set in ORIG_MODE.  It - or a copy of it -
+   was copied in COPY_MODE to COPY_REGNO, and then COPY_REGNO was accessed
+   in NEW_MODE.
+   Return a NEW_MODE rtx for REGNO if that's OK, otherwise return NULL_RTX.
+   Adapted from maybe_mode_change in regcprop.  */
+
+static rtx
+nds32_mode_change_reg (enum machine_mode orig_mode, enum machine_mode copy_mode,
+		       enum machine_mode new_mode, unsigned int regno,
+		       unsigned int copy_regno ATTRIBUTE_UNUSED)
+{
+  if (GET_MODE_SIZE (copy_mode) < GET_MODE_SIZE (orig_mode)
+      && GET_MODE_SIZE (copy_mode) < GET_MODE_SIZE (new_mode))
+    return NULL_RTX;
+
+  if (orig_mode == new_mode)
+    return gen_raw_REG (new_mode, regno);
+  else if (nds32_mode_change_ok (orig_mode, new_mode, regno))
+    {
+      int copy_nregs = hard_regno_nregs[copy_regno][copy_mode];
+      int use_nregs = hard_regno_nregs[copy_regno][new_mode];
+      int copy_offset
+	= GET_MODE_SIZE (copy_mode) / copy_nregs * (copy_nregs - use_nregs);
+      int offset
+	= GET_MODE_SIZE (orig_mode) - GET_MODE_SIZE (new_mode) - copy_offset;
+      int byteoffset = offset % UNITS_PER_WORD;
+      int wordoffset = offset - byteoffset;
+
+      offset = ((WORDS_BIG_ENDIAN ? wordoffset : 0)
+		+ (BYTES_BIG_ENDIAN ? byteoffset : 0));
+      regno += subreg_regno_offset (regno, orig_mode, offset, new_mode);
+      if (HARD_REGNO_MODE_OK (regno, new_mode))
+	return gen_raw_REG (new_mode, regno);
+    }
+  return NULL_RTX;
+}
+
+/* Return true if INSN is a register-based move instruction, false
+   otherwise.  */
+
+static bool
+nds32_is_reg_mov_p (rtx_insn *insn)
+{
+  rtx pat = PATTERN (insn);
+
+  if (GET_CODE (pat) != SET)
+    return false;
+
+  rtx src_reg = SET_SRC (pat);
+  rtx dst_reg = SET_DEST (pat);
+
+  if (REG_P (dst_reg) && REG_P (src_reg) && can_copy_p (GET_MODE (dst_reg)))
+    return true;
+  else
+    return false;
+}
+
+
+/* Return accumulated register if INSN is an accumulate style instruction,
+   otherwise return NULL_RTX.  */
+
+static rtx
+nds32_is_acc_insn_p (rtx_insn *insn)
+{
+  int i;
+  const operand_alternative *op_alt;
+  rtx pat;
+
+  if (get_attr_length (insn) != 4)
+    return NULL_RTX;
+
+  pat = PATTERN (insn);
+  if (GET_CODE (pat) != SET)
+    return NULL_RTX;
+
+  /* Try to get the insn data from recog_data.  */
+  recog_memoized (insn);
+  extract_constrain_insn (insn);
+  /* Transform the constraint strings into a more usable form,
+     recog_op_alt.  */
+  preprocess_constraints (insn);
+  op_alt = which_op_alt ();
+
+  /* Check all operands whether the output operand is identical to
+     another input operand  */
+  for (i = 0; i < recog_data.n_operands; ++i)
+    {
+      int matches = op_alt[i].matches;
+      int matched = op_alt[i].matched;
+      if ((matches >= 0
+	   && (recog_data.operand_type[i] != OP_IN
+	       || recog_data.operand_type[matches] != OP_IN))
+	  || (matched >= 0
+	      && (recog_data.operand_type[i] != OP_IN
+		  || recog_data.operand_type[matched] != OP_IN)))
+	return recog_data.operand[i];
+    }
+
+  return NULL_RTX;
+}
+
+/* Finds the reference corresponding to the definition of register whose
+   register number is REGNO in INSN. DF is the dataflow object.
+   Adapted from df_find_def in df-core.  */
+
+static df_ref
+nds32_df_find_regno_def (rtx_insn *insn, unsigned int regno)
+{
+  df_ref def;
+
+  FOR_EACH_INSN_DEF (def, insn)
+    if (DF_REF_REGNO (def) == regno)
+      return def;
+
+  return NULL;
+ }
+
+/* Return true if the REG in INSN is only defined by one insn whose uid
+   is DEF_UID, otherwise return false.  */
+
+static bool
+nds32_is_single_def_p (rtx_insn *insn, rtx reg, unsigned int def_uid)
+{
+  df_ref use;
+
+  FOR_EACH_INSN_USE (use, insn)
+    {
+      df_link *link;
+      unsigned int uid;
+
+      if (DF_REF_REGNO (use) >= REGNO (reg)
+	  && DF_REF_REGNO (use) < END_REGNO (reg))
+	{
+	  link = DF_REF_CHAIN (use);
+	  if (link->next
+	      || DF_REF_IS_ARTIFICIAL (link->ref))
+	    return false;
+
+	  uid = DF_REF_INSN_UID (link->ref);
+	  if (uid != def_uid)
+	    return false;
+	}
+    }
+
+  return true;
+}
+
+/* Return true if there is no definition of REG on any path from the insn
+   whose uid is FROM_UID (called FROM) to insn TO, otherwise return false.
+   This function collects the reaching definitions bitmap at insn TO, and
+   check if all uses of REG in insn FROM can reach insn TO.  */
+
+static bool
+nds32_no_define_reg_p (rtx to, rtx reg, unsigned int from_uid)
+{
+  basic_block bb = BLOCK_FOR_INSN (to);
+  struct df_rd_bb_info *bb_info = DF_RD_BB_INFO (bb);
+  bitmap_head rd_local;
+  bool result = true;
+  rtx_insn *insn;
+  df_ref use;
+  df_insn_info *insn_info;
+
+  bitmap_initialize (&rd_local, &bitmap_default_obstack);
+  bitmap_copy (&rd_local, &bb_info->in);
+  df_rd_simulate_artificial_defs_at_top (bb, &rd_local);
+
+  for (insn = BB_HEAD (bb); insn != to; insn = NEXT_INSN (insn))
+    if (INSN_P (insn))
+      df_rd_simulate_one_insn (bb, insn, &rd_local);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "scan reach define:");
+      print_rtl_single (dump_file, to);
+
+      fprintf (dump_file, "bb rd in:\n");
+      dump_bitmap (dump_file, &bb_info->in);
+
+      fprintf (dump_file, "reach def:\n");
+      dump_bitmap (dump_file, &rd_local);
+    }
+
+  insn_info = DF_INSN_UID_GET (from_uid);
+  FOR_EACH_INSN_INFO_USE (use, insn_info)
+    {
+      df_link *link;
+
+      if (DF_REF_REGNO (use) >= REGNO (reg)
+	  && DF_REF_REGNO (use) < END_REGNO (reg))
+	for (link = DF_REF_CHAIN (use); link; link = link->next)
+	  {
+	    if (dump_file)
+	      {
+		fprintf (dump_file, "use ID %d\n", DF_REF_ID (link->ref));
+		if (DF_REF_IS_ARTIFICIAL (link->ref))
+		  fprintf (dump_file, "use ref is artificial\n");
+		else
+		  {
+		    fprintf (dump_file, "use from insn:");
+		    print_rtl_single (dump_file, DF_REF_INSN (link->ref));
+		  }
+	      }
+	    result &=
+	      (bitmap_bit_p (&rd_local, DF_REF_ID (link->ref)))
+	      ? true
+	      : false;
+	  }
+    }
+
+  bitmap_clear (&rd_local);
+  return result;
+}
+
+/* Return true if the value held by REG is no longer needed before INSN
+   (i.e. REG is dead before INSN), otherwise return false.  */
+
+static bool
+nds32_is_dead_reg_p (rtx_insn *insn, rtx reg)
+{
+  basic_block bb = BLOCK_FOR_INSN (insn);
+  bitmap live = BITMAP_ALLOC (&reg_obstack);
+  bool result = true;
+  rtx_insn *i;
+  unsigned int rn;
+
+  bitmap_copy (live, DF_LR_IN (bb));
+  df_simulate_initialize_forwards (bb, live);
+
+  for (i = BB_HEAD (bb); i != insn; i = NEXT_INSN (i))
+    df_simulate_one_insn_forwards (bb, i, live);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "scan live regs:");
+      print_rtl_single (dump_file, insn);
+
+      fprintf (dump_file, "bb lr in:\n");
+      dump_bitmap (dump_file, DF_LR_IN (bb));
+
+      fprintf (dump_file, "live:\n");
+      dump_bitmap (dump_file, live);
+    }
+
+  for (rn = REGNO (reg); rn < END_REGNO (reg); ++rn)
+    result &= (bitmap_bit_p (live, rn)) ? false : true;
+
+  BITMAP_FREE (live);
+  return result;
+}
+
+/* Return true if START can do propagation. Notice START maybe a move
+   instruction or an accumulate style instruction.
+   MOV_UID is the uid of beginning move instruction that is only used by
+   function nds32_no_define_reg_p.
+   DST_REG & SRC_REG is the SET_DEST and SET_SRC of a move instruction that
+   maybe real or unreal, respectively.
+   INDEX indicates what number sequence is currently considered rank as
+   consecutive hard registers. Simultaneously, INDEX is the index of row in
+   INSN_LISTS.   */
+
+static bool
+nds32_can_cprop_acc_1 (rtx_insn *start, unsigned int mov_uid,
+		       rtx dst_reg, rtx src_reg,
+		       unsigned int index,
+		       std::vector<insn_list> &insn_lists)
+{
+  unsigned int lead_regno = REGNO (dst_reg) + index;
+  unsigned int new_regno = REGNO (src_reg) + index;
+  df_ref def_rec;
+  df_link *link;
+
+  def_rec = nds32_df_find_regno_def (start, lead_regno);
+  gcc_assert (def_rec);
+
+  for (link = DF_REF_CHAIN (def_rec); link; link = link->next)
+    {
+      rtx *use_loc;
+      unsigned int use_regno;
+      enum machine_mode use_mode;
+      rtx_insn *use_insn;
+      rtx acc_reg, new_src;
+
+      if (DF_REF_IS_ARTIFICIAL (link->ref))
+	return false;
+
+      use_loc = DF_REF_LOC (link->ref);
+      gcc_assert (use_loc && REG_P (*use_loc));
+
+      use_regno = REGNO (*use_loc);
+      /* Do not propagate when any insns use register that regno is
+	 smaller than DST_REG.  */
+      if (use_regno < REGNO (dst_reg))
+	return false;
+
+      /* This status should be handled by previous call.  */
+      if (use_regno < lead_regno)
+	continue;
+
+      /* Do not propagate because not all of the pieces of the copy came
+	 from DST_REG.  */
+      if (END_REGNO (*use_loc) > END_REGNO (dst_reg))
+	return false;
+
+      use_insn = DF_REF_INSN (link->ref);
+      /* Do not propagate since call-used registers can't be replaced.  */
+      if (CALL_P (use_insn))
+	return false;
+
+      /* Do not replace in asms intentionally referencing hard registers.  */
+      if (asm_noperands (PATTERN (use_insn)) >= 0
+	  && use_regno == ORIGINAL_REGNO (*use_loc))
+	return false;
+
+      /* Do not propagate when the register is defined by more than one
+	 instruction.  */
+      if (!nds32_is_single_def_p (use_insn, *use_loc, INSN_UID (start)))
+	return false;
+
+      use_mode = GET_MODE (*use_loc);
+      new_src = nds32_mode_change_reg (GET_MODE (src_reg),
+				       GET_MODE (dst_reg),
+				       use_mode,
+				       new_regno,
+				       use_regno);
+      /* Do not propagate if we can't generate a new register with new mode.  */
+      if (!new_src)
+	return false;
+
+      /* Can not replace DST_REG with SRC_REG when SRC_REG is redefined between
+	 START and use insn of START.  */
+      if (!nds32_no_define_reg_p (use_insn, new_src, mov_uid))
+	return false;
+
+      acc_reg = nds32_is_acc_insn_p (use_insn);
+      /* Handle the accumulate style instruction that accumulate register
+	 may be replaced.
+         Also handle the AUTO_INC register that is another form of accumulated
+	 register.  */
+      if ((acc_reg && rtx_equal_p (acc_reg, *use_loc))
+	  || FIND_REG_INC_NOTE (use_insn, *use_loc))
+	{
+	  unsigned int i, use_nregs;
+
+	  /* ACC_REG can't be replaced since the SRC_REG can't be
+	     overwritten.  */
+	  if (!nds32_is_dead_reg_p (use_insn, new_src))
+	    return false;
+
+	  /* Once we confirm that ACC_REG can be replaced, the unreal move
+	     instruction is generated. For example:
+	     mov   r0, r1	   mov   r0, r1
+	     cmovn r0, r2, r3  ->  cmovn r1, r2, r3
+				   mov   r0, r1
+	     If the unreal move instruction can do propagation, the ACC_REG
+	     can be replaced. We check it in a recursive way.  */
+	  use_nregs = hard_regno_nregs [use_regno][(int) use_mode];
+	  for (i = 0; i < use_nregs; ++i)
+	    if (!nds32_can_cprop_acc_1 (use_insn, mov_uid,
+					*use_loc, new_src,
+					i, insn_lists))
+	      return false;
+	}
+      insn_lists[index].push_back (use_insn);
+    }
+
+  return true;
+}
+
+/* Return true if MOV can do propagation, otherwise return false.
+   INSN_LISTS is used to record what insns need to replace the operands.  */
+
+static bool
+nds32_can_cprop_acc (rtx_insn *mov, std::vector<insn_list> &insn_lists)
+{
+  rtx dst_reg = SET_DEST (PATTERN (mov));
+  rtx src_reg = SET_SRC (PATTERN (mov));
+  unsigned int dst_regno = REGNO (dst_reg);
+  enum machine_mode dst_mode = GET_MODE (dst_reg);
+  unsigned int dst_nregs = hard_regno_nregs[dst_regno][(int) dst_mode];
+  unsigned int index;
+
+  insn_lists.resize (dst_nregs);
+  for (index = 0; index < dst_nregs; ++index)
+    if (!nds32_can_cprop_acc_1 (mov, INSN_UID (mov),
+				dst_reg, src_reg,
+				index, insn_lists))
+      return false;
+
+  return true;
+}
+
+/* Replace every occurrence of OLD_REGNO in LOC with NEW_REGNO. LOC maybe a
+   part of INSN.
+   DST_REG & SRC_REG are used by function nds32_mode_change_reg.
+   Mark each change with validate_change passing INSN.  */
+
+static void
+nds32_replace_partial_operands (rtx *loc, rtx dst_reg, rtx src_reg,
+				unsigned int old_regno, unsigned int new_regno,
+				rtx_insn *insn)
+{
+  int i, j;
+  rtx x = *loc;
+  enum rtx_code code;
+  const char *fmt;
+
+  if (!x)
+    return;
+
+  code = GET_CODE (x);
+  fmt = GET_RTX_FORMAT (code);
+
+  if (REG_P (x) && REGNO (x) == old_regno)
+    {
+      rtx new_reg = nds32_mode_change_reg (GET_MODE (src_reg),
+					   GET_MODE (dst_reg),
+					   GET_MODE (x),
+					   new_regno,
+					   old_regno);
+
+      gcc_assert (new_reg);
+
+      ORIGINAL_REGNO (new_reg) = ORIGINAL_REGNO (x);
+      REG_ATTRS (new_reg) = REG_ATTRS (x);
+      REG_POINTER (new_reg) = REG_POINTER (x);
+
+      /* ??? unshare or not?  */
+      validate_change (insn, loc, new_reg, 1);
+      return;
+    }
+
+  /* Call ourself recursively to perform the replacements.  */
+  for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
+    {
+      if (fmt[i] == 'e')
+	nds32_replace_partial_operands (&XEXP (x, i), dst_reg, src_reg,
+					old_regno, new_regno, insn);
+      else if (fmt[i] == 'E') /* ??? how about V?  */
+	for (j = XVECLEN (x, i) - 1; j >= 0; j--)
+	  nds32_replace_partial_operands (&XVECEXP (x, i, j), dst_reg, src_reg,
+					  old_regno, new_regno, insn);
+    }
+}
+
+/* Try replacing every occurrence of OLD_REGNO in INSN with NEW_REGNO.  */
+
+static void
+nds32_replace_all_operands (rtx dst_reg, rtx src_reg,
+			    unsigned int old_regno, unsigned int new_regno,
+			    rtx_insn *insn)
+{
+  nds32_replace_partial_operands (&PATTERN (insn), dst_reg, src_reg,
+				  old_regno, new_regno, insn);
+}
+
+/* Called via note_uses in function nds32_replace_src_operands, for all used
+   rtx do replacement.  */
+
+static void
+nds32_replace_src_operands_1 (rtx *loc, void *data)
+{
+  struct replace_src_operands_data *d
+    = (struct replace_src_operands_data *) data;
+
+  nds32_replace_partial_operands (loc, d->dst_reg, d->src_reg,
+				  d->old_regno, d->new_regno, d->insn);
+}
+
+/* Try replacing every occurrence of OLD_REGNO in INSN with NEW_REGNO,
+   avoiding SET_DESTs.  */
+
+static void
+nds32_replace_src_operands (rtx dst_reg, rtx src_reg,
+			    unsigned int old_regno, unsigned int new_regno,
+			    rtx_insn *insn)
+{
+  struct replace_src_operands_data d
+    = {dst_reg, src_reg, old_regno, new_regno, insn};
+
+  note_uses (&PATTERN (insn), nds32_replace_src_operands_1, &d);
+}
+
+/* Try replacing every occurrence of SRC_REG (include its consecutive hard
+   registers) in each insn of INSN_LISTS with DST_REG.  */
+
+static bool
+nds32_try_replace_operands (rtx dst_reg, rtx src_reg,
+			    std::vector<insn_list> &insn_lists)
+{
+  unsigned int i;
+  std::vector<rtx_insn *>::iterator ritr;
+  unsigned int old_regno, new_regno;
+
+  old_regno = REGNO (dst_reg);
+  new_regno = REGNO (src_reg);
+
+  for (i = 0; i < insn_lists.size (); ++i, ++old_regno, ++new_regno)
+    for (ritr = insn_lists[i].begin (); ritr != insn_lists[i].end (); ++ritr)
+      {
+	rtx_insn *insn = *ritr;
+	rtx acc_reg;
+
+	acc_reg = nds32_is_acc_insn_p (insn);
+	if (acc_reg && REGNO (acc_reg) == old_regno)
+	  {
+	    /* Replace OP_OUT & OP_INOUT  */
+	    nds32_replace_all_operands (dst_reg, src_reg,
+					old_regno, new_regno, insn);
+
+	  }
+	else
+	  {
+	    /* Replace OP_IN  */
+	    nds32_replace_src_operands (dst_reg, src_reg,
+					old_regno, new_regno, insn);
+	  }
+      }
+
+  if (!apply_change_group ())
+    return false;
+  else
+    {
+      df_analyze ();
+      return true;
+    }
+}
+
+/* Check if each move instruction in WORK_LIST can do propagation, and
+   then try to replace operands if necessary. */
+
+static int
+nds32_do_cprop_acc (auto_vec<rtx_insn *> &work_list)
+{
+  int n_replace = 0;
+  int i;
+  rtx_insn *mov;
+  std::vector<insn_list> insn_lists;
+
+  FOR_EACH_VEC_ELT (work_list, i, mov)
+    {
+      if (nds32_can_cprop_acc (mov, insn_lists))
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "\n [CPROP_ACC] insn %d will be cprop. \n",
+		     INSN_UID (mov));
+
+	  if (nds32_try_replace_operands (SET_DEST (PATTERN (mov)),
+					  SET_SRC (PATTERN (mov)),
+					  insn_lists))
+	    n_replace++;
+	}
+      insn_lists.clear ();
+    }
+
+  return n_replace;
+}
+
+/* Return true if MOV meets the conditions of propagation about move
+   instruction, otherwise return false.  */
+
+static bool
+nds32_is_target_mov_p (rtx mov)
+{
+  rtx dst = SET_DEST (PATTERN (mov));
+  rtx src = SET_SRC (PATTERN (mov));
+  unsigned int dst_regno, src_regno;
+  unsigned int dst_nregs, src_nregs;
+  bool dst_is_general, src_is_general;
+
+  gcc_assert (REG_P (dst) && REG_P (src));
+
+  dst_regno = REGNO (dst);
+  src_regno = REGNO (src);
+  dst_nregs = hard_regno_nregs[dst_regno][GET_MODE (dst)];
+  src_nregs = hard_regno_nregs[src_regno][GET_MODE (src)];
+
+  /* Do not propagate to the stack pointer, as that can leave memory accesses
+     with no scheduling dependency on the stack update.
+     Adapted from regcprop.  */
+  if (dst_regno == STACK_POINTER_REGNUM)
+    return false;
+
+  /* Likewise with the frame pointer, if we're using one.
+     Adapted from regcprop.  */
+  if (frame_pointer_needed && dst_regno == HARD_FRAME_POINTER_REGNUM)
+    return false;
+
+  /* Do not propagate to fixed or global registers, patterns can be relying
+     to see particular fixed register or users can expect the chosen global
+     register in asm.
+     Adapted from regcprop.  */
+  if (fixed_regs[dst_regno] || global_regs[dst_regno])
+    return false;
+
+  /* Make sure the all consecutive registers of SET_DEST are only defined by
+     SET_SRC.  */
+  if (dst_nregs > src_nregs)
+    return false;
+
+  /* Narrowing on big endian will result in the invalid transformation.  */
+  if (dst_nregs < src_nregs
+      && (GET_MODE_SIZE (GET_MODE (src)) > UNITS_PER_WORD
+	  ? WORDS_BIG_ENDIAN : BYTES_BIG_ENDIAN))
+    return false;
+
+  dst_is_general = in_hard_reg_set_p (reg_class_contents[GENERAL_REGS],
+				      GET_MODE (dst), REGNO (dst));
+  src_is_general = in_hard_reg_set_p (reg_class_contents[GENERAL_REGS],
+				      GET_MODE (src), REGNO (src));
+  /* Make sure the register class of SET_DEST & SET_SRC are the same.  */
+  if (dst_is_general ^ src_is_general)
+    return false;
+
+  return true;
+}
+
+/* Collect the move instructions that are the uses of accumulated register
+   in WORK_LIST */
+
+static void
+nds32_cprop_acc_find_target_mov (auto_vec<rtx_insn *> &work_list)
+{
+  basic_block bb;
+  rtx_insn *insn;
+  rtx acc_reg;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    FOR_BB_INSNS (bb, insn)
+      if (INSN_P (insn))
+	{
+	  acc_reg = nds32_is_acc_insn_p (insn);
+	  if (acc_reg)
+	    {
+	      unsigned int acc_regno;
+	      enum machine_mode acc_mode;
+	      df_ref use;
+	      df_link *link;
+	      rtx_insn *def_insn;
+
+	      if (!single_set (insn) || !REG_P (acc_reg))
+		continue;
+
+	      acc_regno = REGNO (acc_reg);
+	      /* Don't replace in asms intentionally referencing hard regs.  */
+	      if (asm_noperands (PATTERN (insn)) >= 0
+		  && acc_regno == ORIGINAL_REGNO (acc_reg))
+		continue;
+
+	      if (dump_file)
+		fprintf (dump_file,
+			 "\n [CPROP_ACC] "
+			 "RTL_UID %d is an exchangeable ACC insn. \n",
+			 INSN_UID (insn));
+
+	      use = df_find_use (insn, acc_reg);
+	      gcc_assert (use);
+	      link = DF_REF_CHAIN (use);
+
+	      if (link->next
+		  || DF_REF_IS_ARTIFICIAL (link->ref))
+		continue;
+
+	      acc_mode = GET_MODE (acc_reg);
+	      def_insn = DF_REF_INSN (link->ref);
+	      if (nds32_is_reg_mov_p (def_insn))
+		{
+		  rtx *loc = DF_REF_LOC (link->ref);
+		  enum machine_mode loc_mode = GET_MODE (*loc);
+
+		  /* If the move instruction can't define whole accumulated
+		     register, the replacement is invalid.  */
+		  if (loc_mode != acc_mode)
+		    if (hard_regno_nregs[acc_regno][acc_mode]
+			> hard_regno_nregs[acc_regno][loc_mode])
+		      continue;
+
+		  if (nds32_is_target_mov_p (def_insn))
+		    work_list.safe_push (def_insn);
+		}
+	    }
+	}
+}
+
+/* Main entry point for the forward copy propagation optimization for
+   accumulate style instruction.  */
+
+static int
+nds32_cprop_acc_opt (void)
+{
+  df_chain_add_problem (DF_DU_CHAIN + DF_UD_CHAIN);
+  df_note_add_problem ();
+  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
+  df_insn_rescan_all ();
+  df_analyze ();
+
+  auto_vec<rtx_insn *> work_list;
+
+  nds32_cprop_acc_find_target_mov (work_list);
+  if (work_list.is_empty())
+    {
+      if (dump_file)
+	fprintf (dump_file, "\n [CPROP_ACC] The work_list is empty. \n");
+      return 0;
+    }
+
+  if (dump_file)
+    {
+      int i;
+      rtx_insn *mov;
+
+      fprintf (dump_file, "\n [CPROP_ACC] The content of work_list:");
+      FOR_EACH_VEC_ELT (work_list, i, mov)
+	fprintf (dump_file, " %d", INSN_UID (mov));
+      fprintf (dump_file, "\n");
+    }
+
+  compute_bb_for_insn ();
+
+  int n_replace = nds32_do_cprop_acc (work_list);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "\n [CPROP_ACC] Result: ");
+      if (n_replace == 0)
+	fprintf (dump_file, "No move can do cprop. \n");
+      else
+	fprintf (dump_file, "Do cprop for %d move. \n", n_replace);
+    }
+
+  work_list.release ();
+  return 1;
+}
+
+const pass_data pass_data_nds32_cprop_acc_opt =
+{
+  RTL_PASS,                                     /* type */
+  "cprop_acc",                                  /* name */
+  OPTGROUP_NONE,                                /* optinfo_flags */
+  TV_MACH_DEP,                                  /* tv_id */
+  0,                                            /* properties_required */
+  0,                                            /* properties_provided */
+  0,                                            /* properties_destroyed */
+  0,                                            /* todo_flags_start */
+  TODO_df_finish,				/* todo_flags_finish */
+};
+
+class pass_nds32_cprop_acc_opt : public rtl_opt_pass
+{
+public:
+  pass_nds32_cprop_acc_opt (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_nds32_cprop_acc_opt, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) { return optimize > 0 && flag_nds32_cprop_acc; }
+  unsigned int execute (function *) { return nds32_cprop_acc_opt (); }
+};
+
+rtl_opt_pass *
+make_pass_nds32_cprop_acc_opt (gcc::context *ctxt)
+{
+  return new pass_nds32_cprop_acc_opt (ctxt);
+}
diff --git a/gcc/config/nds32/nds32-doubleword.md b/gcc/config/nds32/nds32-doubleword.md
index 23a9f25..7c9dfb9 100644
--- a/gcc/config/nds32/nds32-doubleword.md
+++ b/gcc/config/nds32/nds32-doubleword.md
@@ -23,7 +23,8 @@
 ;; Move DImode/DFmode instructions.
 ;; -------------------------------------------------------------

-
+;; Do *NOT* try to split DI/DFmode before reload since LRA seem
+;; still buggy for such behavior at least at gcc 4.8.2...
 (define_expand "movdi"
   [(set (match_operand:DI 0 "general_operand" "")
 	(match_operand:DI 1 "general_operand" ""))]
@@ -46,149 +47,100 @@


 (define_insn "move_<mode>"
-  [(set (match_operand:DIDF 0 "nonimmediate_operand" "=r, r, r, m")
-	(match_operand:DIDF 1 "general_operand"      " r, i, m, r"))]
-  ""
+  [(set (match_operand:DIDF 0 "nonimmediate_operand" "=r, r,  r, r, Da, m, f, Q, f, *r, *f")
+	(match_operand:DIDF 1 "general_operand"      " r, i, Da, m,  r, r, Q, f, f, *f, *r"))]
+  "register_operand(operands[0], <MODE>mode)
+   || register_operand(operands[1], <MODE>mode)"
 {
-  rtx addr;
-  rtx otherops[5];
-
   switch (which_alternative)
     {
     case 0:
       return "movd44\t%0, %1";
-
     case 1:
       /* reg <- const_int, we ask gcc to split instruction.  */
       return "#";
-
     case 2:
-      /* Refer to nds32_legitimate_address_p() in nds32.c,
-         we only allow "reg", "symbol_ref", "const", and "reg + const_int"
-         as address rtx for DImode/DFmode memory access.  */
-      addr = XEXP (operands[1], 0);
-
-      otherops[0] = gen_rtx_REG (SImode, REGNO (operands[0]));
-      otherops[1] = gen_rtx_REG (SImode, REGNO (operands[0]) + 1);
-      otherops[2] = addr;
-
-      if (REG_P (addr))
-	{
-	  /* (reg) <- (mem (reg)) */
-	  output_asm_insn ("lmw.bi\t%0, [%2], %1, 0", otherops);
-	}
-      else if (GET_CODE (addr) == PLUS)
-	{
-	  /* (reg) <- (mem (plus (reg) (const_int))) */
-	  rtx op0 = XEXP (addr, 0);
-	  rtx op1 = XEXP (addr, 1);
-
-	  if (REG_P (op0))
-	    {
-	      otherops[2] = op0;
-	      otherops[3] = op1;
-	      otherops[4] = gen_int_mode (INTVAL (op1) + 4, SImode);
-	    }
-	  else
-	    {
-	      otherops[2] = op1;
-	      otherops[3] = op0;
-	      otherops[4] = gen_int_mode (INTVAL (op0) + 4, SImode);
-	    }
-
-	  /* To avoid base overwrite when REGNO(%0) == REGNO(%2).  */
-	  if (REGNO (otherops[0]) != REGNO (otherops[2]))
-	    {
-	      output_asm_insn ("lwi\t%0, [%2 + (%3)]", otherops);
-	      output_asm_insn ("lwi\t%1, [%2 + (%4)]", otherops);
-	    }
-	  else
-	    {
-	      output_asm_insn ("lwi\t%1, [%2 + (%4)]", otherops);
-	      output_asm_insn ("lwi\t%0,[ %2 + (%3)]", otherops);
-	    }
-	}
-      else
-	{
-	  /* (reg) <- (mem (symbol_ref ...))
-	     (reg) <- (mem (const ...)) */
-	  output_asm_insn ("lwi.gp\t%0, [ + %2]", otherops);
-	  output_asm_insn ("lwi.gp\t%1, [ + %2 + 4]", otherops);
-	}
-
-      /* We have already used output_asm_insn() by ourself,
-         so return an empty string.  */
-      return "";
-
+      /* The memory format is (mem (reg)),
+	 we can generate 'lmw.bi' instruction.  */
+      return nds32_output_double (operands, true);
     case 3:
-      /* Refer to nds32_legitimate_address_p() in nds32.c,
-         we only allow "reg", "symbol_ref", "const", and "reg + const_int"
-         as address rtx for DImode/DFmode memory access.  */
-      addr = XEXP (operands[0], 0);
-
-      otherops[0] = gen_rtx_REG (SImode, REGNO (operands[1]));
-      otherops[1] = gen_rtx_REG (SImode, REGNO (operands[1]) + 1);
-      otherops[2] = addr;
-
-      if (REG_P (addr))
-	{
-	  /* (mem (reg)) <- (reg) */
-	  output_asm_insn ("smw.bi\t%0, [%2], %1, 0", otherops);
-	}
-      else if (GET_CODE (addr) == PLUS)
-	{
-	  /* (mem (plus (reg) (const_int))) <- (reg) */
-	  rtx op0 = XEXP (addr, 0);
-	  rtx op1 = XEXP (addr, 1);
-
-	  if (REG_P (op0))
-	    {
-	      otherops[2] = op0;
-	      otherops[3] = op1;
-	      otherops[4] = gen_int_mode (INTVAL (op1) + 4, SImode);
-	    }
-	  else
-	    {
-	      otherops[2] = op1;
-	      otherops[3] = op0;
-	      otherops[4] = gen_int_mode (INTVAL (op0) + 4, SImode);
-	    }
-
-	  /* To avoid base overwrite when REGNO(%0) == REGNO(%2).  */
-	  if (REGNO (otherops[0]) != REGNO (otherops[2]))
-	    {
-	      output_asm_insn ("swi\t%0, [%2 + (%3)]", otherops);
-	      output_asm_insn ("swi\t%1, [%2 + (%4)]", otherops);
-	    }
-	  else
-	    {
-	      output_asm_insn ("swi\t%1, [%2 + (%4)]", otherops);
-	      output_asm_insn ("swi\t%0, [%2 + (%3)]", otherops);
-	    }
-	}
-      else
-	{
-	  /* (mem (symbol_ref ...)) <- (reg)
-	     (mem (const ...))      <- (reg) */
-	  output_asm_insn ("swi.gp\t%0, [ + %2]", otherops);
-	  output_asm_insn ("swi.gp\t%1, [ + %2 + 4]", otherops);
-	}
-
-      /* We have already used output_asm_insn() by ourself,
-         so return an empty string.  */
-      return "";
-
+      /* We haven't 64-bit load instruction,
+	 we split this pattern to two SImode pattern.  */
+      return "#";
+    case 4:
+      /* The memory format is (mem (reg)),
+	 we can generate 'smw.bi' instruction.  */
+      return nds32_output_double (operands, false);
+    case 5:
+      /* We haven't 64-bit store instruction,
+	 we split this pattern to two SImode pattern.  */
+      return "#";
+    case 6:
+      return nds32_output_float_load (operands);
+    case 7:
+      return nds32_output_float_store (operands);
+    case 8:
+      return "fcpysd\t%0, %1, %1";
+    case 9:
+      return "fmfdr\t%0, %1";
+    case 10:
+      return "fmtdr\t%1, %0";
     default:
       gcc_unreachable ();
     }
 }
-  [(set_attr "type"   "move,move,move,move")
-   (set_attr "length" "   4,  16,   8,   8")])
+  [(set_attr "type"    "alu,alu,load,load,store,store,fload,fstore,fcpy,fmfdr,fmtdr")
+   (set_attr_alternative "length"
+     [
+       ;; Alternative 0
+       (if_then_else (match_test "!TARGET_16_BIT")
+		     (const_int 4)
+		     (const_int 2))
+       ;; Alternative 1
+       (const_int 16)
+       ;; Alternative 2
+       (const_int 4)
+       ;; Alternative 3
+       (const_int 8)
+       ;; Alternative 4
+       (const_int 4)
+       ;; Alternative 5
+       (const_int 8)
+       ;; Alternative 6
+       (const_int 4)
+       ;; Alternative 7
+       (const_int 4)
+       ;; Alternative 8
+       (const_int 4)
+       ;; Alternative 9
+       (const_int 4)
+       ;; Alternative 10
+       (const_int 4)
+     ])
+   (set_attr "feature" " v1, v1,  v1,  v1,   v1,   v1,    fpu,    fpu,    fpu,    fpu,    fpu")])
+
+;; Split move_di pattern when the hard register is odd.
+(define_split
+  [(set (match_operand:DIDF 0 "register_operand" "")
+	(match_operand:DIDF 1 "register_operand" ""))]
+  "(NDS32_IS_GPR_REGNUM (REGNO (operands[0]))
+    && ((REGNO (operands[0]) & 0x1) == 1))
+   || (NDS32_IS_GPR_REGNUM (REGNO (operands[1]))
+       && ((REGNO (operands[1]) & 0x1) == 1))"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 4) (match_dup 5))]
+  {
+     operands[2] = gen_lowpart (SImode, operands[0]);
+     operands[4] = gen_highpart (SImode, operands[0]);
+     operands[3] = gen_lowpart (SImode, operands[1]);
+     operands[5] = gen_highpart (SImode, operands[1]);
+  }
+)

 (define_split
   [(set (match_operand:DIDF 0 "register_operand"     "")
 	(match_operand:DIDF 1 "const_double_operand" ""))]
-  "reload_completed"
+  "flag_pic || reload_completed"
   [(set (match_dup 2) (match_dup 3))
    (set (match_dup 4) (match_dup 5))]
 {
@@ -207,7 +159,12 @@
   /* Actually we would like to create move behavior by ourself.
      So that movsi expander could have chance to split large constant.  */
   emit_move_insn (operands[2], operands[3]);
-  emit_move_insn (operands[4], operands[5]);
+
+  unsigned HOST_WIDE_INT mask = GET_MODE_MASK (SImode);
+  if ((UINTVAL (operands[3]) & mask) == (UINTVAL (operands[5]) & mask))
+    emit_move_insn (operands[4], operands[2]);
+  else
+    emit_move_insn (operands[4], operands[5]);
   DONE;
 })

@@ -217,7 +174,9 @@
   [(set (match_operand:DIDF 0 "register_operand" "")
 	(match_operand:DIDF 1 "register_operand" ""))]
   "reload_completed
-   && (TARGET_ISA_V2 || !TARGET_16_BIT)"
+   && (TARGET_ISA_V2 || !TARGET_16_BIT)
+   && NDS32_IS_GPR_REGNUM (REGNO (operands[0]))
+   && NDS32_IS_GPR_REGNUM (REGNO (operands[1]))"
   [(set (match_dup 0) (match_dup 1))
    (set (match_dup 2) (match_dup 3))]
 {
@@ -239,6 +198,28 @@
     }
 })

+(define_split
+  [(set (match_operand:DIDF 0 "nds32_general_register_operand" "")
+	(match_operand:DIDF 1 "memory_operand" ""))]
+  "reload_completed
+   && nds32_split_double_word_load_store_p (operands, true)"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 4) (match_dup 5))]
+{
+  nds32_spilt_doubleword (operands, true);
+})
+
+(define_split
+  [(set (match_operand:DIDF 0  "memory_operand" "")
+	(match_operand:DIDF 1  "nds32_general_register_operand" ""))]
+  "reload_completed
+   && nds32_split_double_word_load_store_p (operands, false)"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 4) (match_dup 5))]
+{
+  nds32_spilt_doubleword (operands, false);
+})
+
 ;; -------------------------------------------------------------
 ;; Boolean DImode instructions.
 ;; -------------------------------------------------------------
diff --git a/gcc/config/nds32/nds32-dspext.md b/gcc/config/nds32/nds32-dspext.md
new file mode 100644
index 0000000..6ec2137
--- /dev/null
+++ b/gcc/config/nds32/nds32-dspext.md
@@ -0,0 +1,5280 @@
+;; Machine description of Andes NDS32 cpu for GNU compiler
+;; Copyright (C) 2012-2016 Free Software Foundation, Inc.
+;; Contributed by Andes Technology Corporation.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_expand "mov<mode>"
+  [(set (match_operand:VQIHI 0 "general_operand" "")
+	(match_operand:VQIHI 1 "general_operand" ""))]
+  "NDS32_EXT_DSP_P ()"
+{
+  /* Need to force register if mem <- !reg.  */
+  if (MEM_P (operands[0]) && !REG_P (operands[1]))
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+
+  /* If operands[1] is a large constant and cannot be performed
+     by a single instruction, we need to split it.  */
+  if (GET_CODE (operands[1]) == CONST_VECTOR
+      && !satisfies_constraint_CVs2 (operands[1])
+      && !satisfies_constraint_CVhi (operands[1]))
+    {
+      HOST_WIDE_INT ival = const_vector_to_hwint (operands[1]);
+      rtx tmp_rtx;
+
+      tmp_rtx = can_create_pseudo_p ()
+		? gen_reg_rtx (SImode)
+		: simplify_gen_subreg (SImode, operands[0], <MODE>mode, 0);
+
+      emit_move_insn (tmp_rtx, gen_int_mode (ival, SImode));
+      convert_move (operands[0], tmp_rtx, false);
+      DONE;
+    }
+
+  if (REG_P (operands[0]) && SYMBOLIC_CONST_P (operands[1]))
+    {
+      if (nds32_tls_referenced_p (operands [1]))
+	{
+	  nds32_expand_tls_move (operands);
+	  DONE;
+	}
+      else if (flag_pic)
+	{
+	  nds32_expand_pic_move (operands);
+	  DONE;
+	}
+    }
+})
+
+(define_insn "*mov<mode>"
+  [(set (match_operand:VQIHI 0 "nonimmediate_operand" "=r, r,$U45,$U33,$U37,$U45, m,$  l,$  l,$  l,$  d,  d, r,$   d,    r,    r,    r, *f, *f,  r, *f,  Q, A")
+	(match_operand:VQIHI 1 "nds32_vmove_operand"  " r, r,   l,   l,   l,   d, r, U45, U33, U37, U45,Ufe, m, CVp5, CVs5, CVs2, CVhi, *f,  r, *f,  Q, *f, r"))]
+  "NDS32_EXT_DSP_P ()
+   && (register_operand(operands[0], <MODE>mode)
+       || register_operand(operands[1], <MODE>mode))"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "mov55\t%0, %1";
+    case 1:
+      return "ori\t%0, %1, 0";
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+      return nds32_output_16bit_store (operands, <byte>);
+    case 6:
+      return nds32_output_32bit_store (operands, <byte>);
+    case 7:
+    case 8:
+    case 9:
+    case 10:
+    case 11:
+      return nds32_output_16bit_load (operands, <byte>);
+    case 12:
+      return nds32_output_32bit_load (operands, <byte>);
+    case 13:
+      return "movpi45\t%0, %1";
+    case 14:
+      return "movi55\t%0, %1";
+    case 15:
+      return "movi\t%0, %1";
+    case 16:
+      return "sethi\t%0, hi20(%1)";
+    case 17:
+      if (TARGET_FPU_SINGLE)
+	return "fcpyss\t%0, %1, %1";
+      else
+	return "#";
+    case 18:
+      return "fmtsr\t%1, %0";
+    case 19:
+      return "fmfsr\t%0, %1";
+    case 20:
+      return nds32_output_float_load (operands);
+    case 21:
+      return nds32_output_float_store (operands);
+    case 22:
+      return "mtusr\t%1, %0";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type"    "alu,alu,store,store,store,store,store,load,load,load,load,load,load,alu,alu,alu,alu,fcpy,fmtsr,fmfsr,fload,fstore,alu")
+   (set_attr "length"  "  2,  4,    2,    2,    2,    2,    4,   2,   2,   2,   2,   2,   4,  2,  2,  4,  4,   4,    4,    4,    4,     4,  4")
+   (set_attr "feature" " v1, v1,   v1,   v1,   v1,   v1,   v1,  v1,  v1,  v1,  v1, v3m,  v1, v1, v1, v1, v1, fpu,  fpu,  fpu,  fpu,   fpu, v1")])
+
+(define_expand "movv2si"
+  [(set (match_operand:V2SI 0 "general_operand" "")
+	(match_operand:V2SI 1 "general_operand" ""))]
+  "NDS32_EXT_DSP_P ()"
+{
+  /* Need to force register if mem <- !reg.  */
+  if (MEM_P (operands[0]) && !REG_P (operands[1]))
+    operands[1] = force_reg (V2SImode, operands[1]);
+})
+
+(define_insn "*movv2si"
+  [(set (match_operand:V2SI 0 "nonimmediate_operand" "=r, r,  r, r, Da, m, f, Q, f, r, f")
+	(match_operand:V2SI 1 "general_operand"      " r, i, Da, m,  r, r, Q, f, f, f, r"))]
+  "NDS32_EXT_DSP_P ()
+   && (register_operand(operands[0], V2SImode)
+       || register_operand(operands[1], V2SImode))"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "movd44\t%0, %1";
+    case 1:
+      /* reg <- const_int, we ask gcc to split instruction.  */
+      return "#";
+    case 2:
+      /* The memory format is (mem (reg)),
+	 we can generate 'lmw.bi' instruction.  */
+      return nds32_output_double (operands, true);
+    case 3:
+      /* We haven't 64-bit load instruction,
+	 we split this pattern to two SImode pattern.  */
+      return "#";
+    case 4:
+      /* The memory format is (mem (reg)),
+	 we can generate 'smw.bi' instruction.  */
+      return nds32_output_double (operands, false);
+    case 5:
+      /* We haven't 64-bit store instruction,
+	 we split this pattern to two SImode pattern.  */
+      return "#";
+    case 6:
+      return nds32_output_float_load (operands);
+    case 7:
+      return nds32_output_float_store (operands);
+    case 8:
+      return "fcpysd\t%0, %1, %1";
+    case 9:
+      return "fmfdr\t%0, %1";
+    case 10:
+      return "fmtdr\t%1, %0";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type"    "alu,alu,load,load,store,store,unknown,unknown,unknown,unknown,unknown")
+   (set_attr_alternative "length"
+     [
+       ;; Alternative 0
+       (if_then_else (match_test "!TARGET_16_BIT")
+		     (const_int 4)
+		     (const_int 2))
+       ;; Alternative 1
+       (const_int 16)
+       ;; Alternative 2
+       (const_int 4)
+       ;; Alternative 3
+       (const_int 8)
+       ;; Alternative 4
+       (const_int 4)
+       ;; Alternative 5
+       (const_int 8)
+       ;; Alternative 6
+       (const_int 4)
+       ;; Alternative 7
+       (const_int 4)
+       ;; Alternative 8
+       (const_int 4)
+       ;; Alternative 9
+       (const_int 4)
+       ;; Alternative 10
+       (const_int 4)
+     ])
+   (set_attr "feature" " v1, v1,  v1,  v1,   v1,   v1,    fpu,    fpu,    fpu,    fpu,    fpu")])
+
+(define_expand "movmisalign<mode>"
+  [(set (match_operand:VQIHI 0 "general_operand" "")
+	(match_operand:VQIHI 1 "general_operand" ""))]
+  "NDS32_EXT_DSP_P ()"
+{
+  rtx addr;
+  if (MEM_P (operands[0]) && !REG_P (operands[1]))
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+
+  if (MEM_P (operands[0]))
+    {
+      addr = force_reg (Pmode, XEXP (operands[0], 0));
+      emit_insn (gen_unaligned_store<mode> (addr, operands[1]));
+    }
+  else
+    {
+      addr = force_reg (Pmode, XEXP (operands[1], 0));
+      emit_insn (gen_unaligned_load<mode> (operands[0], addr));
+    }
+  DONE;
+})
+
+(define_expand "unaligned_load<mode>"
+  [(set (match_operand:VQIHI 0 "register_operand" "=r")
+	(unspec:VQIHI [(mem:VQIHI (match_operand:SI 1 "register_operand" "r"))] UNSPEC_UALOAD_W))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_ISA_V3M)
+    nds32_expand_unaligned_load (operands, <MODE>mode);
+  else
+    emit_insn (gen_unaligned_load_w<mode> (operands[0], gen_rtx_MEM (<MODE>mode, operands[1])));
+  DONE;
+})
+
+(define_insn "unaligned_load_w<mode>"
+  [(set (match_operand:VQIHI 0 "register_operand"                          "=  r")
+	(unspec:VQIHI [(match_operand:VQIHI 1 "nds32_lmw_smw_base_operand" " Umw")] UNSPEC_UALOAD_W))]
+  "NDS32_EXT_DSP_P ()"
+{
+  return nds32_output_lmw_single_word (operands);
+}
+  [(set_attr "type"   "load")
+   (set_attr "length"    "4")]
+)
+
+(define_expand "unaligned_store<mode>"
+  [(set (mem:VQIHI (match_operand:SI 0 "register_operand" "r"))
+	(unspec:VQIHI [(match_operand:VQIHI 1 "register_operand" "r")] UNSPEC_UASTORE_W))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_ISA_V3M)
+    nds32_expand_unaligned_store (operands, <MODE>mode);
+  else
+    emit_insn (gen_unaligned_store_w<mode> (gen_rtx_MEM (<MODE>mode, operands[0]), operands[1]));
+  DONE;
+})
+
+(define_insn "unaligned_store_w<mode>"
+  [(set (match_operand:VQIHI 0 "nds32_lmw_smw_base_operand"      "=Umw")
+	(unspec:VQIHI [(match_operand:VQIHI 1 "register_operand" "   r")] UNSPEC_UASTORE_W))]
+  "NDS32_EXT_DSP_P ()"
+{
+  return nds32_output_smw_single_word (operands);
+}
+  [(set_attr "type"   "store")
+   (set_attr "length"     "4")]
+)
+
+(define_insn "<uk>add<mode>3"
+  [(set (match_operand:VQIHI 0 "register_operand"                 "=r")
+	(all_plus:VQIHI (match_operand:VQIHI 1 "register_operand" " r")
+			(match_operand:VQIHI 2 "register_operand" " r")))]
+  "NDS32_EXT_DSP_P ()"
+  "<uk>add<bits> %0, %1, %2"
+  [(set_attr "type"    "dalu")
+   (set_attr "length"  "4")
+   (set_attr "feature" "v1")])
+
+(define_insn "<uk>adddi3"
+  [(set (match_operand:DI 0 "register_operand"              "=r")
+	(all_plus:DI (match_operand:DI 1 "register_operand" " r")
+		     (match_operand:DI 2 "register_operand" " r")))]
+  "NDS32_EXT_DSP_P ()"
+  "<uk>add64 %0, %1, %2"
+  [(set_attr "type"    "dalu64")
+   (set_attr "length"  "4")
+   (set_attr "feature" "v1")])
+
+(define_insn "raddv4qi3"
+  [(set (match_operand:V4QI 0 "register_operand"                  "=r")
+	(truncate:V4QI
+	  (ashiftrt:V4HI
+	    (plus:V4HI (sign_extend:V4HI (match_operand:V4QI 1 "register_operand" " r"))
+		       (sign_extend:V4HI (match_operand:V4QI 2 "register_operand" " r")))
+	    (const_int 1))))]
+  "NDS32_EXT_DSP_P ()"
+  "radd8\t%0, %1, %2"
+  [(set_attr "type"    "dalu")
+   (set_attr "length"  "4")
+   (set_attr "feature" "v1")])
+
+
+(define_insn "uraddv4qi3"
+  [(set (match_operand:V4QI 0 "register_operand"                  "=r")
+	(truncate:V4QI
+	  (lshiftrt:V4HI
+	    (plus:V4HI (zero_extend:V4HI (match_operand:V4QI 1 "register_operand" " r"))
+		       (zero_extend:V4HI (match_operand:V4QI 2 "register_operand" " r")))
+	    (const_int 1))))]
+  "NDS32_EXT_DSP_P ()"
+  "uradd8\t%0, %1, %2"
+  [(set_attr "type"    "dalu")
+   (set_attr "length"  "4")
+   (set_attr "feature" "v1")])
+
+(define_insn "raddv2hi3"
+  [(set (match_operand:V2HI 0 "register_operand"                                  "=r")
+	(truncate:V2HI
+	  (ashiftrt:V2SI
+	    (plus:V2SI (sign_extend:V2SI (match_operand:V2HI 1 "register_operand" " r"))
+		       (sign_extend:V2SI (match_operand:V2HI 2 "register_operand" " r")))
+	    (const_int 1))))]
+  "NDS32_EXT_DSP_P ()"
+  "radd16\t%0, %1, %2"
+  [(set_attr "type"    "dalu")
+   (set_attr "length"  "4")
+   (set_attr "feature" "v1")])
+
+(define_insn "uraddv2hi3"
+  [(set (match_operand:V2HI 0 "register_operand"                                  "=r")
+	(truncate:V2HI
+	  (lshiftrt:V2SI
+	    (plus:V2SI (zero_extend:V2SI (match_operand:V2HI 1 "register_operand" " r"))
+		       (zero_extend:V2SI (match_operand:V2HI 2 "register_operand" " r")))
+	    (const_int 1))))]
+  "NDS32_EXT_DSP_P ()"
+  "uradd16\t%0, %1, %2"
+  [(set_attr "type"    "dalu")
+   (set_attr "length"  "4")
+   (set_attr "feature" "v1")])
+
+(define_insn "radddi3"
+  [(set (match_operand:DI 0 "register_operand"            "=r")
+	(truncate:DI
+	  (ashiftrt:TI
+	    (plus:TI (sign_extend:TI (match_operand:DI 1 "register_operand" " r"))
+		     (sign_extend:TI (match_operand:DI 2 "register_operand" " r")))
+	  (const_int 1))))]
+  "NDS32_EXT_DSP_P ()"
+  "radd64\t%0, %1, %2"
+  [(set_attr "type"    "dalu64")
+   (set_attr "length"  "4")
+   (set_attr "feature" "v1")])
+
+
+(define_insn "uradddi3"
+  [(set (match_operand:DI 0 "register_operand"            "=r")
+	(truncate:DI
+	  (lshiftrt:TI
+	    (plus:TI (zero_extend:TI (match_operand:DI 1 "register_operand" " r"))
+		     (zero_extend:TI (match_operand:DI 2 "register_operand" " r")))
+	  (const_int 1))))]
+  "NDS32_EXT_DSP_P ()"
+  "uradd64\t%0, %1, %2"
+  [(set_attr "type"    "dalu64")
+   (set_attr "length"  "4")
+   (set_attr "feature" "v1")])
+
+(define_insn "<uk>sub<mode>3"
+  [(set (match_operand:VQIHI 0 "register_operand"                  "=r")
+	(all_minus:VQIHI (match_operand:VQIHI 1 "register_operand" " r")
+			 (match_operand:VQIHI 2 "register_operand" " r")))]
+  "NDS32_EXT_DSP_P ()"
+  "<uk>sub<bits> %0, %1, %2"
+  [(set_attr "type"    "dalu")
+   (set_attr "length"  "4")
+   (set_attr "feature" "v1")])
+
+(define_insn "<uk>subdi3"
+  [(set (match_operand:DI 0 "register_operand"               "=r")
+	(all_minus:DI (match_operand:DI 1 "register_operand" " r")
+		      (match_operand:DI 2 "register_operand" " r")))]
+  "NDS32_EXT_DSP_P ()"
+  "<uk>sub64 %0, %1, %2"
+  [(set_attr "type"    "dalu64")
+   (set_attr "length"  "4")
+   (set_attr "feature" "v1")])
+
+(define_insn "rsubv4qi3"
+  [(set (match_operand:V4QI 0 "register_operand"                                   "=r")
+	(truncate:V4QI
+	  (ashiftrt:V4HI
+	    (minus:V4HI (sign_extend:V4HI (match_operand:V4QI 1 "register_operand" " r"))
+			(sign_extend:V4HI (match_operand:V4QI 2 "register_operand" " r")))
+	    (const_int 1))))]
+  "NDS32_EXT_DSP_P ()"
+  "rsub8\t%0, %1, %2"
+  [(set_attr "type"    "dalu")
+   (set_attr "length"  "4")])
+
+(define_insn "ursubv4qi3"
+  [(set (match_operand:V4QI 0 "register_operand"                                   "=r")
+	(truncate:V4QI
+	  (lshiftrt:V4HI
+	    (minus:V4HI (zero_extend:V4HI (match_operand:V4QI 1 "register_operand" " r"))
+			(zero_extend:V4HI (match_operand:V4QI 2 "register_operand" " r")))
+	    (const_int 1))))]
+  "NDS32_EXT_DSP_P ()"
+  "ursub8\t%0, %1, %2"
+  [(set_attr "type"    "dalu")
+   (set_attr "length"  "4")])
+
+(define_insn "rsubv2hi3"
+  [(set (match_operand:V2HI 0 "register_operand"                                   "=r")
+	(truncate:V2HI
+	  (ashiftrt:V2SI
+	    (minus:V2SI (sign_extend:V2SI (match_operand:V2HI 1 "register_operand" " r"))
+			(sign_extend:V2SI (match_operand:V2HI 2 "register_operand" " r")))
+	    (const_int 1))))]
+  "NDS32_EXT_DSP_P ()"
+  "rsub16\t%0, %1, %2"
+  [(set_attr "type"    "dalu")
+   (set_attr "length"  "4")])
+
+(define_insn "ursubv2hi3"
+  [(set (match_operand:V2HI 0 "register_operand"                                   "=r")
+	(truncate:V2HI
+	  (lshiftrt:V2SI
+	    (minus:V2SI (zero_extend:V2SI (match_operand:V2HI 1 "register_operand" " r"))
+			(zero_extend:V2SI (match_operand:V2HI 2 "register_operand" " r")))
+	    (const_int 1))))]
+  "NDS32_EXT_DSP_P ()"
+  "ursub16\t%0, %1, %2"
+  [(set_attr "type"    "dalu")
+   (set_attr "length"  "4")])
+
+(define_insn "rsubdi3"
+  [(set (match_operand:DI 0 "register_operand"                   "=r")
+	(truncate:DI
+	  (ashiftrt:TI
+	    (minus:TI (sign_extend:TI (match_operand:DI 1 "register_operand" " r"))
+		      (sign_extend:TI (match_operand:DI 2 "register_operand" " r")))
+	  (const_int 1))))]
+  "NDS32_EXT_DSP_P ()"
+  "rsub64\t%0, %1, %2"
+  [(set_attr "type"    "dalu64")
+   (set_attr "length"  "4")])
+
+
+(define_insn "ursubdi3"
+  [(set (match_operand:DI 0 "register_operand"                   "=r")
+	(truncate:DI
+	  (lshiftrt:TI
+	    (minus:TI (zero_extend:TI (match_operand:DI 1 "register_operand" " r"))
+		      (zero_extend:TI (match_operand:DI 2 "register_operand" " r")))
+	  (const_int 1))))]
+  "NDS32_EXT_DSP_P ()"
+  "ursub64\t%0, %1, %2"
+  [(set_attr "type"    "dalu64")
+   (set_attr "length"  "4")])
+
+(define_expand "cras16_1"
+  [(match_operand:V2HI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_cras16_1_be (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_cras16_1_le (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "cras16_1_le"
+  [(set (match_operand:V2HI 0 "register_operand"         "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (minus:HI
+	      (vec_select:HI
+		(match_operand:V2HI 1 "register_operand" " r")
+		(parallel [(const_int 0)]))
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 1)]))))
+	  (vec_duplicate:V2HI
+	    (plus:HI
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 0)]))
+	      (vec_select:HI
+		(match_dup 1)
+		(parallel [(const_int 1)]))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "cras16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_insn "cras16_1_be"
+  [(set (match_operand:V2HI 0 "register_operand"         "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (minus:HI
+	      (vec_select:HI
+		(match_operand:V2HI 1 "register_operand" " r")
+		(parallel [(const_int 1)]))
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 0)]))))
+	  (vec_duplicate:V2HI
+	    (plus:HI
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 1)]))
+	      (vec_select:HI
+		(match_dup 1)
+		(parallel [(const_int 0)]))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "cras16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_expand "kcras16_1"
+  [(match_operand:V2HI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_kcras16_1_be (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_kcras16_1_le (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "kcras16_1_le"
+  [(set (match_operand:V2HI 0 "register_operand"         "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (ss_minus:HI
+	      (vec_select:HI
+		(match_operand:V2HI 1 "register_operand" " r")
+		(parallel [(const_int 0)]))
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 1)]))))
+	  (vec_duplicate:V2HI
+	    (ss_plus:HI
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 0)]))
+	      (vec_select:HI
+		(match_dup 1)
+		(parallel [(const_int 1)]))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "kcras16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_insn "kcras16_1_be"
+  [(set (match_operand:V2HI 0 "register_operand"         "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (ss_minus:HI
+	      (vec_select:HI
+		(match_operand:V2HI 1 "register_operand" " r")
+		(parallel [(const_int 1)]))
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 0)]))))
+	  (vec_duplicate:V2HI
+	    (ss_plus:HI
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 1)]))
+	      (vec_select:HI
+		(match_dup 1)
+		(parallel [(const_int 0)]))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "kcras16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_expand "ukcras16_1"
+  [(match_operand:V2HI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_ukcras16_1_be (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_ukcras16_1_le (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "ukcras16_1_le"
+  [(set (match_operand:V2HI 0 "register_operand"         "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (us_minus:HI
+	      (vec_select:HI
+		(match_operand:V2HI 1 "register_operand" " r")
+		(parallel [(const_int 0)]))
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 1)]))))
+	  (vec_duplicate:V2HI
+	    (us_plus:HI
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 0)]))
+	      (vec_select:HI
+		(match_dup 1)
+		(parallel [(const_int 1)]))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "ukcras16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_insn "ukcras16_1_be"
+  [(set (match_operand:V2HI 0 "register_operand"         "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (us_minus:HI
+	      (vec_select:HI
+		(match_operand:V2HI 1 "register_operand" " r")
+		(parallel [(const_int 1)]))
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 0)]))))
+	  (vec_duplicate:V2HI
+	    (us_plus:HI
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 1)]))
+	      (vec_select:HI
+		(match_dup 1)
+		(parallel [(const_int 0)]))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "ukcras16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_expand "crsa16_1"
+  [(match_operand:V2HI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_crsa16_1_be (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_crsa16_1_le (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "crsa16_1_le"
+  [(set (match_operand:V2HI 0 "register_operand"         "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (minus:HI
+	      (vec_select:HI
+		(match_operand:V2HI 1 "register_operand" " r")
+		(parallel [(const_int 1)]))
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 0)]))))
+	  (vec_duplicate:V2HI
+	    (plus:HI
+	      (vec_select:HI
+		(match_dup 1)
+		(parallel [(const_int 0)]))
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 1)]))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "crsa16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_insn "crsa16_1_be"
+  [(set (match_operand:V2HI 0 "register_operand"         "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (minus:HI
+	      (vec_select:HI
+		(match_operand:V2HI 1 "register_operand" " r")
+		(parallel [(const_int 0)]))
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 1)]))))
+	  (vec_duplicate:V2HI
+	    (plus:HI
+	      (vec_select:HI
+		(match_dup 1)
+		(parallel [(const_int 1)]))
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 0)]))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "crsa16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_expand "kcrsa16_1"
+  [(match_operand:V2HI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_kcrsa16_1_be (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_kcrsa16_1_le (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "kcrsa16_1_le"
+  [(set (match_operand:V2HI 0 "register_operand"         "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (ss_minus:HI
+	      (vec_select:HI
+		(match_operand:V2HI 1 "register_operand" " r")
+		(parallel [(const_int 1)]))
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 0)]))))
+	  (vec_duplicate:V2HI
+	    (ss_plus:HI
+	      (vec_select:HI
+		(match_dup 1)
+		(parallel [(const_int 0)]))
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 1)]))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "kcrsa16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_insn "kcrsa16_1_be"
+  [(set (match_operand:V2HI 0 "register_operand"         "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (ss_minus:HI
+	      (vec_select:HI
+		(match_operand:V2HI 1 "register_operand" " r")
+		(parallel [(const_int 0)]))
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 1)]))))
+	  (vec_duplicate:V2HI
+	    (ss_plus:HI
+	      (vec_select:HI
+		(match_dup 1)
+		(parallel [(const_int 1)]))
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 0)]))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "kcrsa16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_expand "ukcrsa16_1"
+  [(match_operand:V2HI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_ukcrsa16_1_be (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_ukcrsa16_1_le (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "ukcrsa16_1_le"
+  [(set (match_operand:V2HI 0 "register_operand"         "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (us_minus:HI
+	      (vec_select:HI
+		(match_operand:V2HI 1 "register_operand" " r")
+		(parallel [(const_int 1)]))
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 0)]))))
+	  (vec_duplicate:V2HI
+	    (us_plus:HI
+	      (vec_select:HI
+		(match_dup 1)
+		(parallel [(const_int 0)]))
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 1)]))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "ukcrsa16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_insn "ukcrsa16_1_be"
+  [(set (match_operand:V2HI 0 "register_operand"         "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (us_minus:HI
+	      (vec_select:HI
+		(match_operand:V2HI 1 "register_operand" " r")
+		(parallel [(const_int 0)]))
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 1)]))))
+	  (vec_duplicate:V2HI
+	    (us_plus:HI
+	      (vec_select:HI
+		(match_dup 1)
+		(parallel [(const_int 1)]))
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 0)]))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "ukcrsa16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_expand "rcras16_1"
+  [(match_operand:V2HI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_rcras16_1_be (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_rcras16_1_le (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "rcras16_1_le"
+  [(set (match_operand:V2HI 0 "register_operand"           "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (ashiftrt:SI
+		(minus:SI
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 1 "register_operand" " r")
+		      (parallel [(const_int 0)])))
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 2 "register_operand" " r")
+		      (parallel [(const_int 1)]))))
+		(const_int 1))))
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (ashiftrt:SI
+		(plus:SI
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_dup 2)
+		      (parallel [(const_int 0)])))
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_dup 1)
+		      (parallel [(const_int 1)]))))
+		(const_int 1))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "rcras16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_insn "rcras16_1_be"
+  [(set (match_operand:V2HI 0 "register_operand"           "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (ashiftrt:SI
+		(minus:SI
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 1 "register_operand" " r")
+		      (parallel [(const_int 1)])))
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 2 "register_operand" " r")
+		      (parallel [(const_int 0)]))))
+		(const_int 1))))
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (ashiftrt:SI
+		(plus:SI
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_dup 2)
+		      (parallel [(const_int 1)])))
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_dup 1)
+		      (parallel [(const_int 0)]))))
+		(const_int 1))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "rcras16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_expand "urcras16_1"
+  [(match_operand:V2HI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_urcras16_1_be (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_urcras16_1_le (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "urcras16_1_le"
+  [(set (match_operand:V2HI 0 "register_operand"           "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (lshiftrt:SI
+		(minus:SI
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 1 "register_operand" " r")
+		      (parallel [(const_int 0)])))
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 2 "register_operand" " r")
+		      (parallel [(const_int 1)]))))
+		(const_int 1))))
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (lshiftrt:SI
+		(plus:SI
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_dup 2)
+		      (parallel [(const_int 0)])))
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_dup 1)
+		      (parallel [(const_int 1)]))))
+		(const_int 1))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "urcras16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_insn "urcras16_1_be"
+  [(set (match_operand:V2HI 0 "register_operand"           "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (lshiftrt:SI
+		(minus:SI
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 1 "register_operand" " r")
+		      (parallel [(const_int 1)])))
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 2 "register_operand" " r")
+		      (parallel [(const_int 0)]))))
+		(const_int 1))))
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (lshiftrt:SI
+		(plus:SI
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_dup 2)
+		      (parallel [(const_int 1)])))
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_dup 1)
+		      (parallel [(const_int 0)]))))
+		(const_int 1))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "urcras16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_expand "rcrsa16_1"
+  [(match_operand:V2HI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_rcrsa16_1_be (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_rcrsa16_1_le (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "rcrsa16_1_le"
+  [(set (match_operand:V2HI 0 "register_operand"           "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (ashiftrt:SI
+	        (minus:SI
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 1 "register_operand" " r")
+		      (parallel [(const_int 1)])))
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 2 "register_operand" " r")
+		      (parallel [(const_int 0)]))))
+		(const_int 1))))
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (ashiftrt:SI
+		(plus:SI
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_dup 1)
+		      (parallel [(const_int 0)])))
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_dup 2)
+		      (parallel [(const_int 1)]))))
+		(const_int 1))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "rcrsa16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_insn "rcrsa16_1_be"
+  [(set (match_operand:V2HI 0 "register_operand"           "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (ashiftrt:SI
+	        (minus:SI
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 1 "register_operand" " r")
+		      (parallel [(const_int 0)])))
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 2 "register_operand" " r")
+		      (parallel [(const_int 1)]))))
+		(const_int 1))))
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (ashiftrt:SI
+		(plus:SI
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_dup 1)
+		      (parallel [(const_int 1)])))
+		  (sign_extend:SI
+		    (vec_select:HI
+		      (match_dup 2)
+		      (parallel [(const_int 0)]))))
+		(const_int 1))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "rcrsa16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_expand "urcrsa16_1"
+  [(match_operand:V2HI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_urcrsa16_1_be (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_urcrsa16_1_le (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "urcrsa16_1_le"
+  [(set (match_operand:V2HI 0 "register_operand"           "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (lshiftrt:SI
+	        (minus:SI
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 1 "register_operand" " r")
+		      (parallel [(const_int 1)])))
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 2 "register_operand" " r")
+		      (parallel [(const_int 0)]))))
+		(const_int 1))))
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (lshiftrt:SI
+		(plus:SI
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_dup 1)
+		      (parallel [(const_int 0)])))
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_dup 2)
+		      (parallel [(const_int 1)]))))
+		(const_int 1))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "urcrsa16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_insn "urcrsa16_1_be"
+  [(set (match_operand:V2HI 0 "register_operand"           "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (lshiftrt:SI
+	        (minus:SI
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 1 "register_operand" " r")
+		      (parallel [(const_int 0)])))
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_operand:V2HI 2 "register_operand" " r")
+		      (parallel [(const_int 1)]))))
+		(const_int 1))))
+	  (vec_duplicate:V2HI
+	    (truncate:HI
+	      (lshiftrt:SI
+		(plus:SI
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_dup 1)
+		      (parallel [(const_int 1)])))
+		  (zero_extend:SI
+		    (vec_select:HI
+		      (match_dup 2)
+		      (parallel [(const_int 0)]))))
+		(const_int 1))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "urcrsa16\t%0, %1, %2"
+  [(set_attr "type" "dalu")]
+)
+
+(define_expand "<shift>v2hi3"
+  [(set (match_operand:V2HI 0 "register_operand"                  "")
+	(shifts:V2HI (match_operand:V2HI 1 "register_operand"     "")
+		     (match_operand:SI   2 "nds32_rimm4u_operand" "")))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (operands[2] == const0_rtx)
+    {
+      emit_move_insn (operands[0], operands[1]);
+      DONE;
+    }
+})
+
+(define_insn "*ashlv2hi3"
+  [(set (match_operand:V2HI 0 "register_operand"                "=   r, r")
+	(ashift:V2HI (match_operand:V2HI 1 "register_operand"   "    r, r")
+		     (match_operand:SI 2 "nds32_rimm4u_operand" " Iu04, r")))]
+  "NDS32_EXT_DSP_P ()"
+  "@
+   slli16\t%0, %1, %2
+   sll16\t%0, %1, %2"
+  [(set_attr "type"   "dalu,dalu")
+   (set_attr "length" "   4,   4")])
+
+(define_insn "kslli16"
+  [(set (match_operand:V2HI 0 "register_operand"                   "=   r, r")
+	(ss_ashift:V2HI (match_operand:V2HI 1 "register_operand"   "    r, r")
+			(match_operand:SI 2 "nds32_rimm4u_operand" " Iu04, r")))]
+  "NDS32_EXT_DSP_P ()"
+  "@
+   kslli16\t%0, %1, %2
+   ksll16\t%0, %1, %2"
+  [(set_attr "type"   "dalu,dalu")
+   (set_attr "length" "   4,   4")])
+
+(define_insn "*ashrv2hi3"
+  [(set (match_operand:V2HI 0 "register_operand"                  "=   r, r")
+	(ashiftrt:V2HI (match_operand:V2HI 1 "register_operand"   "    r, r")
+		       (match_operand:SI 2 "nds32_rimm4u_operand" " Iu04, r")))]
+  "NDS32_EXT_DSP_P ()"
+  "@
+   srai16\t%0, %1, %2
+   sra16\t%0, %1, %2"
+  [(set_attr "type"   "dalu,dalu")
+   (set_attr "length" "   4,   4")])
+
+(define_insn "sra16_round"
+  [(set (match_operand:V2HI 0 "register_operand"                                "=   r, r")
+	(unspec:V2HI [(ashiftrt:V2HI (match_operand:V2HI 1 "register_operand"   "    r, r")
+				     (match_operand:SI 2 "nds32_rimm4u_operand" " Iu04, r"))]
+		     UNSPEC_ROUND))]
+  "NDS32_EXT_DSP_P ()"
+  "@
+   srai16.u\t%0, %1, %2
+   sra16.u\t%0, %1, %2"
+  [(set_attr "type"   "daluround,daluround")
+   (set_attr "length" "         4,       4")])
+
+(define_insn "*lshrv2hi3"
+  [(set (match_operand:V2HI 0 "register_operand"                  "=   r, r")
+	(lshiftrt:V2HI (match_operand:V2HI 1 "register_operand"   "    r, r")
+		       (match_operand:SI 2 "nds32_rimm4u_operand" " Iu04, r")))]
+  "NDS32_EXT_DSP_P ()"
+  "@
+   srli16\t%0, %1, %2
+   srl16\t%0, %1, %2"
+  [(set_attr "type"   "dalu,dalu")
+   (set_attr "length" "   4,   4")])
+
+(define_insn "srl16_round"
+  [(set (match_operand:V2HI 0 "register_operand"                                "=   r, r")
+	(unspec:V2HI [(lshiftrt:V2HI (match_operand:V2HI 1 "register_operand"   "    r, r")
+				     (match_operand:SI 2 "nds32_rimm4u_operand" " Iu04, r"))]
+		     UNSPEC_ROUND))]
+  "NDS32_EXT_DSP_P ()"
+  "@
+   srli16.u\t%0, %1, %2
+   srl16.u\t%0, %1, %2"
+  [(set_attr "type"   "daluround,daluround")
+   (set_attr "length" "        4,        4")])
+
+(define_insn "kslra16"
+  [(set (match_operand:V2HI 0 "register_operand"                  "=r")
+	(if_then_else:V2HI
+	  (lt:SI (match_operand:SI 2 "register_operand"           " r")
+		 (const_int 0))
+	  (ashiftrt:V2HI (match_operand:V2HI 1 "register_operand" " r")
+			 (neg:SI (match_dup 2)))
+	  (ashift:V2HI (match_dup 1)
+		       (match_dup 2))))]
+  "NDS32_EXT_DSP_P ()"
+  "kslra16\t%0, %1, %2"
+  [(set_attr "type"    "dalu")
+   (set_attr "length"  "4")])
+
+(define_insn "kslra16_round"
+  [(set (match_operand:V2HI 0 "register_operand"                  "=r")
+	(if_then_else:V2HI
+	  (lt:SI (match_operand:SI 2 "register_operand"           " r")
+		 (const_int 0))
+	  (unspec:V2HI [(ashiftrt:V2HI (match_operand:V2HI 1 "register_operand" " r")
+				       (neg:SI (match_dup 2)))]
+		       UNSPEC_ROUND)
+	  (ashift:V2HI (match_dup 1)
+		       (match_dup 2))))]
+  "NDS32_EXT_DSP_P ()"
+  "kslra16.u\t%0, %1, %2"
+  [(set_attr "type"    "daluround")
+   (set_attr "length"  "4")])
+
+(define_insn "cmpeq<bits>"
+  [(set (match_operand:SI 0 "register_operand"                       "=r")
+	(unspec:SI [(eq:SI (match_operand:VQIHI 1 "register_operand" " r")
+			   (match_operand:VQIHI 2 "register_operand" " r"))]
+		   UNSPEC_VEC_COMPARE))]
+  "NDS32_EXT_DSP_P ()"
+  "cmpeq<bits>\t%0, %1, %2"
+  [(set_attr "type"    "dcmp")
+   (set_attr "length"  "4")])
+
+(define_insn "scmplt<bits>"
+  [(set (match_operand:SI 0 "register_operand"                       "=r")
+	(unspec:SI [(lt:SI (match_operand:VQIHI 1 "register_operand" " r")
+			   (match_operand:VQIHI 2 "register_operand" " r"))]
+		   UNSPEC_VEC_COMPARE))]
+  "NDS32_EXT_DSP_P ()"
+  "scmplt<bits>\t%0, %1, %2"
+  [(set_attr "type"    "dcmp")
+   (set_attr "length"  "4")])
+
+(define_insn "scmple<bits>"
+  [(set (match_operand:SI 0 "register_operand"                       "=r")
+	(unspec:SI [(le:SI (match_operand:VQIHI 1 "register_operand" " r")
+			   (match_operand:VQIHI 2 "register_operand" " r"))]
+		   UNSPEC_VEC_COMPARE))]
+  "NDS32_EXT_DSP_P ()"
+  "scmple<bits>\t%0, %1, %2"
+  [(set_attr "type"    "dcmp")
+   (set_attr "length"  "4")])
+
+(define_insn "ucmplt<bits>"
+  [(set (match_operand:SI 0 "register_operand"                        "=r")
+	(unspec:SI [(ltu:SI (match_operand:VQIHI 1 "register_operand" " r")
+			    (match_operand:VQIHI 2 "register_operand" " r"))]
+		   UNSPEC_VEC_COMPARE))]
+  "NDS32_EXT_DSP_P ()"
+  "ucmplt<bits>\t%0, %1, %2"
+  [(set_attr "type"    "dcmp")
+   (set_attr "length"  "4")])
+
+(define_insn "ucmple<bits>"
+  [(set (match_operand:SI 0 "register_operand"                        "=r")
+	(unspec:SI [(leu:SI (match_operand:VQIHI 1 "register_operand" " r")
+			    (match_operand:VQIHI 2 "register_operand" " r"))]
+		   UNSPEC_VEC_COMPARE))]
+  "NDS32_EXT_DSP_P ()"
+  "ucmple<bits>\t%0, %1, %2"
+  [(set_attr "type"    "dcmp")
+   (set_attr "length"  "4")])
+
+(define_insn "sclip16"
+  [(set (match_operand:V2HI 0 "register_operand"                "=   r")
+	(unspec:V2HI [(match_operand:V2HI 1 "register_operand"  "    r")
+		      (match_operand:SI 2 "nds32_imm4u_operand" " Iu04")]
+		     UNSPEC_CLIPS))]
+  "NDS32_EXT_DSP_P ()"
+  "sclip16\t%0, %1, %2"
+  [(set_attr "type"    "dclip")
+   (set_attr "length"  "4")])
+
+(define_insn "uclip16"
+  [(set (match_operand:V2HI 0 "register_operand"                "=   r")
+	(unspec:V2HI [(match_operand:V2HI 1 "register_operand"  "    r")
+		      (match_operand:SI 2 "nds32_imm4u_operand" " Iu04")]
+		     UNSPEC_CLIP))]
+  "NDS32_EXT_DSP_P ()"
+  "uclip16\t%0, %1, %2"
+  [(set_attr "type"    "dclip")
+   (set_attr "length"  "4")])
+
+(define_insn "khm16"
+  [(set (match_operand:V2HI 0 "register_operand"                "=r")
+	(unspec:V2HI [(match_operand:V2HI 1 "register_operand"  " r")
+		      (match_operand:V2HI 2 "register_operand" "  r")]
+		     UNSPEC_KHM))]
+  "NDS32_EXT_DSP_P ()"
+  "khm16\t%0, %1, %2"
+  [(set_attr "type"    "dmul")
+   (set_attr "length"  "4")])
+
+(define_insn "khmx16"
+  [(set (match_operand:V2HI 0 "register_operand"                "=r")
+	(unspec:V2HI [(match_operand:V2HI 1 "register_operand"  " r")
+		      (match_operand:V2HI 2 "register_operand" "  r")]
+		     UNSPEC_KHMX))]
+  "NDS32_EXT_DSP_P ()"
+  "khmx16\t%0, %1, %2"
+  [(set_attr "type"    "dmul")
+   (set_attr "length"  "4")])
+
+(define_expand "vec_setv4qi"
+  [(match_operand:V4QI 0 "register_operand" "")
+   (match_operand:QI 1 "register_operand" "")
+   (match_operand:SI 2 "immediate_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  HOST_WIDE_INT pos = INTVAL (operands[2]);
+  if (pos > 4)
+    gcc_unreachable ();
+  HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << pos;
+  emit_insn (gen_vec_setv4qi_internal (operands[0], operands[1],
+				       operands[0], GEN_INT (elem)));
+  DONE;
+})
+
+(define_expand "insb"
+  [(match_operand:V4QI 0 "register_operand" "")
+   (match_operand:V4QI 1 "register_operand" "")
+   (match_operand:SI 2 "register_operand" "")
+   (match_operand:SI 3 "const_int_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (INTVAL (operands[3]) > 3 || INTVAL (operands[3]) < 0)
+    gcc_unreachable ();
+
+  rtx src = gen_reg_rtx (QImode);
+
+  convert_move (src, operands[2], false);
+
+  HOST_WIDE_INT selector_index;
+  /* Big endian need reverse index. */
+  if (TARGET_BIG_ENDIAN)
+    selector_index = 4 - INTVAL (operands[3]) - 1;
+  else
+    selector_index = INTVAL (operands[3]);
+  rtx selector = gen_int_mode (1 << selector_index, SImode);
+  emit_insn (gen_vec_setv4qi_internal (operands[0], src,
+				       operands[1], selector));
+  DONE;
+})
+
+(define_expand "insvsi"
+  [(set (zero_extract:SI (match_operand:SI 0 "register_operand" "")
+			 (match_operand:SI 1 "const_int_operand" "")
+			 (match_operand:SI 2 "nds32_insv_operand" ""))
+	(match_operand:SI 3 "register_operand" ""))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (INTVAL (operands[1]) != 8)
+    FAIL;
+}
+  [(set_attr "type"    "dinsb")
+   (set_attr "length"  "4")])
+
+
+(define_insn "insvsi_internal"
+  [(set (zero_extract:SI (match_operand:SI 0 "register_operand"   "+r")
+			 (const_int 8)
+			 (match_operand:SI 1 "nds32_insv_operand"  "i"))
+	(match_operand:SI 2                  "register_operand"    "r"))]
+  "NDS32_EXT_DSP_P ()"
+  "insb\t%0, %2, %v1"
+  [(set_attr "type"    "dinsb")
+   (set_attr "length"  "4")])
+
+(define_insn "insvsiqi_internal"
+  [(set (zero_extract:SI (match_operand:SI 0 "register_operand"   "+r")
+			 (const_int 8)
+			 (match_operand:SI 1 "nds32_insv_operand"  "i"))
+	(zero_extend:SI (match_operand:QI 2 "register_operand"    "r")))]
+  "NDS32_EXT_DSP_P ()"
+  "insb\t%0, %2, %v1"
+  [(set_attr "type"    "dinsb")
+   (set_attr "length"  "4")])
+
+;; Intermedium pattern for synthetize insvsiqi_internal
+;; v0 = ((v1 & 0xff) << 8)
+(define_insn_and_split "and0xff_s8"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(and:SI (ashift:SI (match_operand:SI 1 "register_operand" "r")
+			   (const_int 8))
+		(const_int 65280)))]
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 1)]
+{
+  rtx tmp = gen_reg_rtx (SImode);
+  emit_insn (gen_ashlsi3 (tmp, operands[1], gen_int_mode (8, SImode)));
+  emit_insn (gen_andsi3 (operands[0], tmp, gen_int_mode (0xffff, SImode)));
+  DONE;
+})
+
+;; v0 = (v1 & 0xff00ffff) | ((v2 << 16) | 0xff0000)
+(define_insn_and_split "insbsi2"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(ior:SI (and:SI (match_operand:SI 1 "register_operand" "0")
+			(const_int -16711681))
+		(and:SI (ashift:SI (match_operand:SI 2 "register_operand" "r")
+				   (const_int 16))
+			(const_int 16711680))))]
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 1)]
+{
+  rtx tmp = gen_reg_rtx (SImode);
+  emit_move_insn (tmp, operands[1]);
+  emit_insn (gen_insvsi_internal (tmp, gen_int_mode(16, SImode), operands[2]));
+  emit_move_insn (operands[0], tmp);
+  DONE;
+})
+
+;; v0 = (v1 & 0xff00ffff) | v2
+(define_insn_and_split "ior_and0xff00ffff_reg"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(ior:SI (and:SI (match_operand:SI 1 "register_operand" "r")
+			(const_int -16711681))
+		(match_operand:SI 2 "register_operand" "r")))]
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 1)]
+{
+  rtx tmp = gen_reg_rtx (SImode);
+  emit_insn (gen_andsi3 (tmp, operands[1], gen_int_mode (0xff00ffff, SImode)));
+  emit_insn (gen_iorsi3 (operands[0], tmp, operands[2]));
+  DONE;
+})
+
+(define_insn "vec_setv4qi_internal"
+  [(set (match_operand:V4QI 0 "register_operand"          "=   r,    r,    r,    r")
+	(vec_merge:V4QI
+	  (vec_duplicate:V4QI
+	    (match_operand:QI 1 "register_operand"        "    r,    r,    r,    r"))
+	  (match_operand:V4QI 2 "register_operand"        "    0,    0,    0,    0")
+	  (match_operand:SI 3 "nds32_imm_1_2_4_8_operand" " Iv01, Iv02, Iv04, Iv08")))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+       const char *pats[] = { "insb\t%0, %1, 3",
+			      "insb\t%0, %1, 2",
+			      "insb\t%0, %1, 1",
+			      "insb\t%0, %1, 0" };
+      return pats[which_alternative];
+    }
+  else
+    {
+       const char *pats[] = { "insb\t%0, %1, 0",
+			      "insb\t%0, %1, 1",
+			      "insb\t%0, %1, 2",
+			      "insb\t%0, %1, 3" };
+      return pats[which_alternative];
+    }
+}
+  [(set_attr "type"    "dinsb")
+   (set_attr "length"  "4")])
+
+(define_insn "vec_setv4qi_internal_vec"
+  [(set (match_operand:V4QI 0 "register_operand"          "=   r,    r,    r,    r")
+	(vec_merge:V4QI
+	  (vec_duplicate:V4QI
+	    (vec_select:QI
+	      (match_operand:V4QI 1 "register_operand"    "    r,    r,    r,    r")
+	      (parallel [(const_int 0)])))
+	  (match_operand:V4QI 2 "register_operand"        "    0,    0,    0,    0")
+	  (match_operand:SI 3 "nds32_imm_1_2_4_8_operand" " Iv01, Iv02, Iv04, Iv08")))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   insb\t%0, %1, 0
+   insb\t%0, %1, 1
+   insb\t%0, %1, 2
+   insb\t%0, %1, 3"
+  [(set_attr "type"    "dinsb")
+   (set_attr "length"  "4")])
+
+(define_insn "vec_mergev4qi_and_cv0_1"
+  [(set (match_operand:V4QI 0 "register_operand"       "=$l,r")
+	(vec_merge:V4QI
+	  (vec_duplicate:V4QI
+	    (vec_select:QI
+	      (match_operand:V4QI 1 "register_operand" "  l,r")
+	      (parallel [(const_int 0)])))
+	  (const_vector:V4QI [
+	    (const_int 0)
+	    (const_int 0)
+	    (const_int 0)
+	    (const_int 0)])
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   zeb33\t%0, %1
+   zeb\t%0, %1"
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")])
+
+(define_insn "vec_mergev4qi_and_cv0_2"
+  [(set (match_operand:V4QI 0 "register_operand"       "=$l,r")
+	(vec_merge:V4QI
+	  (const_vector:V4QI [
+	    (const_int 0)
+	    (const_int 0)
+	    (const_int 0)
+	    (const_int 0)])
+	  (vec_duplicate:V4QI
+	    (vec_select:QI
+	      (match_operand:V4QI 1 "register_operand" "  l,r")
+	      (parallel [(const_int 0)])))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   zeb33\t%0, %1
+   zeb\t%0, %1"
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")])
+
+(define_insn "vec_mergeqi_and_cv0_1"
+  [(set (match_operand:V4QI 0 "register_operand"                     "=$l,r")
+	(vec_merge:V4QI
+	  (vec_duplicate:V4QI (match_operand:QI 1 "register_operand" "  l,r"))
+	  (const_vector:V4QI [
+	    (const_int 0)
+	    (const_int 0)
+	    (const_int 0)
+	    (const_int 0)])
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   zeb33\t%0, %1
+   zeb\t%0, %1"
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")])
+
+(define_insn "vec_mergeqi_and_cv0_2"
+  [(set (match_operand:V4QI 0 "register_operand"                     "=$l,r")
+	(vec_merge:V4QI
+	  (const_vector:V4QI [
+	    (const_int 0)
+	    (const_int 0)
+	    (const_int 0)
+	    (const_int 0)])
+	  (vec_duplicate:V4QI (match_operand:QI 1 "register_operand" "  l,r"))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   zeb33\t%0, %1
+   zeb\t%0, %1"
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")])
+
+(define_expand "vec_setv2hi"
+  [(match_operand:V2HI 0 "register_operand" "")
+   (match_operand:HI 1 "register_operand" "")
+   (match_operand:SI 2 "immediate_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  HOST_WIDE_INT pos = INTVAL (operands[2]);
+  if (pos > 2)
+    gcc_unreachable ();
+  HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << pos;
+  emit_insn (gen_vec_setv2hi_internal (operands[0], operands[1],
+				       operands[0], GEN_INT (elem)));
+  DONE;
+})
+
+(define_insn "vec_setv2hi_internal"
+  [(set (match_operand:V2HI 0 "register_operand"      "=   r,    r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (match_operand:HI 1 "register_operand"    "    r,    r"))
+	  (match_operand:V2HI 2 "register_operand"    "    r,    r")
+	  (match_operand:SI 3 "nds32_imm_1_2_operand" " Iv01, Iv02")))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      const char *pats[] = { "pkbb16\t%0, %1, %2",
+			     "pktb16\t%0, %2, %1" };
+      return pats[which_alternative];
+    }
+  else
+    {
+      const char *pats[] = { "pktb16\t%0, %2, %1",
+			     "pkbb16\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+}
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "vec_mergev2hi_and_cv0_1"
+  [(set (match_operand:V2HI 0 "register_operand"       "=$l,r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (vec_select:HI
+	      (match_operand:V2HI 1 "register_operand" "  l,r")
+	      (parallel [(const_int 0)])))
+	  (const_vector:V2HI [
+	    (const_int 0)
+	    (const_int 0)])
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   zeh33\t%0, %1
+   zeh\t%0, %1"
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")])
+
+(define_insn "vec_mergev2hi_and_cv0_2"
+  [(set (match_operand:V2HI 0 "register_operand"       "=$l,r")
+	(vec_merge:V2HI
+	  (const_vector:V2HI [
+	    (const_int 0)
+	    (const_int 0)])
+	  (vec_duplicate:V2HI
+	    (vec_select:HI
+	      (match_operand:V2HI 1 "register_operand" "  l,r")
+	      (parallel [(const_int 0)])))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   zeh33\t%0, %1
+   zeh\t%0, %1"
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")])
+
+(define_insn "vec_mergehi_and_cv0_1"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=$l,r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI (match_operand:HI 1 "register_operand" "  l,r"))
+	  (const_vector:V2HI [
+	    (const_int 0)
+	    (const_int 0)])
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   zeh33\t%0, %1
+   zeh\t%0, %1"
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")])
+
+(define_insn "vec_mergehi_and_cv0_2"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=$l,r")
+	(vec_merge:V2HI
+	  (const_vector:V2HI [
+	    (const_int 0)
+	    (const_int 0)])
+	  (vec_duplicate:V2HI (match_operand:HI 1 "register_operand" "  l,r"))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   zeh33\t%0, %1
+   zeh\t%0, %1"
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")])
+
+(define_expand "pkbb"
+  [(match_operand:V2HI 0 "register_operand")
+   (match_operand:V2HI 1 "register_operand")
+   (match_operand:V2HI 2 "register_operand")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      emit_insn (gen_vec_mergevv (operands[0], operands[1], operands[2],
+				  GEN_INT (1), GEN_INT (1), GEN_INT (1)));
+    }
+  else
+    {
+      emit_insn (gen_vec_mergevv (operands[0], operands[1], operands[2],
+				  GEN_INT (2), GEN_INT (0), GEN_INT (0)));
+    }
+  DONE;
+})
+
+(define_insn "pkbbsi_1"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(ior:SI (and:SI (match_operand:SI 1 "register_operand" "r")
+			(const_int 65535))
+		(ashift:SI (match_operand:SI 2 "register_operand" "r")
+			   (const_int 16))))]
+  "NDS32_EXT_DSP_P ()"
+  "pkbb16\t%0, %2, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "pkbbsi_2"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(ior:SI	(ashift:SI (match_operand:SI 2 "register_operand" "r")
+			   (const_int 16))
+		(and:SI (match_operand:SI 1 "register_operand" "r")
+			(const_int 65535))))]
+  "NDS32_EXT_DSP_P ()"
+  "pkbb16\t%0, %2, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "pkbbsi_3"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(ior:SI (zero_extend:SI	(match_operand:HI 1 "register_operand" "r"))
+		(ashift:SI (match_operand:SI 2 "register_operand" "r")
+			   (const_int 16))))]
+  "NDS32_EXT_DSP_P ()"
+  "pkbb16\t%0, %2, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "pkbbsi_4"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(ior:SI	(ashift:SI (match_operand:SI 2 "register_operand" "r")
+			   (const_int 16))
+		(zero_extend:SI (match_operand:HI 1 "register_operand" "r"))))]
+  "NDS32_EXT_DSP_P ()"
+  "pkbb16\t%0, %2, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+;; v0 = (v1 & 0xffff0000) | (v2 & 0xffff)
+(define_insn "pktbsi_1"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(ior:SI (and:SI (match_operand:SI 1 "register_operand" "r")
+			(const_int -65536))
+		(zero_extend:SI (match_operand:HI 2 "register_operand" "r"))))]
+  "NDS32_EXT_DSP_P ()"
+  "pktb16\t%0, %1, %2"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "pktbsi_2"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(ior:SI (and:SI (match_operand:SI 1 "register_operand" "r")
+			(const_int -65536))
+		(and:SI (match_operand:SI 2 "register_operand" "r")
+			(const_int 65535))))]
+  "NDS32_EXT_DSP_P ()"
+  "pktb16\t%0, %1, %2"
+  [(set_attr "type"    "alu")
+   (set_attr "length"  "4")])
+
+(define_insn "pktbsi_3"
+  [(set (zero_extract:SI (match_operand:SI 0 "register_operand" "+r")
+			 (const_int 16 )
+			 (const_int 0))
+	(match_operand:SI 1 "register_operand"                  " r"))]
+  "NDS32_EXT_DSP_P ()"
+  "pktb16\t%0, %0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "pktbsi_4"
+  [(set (zero_extract:SI (match_operand:SI 0 "register_operand" "+r")
+			 (const_int 16 )
+			 (const_int 0))
+	(zero_extend:SI (match_operand:HI 1 "register_operand"  " r")))]
+  "NDS32_EXT_DSP_P ()"
+  "pktb16\t%0, %0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "pkttsi"
+  [(set (match_operand:SI 0 "register_operand"                      "=r")
+	(ior:SI (and:SI (match_operand:SI 1 "register_operand"      " r")
+			(const_int -65536))
+		(lshiftrt:SI (match_operand:SI 2 "register_operand" " r")
+			     (const_int 16))))]
+  "NDS32_EXT_DSP_P ()"
+  "pktt16\t%0, %1, %2"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_expand "pkbt"
+  [(match_operand:V2HI 0 "register_operand")
+   (match_operand:V2HI 1 "register_operand")
+   (match_operand:V2HI 2 "register_operand")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      emit_insn (gen_vec_mergevv (operands[0], operands[1], operands[2],
+				  GEN_INT (1), GEN_INT (1), GEN_INT (0)));
+    }
+  else
+    {
+      emit_insn (gen_vec_mergevv (operands[0], operands[1], operands[2],
+				  GEN_INT (2), GEN_INT (0), GEN_INT (1)));
+    }
+  DONE;
+})
+
+(define_expand "pktt"
+  [(match_operand:V2HI 0 "register_operand")
+   (match_operand:V2HI 1 "register_operand")
+   (match_operand:V2HI 2 "register_operand")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      emit_insn (gen_vec_mergevv (operands[0], operands[1], operands[2],
+				  GEN_INT (1), GEN_INT (0), GEN_INT (0)));
+    }
+  else
+    {
+      emit_insn (gen_vec_mergevv (operands[0], operands[1], operands[2],
+				  GEN_INT (2), GEN_INT (1), GEN_INT (1)));
+    }
+  DONE;
+})
+
+(define_expand "pktb"
+  [(match_operand:V2HI 0 "register_operand")
+   (match_operand:V2HI 1 "register_operand")
+   (match_operand:V2HI 2 "register_operand")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      emit_insn (gen_vec_mergevv (operands[0], operands[1], operands[2],
+				  GEN_INT (1), GEN_INT (0), GEN_INT (1)));
+    }
+  else
+    {
+      emit_insn (gen_vec_mergevv (operands[0], operands[1], operands[2],
+				  GEN_INT (2), GEN_INT (1), GEN_INT (0)));
+    }
+  DONE;
+})
+
+(define_insn "vec_mergerr"
+  [(set (match_operand:V2HI 0 "register_operand"      "=   r,    r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (match_operand:HI 1 "register_operand"    "    r,    r"))
+	  (vec_duplicate:V2HI
+	    (match_operand:HI 2 "register_operand"    "    r,    r"))
+	  (match_operand:SI 3 "nds32_imm_1_2_operand" " Iv01, Iv02")))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   pkbb16\t%0, %2, %1
+   pkbb16\t%0, %1, %2"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+
+(define_insn "vec_merge"
+  [(set (match_operand:V2HI 0 "register_operand"      "=   r,    r")
+	(vec_merge:V2HI
+	  (match_operand:V2HI 1 "register_operand"    "    r,    r")
+	  (match_operand:V2HI 2 "register_operand"    "    r,    r")
+	  (match_operand:SI 3 "nds32_imm_1_2_operand" " Iv01, Iv02")))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      const char *pats[] = { "pktb16\t%0, %1, %2",
+			     "pktb16\t%0, %2, %1" };
+      return pats[which_alternative];
+    }
+  else
+    {
+      const char *pats[] = { "pktb16\t%0, %2, %1",
+			     "pktb16\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+}
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "vec_mergerv"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=   r,    r,    r,    r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (match_operand:HI 1 "register_operand"                   "    r,    r,    r,    r"))
+	  (vec_duplicate:V2HI
+	    (vec_select:HI
+	      (match_operand:V2HI 2 "register_operand"               "    r,    r,    r,    r")
+	      (parallel [(match_operand:SI 4 "nds32_imm_0_1_operand" " Iv00, Iv01, Iv00, Iv01")])))
+	  (match_operand:SI 3 "nds32_imm_1_2_operand"                " Iv01, Iv01, Iv02, Iv02")))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   pkbb16\t%0, %2, %1
+   pktb16\t%0, %2, %1
+   pkbb16\t%0, %1, %2
+   pkbt16\t%0, %1, %2"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "vec_mergevr"
+  [(set (match_operand:V2HI 0 "register_operand"                      "=   r,    r,    r,    r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (vec_select:HI
+	      (match_operand:V2HI 1 "register_operand"                "    r,    r,    r,    r")
+	       (parallel [(match_operand:SI 4 "nds32_imm_0_1_operand" " Iv00, Iv01, Iv00, Iv01")])))
+	  (vec_duplicate:V2HI
+	    (match_operand:HI 2 "register_operand"                    "    r,    r,    r,    r"))
+	  (match_operand:SI 3 "nds32_imm_1_2_operand"                 " Iv01, Iv01, Iv02, Iv02")))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   pkbb16\t%0, %2, %1
+   pkbt16\t%0, %2, %1
+   pkbb16\t%0, %1, %2
+   pktb16\t%0, %1, %2"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "vec_mergevv"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=   r,    r,    r,    r,    r,    r,    r,    r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (vec_select:HI
+	      (match_operand:V2HI 1 "register_operand"               "    r,    r,    r,    r,    r,    r,    r,    r")
+	      (parallel [(match_operand:SI 4 "nds32_imm_0_1_operand" " Iv00, Iv00, Iv01, Iv01, Iv00, Iv00, Iv01, Iv01")])))
+	  (vec_duplicate:V2HI
+	    (vec_select:HI
+	      (match_operand:V2HI 2 "register_operand"               "    r,    r,    r,    r,    r,    r,    r,    r")
+	      (parallel [(match_operand:SI 5 "nds32_imm_0_1_operand" " Iv00, Iv01, Iv01, Iv00, Iv00, Iv01, Iv01, Iv00")])))
+	  (match_operand:SI 3 "nds32_imm_1_2_operand"                " Iv01, Iv01, Iv01, Iv01, Iv02, Iv02, Iv02, Iv02")))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      const char *pats[] = { "pktt16\t%0, %1, %2",
+			     "pktb16\t%0, %1, %2",
+			     "pkbb16\t%0, %1, %2",
+			     "pkbt16\t%0, %1, %2",
+			     "pktt16\t%0, %2, %1",
+			     "pkbt16\t%0, %2, %1",
+			     "pkbb16\t%0, %2, %1",
+			     "pktb16\t%0, %2, %1" };
+      return pats[which_alternative];
+    }
+  else
+    {
+      const char *pats[] = { "pkbb16\t%0, %2, %1",
+			     "pktb16\t%0, %2, %1",
+			     "pktt16\t%0, %2, %1",
+			     "pkbt16\t%0, %2, %1",
+			     "pkbb16\t%0, %1, %2",
+			     "pkbt16\t%0, %1, %2",
+			     "pktt16\t%0, %1, %2",
+			     "pktb16\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+}
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_expand "vec_extractv4qi"
+  [(set (match_operand:QI 0 "register_operand" "")
+	(vec_select:QI
+	  (match_operand:V4QI 1          "nonimmediate_operand" "")
+	  (parallel [(match_operand:SI 2 "const_int_operand" "")])))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+{
+  if (INTVAL (operands[2]) != 0
+      && INTVAL (operands[2]) != 1
+      && INTVAL (operands[2]) != 2
+      && INTVAL (operands[2]) != 3)
+    gcc_unreachable ();
+
+  if (INTVAL (operands[2]) != 0 && MEM_P (operands[0]))
+    FAIL;
+})
+
+(define_insn "vec_extractv4qi0"
+  [(set (match_operand:QI 0 "register_operand"         "=l,r,r")
+	(vec_select:QI
+	  (match_operand:V4QI 1 "nonimmediate_operand" " l,r,m")
+	  (parallel [(const_int 0)])))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "zeb33\t%0, %1";
+    case 1:
+      return "zeb\t%0, %1";
+    case 2:
+      return nds32_output_32bit_load (operands, 1);
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type"    "alu")
+   (set_attr "length"  "4")])
+
+(define_insn "vec_extractv4qi0_ze"
+  [(set (match_operand:SI 0 "register_operand"         "=l,r,r")
+	(zero_extend:SI
+	  (vec_select:QI
+	    (match_operand:V4QI 1 "nonimmediate_operand" " l,r,m")
+	    (parallel [(const_int 0)]))))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "zeb33\t%0, %1";
+    case 1:
+      return "zeb\t%0, %1";
+    case 2:
+      return nds32_output_32bit_load (operands, 1);
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type"    "alu")
+   (set_attr "length"  "4")])
+
+(define_insn "vec_extractv4qi0_se"
+  [(set (match_operand:SI 0 "register_operand"         "=l,r,r")
+	(sign_extend:SI
+	  (vec_select:QI
+	    (match_operand:V4QI 1 "nonimmediate_operand" " l,r,m")
+	    (parallel [(const_int 0)]))))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "seb33\t%0, %1";
+    case 1:
+      return "seb\t%0, %1";
+    case 2:
+      return nds32_output_32bit_load_se (operands, 1);
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type"    "alu")
+   (set_attr "length"  "4")])
+
+(define_insn_and_split "vec_extractv4qi1"
+  [(set (match_operand:QI 0 "register_operand" "=r")
+	(vec_select:QI
+	  (match_operand:V4QI 1 "register_operand" " r")
+	  (parallel [(const_int 1)])))]
+  "NDS32_EXT_DSP_P () && !reload_completed && !TARGET_BIG_ENDIAN"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 1)]
+{
+  rtx tmp = gen_reg_rtx (V4QImode);
+  emit_insn (gen_rotrv4qi_1 (tmp, operands[1]));
+  emit_insn (gen_vec_extractv4qi0 (operands[0], tmp));
+  DONE;
+}
+  [(set_attr "type"    "alu")
+   (set_attr "length"  "4")])
+
+(define_insn_and_split "vec_extractv4qi2"
+  [(set (match_operand:QI 0 "register_operand" "=r")
+	(vec_select:QI
+	  (match_operand:V4QI 1 "register_operand" " r")
+	  (parallel [(const_int 2)])))]
+  "NDS32_EXT_DSP_P () && !reload_completed && !TARGET_BIG_ENDIAN"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 1)]
+{
+  rtx tmp = gen_reg_rtx (V4QImode);
+  emit_insn (gen_rotrv4qi_2 (tmp, operands[1]));
+  emit_insn (gen_vec_extractv4qi0 (operands[0], tmp));
+  DONE;
+}
+  [(set_attr "type"    "alu")
+   (set_attr "length"  "4")])
+
+(define_insn_and_split "vec_extractv4qi3"
+  [(set (match_operand:QI 0 "register_operand" "=r")
+	(vec_select:QI
+	  (match_operand:V4QI 1 "register_operand" " r")
+	  (parallel [(const_int 3)])))]
+  "NDS32_EXT_DSP_P () && !reload_completed && !TARGET_BIG_ENDIAN"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 1)]
+{
+  rtx tmp = gen_reg_rtx (V4QImode);
+  emit_insn (gen_rotrv4qi_3 (tmp, operands[1]));
+  emit_insn (gen_vec_extractv4qi0 (operands[0], tmp));
+  DONE;
+}
+  [(set_attr "type"    "alu")
+   (set_attr "length"  "4")])
+
+(define_insn "vec_extractv4qi3_se"
+  [(set (match_operand:SI 0 "register_operand"       "=$d,r")
+	(sign_extend:SI
+	  (vec_select:QI
+	    (match_operand:V4QI 1 "register_operand" "  0,r")
+	    (parallel [(const_int 3)]))))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   srai45\t%0, 24
+   srai\t%0, %1, 24"
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")])
+
+(define_insn "vec_extractv4qi3_ze"
+  [(set (match_operand:SI 0 "register_operand"       "=$d,r")
+	(zero_extend:SI
+	  (vec_select:QI
+	    (match_operand:V4QI 1 "register_operand" "  0,r")
+	    (parallel [(const_int 3)]))))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   srli45\t%0, 24
+   srli\t%0, %1, 24"
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")])
+
+(define_insn_and_split "vec_extractv4qihi0"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+	(sign_extend:HI
+	  (vec_select:QI
+	    (match_operand:V4QI 1 "register_operand" " r")
+	    (parallel [(const_int 0)]))))]
+  "NDS32_EXT_DSP_P () && !reload_completed && !TARGET_BIG_ENDIAN"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 1)]
+{
+  rtx tmp = gen_reg_rtx (QImode);
+  emit_insn (gen_vec_extractv4qi0 (tmp, operands[1]));
+  emit_insn (gen_extendqihi2 (operands[0], tmp));
+  DONE;
+}
+  [(set_attr "type"    "alu")
+   (set_attr "length"  "4")])
+
+(define_insn_and_split "vec_extractv4qihi1"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+	(sign_extend:HI
+	  (vec_select:QI
+	    (match_operand:V4QI 1 "register_operand" " r")
+	    (parallel [(const_int 1)]))))]
+  "NDS32_EXT_DSP_P () && !reload_completed && !TARGET_BIG_ENDIAN"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 1)]
+{
+  rtx tmp = gen_reg_rtx (QImode);
+  emit_insn (gen_vec_extractv4qi1 (tmp, operands[1]));
+  emit_insn (gen_extendqihi2 (operands[0], tmp));
+  DONE;
+}
+  [(set_attr "type"    "alu")
+   (set_attr "length"  "4")])
+
+(define_insn_and_split "vec_extractv4qihi2"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+	(sign_extend:HI
+	  (vec_select:QI
+	    (match_operand:V4QI 1 "register_operand" " r")
+	    (parallel [(const_int 2)]))))]
+  "NDS32_EXT_DSP_P () && !reload_completed && !TARGET_BIG_ENDIAN"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 1)]
+{
+  rtx tmp = gen_reg_rtx (QImode);
+  emit_insn (gen_vec_extractv4qi2 (tmp, operands[1]));
+  emit_insn (gen_extendqihi2 (operands[0], tmp));
+  DONE;
+}
+  [(set_attr "type"    "alu")
+   (set_attr "length"  "4")])
+
+(define_insn_and_split "vec_extractv4qihi3"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+	(sign_extend:HI
+	  (vec_select:QI
+	    (match_operand:V4QI 1 "register_operand" " r")
+	    (parallel [(const_int 3)]))))]
+  "NDS32_EXT_DSP_P () && !reload_completed && !TARGET_BIG_ENDIAN"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 1)]
+{
+  rtx tmp = gen_reg_rtx (QImode);
+  emit_insn (gen_vec_extractv4qi3 (tmp, operands[1]));
+  emit_insn (gen_extendqihi2 (operands[0], tmp));
+  DONE;
+}
+  [(set_attr "type"    "alu")
+   (set_attr "length"  "4")])
+
+(define_expand "vec_extractv2hi"
+  [(set (match_operand:HI 0 "register_operand" "")
+	(vec_select:HI
+	  (match_operand:V2HI 1          "nonimmediate_operand" "")
+	  (parallel [(match_operand:SI 2 "const_int_operand" "")])))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (INTVAL (operands[2]) != 0
+      && INTVAL (operands[2]) != 1)
+    gcc_unreachable ();
+
+  if (INTVAL (operands[2]) != 0 && MEM_P (operands[0]))
+    FAIL;
+})
+
+(define_insn "vec_extractv2hi0"
+  [(set (match_operand:HI 0 "register_operand"         "=$l,r,r")
+	(vec_select:HI
+	  (match_operand:V2HI 1 "nonimmediate_operand" "  l,r,m")
+	  (parallel [(const_int 0)])))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "seh33\t%0, %1";
+    case 1:
+      return "seh\t%0, %1";
+    case 2:
+      return nds32_output_32bit_load_se (operands, 2);
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type"    "alu,alu,load")
+   (set_attr "length"  "  2,  4,   4")])
+
+(define_insn "vec_extractv2hi0_ze"
+  [(set (match_operand:SI 0 "register_operand"         "=$l, r,$  l, *r")
+        (zero_extend:SI
+	  (vec_select:HI
+	    (match_operand:V2HI 1 "nonimmediate_operand" "  l, r, U33,  m")
+	    (parallel [(const_int 0)]))))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "zeh33\t%0, %1";
+    case 1:
+      return "zeh\t%0, %1";
+    case 2:
+      return nds32_output_16bit_load (operands, 2);
+    case 3:
+      return nds32_output_32bit_load (operands, 2);
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type"   "alu,alu,load,load")
+   (set_attr "length" "  2,  4,   2,   4")])
+
+(define_insn "vec_extractv2hi0_se"
+  [(set (match_operand:SI 0 "register_operand"         "=$l, r, r")
+        (sign_extend:SI
+	  (vec_select:HI
+	    (match_operand:V2HI 1 "nonimmediate_operand" "  l,r,m")
+	    (parallel [(const_int 0)]))))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "seh33\t%0, %1";
+    case 1:
+      return "seh\t%0, %1";
+    case 2:
+      return nds32_output_32bit_load_se (operands, 2);
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type"   "alu,alu,load")
+   (set_attr "length" "  2,  4,   4")])
+
+(define_insn "vec_extractv2hi0_be"
+  [(set (match_operand:HI 0 "register_operand"     "=$d,r")
+	(vec_select:HI
+	  (match_operand:V2HI 1 "register_operand" "  0,r")
+	  (parallel [(const_int 0)])))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "@
+   srai45\t%0, 16
+   srai\t%0, %1, 16"
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")])
+
+(define_insn "vec_extractv2hi1"
+  [(set (match_operand:HI 0 "register_operand"     "=$d,r")
+	(vec_select:HI
+	  (match_operand:V2HI 1 "register_operand" "  0,r")
+	  (parallel [(const_int 1)])))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   srai45\t%0, 16
+   srai\t%0, %1, 16"
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")])
+
+(define_insn "vec_extractv2hi1_se"
+  [(set (match_operand:SI 0 "register_operand"     "=$d,r")
+	(sign_extend:SI
+	  (vec_select:HI
+	    (match_operand:V2HI 1 "register_operand" "  0,r")
+	    (parallel [(const_int 1)]))))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   srai45\t%0, 16
+   srai\t%0, %1, 16"
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")])
+
+(define_insn "vec_extractv2hi1_ze"
+  [(set (match_operand:SI 0 "register_operand"     "=$d,r")
+	(zero_extend:SI
+	  (vec_select:HI
+	    (match_operand:V2HI 1 "register_operand" "  0,r")
+	    (parallel [(const_int 1)]))))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "@
+   srli45\t%0, 16
+   srli\t%0, %1, 16"
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")])
+
+(define_insn "vec_extractv2hi1_be"
+  [(set (match_operand:HI 0 "register_operand"         "=$l,r,r")
+	(vec_select:HI
+	  (match_operand:V2HI 1 "nonimmediate_operand" "  l,r,m")
+	  (parallel [(const_int 1)])))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "seh33\t%0, %1";
+    case 1:
+      return "seh\t%0, %1";
+    case 2:
+      return nds32_output_32bit_load_se (operands, 2);
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type"    "alu,alu,load")
+   (set_attr "length"  "  2,  4,   4")])
+
+(define_insn "<su>mul16"
+  [(set (match_operand:V2SI 0 "register_operand"                         "=r")
+	(mult:V2SI (extend:V2SI (match_operand:V2HI 1 "register_operand" "%r"))
+		   (extend:V2SI (match_operand:V2HI 2 "register_operand" " r"))))]
+  "NDS32_EXT_DSP_P ()"
+  "<su>mul16\t%0, %1, %2"
+  [(set_attr "type"   "dmul")
+   (set_attr "length"   "4")])
+
+(define_insn "<su>mulx16"
+  [(set (match_operand:V2SI 0 "register_operand"         "=r")
+	(vec_merge:V2SI
+	  (vec_duplicate:V2SI
+	    (mult:SI
+	      (extend:SI
+		(vec_select:HI
+		  (match_operand:V2HI 1 "register_operand" " r")
+		  (parallel [(const_int 0)])))
+	      (extend:SI
+		(vec_select:HI
+		  (match_operand:V2HI 2 "register_operand" " r")
+		  (parallel [(const_int 1)])))))
+	  (vec_duplicate:V2SI
+	    (mult:SI
+	      (extend:SI
+		(vec_select:HI
+		  (match_dup 1)
+		  (parallel [(const_int 1)])))
+	      (extend:SI
+		(vec_select:HI
+		  (match_dup 2)
+		  (parallel [(const_int 0)])))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P ()"
+  "<su>mulx16\t%0, %1, %2"
+  [(set_attr "type"    "dmul")
+   (set_attr "length"   "4")])
+
+(define_insn "rotrv2hi_1"
+  [(set (match_operand:V2HI 0 "register_operand"    "=r")
+	(vec_select:V2HI
+	   (match_operand:V2HI 1 "register_operand" " r")
+	   (parallel [(const_int 1) (const_int 0)])))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "rotri\t%0, %1, 16"
+  [(set_attr "type"   "alu")
+   (set_attr "length"  "4")])
+
+(define_insn "rotrv2hi_1_be"
+  [(set (match_operand:V2HI 0 "register_operand"    "=r")
+	(vec_select:V2HI
+	   (match_operand:V2HI 1 "register_operand" " r")
+	   (parallel [(const_int 0) (const_int 1)])))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "rotri\t%0, %1, 16"
+  [(set_attr "type"   "alu")
+   (set_attr "length"  "4")])
+
+(define_insn "rotrv4qi_1"
+  [(set (match_operand:V4QI 0 "register_operand"    "=r")
+	(vec_select:V4QI
+	   (match_operand:V4QI 1 "register_operand" " r")
+	   (parallel [(const_int 1) (const_int 2) (const_int 3) (const_int 0)])))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "rotri\t%0, %1, 8"
+  [(set_attr "type"   "alu")
+   (set_attr "length"  "4")])
+
+(define_insn "rotrv4qi_1_be"
+  [(set (match_operand:V4QI 0 "register_operand"    "=r")
+	(vec_select:V4QI
+	   (match_operand:V4QI 1 "register_operand" " r")
+	   (parallel [(const_int 2) (const_int 1) (const_int 0) (const_int 3)])))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "rotri\t%0, %1, 8"
+  [(set_attr "type"   "alu")
+   (set_attr "length"  "4")])
+
+(define_insn "rotrv4qi_2"
+  [(set (match_operand:V4QI 0 "register_operand"    "=r")
+	(vec_select:V4QI
+	   (match_operand:V4QI 1 "register_operand" " r")
+	   (parallel [(const_int 2) (const_int 3) (const_int 0) (const_int 1)])))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "rotri\t%0, %1, 16"
+  [(set_attr "type"   "alu")
+   (set_attr "length"  "4")])
+
+(define_insn "rotrv4qi_2_be"
+  [(set (match_operand:V4QI 0 "register_operand"    "=r")
+	(vec_select:V4QI
+	   (match_operand:V4QI 1 "register_operand" " r")
+	   (parallel [(const_int 1) (const_int 0) (const_int 3) (const_int 2)])))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "rotri\t%0, %1, 16"
+  [(set_attr "type"   "alu")
+   (set_attr "length"  "4")])
+
+(define_insn "rotrv4qi_3"
+  [(set (match_operand:V4QI 0 "register_operand"    "=r")
+	(vec_select:V4QI
+	   (match_operand:V4QI 1 "register_operand" " r")
+	   (parallel [(const_int 3) (const_int 0) (const_int 1) (const_int 2)])))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "rotri\t%0, %1, 24"
+  [(set_attr "type"   "alu")
+   (set_attr "length"  "4")])
+
+(define_insn "rotrv4qi_3_be"
+  [(set (match_operand:V4QI 0 "register_operand"    "=r")
+	(vec_select:V4QI
+	   (match_operand:V4QI 1 "register_operand" " r")
+	   (parallel [(const_int 0) (const_int 3) (const_int 2) (const_int 1)])))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "rotri\t%0, %1, 24"
+  [(set_attr "type"   "alu")
+   (set_attr "length"  "4")])
+
+(define_insn "v4qi_dup_10"
+  [(set (match_operand:V4QI 0 "register_operand"    "=r")
+	(vec_select:V4QI
+	   (match_operand:V4QI 1 "register_operand" " r")
+	   (parallel [(const_int 0) (const_int 1) (const_int 0) (const_int 1)])))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "pkbb\t%0, %1, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "v4qi_dup_32"
+  [(set (match_operand:V4QI 0 "register_operand"    "=r")
+	(vec_select:V4QI
+	   (match_operand:V4QI 1 "register_operand" " r")
+	   (parallel [(const_int 2) (const_int 3) (const_int 2) (const_int 3)])))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "pktt\t%0, %1, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_expand "vec_unpacks_lo_v4qi"
+  [(match_operand:V2HI 0 "register_operand" "=r")
+   (match_operand:V4QI 1 "register_operand" " r")]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+{
+  emit_insn (gen_sunpkd810 (operands[0], operands[1]));
+  DONE;
+})
+
+(define_expand "sunpkd810"
+  [(match_operand:V2HI 0 "register_operand")
+   (match_operand:V4QI 1 "register_operand")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_sunpkd810_imp_be (operands[0], operands[1]));
+  else
+    emit_insn (gen_sunpkd810_imp (operands[0], operands[1]));
+  DONE;
+})
+
+(define_insn "<zs>unpkd810_imp"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 1)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 0)]))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "<zs>unpkd810\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "<zs>unpkd810_imp_inv"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 0)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 1)]))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "<zs>unpkd810\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "<zs>unpkd810_imp_be"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 2)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 3)]))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "<zs>unpkd810\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "<zs>unpkd810_imp_inv_be"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 3)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 2)]))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "<zs>unpkd810\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_expand "sunpkd820"
+  [(match_operand:V2HI 0 "register_operand")
+   (match_operand:V4QI 1 "register_operand")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_sunpkd820_imp_be (operands[0], operands[1]));
+  else
+    emit_insn (gen_sunpkd820_imp (operands[0], operands[1]));
+  DONE;
+})
+
+(define_insn "<zs>unpkd820_imp"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 2)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 0)]))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "<zs>unpkd820\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "<zs>unpkd820_imp_inv"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 0)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 2)]))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "<zs>unpkd820\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "<zs>unpkd820_imp_be"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 1)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 3)]))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "<zs>unpkd820\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "<zs>unpkd820_imp_inv_be"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 3)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 1)]))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "<zs>unpkd820\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_expand "sunpkd830"
+  [(match_operand:V2HI 0 "register_operand")
+   (match_operand:V4QI 1 "register_operand")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_sunpkd830_imp_be (operands[0], operands[1]));
+  else
+    emit_insn (gen_sunpkd830_imp (operands[0], operands[1]));
+  DONE;
+})
+
+(define_insn "<zs>unpkd830_imp"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 3)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 0)]))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "<zs>unpkd830\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "<zs>unpkd830_imp_inv"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 0)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 3)]))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "<zs>unpkd830\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "<zs>unpkd830_imp_be"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 0)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 3)]))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "<zs>unpkd830\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "<zs>unpkd830_imp_inv_be"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 3)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 0)]))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "<zs>unpkd830\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_expand "sunpkd831"
+  [(match_operand:V2HI 0 "register_operand")
+   (match_operand:V4QI 1 "register_operand")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_sunpkd831_imp_be (operands[0], operands[1]));
+  else
+    emit_insn (gen_sunpkd831_imp (operands[0], operands[1]));
+  DONE;
+})
+
+(define_insn "<zs>unpkd831_imp"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 3)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 1)]))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "<zs>unpkd831\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "<zs>unpkd831_imp_inv"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 1)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 3)]))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "<zs>unpkd831\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "<zs>unpkd831_imp_be"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 0)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 2)]))))
+	  (const_int 1)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "<zs>unpkd831\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_insn "<zs>unpkd831_imp_inv_be"
+  [(set (match_operand:V2HI 0 "register_operand"                     "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_operand:V4QI 1 "register_operand"             " r")
+		(parallel [(const_int 2)]))))
+	  (vec_duplicate:V2HI
+	    (extend:HI
+	      (vec_select:QI
+		(match_dup 1)
+		(parallel [(const_int 0)]))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "<zs>unpkd831\t%0, %1"
+  [(set_attr "type"    "dpack")
+   (set_attr "length"  "4")])
+
+(define_expand "zunpkd810"
+  [(match_operand:V2HI 0 "register_operand")
+   (match_operand:V4QI 1 "register_operand")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_zunpkd810_imp_be (operands[0], operands[1]));
+  else
+    emit_insn (gen_zunpkd810_imp (operands[0], operands[1]));
+  DONE;
+})
+
+(define_expand "zunpkd820"
+  [(match_operand:V2HI 0 "register_operand")
+   (match_operand:V4QI 1 "register_operand")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_zunpkd820_imp_be (operands[0], operands[1]));
+  else
+    emit_insn (gen_zunpkd820_imp (operands[0], operands[1]));
+  DONE;
+})
+
+(define_expand "zunpkd830"
+  [(match_operand:V2HI 0 "register_operand")
+   (match_operand:V4QI 1 "register_operand")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_zunpkd830_imp_be (operands[0], operands[1]));
+  else
+    emit_insn (gen_zunpkd830_imp (operands[0], operands[1]));
+  DONE;
+})
+
+(define_expand "zunpkd831"
+  [(match_operand:V2HI 0 "register_operand")
+   (match_operand:V4QI 1 "register_operand")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_zunpkd831_imp_be (operands[0], operands[1]));
+  else
+    emit_insn (gen_zunpkd831_imp (operands[0], operands[1]));
+  DONE;
+})
+
+(define_expand "smbb"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_mulhisi3v (operands[0], operands[1], operands[2],
+			      GEN_INT (1), GEN_INT (1)));
+  else
+    emit_insn (gen_mulhisi3v (operands[0], operands[1], operands[2],
+			      GEN_INT (0), GEN_INT (0)));
+  DONE;
+})
+
+(define_expand "smbt"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_mulhisi3v (operands[0], operands[1], operands[2],
+			      GEN_INT (1), GEN_INT (0)));
+  else
+    emit_insn (gen_mulhisi3v (operands[0], operands[1], operands[2],
+			      GEN_INT (0), GEN_INT (1)));
+  DONE;
+})
+
+(define_expand "smtt"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_mulhisi3v (operands[0], operands[1], operands[2],
+			      GEN_INT (0), GEN_INT (0)));
+  else
+    emit_insn (gen_mulhisi3v (operands[0], operands[1], operands[2],
+			      GEN_INT (1), GEN_INT (1)));
+  DONE;
+})
+
+(define_insn "mulhisi3v"
+  [(set (match_operand:SI 0 "register_operand"                         "=   r,    r,    r,    r")
+	(mult:SI
+	  (sign_extend:SI
+	     (vec_select:HI
+	       (match_operand:V2HI 1 "register_operand"                "    r,    r,    r,    r")
+	       (parallel [(match_operand:SI 3 "nds32_imm_0_1_operand"  " Iv00, Iv00, Iv01, Iv01")])))
+	  (sign_extend:SI (vec_select:HI
+	       (match_operand:V2HI 2 "register_operand"                "    r,    r,    r,    r")
+	       (parallel [(match_operand:SI 4 "nds32_imm_0_1_operand"  " Iv00, Iv01, Iv01, Iv00")])))))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      const char *pats[] = { "smtt\t%0, %1, %2",
+			     "smbt\t%0, %2, %1",
+			     "smbb\t%0, %1, %2",
+			     "smbt\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+  else
+    {
+      const char *pats[] = { "smbb\t%0, %1, %2",
+			     "smbt\t%0, %1, %2",
+			     "smtt\t%0, %1, %2",
+			     "smbt\t%0, %2, %1" };
+      return pats[which_alternative];
+    }
+}
+  [(set_attr "type"     "dmul")
+   (set_attr "length"   "4")])
+
+(define_expand "kmabb"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")
+   (match_operand:V2HI 3 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_kma_internal (operands[0], operands[2], operands[3],
+				 GEN_INT (1), GEN_INT (1),
+				 operands[1]));
+  else
+    emit_insn (gen_kma_internal (operands[0], operands[2], operands[3],
+				 GEN_INT (0), GEN_INT (0),
+				 operands[1]));
+  DONE;
+})
+
+(define_expand "kmabt"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")
+   (match_operand:V2HI 3 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_kma_internal (operands[0], operands[2], operands[3],
+				 GEN_INT (1), GEN_INT (0),
+				 operands[1]));
+  else
+    emit_insn (gen_kma_internal (operands[0], operands[2], operands[3],
+				 GEN_INT (0), GEN_INT (1),
+				 operands[1]));
+  DONE;
+})
+
+(define_expand "kmatt"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")
+   (match_operand:V2HI 3 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_kma_internal (operands[0], operands[2], operands[3],
+				 GEN_INT (0), GEN_INT (0),
+				 operands[1]));
+  else
+    emit_insn (gen_kma_internal (operands[0], operands[2], operands[3],
+				 GEN_INT (1), GEN_INT (1),
+				 operands[1]));
+  DONE;
+})
+
+(define_insn "kma_internal"
+  [(set (match_operand:SI 0 "register_operand"                          "=    r,    r,    r,    r")
+	(ss_plus:SI
+	  (mult:SI
+	    (sign_extend:SI
+	      (vec_select:HI
+		(match_operand:V2HI 1 "register_operand"                "    r,    r,    r,    r")
+	        (parallel [(match_operand:SI 3 "nds32_imm_0_1_operand"  " Iv00, Iv00, Iv01, Iv01")])))
+	    (sign_extend:SI
+	      (vec_select:HI
+	        (match_operand:V2HI 2 "register_operand"                "    r,    r,    r,    r")
+	        (parallel [(match_operand:SI 4 "nds32_imm_0_1_operand"  " Iv00, Iv01, Iv01, Iv00")]))))
+	  (match_operand:SI 5 "register_operand"                        "     0,    0,    0,    0")))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      const char *pats[] = { "kmatt\t%0, %1, %2",
+			     "kmabt\t%0, %2, %1",
+			     "kmabb\t%0, %1, %2",
+			     "kmabt\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+  else
+    {
+      const char *pats[] = { "kmabb\t%0, %1, %2",
+			     "kmabt\t%0, %1, %2",
+			     "kmatt\t%0, %1, %2",
+			     "kmabt\t%0, %2, %1" };
+      return pats[which_alternative];
+    }
+}
+  [(set_attr "type"    "dmac")
+   (set_attr "length"   "4")])
+
+(define_expand "smds"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_smds_be (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_smds_le (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_expand "smds_le"
+  [(set (match_operand:SI 0 "register_operand"                         "=r")
+	(minus:SI
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 1 "register_operand" " r")
+			      (parallel [(const_int 1)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 2 "register_operand" " r")
+			      (parallel [(const_int 1)]))))
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 1)
+			      (parallel [(const_int 0)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 2)
+			      (parallel [(const_int 0)]))))))]
+  "NDS32_EXT_DSP_P ()"
+{
+})
+
+(define_expand "smds_be"
+  [(set (match_operand:SI 0 "register_operand"                         "=r")
+	(minus:SI
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 1 "register_operand" " r")
+			      (parallel [(const_int 0)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 2 "register_operand" " r")
+			      (parallel [(const_int 0)]))))
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 1)
+			      (parallel [(const_int 1)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 2)
+			      (parallel [(const_int 1)]))))))]
+  "NDS32_EXT_DSP_P ()"
+{
+})
+
+(define_expand "smdrs"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_smdrs_be (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_smdrs_le (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_expand "smdrs_le"
+  [(set (match_operand:SI 0 "register_operand"                         "=r")
+	(minus:SI
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 1 "register_operand" " r")
+			      (parallel [(const_int 0)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 2 "register_operand" " r")
+			      (parallel [(const_int 0)]))))
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 1)
+			      (parallel [(const_int 1)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 2)
+			      (parallel [(const_int 1)]))))))]
+  "NDS32_EXT_DSP_P ()"
+{
+})
+
+(define_expand "smdrs_be"
+  [(set (match_operand:SI 0 "register_operand"                         "=r")
+	(minus:SI
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 1 "register_operand" " r")
+			      (parallel [(const_int 1)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 2 "register_operand" " r")
+			      (parallel [(const_int 1)]))))
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 1)
+			      (parallel [(const_int 0)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 2)
+			      (parallel [(const_int 0)]))))))]
+  "NDS32_EXT_DSP_P ()"
+{
+})
+
+(define_expand "smxdsv"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:V2HI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_smxdsv_be (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_smxdsv_le (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+
+(define_expand "smxdsv_le"
+  [(set (match_operand:SI 0 "register_operand"                         "=r")
+	(minus:SI
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 1 "register_operand" " r")
+			      (parallel [(const_int 1)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 2 "register_operand" " r")
+			      (parallel [(const_int 0)]))))
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 1)
+			      (parallel [(const_int 0)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 2)
+			      (parallel [(const_int 1)]))))))]
+  "NDS32_EXT_DSP_P ()"
+{
+})
+
+(define_expand "smxdsv_be"
+  [(set (match_operand:SI 0 "register_operand"                         "=r")
+	(minus:SI
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 1 "register_operand" " r")
+			      (parallel [(const_int 0)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 2 "register_operand" " r")
+			      (parallel [(const_int 1)]))))
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 1)
+			      (parallel [(const_int 1)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 2)
+			      (parallel [(const_int 0)]))))))]
+  "NDS32_EXT_DSP_P ()"
+{
+})
+
+(define_insn "smal1"
+  [(set (match_operand:DI 0 "register_operand"             "=r")
+	(plus:DI (match_operand:DI 1 "register_operand"    " r")
+	  (sign_extend:DI
+	    (mult:SI
+	      (sign_extend:SI
+		(vec_select:HI
+		  (match_operand:V2HI 2 "register_operand" " r")
+		  (parallel [(const_int 0)])))
+	      (sign_extend:SI
+		(vec_select:HI
+		  (match_dup 2)
+		  (parallel [(const_int 1)])))))))]
+  "NDS32_EXT_DSP_P ()"
+  "smal\t%0, %1, %2"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "smal2"
+  [(set (match_operand:DI 0 "register_operand"           "=r")
+	(plus:DI (match_operand:DI 1 "register_operand"  " r")
+	  (mult:DI
+	    (sign_extend:DI
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 0)])))
+	    (sign_extend:DI
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 1)]))))))]
+  "NDS32_EXT_DSP_P ()"
+  "smal\t%0, %1, %2"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "smal3"
+  [(set (match_operand:DI 0 "register_operand"             "=r")
+	(plus:DI (match_operand:DI 1 "register_operand"    " r")
+	  (sign_extend:DI
+	    (mult:SI
+	      (sign_extend:SI
+		(vec_select:HI
+		  (match_operand:V2HI 2 "register_operand" " r")
+		  (parallel [(const_int 1)])))
+	      (sign_extend:SI
+		(vec_select:HI
+		  (match_dup 2)
+		  (parallel [(const_int 0)])))))))]
+  "NDS32_EXT_DSP_P ()"
+  "smal\t%0, %1, %2"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "smal4"
+  [(set (match_operand:DI 0 "register_operand"           "=r")
+	(plus:DI (match_operand:DI 1 "register_operand"  " r")
+	  (mult:DI
+	    (sign_extend:DI
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 1)])))
+	    (sign_extend:DI
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 0)]))))))]
+  "NDS32_EXT_DSP_P ()"
+  "smal\t%0, %1, %2"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "smal5"
+  [(set (match_operand:DI 0 "register_operand"             "=r")
+	(plus:DI
+	  (sign_extend:DI
+	    (mult:SI
+	      (sign_extend:SI
+		(vec_select:HI
+		  (match_operand:V2HI 2 "register_operand" " r")
+		  (parallel [(const_int 0)])))
+	      (sign_extend:SI
+		(vec_select:HI
+		  (match_dup 2)
+		  (parallel [(const_int 1)])))))
+	  (match_operand:DI 1 "register_operand"           " r")))]
+  "NDS32_EXT_DSP_P ()"
+  "smal\t%0, %1, %2"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "smal6"
+  [(set (match_operand:DI 0 "register_operand"           "=r")
+	(plus:DI
+	  (mult:DI
+	    (sign_extend:DI
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 0)])))
+	    (sign_extend:DI
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 1)]))))
+	  (match_operand:DI 1 "register_operand"         " r")))]
+  "NDS32_EXT_DSP_P ()"
+  "smal\t%0, %1, %2"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "smal7"
+  [(set (match_operand:DI 0 "register_operand"             "=r")
+	(plus:DI
+	  (sign_extend:DI
+	    (mult:SI
+	      (sign_extend:SI
+		(vec_select:HI
+		  (match_operand:V2HI 2 "register_operand" " r")
+		  (parallel [(const_int 1)])))
+	      (sign_extend:SI
+		(vec_select:HI
+		  (match_dup 2)
+		  (parallel [(const_int 0)])))))
+	  (match_operand:DI 1 "register_operand"           " r")))]
+  "NDS32_EXT_DSP_P ()"
+  "smal\t%0, %1, %2"
+  [(set_attr "type"    "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "smal8"
+  [(set (match_operand:DI 0 "register_operand"           "=r")
+	(plus:DI
+	  (mult:DI
+	    (sign_extend:DI
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand" " r")
+		(parallel [(const_int 1)])))
+	    (sign_extend:DI
+	      (vec_select:HI
+		(match_dup 2)
+		(parallel [(const_int 0)]))))
+	  (match_operand:DI 1 "register_operand"         " r")))]
+  "NDS32_EXT_DSP_P ()"
+  "smal\t%0, %1, %2"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+;; We need this dummy pattern for smal
+(define_insn_and_split "extendsidi2"
+  [(set (match_operand:DI 0 "register_operand" "")
+	(sign_extend:DI (match_operand:SI 1 "nds32_move_operand" "")))]
+  "NDS32_EXT_DSP_P ()"
+  "#"
+  "NDS32_EXT_DSP_P ()"
+  [(const_int 0)]
+{
+  rtx high_part_dst, low_part_dst;
+
+  low_part_dst = nds32_di_low_part_subreg (operands[0]);
+  high_part_dst = nds32_di_high_part_subreg (operands[0]);
+
+  emit_move_insn (low_part_dst, operands[1]);
+  emit_insn (gen_ashrsi3 (high_part_dst, low_part_dst, GEN_INT (31)));
+  DONE;
+}
+  [(set_attr "type"   "alu")
+   (set_attr "length"   "4")])
+
+;; We need this dummy pattern for usmar64/usmsr64
+(define_insn_and_split "zero_extendsidi2"
+  [(set (match_operand:DI 0 "register_operand" "")
+	(zero_extend:DI (match_operand:SI 1 "nds32_move_operand" "")))]
+  "NDS32_EXT_DSP_P ()"
+  "#"
+  "NDS32_EXT_DSP_P ()"
+  [(const_int 0)]
+{
+  rtx high_part_dst, low_part_dst;
+
+  low_part_dst = nds32_di_low_part_subreg (operands[0]);
+  high_part_dst = nds32_di_high_part_subreg (operands[0]);
+
+  emit_move_insn (low_part_dst, operands[1]);
+  emit_move_insn (high_part_dst, const0_rtx);
+  DONE;
+}
+  [(set_attr "type"   "alu")
+   (set_attr "length"   "4")])
+
+(define_insn_and_split "extendhidi2"
+  [(set (match_operand:DI 0 "register_operand" "")
+	(sign_extend:DI (match_operand:HI 1 "nonimmediate_operand" "")))]
+  "NDS32_EXT_DSP_P ()"
+  "#"
+  "NDS32_EXT_DSP_P ()"
+  [(const_int 0)]
+{
+  rtx high_part_dst, low_part_dst;
+
+  low_part_dst = nds32_di_low_part_subreg (operands[0]);
+  high_part_dst = nds32_di_high_part_subreg (operands[0]);
+
+
+  emit_insn (gen_extendhisi2 (low_part_dst, operands[1]));
+  emit_insn (gen_ashrsi3 (high_part_dst, low_part_dst, GEN_INT (31)));
+  DONE;
+}
+  [(set_attr "type"   "alu")
+   (set_attr "length"   "4")])
+
+(define_insn "extendqihi2"
+  [(set (match_operand:HI 0 "register_operand"                 "=r")
+	(sign_extend:HI (match_operand:QI 1 "register_operand" " r")))]
+  "NDS32_EXT_DSP_P ()"
+  "sunpkd820\t%0, %1"
+  [(set_attr "type"       "dpack")
+   (set_attr "length"     "4")])
+
+(define_insn "smulsi3_highpart"
+  [(set (match_operand:SI 0 "register_operand"                       "=r")
+	(truncate:SI
+	  (lshiftrt:DI
+	    (mult:DI
+	      (sign_extend:DI (match_operand:SI 1 "register_operand" " r"))
+	      (sign_extend:DI (match_operand:SI 2 "register_operand" " r")))
+	    (const_int 32))))]
+  "NDS32_EXT_DSP_P ()"
+  "smmul\t%0, %1, %2"
+  [(set_attr "type"     "dmul")
+   (set_attr "length"   "4")])
+
+(define_insn "smmul_round"
+  [(set (match_operand:SI 0 "register_operand"                       "=r")
+	(truncate:SI
+	  (lshiftrt:DI
+	    (unspec:DI [(mult:DI
+		  	  (sign_extend:DI (match_operand:SI 1 "register_operand" " r"))
+			  (sign_extend:DI (match_operand:SI 2 "register_operand" " r")))]
+		       UNSPEC_ROUND)
+	    (const_int 32))))]
+  "NDS32_EXT_DSP_P ()"
+  "smmul.u\t%0, %1, %2"
+  [(set_attr "type"     "dmul")
+   (set_attr "length"   "4")])
+
+(define_insn "kmmac"
+  [(set (match_operand:SI 0 "register_operand"                         "=r")
+	(ss_plus:SI (match_operand:SI 1 "register_operand"             " 0")
+	  (truncate:SI
+	    (lshiftrt:DI
+	      (mult:DI
+		(sign_extend:DI (match_operand:SI 2 "register_operand" " r"))
+		(sign_extend:DI (match_operand:SI 3 "register_operand" " r")))
+	      (const_int 32)))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmmac\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kmmac_round"
+  [(set (match_operand:SI 0 "register_operand"                                     "=r")
+	(ss_plus:SI (match_operand:SI 1 "register_operand"                         " 0")
+	  (truncate:SI
+	    (lshiftrt:DI
+	      (unspec:DI [(mult:DI
+			    (sign_extend:DI (match_operand:SI 2 "register_operand" " r"))
+			    (sign_extend:DI (match_operand:SI 3 "register_operand" " r")))]
+			 UNSPEC_ROUND)
+	      (const_int 32)))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmmac.u\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kmmsb"
+  [(set (match_operand:SI 0 "register_operand"                         "=r")
+	(ss_minus:SI (match_operand:SI 1 "register_operand"            " 0")
+	  (truncate:SI
+	    (lshiftrt:DI
+	      (mult:DI
+		(sign_extend:DI (match_operand:SI 2 "register_operand" " r"))
+		(sign_extend:DI (match_operand:SI 3 "register_operand" " r")))
+	      (const_int 32)))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmmsb\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kmmsb_round"
+  [(set (match_operand:SI 0 "register_operand"                                     "=r")
+	(ss_minus:SI (match_operand:SI 1 "register_operand"                        " 0")
+	  (truncate:SI
+	    (lshiftrt:DI
+	      (unspec:DI [(mult:DI
+			    (sign_extend:DI (match_operand:SI 2 "register_operand" " r"))
+			    (sign_extend:DI (match_operand:SI 3 "register_operand" " r")))]
+			 UNSPEC_ROUND)
+	      (const_int 32)))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmmsb.u\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kwmmul"
+  [(set (match_operand:SI 0 "register_operand"                       "=r")
+	(truncate:SI
+	  (lshiftrt:DI
+	    (ss_mult:DI
+	      (mult:DI (sign_extend:DI (match_operand:SI 1 "register_operand" " r")) (const_int 2))
+	      (mult:DI (sign_extend:DI (match_operand:SI 2 "register_operand" " r")) (const_int 2)))
+	    (const_int 32))))]
+  "NDS32_EXT_DSP_P ()"
+  "kwmmul\t%0, %1, %2"
+  [(set_attr "type"     "dmul")
+   (set_attr "length"   "4")])
+
+(define_insn "kwmmul_round"
+  [(set (match_operand:SI 0 "register_operand"                       "=r")
+	(truncate:SI
+	  (lshiftrt:DI
+	    (unspec:DI [
+	      (ss_mult:DI
+		(mult:DI (sign_extend:DI (match_operand:SI 1 "register_operand" " r")) (const_int 2))
+		(mult:DI (sign_extend:DI (match_operand:SI 2 "register_operand" " r")) (const_int 2)))]
+	      UNSPEC_ROUND)
+	    (const_int 32))))]
+  "NDS32_EXT_DSP_P ()"
+  "kwmmul.u\t%0, %1, %2"
+  [(set_attr "type"     "dmul")
+   (set_attr "length"   "4")])
+
+(define_expand "smmwb"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_smulhisi3_highpart_1 (operands[0], operands[1], operands[2], GEN_INT (1)));
+  else
+    emit_insn (gen_smulhisi3_highpart_1 (operands[0], operands[1], operands[2], GEN_INT (0)));
+  DONE;
+})
+
+(define_expand "smmwt"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_smulhisi3_highpart_1 (operands[0], operands[1], operands[2], GEN_INT (0)));
+  else
+    emit_insn (gen_smulhisi3_highpart_1 (operands[0], operands[1], operands[2], GEN_INT (1)));
+  DONE;
+})
+
+(define_insn "smulhisi3_highpart_1"
+  [(set (match_operand:SI 0 "register_operand"                           "=   r,    r")
+	(truncate:SI
+	  (lshiftrt:DI
+	    (mult:DI
+	      (sign_extend:DI (match_operand:SI 1 "register_operand"     "    r,    r"))
+	      (sign_extend:DI
+	        (vec_select:HI
+		  (match_operand:V2HI 2 "register_operand"               "    r,    r")
+		  (parallel [(match_operand:SI 3 "nds32_imm_0_1_operand" " Iv00, Iv01")]))))
+	    (const_int 16))))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      const char *pats[] = { "smmwt\t%0, %1, %2",
+			     "smmwb\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+  else
+    {
+      const char *pats[] = { "smmwb\t%0, %1, %2",
+			     "smmwt\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+}
+  [(set_attr "type"     "dmul")
+   (set_attr "length"   "4")])
+
+(define_insn "smulhisi3_highpart_2"
+  [(set (match_operand:SI 0 "register_operand"                           "=   r,    r")
+	(truncate:SI
+	  (lshiftrt:DI
+	    (mult:DI
+	      (sign_extend:DI
+	        (vec_select:HI
+		  (match_operand:V2HI 1 "register_operand"               "    r,    r")
+		  (parallel [(match_operand:SI 3 "nds32_imm_0_1_operand" " Iv00, Iv01")])))
+	      (sign_extend:DI (match_operand:SI 2 "register_operand"     "    r,    r")))
+	    (const_int 16))))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      const char *pats[] = { "smmwt\t%0, %1, %2",
+			     "smmwb\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+  else
+    {
+      const char *pats[] = { "smmwb\t%0, %1, %2",
+			     "smmwt\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+}
+  [(set_attr "type"     "dmul")
+   (set_attr "length"   "4")])
+
+(define_expand "smmwb_round"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_smmw_round_internal (operands[0], operands[1], operands[2], GEN_INT (1)));
+  else
+    emit_insn (gen_smmw_round_internal (operands[0], operands[1], operands[2], GEN_INT (0)));
+  DONE;
+})
+
+(define_expand "smmwt_round"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_smmw_round_internal (operands[0], operands[1], operands[2], GEN_INT (0)));
+  else
+    emit_insn (gen_smmw_round_internal (operands[0], operands[1], operands[2], GEN_INT (1)));
+  DONE;
+})
+
+(define_insn "smmw_round_internal"
+  [(set (match_operand:SI 0 "register_operand"                           "=   r,    r")
+	(truncate:SI
+	  (lshiftrt:DI
+	    (unspec:DI
+	      [(mult:DI
+		 (sign_extend:DI (match_operand:SI 1 "register_operand"     "    r,    r"))
+		 (sign_extend:DI
+		   (vec_select:HI
+		     (match_operand:V2HI 2 "register_operand"               "    r,    r")
+		     (parallel [(match_operand:SI 3 "nds32_imm_0_1_operand" " Iv00, Iv01")]))))]
+	      UNSPEC_ROUND)
+	    (const_int 16))))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      const char *pats[] = { "smmwt.u\t%0, %1, %2",
+			     "smmwb.u\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+  else
+    {
+      const char *pats[] = { "smmwb.u\t%0, %1, %2",
+			     "smmwt.u\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+}
+  [(set_attr "type"     "dmul")
+   (set_attr "length"   "4")])
+
+(define_expand "kmmawb"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:SI 2 "register_operand" "")
+   (match_operand:V2HI 3 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_kmmaw_internal (operands[0], operands[2], operands[3], GEN_INT (1), operands[1]));
+  else
+    emit_insn (gen_kmmaw_internal (operands[0], operands[2], operands[3], GEN_INT (0), operands[1]));
+  DONE;
+})
+
+(define_expand "kmmawt"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:SI 2 "register_operand" "")
+   (match_operand:V2HI 3 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_kmmaw_internal (operands[0], operands[2], operands[3], GEN_INT (0), operands[1]));
+  else
+    emit_insn (gen_kmmaw_internal (operands[0], operands[2], operands[3], GEN_INT (1), operands[1]));
+  DONE;
+})
+
+(define_insn "kmmaw_internal"
+  [(set (match_operand:SI 0 "register_operand"                               "=   r,    r")
+	(ss_plus:SI
+	  (match_operand:SI 4 "register_operand"                             "    0,    0")
+	  (truncate:SI
+	    (lshiftrt:DI
+	      (mult:DI
+		(sign_extend:DI (match_operand:SI 1 "register_operand"       "    r,    r"))
+		  (sign_extend:DI
+		    (vec_select:HI
+		      (match_operand:V2HI 2 "register_operand"               "    r,    r")
+		      (parallel [(match_operand:SI 3 "nds32_imm_0_1_operand" " Iv00, Iv01")]))))
+	      (const_int 16)))))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      const char *pats[] = { "kmmawt\t%0, %1, %2",
+			     "kmmawb\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+  else
+    {
+      const char *pats[] = { "kmmawb\t%0, %1, %2",
+			     "kmmawt\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+}
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_expand "kmmawb_round"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:SI 2 "register_operand" "")
+   (match_operand:V2HI 3 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_kmmaw_round_internal (operands[0], operands[2], operands[3], GEN_INT (1), operands[1]));
+  else
+    emit_insn (gen_kmmaw_round_internal (operands[0], operands[2], operands[3], GEN_INT (0), operands[1]));
+  DONE;
+}
+  [(set_attr "type"   "alu")
+   (set_attr "length"   "4")])
+
+(define_expand "kmmawt_round"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:SI 2 "register_operand" "")
+   (match_operand:V2HI 3 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_kmmaw_round_internal (operands[0], operands[2], operands[3], GEN_INT (0), operands[1]));
+  else
+    emit_insn (gen_kmmaw_round_internal (operands[0], operands[2], operands[3], GEN_INT (1), operands[1]));
+  DONE;
+}
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+
+(define_insn "kmmaw_round_internal"
+  [(set (match_operand:SI 0 "register_operand"                                "=   r,    r")
+	(ss_plus:SI
+	  (match_operand:SI 4 "register_operand"                              "    0,    0")
+	  (truncate:SI
+	    (lshiftrt:DI
+	      (unspec:DI
+		[(mult:DI
+		   (sign_extend:DI (match_operand:SI 1 "register_operand"     "    r,    r"))
+		   (sign_extend:DI
+		     (vec_select:HI
+		       (match_operand:V2HI 2 "register_operand"               "    r,    r")
+		       (parallel [(match_operand:SI 3 "nds32_imm_0_1_operand" " Iv00, Iv01")]))))]
+		UNSPEC_ROUND)
+	      (const_int 16)))))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      const char *pats[] = { "kmmawt.u\t%0, %1, %2",
+			     "kmmawb.u\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+  else
+    {
+      const char *pats[] = { "kmmawb.u\t%0, %1, %2",
+			     "kmmawt.u\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+}
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_expand "smalbb"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:DI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")
+   (match_operand:V2HI 3 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_smaddhidi (operands[0], operands[2],
+			      operands[3], operands[1],
+			      GEN_INT (1), GEN_INT (1)));
+  else
+    emit_insn (gen_smaddhidi (operands[0], operands[2],
+			      operands[3], operands[1],
+			      GEN_INT (0), GEN_INT (0)));
+  DONE;
+})
+
+(define_expand "smalbt"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:DI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")
+   (match_operand:V2HI 3 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_smaddhidi (operands[0], operands[2],
+			      operands[3], operands[1],
+			      GEN_INT (1), GEN_INT (0)));
+  else
+    emit_insn (gen_smaddhidi (operands[0], operands[2],
+			      operands[3], operands[1],
+			      GEN_INT (0), GEN_INT (1)));
+  DONE;
+})
+
+(define_expand "smaltt"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:DI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" "")
+   (match_operand:V2HI 3 "register_operand" "")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_smaddhidi (operands[0], operands[2],
+			      operands[3], operands[1],
+			      GEN_INT (0), GEN_INT (0)));
+  else
+    emit_insn (gen_smaddhidi (operands[0], operands[2],
+			      operands[3], operands[1],
+			      GEN_INT (1), GEN_INT (1)));
+  DONE;
+})
+
+(define_insn "smaddhidi"
+  [(set (match_operand:DI 0 "register_operand"                         "=   r,    r,    r,    r")
+	(plus:DI
+	  (match_operand:DI 3 "register_operand"                       "    0,    0,    0,    0")
+	  (mult:DI
+	    (sign_extend:DI
+	      (vec_select:HI
+		(match_operand:V2HI 1 "register_operand"               "    r,    r,    r,    r")
+		(parallel [(match_operand:SI 4 "nds32_imm_0_1_operand" " Iv00, Iv00, Iv01, Iv01")])))
+	    (sign_extend:DI
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand"               "    r,    r,    r,    r")
+		(parallel [(match_operand:SI 5 "nds32_imm_0_1_operand" " Iv00, Iv01, Iv01, Iv00")]))))))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      const char *pats[] = { "smaltt\t%0, %1, %2",
+			     "smalbt\t%0, %2, %1",
+			     "smalbb\t%0, %1, %2",
+			     "smalbt\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+  else
+    {
+      const char *pats[] = { "smalbb\t%0, %1, %2",
+			     "smalbt\t%0, %1, %2",
+			     "smaltt\t%0, %1, %2",
+			     "smalbt\t%0, %2, %1" };
+      return pats[which_alternative];
+    }
+}
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "smaddhidi2"
+  [(set (match_operand:DI 0 "register_operand"                         "=   r,    r,    r,    r")
+	(plus:DI
+	  (mult:DI
+	    (sign_extend:DI
+	      (vec_select:HI
+		(match_operand:V2HI 1 "register_operand"               "    r,    r,    r,    r")
+		(parallel [(match_operand:SI 4 "nds32_imm_0_1_operand" " Iv00, Iv00, Iv01, Iv01")])))
+	    (sign_extend:DI
+	      (vec_select:HI
+		(match_operand:V2HI 2 "register_operand"               "    r,    r,    r,    r")
+		(parallel [(match_operand:SI 5 "nds32_imm_0_1_operand" " Iv00, Iv01, Iv01, Iv00")]))))
+	  (match_operand:DI 3 "register_operand"                       "    0,    0,    0,    0")))]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    {
+      const char *pats[] = { "smaltt\t%0, %1, %2",
+			     "smalbt\t%0, %2, %1",
+			     "smalbb\t%0, %1, %2",
+			     "smalbt\t%0, %1, %2" };
+      return pats[which_alternative];
+    }
+  else
+    {
+      const char *pats[] = { "smalbb\t%0, %1, %2",
+			     "smalbt\t%0, %1, %2",
+			     "smaltt\t%0, %1, %2",
+			     "smalbt\t%0, %2, %1" };
+      return pats[which_alternative];
+    }
+}
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_expand "smalda1"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:DI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" " r")
+   (match_operand:V2HI 3 "register_operand" " r")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_smalda1_be (operands[0], operands[1], operands[2], operands[3]));
+  else
+    emit_insn (gen_smalda1_le (operands[0], operands[1], operands[2], operands[3]));
+  DONE;
+})
+
+(define_expand "smalds1"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:DI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" " r")
+   (match_operand:V2HI 3 "register_operand" " r")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_smalds1_be (operands[0], operands[1], operands[2], operands[3]));
+  else
+    emit_insn (gen_smalds1_le (operands[0], operands[1], operands[2], operands[3]));
+  DONE;
+})
+
+(define_insn "smalda1_le"
+  [(set (match_operand:DI 0 "register_operand"                             "=r")
+	(plus:DI
+	  (match_operand:DI 1 "register_operand"                           " 0")
+	  (sign_extend:DI
+	    (plus:SI
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 2 "register_operand" " r")
+				  (parallel [(const_int 1)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 3 "register_operand" " r")
+				  (parallel [(const_int 1)]))))
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 2)
+				  (parallel [(const_int 0)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 3)
+				  (parallel [(const_int 0)]))))))))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "smalda\t%0, %2, %3"
+  [(set_attr "type"    "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "smalds1_le"
+  [(set (match_operand:DI 0 "register_operand"                             "=r")
+	(plus:DI
+	  (match_operand:DI 1 "register_operand"                           " 0")
+	  (sign_extend:DI
+	    (minus:SI
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 2 "register_operand" " r")
+				  (parallel [(const_int 1)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 3 "register_operand" " r")
+				  (parallel [(const_int 1)]))))
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 2)
+				  (parallel [(const_int 0)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 3)
+				  (parallel [(const_int 0)]))))))))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "smalds\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "smalda1_be"
+  [(set (match_operand:DI 0 "register_operand"                             "=r")
+	(plus:DI
+	  (match_operand:DI 1 "register_operand"                           " 0")
+	  (sign_extend:DI
+	    (plus:SI
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 2 "register_operand" " r")
+				  (parallel [(const_int 0)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 3 "register_operand" " r")
+				  (parallel [(const_int 0)]))))
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 2)
+				  (parallel [(const_int 1)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 3)
+				  (parallel [(const_int 1)]))))))))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "smalda\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "smalds1_be"
+  [(set (match_operand:DI 0 "register_operand"                             "=r")
+	(plus:DI
+	  (match_operand:DI 1 "register_operand"                           " 0")
+	  (sign_extend:DI
+	    (minus:SI
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 2 "register_operand" " r")
+				  (parallel [(const_int 0)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 3 "register_operand" " r")
+				  (parallel [(const_int 0)]))))
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 2)
+				  (parallel [(const_int 1)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 3)
+				  (parallel [(const_int 1)]))))))))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "smalds\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_expand "smaldrs3"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:DI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" " r")
+   (match_operand:V2HI 3 "register_operand" " r")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_smaldrs3_be (operands[0], operands[1], operands[2], operands[3]));
+  else
+    emit_insn (gen_smaldrs3_le (operands[0], operands[1], operands[2], operands[3]));
+  DONE;
+})
+
+(define_insn "smaldrs3_le"
+  [(set (match_operand:DI 0 "register_operand"                             "=r")
+	(plus:DI
+	  (match_operand:DI 1 "register_operand"                           " 0")
+	  (sign_extend:DI
+	    (minus:SI
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 2 "register_operand" " r")
+				  (parallel [(const_int 0)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 3 "register_operand" " r")
+				  (parallel [(const_int 0)]))))
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 2)
+				  (parallel [(const_int 1)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 3)
+				  (parallel [(const_int 1)]))))))))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "smaldrs\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "smaldrs3_be"
+  [(set (match_operand:DI 0 "register_operand"                             "=r")
+	(plus:DI
+	  (match_operand:DI 1 "register_operand"                           " 0")
+	  (sign_extend:DI
+	    (minus:SI
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 2 "register_operand" " r")
+				  (parallel [(const_int 1)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 3 "register_operand" " r")
+				  (parallel [(const_int 1)]))))
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 2)
+				  (parallel [(const_int 0)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 3)
+				  (parallel [(const_int 0)]))))))))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "smaldrs\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_expand "smalxda1"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:DI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" " r")
+   (match_operand:V2HI 3 "register_operand" " r")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_smalxda1_be (operands[0], operands[1], operands[2], operands[3]));
+  else
+    emit_insn (gen_smalxda1_le (operands[0], operands[1], operands[2], operands[3]));
+  DONE;
+})
+
+(define_expand "smalxds1"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:DI 1 "register_operand" "")
+   (match_operand:V2HI 2 "register_operand" " r")
+   (match_operand:V2HI 3 "register_operand" " r")]
+  "NDS32_EXT_DSP_P ()"
+{
+  if (TARGET_BIG_ENDIAN)
+    emit_insn (gen_smalxds1_be (operands[0], operands[1], operands[2], operands[3]));
+  else
+    emit_insn (gen_smalxds1_le (operands[0], operands[1], operands[2], operands[3]));
+  DONE;
+})
+
+(define_insn "smalxd<add_sub>1_le"
+  [(set (match_operand:DI 0 "register_operand"                             "=r")
+	(plus:DI
+	  (match_operand:DI 1 "register_operand"                           " 0")
+	  (sign_extend:DI
+	    (plus_minus:SI
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 2 "register_operand" " r")
+				  (parallel [(const_int 1)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 3 "register_operand" " r")
+				  (parallel [(const_int 0)]))))
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 2)
+				  (parallel [(const_int 0)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 3)
+				  (parallel [(const_int 1)]))))))))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "smalxd<add_sub>\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+
+(define_insn "smalxd<add_sub>1_be"
+  [(set (match_operand:DI 0 "register_operand"                             "=r")
+	(plus:DI
+	  (match_operand:DI 1 "register_operand"                           " 0")
+	  (sign_extend:DI
+	    (plus_minus:SI
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 2 "register_operand" " r")
+				  (parallel [(const_int 0)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 3 "register_operand" " r")
+				  (parallel [(const_int 1)]))))
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 2)
+				  (parallel [(const_int 1)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_dup 3)
+				  (parallel [(const_int 0)]))))))))]
+  "NDS32_EXT_DSP_P () && TARGET_BIG_ENDIAN"
+  "smalxd<add_sub>\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "smslda1"
+  [(set (match_operand:DI 0 "register_operand"                             "=r")
+	(minus:DI
+	  (minus:DI
+	    (match_operand:DI 1 "register_operand"                           " 0")
+	    (sign_extend:DI
+	      (mult:SI
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 2 "register_operand" " r")
+				  (parallel [(const_int 1)])))
+		(sign_extend:SI (vec_select:HI
+				  (match_operand:V2HI 3 "register_operand" " r")
+				  (parallel [(const_int 1)]))))))
+	  (sign_extend:DI
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 2)
+				(parallel [(const_int 0)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 3)
+				(parallel [(const_int 0)])))))))]
+  "NDS32_EXT_DSP_P ()"
+  "smslda\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "smslxda1"
+  [(set (match_operand:DI 0 "register_operand"                             "=r")
+	(minus:DI
+	  (minus:DI
+	    (match_operand:DI 1 "register_operand"                           " 0")
+	      (sign_extend:DI
+		(mult:SI
+		  (sign_extend:SI (vec_select:HI
+				    (match_operand:V2HI 2 "register_operand" " r")
+				    (parallel [(const_int 1)])))
+		  (sign_extend:SI (vec_select:HI
+				    (match_operand:V2HI 3 "register_operand" " r")
+				    (parallel [(const_int 0)]))))))
+	  (sign_extend:DI
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 2)
+				(parallel [(const_int 0)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 3)
+				(parallel [(const_int 1)])))))))]
+  "NDS32_EXT_DSP_P ()"
+  "smslxda\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+;; mada for synthetize smalda
+(define_insn_and_split "mada1"
+  [(set (match_operand:SI 0 "register_operand"                          "=r")
+	(plus:SI
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 1 "register_operand" "r")
+			      (parallel [(match_operand:SI 3 "nds32_imm_0_1_operand" " Iu01")])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 2 "register_operand" "r")
+			      (parallel [(match_operand:SI 4 "nds32_imm_0_1_operand" " Iu01")]))))
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 1)
+			      (parallel [(match_operand:SI 5 "nds32_imm_0_1_operand" " Iu01")])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 2)
+			      (parallel [(match_operand:SI 6 "nds32_imm_0_1_operand" " Iu01")]))))))]
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 1)]
+{
+  rtx result0 = gen_reg_rtx (SImode);
+  rtx result1 = gen_reg_rtx (SImode);
+  emit_insn (gen_mulhisi3v (result0, operands[1], operands[2],
+			    operands[3], operands[4]));
+  emit_insn (gen_mulhisi3v (result1, operands[1], operands[2],
+			    operands[5], operands[6]));
+  emit_insn (gen_addsi3 (operands[0], result0, result1));
+  DONE;
+})
+
+(define_insn_and_split "mada2"
+  [(set (match_operand:SI 0 "register_operand"                          "=r")
+	(plus:SI
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 1 "register_operand" "r")
+			      (parallel [(match_operand:SI 3 "nds32_imm_0_1_operand" " Iu01")])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 2 "register_operand" "r")
+			      (parallel [(match_operand:SI 4 "nds32_imm_0_1_operand" " Iu01")]))))
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 2)
+			      (parallel [(match_operand:SI 5 "nds32_imm_0_1_operand" " Iu01")])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 1)
+			      (parallel [(match_operand:SI 6 "nds32_imm_0_1_operand" " Iu01")]))))))]
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 1)]
+{
+  rtx result0 = gen_reg_rtx (SImode);
+  rtx result1 = gen_reg_rtx (SImode);
+  emit_insn (gen_mulhisi3v (result0, operands[1], operands[2],
+			    operands[3], operands[4]));
+  emit_insn (gen_mulhisi3v (result1, operands[1], operands[2],
+			    operands[6], operands[5]));
+  emit_insn (gen_addsi3 (operands[0], result0, result1));
+  DONE;
+})
+
+;; sms for synthetize smalds
+(define_insn_and_split "sms1"
+  [(set (match_operand:SI 0 "register_operand"                                       "=   r")
+	(minus:SI
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 1 "register_operand"               "    r")
+			      (parallel [(match_operand:SI 3 "nds32_imm_0_1_operand" " Iu01")])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 2 "register_operand"               "    r")
+			      (parallel [(match_operand:SI 4 "nds32_imm_0_1_operand" " Iu01")]))))
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 1)
+			      (parallel [(match_operand:SI 5 "nds32_imm_0_1_operand" " Iu01")])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 2)
+			      (parallel [(match_operand:SI 6 "nds32_imm_0_1_operand" " Iu01")]))))))]
+  "NDS32_EXT_DSP_P ()
+   && (!reload_completed
+       || !nds32_need_split_sms_p (operands[3], operands[4],
+				   operands[5], operands[6]))"
+
+{
+  return nds32_output_sms (operands[3], operands[4],
+			   operands[5], operands[6]);
+}
+  "NDS32_EXT_DSP_P ()
+   && !reload_completed
+   && nds32_need_split_sms_p (operands[3], operands[4],
+			      operands[5], operands[6])"
+  [(const_int 1)]
+{
+  nds32_split_sms (operands[0], operands[1], operands[2],
+		   operands[3], operands[4],
+		   operands[5], operands[6]);
+  DONE;
+}
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn_and_split "sms2"
+  [(set (match_operand:SI 0 "register_operand"                                       "=   r")
+	(minus:SI
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 1 "register_operand"               "    r")
+			      (parallel [(match_operand:SI 3 "nds32_imm_0_1_operand" " Iu01")])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 2 "register_operand"               "    r")
+			      (parallel [(match_operand:SI 4 "nds32_imm_0_1_operand" " Iu01")]))))
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 2)
+			      (parallel [(match_operand:SI 5 "nds32_imm_0_1_operand" " Iu01")])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 1)
+			      (parallel [(match_operand:SI 6 "nds32_imm_0_1_operand" " Iu01")]))))))]
+  "NDS32_EXT_DSP_P ()
+   && (!reload_completed
+       || !nds32_need_split_sms_p (operands[3], operands[4],
+				   operands[6], operands[5]))"
+{
+  return nds32_output_sms (operands[3], operands[4],
+			   operands[6], operands[5]);
+}
+  "NDS32_EXT_DSP_P ()
+   && !reload_completed
+   && nds32_need_split_sms_p (operands[3], operands[4],
+			      operands[6], operands[5])"
+  [(const_int 1)]
+{
+  nds32_split_sms (operands[0], operands[1], operands[2],
+		   operands[3], operands[4],
+		   operands[6], operands[5]);
+  DONE;
+}
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kmda"
+  [(set (match_operand:SI 0 "register_operand"                          "=r")
+	(ss_plus:SI
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 1 "register_operand" "r")
+			      (parallel [(const_int 1)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 2 "register_operand" "r")
+			      (parallel [(const_int 1)]))))
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 1)
+			      (parallel [(const_int 0)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 2)
+			      (parallel [(const_int 0)]))))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmda\t%0, %1, %2"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kmxda"
+  [(set (match_operand:SI 0 "register_operand"                          "=r")
+	(ss_plus:SI
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 1 "register_operand" "r")
+			      (parallel [(const_int 1)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_operand:V2HI 2 "register_operand" "r")
+			      (parallel [(const_int 0)]))))
+	  (mult:SI
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 1)
+			      (parallel [(const_int 0)])))
+	    (sign_extend:SI (vec_select:HI
+			      (match_dup 2)
+			      (parallel [(const_int 1)]))))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmxda\t%0, %1, %2"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kmada"
+  [(set (match_operand:SI 0 "register_operand"                           "=r")
+	(ss_plus:SI
+	  (match_operand:SI 1 "register_operand"                         " 0")
+	  (ss_plus:SI
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 2 "register_operand" " r")
+				(parallel [(const_int 1)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 3 "register_operand" " r")
+				(parallel [(const_int 1)]))))
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 2)
+				(parallel [(const_int 0)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 3)
+				(parallel [(const_int 0)])))))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmada\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kmada2"
+  [(set (match_operand:SI 0 "register_operand"                           "=r")
+	(ss_plus:SI
+	  (match_operand:SI 1 "register_operand"                         " 0")
+	  (ss_plus:SI
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 2 "register_operand" " r")
+				(parallel [(const_int 0)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 3 "register_operand" " r")
+				(parallel [(const_int 0)]))))
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 2)
+				(parallel [(const_int 1)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 3)
+				(parallel [(const_int 1)])))))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmada\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kmaxda"
+  [(set (match_operand:SI 0 "register_operand"                           "=r")
+	(ss_plus:SI
+	  (match_operand:SI 1 "register_operand"                         " 0")
+	  (ss_plus:SI
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 2 "register_operand" " r")
+				(parallel [(const_int 1)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 3 "register_operand" " r")
+				(parallel [(const_int 0)]))))
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 2)
+				(parallel [(const_int 0)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 3)
+				(parallel [(const_int 1)])))))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmaxda\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kmads"
+  [(set (match_operand:SI 0 "register_operand"                           "=r")
+	(ss_plus:SI
+	  (match_operand:SI 1 "register_operand"                         " 0")
+	  (ss_minus:SI
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 2 "register_operand" " r")
+				(parallel [(const_int 1)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 3 "register_operand" " r")
+				(parallel [(const_int 1)]))))
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 2)
+				(parallel [(const_int 0)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 3)
+				(parallel [(const_int 0)])))))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmads\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kmadrs"
+  [(set (match_operand:SI 0 "register_operand"                           "=r")
+	(ss_plus:SI
+	  (match_operand:SI 1 "register_operand"                         " 0")
+	  (ss_minus:SI
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 2 "register_operand" " r")
+				(parallel [(const_int 0)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 3 "register_operand" " r")
+				(parallel [(const_int 0)]))))
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 2)
+				(parallel [(const_int 1)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 3)
+				(parallel [(const_int 1)])))))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmadrs\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kmaxds"
+  [(set (match_operand:SI 0 "register_operand"                           "=r")
+	(ss_plus:SI
+	  (match_operand:SI 1 "register_operand"                         " 0")
+	  (ss_minus:SI
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 2 "register_operand" " r")
+				(parallel [(const_int 1)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 3 "register_operand" " r")
+				(parallel [(const_int 0)]))))
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 2)
+				(parallel [(const_int 0)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 3)
+				(parallel [(const_int 1)])))))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmaxds\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kmsda"
+  [(set (match_operand:SI 0 "register_operand"                           "=r")
+	(ss_minus:SI
+	  (match_operand:SI 1 "register_operand"                         " 0")
+	  (ss_minus:SI
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 2 "register_operand" " r")
+				(parallel [(const_int 1)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 3 "register_operand" " r")
+				(parallel [(const_int 1)]))))
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 2)
+				(parallel [(const_int 0)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 3)
+				(parallel [(const_int 0)])))))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmsda\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kmsxda"
+  [(set (match_operand:SI 0 "register_operand"                           "=r")
+	(ss_minus:SI
+	  (match_operand:SI 1 "register_operand"                         " 0")
+	  (ss_minus:SI
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 2 "register_operand" " r")
+				(parallel [(const_int 1)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_operand:V2HI 3 "register_operand" " r")
+				(parallel [(const_int 0)]))))
+	    (mult:SI
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 2)
+				(parallel [(const_int 0)])))
+	      (sign_extend:SI (vec_select:HI
+				(match_dup 3)
+				(parallel [(const_int 1)])))))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmsxda\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+;; smax[8|16] and umax[8|16]
+(define_insn "<opcode><mode>3"
+  [(set (match_operand:VQIHI 0 "register_operand"               "=r")
+	(sumax:VQIHI (match_operand:VQIHI 1 "register_operand" " r")
+		     (match_operand:VQIHI 2 "register_operand" " r")))]
+  "NDS32_EXT_DSP_P ()"
+  "<opcode><bits>\t%0, %1, %2"
+  [(set_attr "type"   "dalu")
+   (set_attr "length" "4")])
+
+;; smin[8|16] and umin[8|16]
+(define_insn "<opcode><mode>3"
+  [(set (match_operand:VQIHI 0 "register_operand"              "=r")
+	(sumin:VQIHI (match_operand:VQIHI 1 "register_operand" " r")
+		     (match_operand:VQIHI 2 "register_operand" " r")))]
+  "NDS32_EXT_DSP_P ()"
+  "<opcode><bits>\t%0, %1, %2"
+  [(set_attr "type"   "dalu")
+   (set_attr "length" "4")])
+
+(define_insn "<opcode><mode>3_bb"
+  [(set (match_operand:<VELT> 0 "register_operand"                    "=r")
+	(sumin_max:<VELT> (vec_select:<VELT>
+			    (match_operand:VQIHI 1 "register_operand" " r")
+			    (parallel [(const_int 0)]))
+			  (vec_select:<VELT>
+			    (match_operand:VQIHI 2 "register_operand" " r")
+			    (parallel [(const_int 0)]))))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "<opcode><bits>\t%0, %1, %2"
+  [(set_attr "type"   "dalu")
+   (set_attr "length" "4")])
+
+(define_insn_and_split "<opcode><mode>3_tt"
+  [(set (match_operand:<VELT> 0 "register_operand"                    "=r")
+	(sumin_max:<VELT> (vec_select:<VELT>
+			    (match_operand:VQIHI 1 "register_operand" " r")
+			    (parallel [(const_int 1)]))
+			  (vec_select:<VELT>
+			    (match_operand:VQIHI 2 "register_operand" " r")
+			    (parallel [(const_int 1)]))))]
+  "NDS32_EXT_DSP_P () && !reload_completed && !TARGET_BIG_ENDIAN"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 0)]
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_<opcode><mode>3 (tmp, operands[1], operands[2]));
+  emit_insn (gen_rotr<mode>_1 (tmp, tmp));
+  emit_move_insn (operands[0], simplify_gen_subreg (<VELT>mode, tmp, <MODE>mode, 0));
+  DONE;
+}
+  [(set_attr "type"   "dalu")
+   (set_attr "length" "4")])
+
+(define_insn_and_split "<opcode>v4qi3_22"
+  [(set (match_operand:QI 0 "register_operand"                   "=r")
+	(sumin_max:QI (vec_select:QI
+			(match_operand:V4QI 1 "register_operand" " r")
+			(parallel [(const_int 2)]))
+		      (vec_select:QI
+			(match_operand:V4QI 2 "register_operand" " r")
+			(parallel [(const_int 2)]))))]
+  "NDS32_EXT_DSP_P () && !reload_completed && !TARGET_BIG_ENDIAN"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 0)]
+{
+  rtx tmp = gen_reg_rtx (V4QImode);
+  emit_insn (gen_<opcode>v4qi3 (tmp, operands[1], operands[2]));
+  emit_insn (gen_rotrv4qi_2 (tmp, tmp));
+  emit_move_insn (operands[0], simplify_gen_subreg (QImode, tmp, V4QImode, 0));
+  DONE;
+}
+  [(set_attr "type"   "dalu")
+   (set_attr "length" "4")])
+
+(define_insn_and_split "<opcode>v4qi3_33"
+  [(set (match_operand:QI 0 "register_operand"                   "=r")
+	(sumin_max:QI (vec_select:QI
+			(match_operand:V4QI 1 "register_operand" " r")
+			(parallel [(const_int 3)]))
+		      (vec_select:QI
+			(match_operand:V4QI 2 "register_operand" " r")
+			(parallel [(const_int 3)]))))]
+  "NDS32_EXT_DSP_P () && !reload_completed && !TARGET_BIG_ENDIAN"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 0)]
+{
+  rtx tmp = gen_reg_rtx (V4QImode);
+  emit_insn (gen_<opcode>v4qi3 (tmp, operands[1], operands[2]));
+  emit_insn (gen_rotrv4qi_3 (tmp, tmp));
+  emit_move_insn (operands[0], simplify_gen_subreg (QImode, tmp, V4QImode, 0));
+  DONE;
+}
+  [(set_attr "type"   "dalu")
+   (set_attr "length" "4")])
+
+(define_insn_and_split "<opcode>v2hi3_bbtt"
+  [(set (match_operand:V2HI 0 "register_operand"                         "=r")
+	(vec_merge:V2HI
+	  (vec_duplicate:V2HI
+	    (sumin_max:HI (vec_select:HI
+			    (match_operand:V2HI 1 "register_operand" " r")
+			    (parallel [(const_int 1)]))
+			  (vec_select:HI
+			    (match_operand:V2HI 2 "register_operand" " r")
+			    (parallel [(const_int 1)]))))
+	  (vec_duplicate:V2HI
+	    (sumin_max:HI (vec_select:HI
+			    (match_dup:V2HI 1)
+			    (parallel [(const_int 0)]))
+			  (vec_select:HI
+			    (match_dup:HI 2)
+			    (parallel [(const_int 0)]))))
+	  (const_int 2)))]
+  "NDS32_EXT_DSP_P () && !TARGET_BIG_ENDIAN"
+  "#"
+  "NDS32_EXT_DSP_P ()"
+  [(const_int 0)]
+{
+  emit_insn (gen_<opcode>v2hi3 (operands[0], operands[1], operands[2]));
+  DONE;
+}
+  [(set_attr "type"   "dalu")
+   (set_attr "length" "4")])
+
+(define_expand "abs<mode>2"
+  [(set (match_operand:VQIHI 0 "register_operand"                "=r")
+	(ss_abs:VQIHI (match_operand:VQIHI 1 "register_operand" " r")))]
+  "NDS32_EXT_DSP_P () && TARGET_HW_ABS && !flag_wrapv"
+{
+})
+
+(define_insn "kabs<mode>2"
+  [(set (match_operand:VQIHI 0 "register_operand"                "=r")
+	(ss_abs:VQIHI (match_operand:VQIHI 1 "register_operand" " r")))]
+  "NDS32_EXT_DSP_P ()"
+  "kabs<bits>\t%0, %1"
+  [(set_attr "type"   "dalu")
+   (set_attr "length" "4")])
+
+(define_insn "<su>mar64_1"
+  [(set (match_operand:DI 0 "register_operand"       "=r")
+	(plus:DI
+	  (match_operand:DI 1 "register_operand"     " 0")
+	  (mult:DI
+	    (extend:DI
+	      (match_operand:SI 2 "register_operand" " r"))
+	    (extend:DI
+	      (match_operand:SI 3 "register_operand" " r")))))]
+  "NDS32_EXT_DSP_P ()"
+  "<su>mar64\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "<su>mar64_2"
+  [(set (match_operand:DI 0 "register_operand"       "=r")
+	(plus:DI
+	  (mult:DI
+	    (extend:DI
+	      (match_operand:SI 2 "register_operand" " r"))
+	    (extend:DI
+	      (match_operand:SI 3 "register_operand" " r")))
+	  (match_operand:DI 1 "register_operand"     " 0")))]
+  "NDS32_EXT_DSP_P ()"
+  "<su>mar64\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "<su>mar64_3"
+  [(set (match_operand:DI 0 "register_operand"       "=r")
+	(plus:DI
+	  (match_operand:DI 1 "register_operand"     " 0")
+	  (extend:DI
+	    (mult:SI
+	      (match_operand:SI 2 "register_operand" " r")
+	      (match_operand:SI 3 "register_operand" " r")))))]
+  "NDS32_EXT_DSP_P ()"
+  "<su>mar64\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "<su>mar64_4"
+  [(set (match_operand:DI 0 "register_operand"       "=r")
+	(plus:DI
+	  (extend:DI
+	  (mult:SI
+	      (match_operand:SI 2 "register_operand" " r")
+	      (match_operand:SI 3 "register_operand" " r")))
+	  (match_operand:DI 1 "register_operand"     " 0")))]
+  "NDS32_EXT_DSP_P ()"
+  "<su>mar64\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "<su>msr64"
+  [(set (match_operand:DI 0 "register_operand"       "=r")
+	(minus:DI
+	  (match_operand:DI 1 "register_operand"     " 0")
+	  (mult:DI
+	    (extend:DI
+	      (match_operand:SI 2 "register_operand" " r"))
+	    (extend:DI
+	      (match_operand:SI 3 "register_operand" " r")))))]
+  "NDS32_EXT_DSP_P ()"
+  "<su>msr64\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "<su>msr64_2"
+  [(set (match_operand:DI 0 "register_operand"       "=r")
+	(minus:DI
+	  (match_operand:DI 1 "register_operand"     " 0")
+	  (extend:DI
+	    (mult:SI
+	      (match_operand:SI 2 "register_operand" " r")
+	      (match_operand:SI 3 "register_operand" " r")))))]
+  "NDS32_EXT_DSP_P ()"
+  "<su>msr64\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+;; kmar64, kmsr64, ukmar64 and ukmsr64
+(define_insn "kmar64_1"
+  [(set (match_operand:DI 0 "register_operand"       "=r")
+	(ss_plus:DI
+	  (match_operand:DI 1 "register_operand"     " 0")
+	  (mult:DI
+	    (sign_extend:DI
+	      (match_operand:SI 2 "register_operand" " r"))
+	    (sign_extend:DI
+	      (match_operand:SI 3 "register_operand" " r")))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmar64\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kmar64_2"
+  [(set (match_operand:DI 0 "register_operand"       "=r")
+	(ss_plus:DI
+	  (mult:DI
+	    (sign_extend:DI
+	      (match_operand:SI 2 "register_operand" " r"))
+	    (sign_extend:DI
+	      (match_operand:SI 3 "register_operand" " r")))
+	  (match_operand:DI 1 "register_operand"     " 0")))]
+  "NDS32_EXT_DSP_P ()"
+  "kmar64\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "kmsr64"
+  [(set (match_operand:DI 0 "register_operand"       "=r")
+	(ss_minus:DI
+	  (match_operand:DI 1 "register_operand"     " 0")
+	  (mult:DI
+	    (sign_extend:DI
+	      (match_operand:SI 2 "register_operand" " r"))
+	    (sign_extend:DI
+	      (match_operand:SI 3 "register_operand" " r")))))]
+  "NDS32_EXT_DSP_P ()"
+  "kmsr64\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "ukmar64_1"
+  [(set (match_operand:DI 0 "register_operand"       "=r")
+	(us_plus:DI
+	  (match_operand:DI 1 "register_operand"     " 0")
+	  (mult:DI
+	    (zero_extend:DI
+	      (match_operand:SI 2 "register_operand" " r"))
+	    (zero_extend:DI
+	      (match_operand:SI 3 "register_operand" " r")))))]
+  "NDS32_EXT_DSP_P ()"
+  "ukmar64\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "ukmar64_2"
+  [(set (match_operand:DI 0 "register_operand"       "=r")
+	(us_plus:DI
+	  (mult:DI
+	    (zero_extend:DI
+	      (match_operand:SI 2 "register_operand" " r"))
+	    (zero_extend:DI
+	      (match_operand:SI 3 "register_operand" " r")))
+	  (match_operand:DI 1 "register_operand"     " 0")))]
+  "NDS32_EXT_DSP_P ()"
+  "ukmar64\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "ukmsr64"
+  [(set (match_operand:DI 0 "register_operand"       "=r")
+	(us_minus:DI
+	  (match_operand:DI 1 "register_operand"     " 0")
+	  (mult:DI
+	    (zero_extend:DI
+	      (match_operand:SI 2 "register_operand" " r"))
+	    (zero_extend:DI
+	      (match_operand:SI 3 "register_operand" " r")))))]
+  "NDS32_EXT_DSP_P ()"
+  "ukmsr64\t%0, %2, %3"
+  [(set_attr "type"     "dmac")
+   (set_attr "length"   "4")])
+
+(define_insn "bpick1"
+  [(set (match_operand:SI 0 "register_operand"       "=r")
+	  (ior:SI
+	    (and:SI
+	      (match_operand:SI 1 "register_operand" " r")
+	      (match_operand:SI 3 "register_operand" " r"))
+	    (and:SI
+	      (match_operand:SI 2 "register_operand" " r")
+	      (not:SI (match_dup 3)))))]
+  "NDS32_EXT_DSP_P ()"
+  "bpick\t%0, %1, %2, %3"
+  [(set_attr "type"     "dbpick")
+   (set_attr "length"   "4")])
+
+(define_insn "bpick2"
+  [(set (match_operand:SI 0 "register_operand"       "=r")
+	  (ior:SI
+	    (and:SI
+	      (match_operand:SI 1 "register_operand" " r")
+	      (match_operand:SI 2 "register_operand" " r"))
+	    (and:SI
+	      (not:SI (match_dup 2))
+	      (match_operand:SI 3 "register_operand" " r"))))]
+  "NDS32_EXT_DSP_P ()"
+  "bpick\t%0, %1, %3, %2"
+  [(set_attr "type"     "dbpick")
+   (set_attr "length"   "4")])
+
+(define_insn "bpick3"
+  [(set (match_operand:SI 0 "register_operand"       "=r")
+	  (ior:SI
+	    (and:SI
+	      (match_operand:SI 1 "register_operand" " r")
+	      (match_operand:SI 2 "register_operand" " r"))
+	    (and:SI
+	      (match_operand:SI 3 "register_operand" " r")
+	      (not:SI (match_dup 1)))))]
+  "NDS32_EXT_DSP_P ()"
+  "bpick\t%0, %2, %3, %1"
+  [(set_attr "type"     "dbpick")
+   (set_attr "length"   "4")])
+
+(define_insn "bpick4"
+  [(set (match_operand:SI 0 "register_operand"       "=r")
+	  (ior:SI
+	    (and:SI
+	      (match_operand:SI 1 "register_operand" " r")
+	      (match_operand:SI 2 "register_operand" " r"))
+	    (and:SI
+	      (not:SI (match_dup 1))
+	      (match_operand:SI 3 "register_operand" " r"))))]
+  "NDS32_EXT_DSP_P ()"
+  "bpick\t%0, %2, %3, %1"
+  [(set_attr "type"     "dbpick")
+   (set_attr "length"   "4")])
+
+(define_insn "bpick5"
+  [(set (match_operand:SI 0 "register_operand"               "=r")
+	  (ior:SI
+	    (and:SI
+	      (match_operand:SI 1 "register_operand"         " r")
+	      (not:SI (match_operand:SI 2 "register_operand" " r")))
+	    (and:SI
+	      (match_operand:SI 3 "register_operand"         " r")
+	      (match_dup 2))))]
+  "NDS32_EXT_DSP_P ()"
+  "bpick\t%0, %3, %1, %2"
+  [(set_attr "type"     "dbpick")
+   (set_attr "length"   "4")])
+
+(define_insn "bpick6"
+  [(set (match_operand:SI 0 "register_operand"               "=r")
+	  (ior:SI
+	    (and:SI
+	      (not:SI (match_operand:SI 1 "register_operand" " r"))
+	      (match_operand:SI 2 "register_operand"         " r"))
+	    (and:SI
+	      (match_operand:SI 3 "register_operand" " r")
+	      (match_dup 1))))]
+  "NDS32_EXT_DSP_P ()"
+  "bpick\t%0, %3, %2, %1"
+  [(set_attr "type"     "dbpick")
+   (set_attr "length"   "4")])
+
+(define_insn "bpick7"
+  [(set (match_operand:SI 0 "register_operand"               "=r")
+	  (ior:SI
+	    (and:SI
+	      (match_operand:SI 1 "register_operand"         " r")
+	      (not:SI (match_operand:SI 2 "register_operand" " r")))
+	    (and:SI
+	      (match_dup 2)
+	      (match_operand:SI 3 "register_operand"         " r"))))]
+  "NDS32_EXT_DSP_P ()"
+  "bpick\t%0, %3, %1, %2"
+  [(set_attr "type"     "dbpick")
+   (set_attr "length"   "4")])
+
+(define_insn "bpick8"
+  [(set (match_operand:SI 0 "register_operand"               "=r")
+	  (ior:SI
+	    (and:SI
+	      (not:SI (match_operand:SI 1 "register_operand" " r"))
+	      (match_operand:SI 2 "register_operand"         " r"))
+	    (and:SI
+	      (match_dup 1)
+	      (match_operand:SI 3 "register_operand"         " r"))))]
+  "NDS32_EXT_DSP_P ()"
+  "bpick\t%0, %3, %2, %1"
+  [(set_attr "type"     "dbpick")
+   (set_attr "length"   "4")])
+
+(define_insn "sraiu"
+  [(set (match_operand:SI 0 "register_operand"                              "=   r, r")
+	(unspec:SI [(ashiftrt:SI (match_operand:SI 1 "register_operand"     "    r, r")
+				 (match_operand:SI 2 "nds32_rimm5u_operand" " Iu05, r"))]
+		    UNSPEC_ROUND))]
+  "NDS32_EXT_DSP_P ()"
+  "@
+   srai.u\t%0, %1, %2
+   sra.u\t%0, %1, %2"
+  [(set_attr "type"   "daluround")
+   (set_attr "length" "4")])
+
+(define_insn "kssl"
+  [(set (match_operand:SI 0 "register_operand"                   "=   r, r")
+	(ss_ashift:SI (match_operand:SI 1 "register_operand"     "    r, r")
+		      (match_operand:SI 2 "nds32_rimm5u_operand" " Iu05, r")))]
+  "NDS32_EXT_DSP_P ()"
+  "@
+   kslli\t%0, %1, %2
+   ksll\t%0, %1, %2"
+  [(set_attr "type"   "dalu")
+   (set_attr "length" "4")])
+
+(define_insn "kslraw_round"
+  [(set (match_operand:SI 0 "register_operand"                  "=r")
+	(if_then_else:SI
+	  (lt:SI (match_operand:SI 2 "register_operand"        " r")
+		 (const_int 0))
+	  (unspec:SI [(ashiftrt:SI (match_operand:SI 1 "register_operand" " r")
+				   (neg:SI (match_dup 2)))]
+		     UNSPEC_ROUND)
+	  (ss_ashift:SI (match_dup 1)
+			(match_dup 2))))]
+  "NDS32_EXT_DSP_P ()"
+  "kslraw.u\t%0, %1, %2"
+  [(set_attr "type"    "daluround")
+   (set_attr "length"  "4")])
+
+(define_insn_and_split "<shift>di3"
+  [(set (match_operand:DI 0 "register_operand" "")
+	(shift_rotate:DI (match_operand:DI 1 "register_operand" "")
+			 (match_operand:SI 2 "nds32_rimm6u_operand" "")))]
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  "#"
+  "NDS32_EXT_DSP_P () && !reload_completed"
+  [(const_int 0)]
+{
+  if (REGNO (operands[0]) == REGNO (operands[1]))
+    {
+      rtx tmp = gen_reg_rtx (DImode);
+      nds32_split_<code>di3 (tmp, operands[1], operands[2]);
+      emit_move_insn (operands[0], tmp);
+    }
+  else
+    nds32_split_<code>di3 (operands[0], operands[1], operands[2]);
+  DONE;
+})
+
+(define_insn "sclip32"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")
+		    (match_operand:SI 2 "immediate_operand" "i")] UNSPEC_CLIPS_OV))]
+  "NDS32_EXT_DSP_P ()"
+  "sclip32\t%0, %1, %2"
+  [(set_attr "type"   "dclip")
+   (set_attr "length" "4")]
+)
+
+(define_insn "uclip32"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")
+		    (match_operand:SI 2 "immediate_operand" "i")] UNSPEC_CLIP_OV))]
+  "NDS32_EXT_DSP_P ()"
+  "uclip32\t%0, %1, %2"
+  [(set_attr "type"   "dclip")
+   (set_attr "length" "4")]
+)
+
+(define_insn "bitrev"
+  [(set (match_operand:SI 0 "register_operand"                 "=r,    r")
+	(unspec:SI [(match_operand:SI 1 "register_operand"     " r,    r")
+		    (match_operand:SI 2 "nds32_rimm5u_operand" " r, Iu05")]
+		   UNSPEC_BITREV))]
+  ""
+  "@
+   bitrev\t%0, %1, %2
+   bitrevi\t%0, %1, %2"
+  [(set_attr "type"   "dalu")
+   (set_attr "length" "4")]
+)
+
+;; wext, wexti
+(define_insn "<su>wext"
+  [(set (match_operand:SI 0 "register_operand"                "=r,   r")
+	(truncate:SI
+	  (shiftrt:DI
+	    (match_operand:DI 1 "register_operand"            " r,   r")
+	    (match_operand:SI 2 "nds32_rimm5u_operand"        " r,Iu05"))))]
+  "NDS32_EXT_DSP_P ()"
+  "@
+   wext\t%0, %1, %2
+   wexti\t%0, %1, %2"
+  [(set_attr "type"     "dwext")
+   (set_attr "length"   "4")])
+
+;; 32-bit add/sub instruction: raddw and rsubw.
+(define_insn "r<opcode>si3"
+  [(set (match_operand:SI 0 "register_operand"                       "=r")
+	(truncate:SI
+	  (ashiftrt:DI
+	    (plus_minus:DI
+	      (sign_extend:DI (match_operand:SI 1 "register_operand" " r"))
+	      (sign_extend:DI (match_operand:SI 2 "register_operand" " r")))
+	    (const_int 1))))]
+  "NDS32_EXT_DSP_P ()"
+  "r<opcode>w\t%0, %1, %2"
+  [(set_attr "type"    "dalu")
+   (set_attr "length"  "4")])
+
+;; 32-bit add/sub instruction: uraddw and ursubw.
+(define_insn "ur<opcode>si3"
+  [(set (match_operand:SI 0 "register_operand"                       "=r")
+	(truncate:SI
+	  (lshiftrt:DI
+	    (plus_minus:DI
+	      (zero_extend:DI (match_operand:SI 1 "register_operand" " r"))
+	      (zero_extend:DI (match_operand:SI 2 "register_operand" " r")))
+	    (const_int 1))))]
+  "NDS32_EXT_DSP_P ()"
+  "ur<opcode>w\t%0, %1, %2"
+  [(set_attr "type"    "dalu")
+   (set_attr "length"  "4")])
diff --git a/gcc/config/nds32/nds32-e8.md b/gcc/config/nds32/nds32-e8.md
new file mode 100644
index 0000000..1f24b5c
--- /dev/null
+++ b/gcc/config/nds32/nds32-e8.md
@@ -0,0 +1,329 @@
+;; Pipeline descriptions of Andes NDS32 cpu for GNU compiler
+;; Copyright (C) 2012-2016 Free Software Foundation, Inc.
+;; Contributed by Andes Technology Corporation.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+
+;; ------------------------------------------------------------------------
+;; Define E8 pipeline settings.
+;; ------------------------------------------------------------------------
+
+(define_automaton "nds32_e8_machine")
+
+;; ------------------------------------------------------------------------
+;; Pipeline Stages
+;; ------------------------------------------------------------------------
+;; IF - Instruction Fetch
+;; II - Instruction Issue / Address Generation
+;; EX - Instruction Execution
+;; EXD - Psuedo Stage / Load Data Completion
+
+(define_cpu_unit "e8_ii" "nds32_e8_machine")
+(define_cpu_unit "e8_ex" "nds32_e8_machine")
+
+(define_insn_reservation "nds_e8_unknown" 1
+  (and (eq_attr "type" "unknown")
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, e8_ex")
+
+(define_insn_reservation "nds_e8_misc" 1
+  (and (eq_attr "type" "misc")
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, e8_ex")
+
+(define_insn_reservation "nds_e8_alu" 1
+  (and (eq_attr "type" "alu")
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, e8_ex")
+
+(define_insn_reservation "nds_e8_load" 1
+  (and (match_test "nds32::load_single_p (insn)")
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, e8_ex")
+
+(define_insn_reservation "nds_e8_store" 1
+  (and (match_test "nds32::store_single_p (insn)")
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, e8_ex")
+
+(define_insn_reservation "nds_e8_load_multiple_1" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "1"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, e8_ex")
+
+(define_insn_reservation "nds_e8_load_multiple_2" 1
+  (and (ior (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "2"))
+	    (match_test "nds32::load_double_p (insn)"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, e8_ii+e8_ex, e8_ex")
+
+(define_insn_reservation "nds_e8_load_multiple_3" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "3"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, (e8_ii+e8_ex)*2, e8_ex")
+
+(define_insn_reservation "nds_e8_load_multiple_4" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "4"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, (e8_ii+e8_ex)*3, e8_ex")
+
+(define_insn_reservation "nds_e8_load_multiple_5" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "5"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, (e8_ii+e8_ex)*4, e8_ex")
+
+(define_insn_reservation "nds_e8_load_multiple_6" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "6"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, (e8_ii+e8_ex)*5, e8_ex")
+
+(define_insn_reservation "nds_e8_load_multiple_7" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "7"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, (e8_ii+e8_ex)*6, e8_ex")
+
+(define_insn_reservation "nds_e8_load_multiple_8" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "8"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, (e8_ii+e8_ex)*7, e8_ex")
+
+(define_insn_reservation "nds_e8_load_multiple_12" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "12"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, (e8_ii+e8_ex)*11, e8_ex")
+
+(define_insn_reservation "nds_e8_store_multiple_1" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "1"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, e8_ex")
+
+(define_insn_reservation "nds_e8_store_multiple_2" 1
+  (and (ior (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "2"))
+	    (match_test "nds32::store_double_p (insn)"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, e8_ii+e8_ex, e8_ex")
+
+(define_insn_reservation "nds_e8_store_multiple_3" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "3"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, (e8_ii+e8_ex)*2, e8_ex")
+
+(define_insn_reservation "nds_e8_store_multiple_4" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "4"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, (e8_ii+e8_ex)*3, e8_ex")
+
+(define_insn_reservation "nds_e8_store_multiple_5" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "5"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, (e8_ii+e8_ex)*4, e8_ex")
+
+(define_insn_reservation "nds_e8_store_multiple_6" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "6"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, (e8_ii+e8_ex)*5, e8_ex")
+
+(define_insn_reservation "nds_e8_store_multiple_7" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "7"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, (e8_ii+e8_ex)*6, e8_ex")
+
+(define_insn_reservation "nds_e8_store_multiple_8" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "8"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, (e8_ii+e8_ex)*7, e8_ex")
+
+(define_insn_reservation "nds_e8_store_multiple_12" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "12"))
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, (e8_ii+e8_ex)*11, e8_ex")
+
+(define_insn_reservation "nds_e8_mul_fast" 1
+  (and (match_test "nds32_mul_config != MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mul")
+	    (eq_attr "pipeline_model" "e8")))
+  "e8_ii, e8_ex")
+
+(define_insn_reservation "nds_e8_mul_slow" 1
+  (and (match_test "nds32_mul_config == MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mul")
+	    (eq_attr "pipeline_model" "e8")))
+  "e8_ii, e8_ex*16")
+
+(define_insn_reservation "nds_e8_mac_fast" 1
+  (and (match_test "nds32_mul_config != MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mac")
+	    (eq_attr "pipeline_model" "e8")))
+  "e8_ii, e8_ii+e8_ex, e8_ex")
+
+(define_insn_reservation "nds_e8_mac_slow" 1
+  (and (match_test "nds32_mul_config == MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mac")
+	    (eq_attr "pipeline_model" "e8")))
+  "e8_ii, (e8_ii+e8_ex)*16, e8_ex")
+
+(define_insn_reservation "nds_e8_div" 1
+  (and (eq_attr "type" "div")
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, (e8_ii+e8_ex)*36, e8_ex")
+
+(define_insn_reservation "nds_e8_branch" 1
+  (and (eq_attr "type" "branch")
+       (eq_attr "pipeline_model" "e8"))
+  "e8_ii, e8_ex")
+
+;; ------------------------------------------------------------------------
+;; Comment Notations and Bypass Rules
+;; ------------------------------------------------------------------------
+;; Producers (LHS)
+;;   LD
+;;     Load data from the memory and produce the loaded data. The result is
+;;     ready at EXD.
+;;   LMW(N, M)
+;;     There are N micro-operations within an instruction that loads multiple
+;;     words. The result produced by the M-th micro-operation is sent to
+;;     consumers. The result is ready at EXD.
+;;   ADDR_OUT
+;;     Most load/store instructions can produce an address output if updating
+;;     the base register is required. The result is ready at EX, which is
+;;     produced by ALU.
+;;   ALU, MOVD44, MUL, MAC
+;;     The result is ready at EX.
+;;   DIV_Rs
+;;     A division instruction saves the quotient result to Rt and saves the
+;;     remainder result to Rs. The instruction is separated into two micro-
+;;     operations. The first micro-operation writes to Rt, and the seconde
+;;     one writes to Rs. Each of the results is ready at EX.
+;;
+;; Consumers (RHS)
+;;   ALU, MUL, DIV
+;;     Require operands at EX.
+;;   ADDR_IN_MOP(N)
+;;      N denotes the address input is required by the N-th micro-operation.
+;;      Such operand is required at II.
+;;   ST
+;;     A store instruction requires its data at EX.
+;;   SMW(N, M)
+;;     There are N micro-operations within an instruction that stores multiple
+;;     words. Each M-th micro-operation requires its data at EX.
+;;   BR_COND
+;;     If a branch instruction is conditional, its input data is required at EX.
+
+;; LD -> ADDR_IN_MOP(1)
+(define_bypass 2
+  "nds_e8_load"
+  "nds_e8_branch,\
+   nds_e8_load, nds_e8_store,\
+   nds_e8_load_multiple_1,nds_e8_load_multiple_2, nds_e8_load_multiple_3,\
+   nds_e8_load_multiple_4,nds_e8_load_multiple_5, nds_e8_load_multiple_6,\
+   nds_e8_load_multiple_7,nds_e8_load_multiple_8, nds_e8_load_multiple_12,\
+   nds_e8_store_multiple_1,nds_e8_store_multiple_2, nds_e8_store_multiple_3,\
+   nds_e8_store_multiple_4,nds_e8_store_multiple_5, nds_e8_store_multiple_6,\
+   nds_e8_store_multiple_7,nds_e8_store_multiple_8, nds_e8_store_multiple_12"
+  "nds32_e8_load_to_ii_p"
+)
+
+;; LD -> ALU, MUL, MAC, DIV, BR_COND, ST, SMW(N, 1)
+(define_bypass 2
+  "nds_e8_load"
+  "nds_e8_alu,
+   nds_e8_mul_fast, nds_e8_mul_slow,\
+   nds_e8_mac_fast, nds_e8_mac_slow,\
+   nds_e8_div,\
+   nds_e8_branch,\
+   nds_e8_store,\
+   nds_e8_store_multiple_1,nds_e8_store_multiple_2, nds_e8_store_multiple_3,\
+   nds_e8_store_multiple_4,nds_e8_store_multiple_5, nds_e8_store_multiple_6,\
+   nds_e8_store_multiple_7,nds_e8_store_multiple_8, nds_e8_store_multiple_12"
+  "nds32_e8_load_to_ex_p"
+)
+
+;; ALU, MOVD44, MUL, MAC, DIV_Rs, LD_bi, ADDR_OUT -> ADDR_IN_MOP(1)
+(define_bypass 2
+  "nds_e8_alu,
+   nds_e8_mul_fast, nds_e8_mul_slow,\
+   nds_e8_mac_fast, nds_e8_mac_slow,\
+   nds_e8_div,\
+   nds_e8_load, nds_e8_store,\
+   nds_e8_load_multiple_1,nds_e8_load_multiple_2, nds_e8_load_multiple_3,\
+   nds_e8_load_multiple_4,nds_e8_load_multiple_5, nds_e8_load_multiple_6,\
+   nds_e8_load_multiple_7,nds_e8_load_multiple_8, nds_e8_load_multiple_12,\
+   nds_e8_store_multiple_1,nds_e8_store_multiple_2, nds_e8_store_multiple_3,\
+   nds_e8_store_multiple_4,nds_e8_store_multiple_5, nds_e8_store_multiple_6,\
+   nds_e8_store_multiple_7,nds_e8_store_multiple_8, nds_e8_store_multiple_12"
+  "nds_e8_branch,\
+   nds_e8_load, nds_e8_store,\
+   nds_e8_load_multiple_1,nds_e8_load_multiple_2, nds_e8_load_multiple_3,\
+   nds_e8_load_multiple_4,nds_e8_load_multiple_5, nds_e8_load_multiple_6,\
+   nds_e8_load_multiple_7,nds_e8_load_multiple_8, nds_e8_load_multiple_12,\
+   nds_e8_store_multiple_1,nds_e8_store_multiple_2, nds_e8_store_multiple_3,\
+   nds_e8_store_multiple_4,nds_e8_store_multiple_5, nds_e8_store_multiple_6,\
+   nds_e8_store_multiple_7,nds_e8_store_multiple_8, nds_e8_store_multiple_12"
+  "nds32_e8_ex_to_ii_p"
+)
+
+;; LMW(N, N) -> ADDR_IN_MOP(1)
+(define_bypass 2
+  "nds_e8_load_multiple_1,nds_e8_load_multiple_2, nds_e8_load_multiple_3,\
+   nds_e8_load_multiple_4,nds_e8_load_multiple_5, nds_e8_load_multiple_6,\
+   nds_e8_load_multiple_7,nds_e8_load_multiple_8, nds_e8_load_multiple_12"
+  "nds_e8_branch,\
+   nds_e8_load, nds_e8_store,\
+   nds_e8_load_multiple_1,nds_e8_load_multiple_2, nds_e8_load_multiple_3,\
+   nds_e8_load_multiple_4,nds_e8_load_multiple_5, nds_e8_load_multiple_6,\
+   nds_e8_load_multiple_7,nds_e8_load_multiple_8, nds_e8_load_multiple_12,\
+   nds_e8_store_multiple_1,nds_e8_store_multiple_2, nds_e8_store_multiple_3,\
+   nds_e8_store_multiple_4,nds_e8_store_multiple_5, nds_e8_store_multiple_6,\
+   nds_e8_store_multiple_7,nds_e8_store_multiple_8, nds_e8_store_multiple_12"
+  "nds32_e8_last_load_to_ii_p"
+)
+
+;; LMW(N, N) -> ALU, MUL, MAC, DIV, BR_COND, ST, SMW(N, 1)
+(define_bypass 2
+  "nds_e8_load_multiple_1,nds_e8_load_multiple_2, nds_e8_load_multiple_3,\
+   nds_e8_load_multiple_4,nds_e8_load_multiple_5, nds_e8_load_multiple_6,\
+   nds_e8_load_multiple_7,nds_e8_load_multiple_8, nds_e8_load_multiple_12"
+  "nds_e8_alu,
+   nds_e8_mul_fast, nds_e8_mul_slow,\
+   nds_e8_mac_fast, nds_e8_mac_slow,\
+   nds_e8_div,\
+   nds_e8_branch,\
+   nds_e8_store,\
+   nds_e8_store_multiple_1,nds_e8_store_multiple_2, nds_e8_store_multiple_3,\
+   nds_e8_store_multiple_4,nds_e8_store_multiple_5, nds_e8_store_multiple_6,\
+   nds_e8_store_multiple_7,nds_e8_store_multiple_8, nds_e8_store_multiple_12"
+  "nds32_e8_last_load_to_ex_p"
+)
diff --git a/gcc/config/nds32/nds32-elf.opt b/gcc/config/nds32/nds32-elf.opt
new file mode 100644
index 0000000..afe6aad
--- /dev/null
+++ b/gcc/config/nds32/nds32-elf.opt
@@ -0,0 +1,16 @@
+mcmodel=
+Target RejectNegative Joined Enum(nds32_cmodel_type) Var(nds32_cmodel_option) Init(CMODEL_MEDIUM)
+Specify the address generation strategy for code model.
+
+Enum
+Name(nds32_cmodel_type) Type(enum nds32_cmodel_type)
+Known cmodel types (for use with the -mcmodel= option):
+
+EnumValue
+Enum(nds32_cmodel_type) String(small) Value(CMODEL_SMALL)
+
+EnumValue
+Enum(nds32_cmodel_type) String(medium) Value(CMODEL_MEDIUM)
+
+EnumValue
+Enum(nds32_cmodel_type) String(large) Value(CMODEL_LARGE)
diff --git a/gcc/config/nds32/nds32-fp-as-gp.c b/gcc/config/nds32/nds32-fp-as-gp.c
index f8b2738..6525915 100644
--- a/gcc/config/nds32/nds32-fp-as-gp.c
+++ b/gcc/config/nds32/nds32-fp-as-gp.c
@@ -1,4 +1,4 @@
-/* The fp-as-gp pass of Andes NDS32 cpu for GNU compiler
+/* fp-as-gp pass of Andes NDS32 cpu for GNU compiler
    Copyright (C) 2012-2016 Free Software Foundation, Inc.
    Contributed by Andes Technology Corporation.

@@ -24,19 +24,280 @@
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
+#include "tree.h"
+#include "rtl.h"
+#include "df.h"
+#include "alias.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "regs.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
+#include "explow.h"
+#include "emit-rtl.h"
+#include "stmt.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"		/* For add_builtin_function().  */
+#include "builtins.h"
+#include "ira.h"
+#include "ira-int.h"
+#include "tree-pass.h"

 /* ------------------------------------------------------------------------ */

+/* A helper function to check if this function should contain prologue.  */
+static bool
+nds32_have_prologue_p (void)
+{
+  int i;
+
+  for (i = 0; i < 28; i++)
+    if (NDS32_REQUIRED_CALLEE_SAVED_P (i))
+      return true;
+
+  return (flag_pic
+	  || NDS32_REQUIRED_CALLEE_SAVED_P (FP_REGNUM)
+	  || NDS32_REQUIRED_CALLEE_SAVED_P (LP_REGNUM));
+}
+
+static int
+nds32_get_symbol_count (void)
+{
+  int symbol_count = 0;
+  rtx_insn *insn;
+  basic_block bb;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      FOR_BB_INSNS (bb, insn)
+	{
+	  /* Counting the insn number which the addressing mode is symbol.  */
+	  if (single_set (insn) && nds32_symbol_load_store_p (insn))
+	    {
+	      rtx pattern = PATTERN (insn);
+	      rtx mem;
+	      gcc_assert (GET_CODE (pattern) == SET);
+	      if (GET_CODE (SET_SRC (pattern)) == REG )
+		mem = SET_DEST (pattern);
+	      else
+		mem = SET_SRC (pattern);
+
+	      /* We have only lwi37 and swi37 for fp-as-gp optimization,
+		 so don't count any other than SImode.
+		 MEM for QImode and HImode will wrap by ZERO_EXTEND
+		 or SIGN_EXTEND */
+	      if (GET_CODE (mem) == MEM)
+		symbol_count++;
+	    }
+	}
+    }
+
+  return symbol_count;
+}
+
 /* Function to determine whether it is worth to do fp_as_gp optimization.
-   Return 0: It is NOT worth to do fp_as_gp optimization.
-   Return 1: It is APPROXIMATELY worth to do fp_as_gp optimization.
+   Return false: It is NOT worth to do fp_as_gp optimization.
+   Return true: It is APPROXIMATELY worth to do fp_as_gp optimization.
    Note that if it is worth to do fp_as_gp optimization,
    we MUST set FP_REGNUM ever live in this function.  */
-int
+static bool
 nds32_fp_as_gp_check_available (void)
 {
-  /* By default we return 0.  */
-  return 0;
+  basic_block bb;
+  basic_block exit_bb;
+  edge_iterator ei;
+  edge e;
+  bool first_exit_blocks_p;
+
+  /* If there exists ANY of following conditions,
+     we DO NOT perform fp_as_gp optimization:
+       1. TARGET_FORBID_FP_AS_GP is set
+	  regardless of the TARGET_FORCE_FP_AS_GP.
+       2. User explicitly uses 'naked'/'no_prologue' attribute.
+	  We use nds32_naked_function_p() to help such checking.
+       3. Not optimize for size.
+       4. Need frame pointer.
+       5. If $fp is already required to be saved,
+	  it means $fp is already choosen by register allocator.
+	  Thus we better not to use it for fp_as_gp optimization.
+       6. This function is a vararg function.
+	  DO NOT apply fp_as_gp optimization on this function
+	  because it may change and break stack frame.
+       7. The epilogue is empty.
+	  This happens when the function uses exit()
+	  or its attribute is no_return.
+	  In that case, compiler will not expand epilogue
+	  so that we have no chance to output .omit_fp_end directive.  */
+  if (TARGET_FORBID_FP_AS_GP
+      || nds32_naked_function_p (current_function_decl)
+      || !optimize_size
+      || frame_pointer_needed
+      || NDS32_REQUIRED_CALLEE_SAVED_P (FP_REGNUM)
+      || (cfun->stdarg == 1)
+      || (find_fallthru_edge (EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) == NULL))
+    return false;
+
+  /* Disable fp_as_gp if there is any infinite loop since the fp may
+     reuse in infinite loops by register rename.
+     For check infinite loops we should make sure exit_bb is post dominate
+     all other basic blocks if there is no infinite loops.  */
+  first_exit_blocks_p = true;
+  exit_bb = NULL;
+
+  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+    {
+      /* More than one exit block also do not perform fp_as_gp optimization.  */
+      if (!first_exit_blocks_p)
+	return false;
+
+      exit_bb = e->src;
+      first_exit_blocks_p = false;
+    }
+
+  /* Not found exit_bb? just abort fp_as_gp!  */
+  if (!exit_bb)
+    return false;
+
+  /* Each bb should post dominate by exit_bb if there is no infinite loop! */
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      if (!dominated_by_p (CDI_POST_DOMINATORS,
+			   bb,
+			   exit_bb))
+	return false;
+    }
+
+  /* Now we can check the possibility of using fp_as_gp optimization.  */
+  if (TARGET_FORCE_FP_AS_GP)
+    {
+      /* User explicitly issues -mforce-fp-as-gp option.  */
+      return true;
+    }
+  else
+    {
+      /* In the following we are going to evaluate whether
+	 it is worth to do fp_as_gp optimization.  */
+      bool good_gain = false;
+      int symbol_count;
+
+      int threshold;
+
+      /* We check if there already requires prologue.
+	 Note that $gp will be saved in prologue for PIC code generation.
+	 After that, we can set threshold by the existence of prologue.
+	 Each fp-implied instruction will gain 2-byte code size
+	 from gp-aware instruction, so we have following heuristics.  */
+      if (flag_pic
+	  || nds32_have_prologue_p ())
+	{
+	  /* Have-prologue:
+	       Compiler already intends to generate prologue content,
+	       so the fp_as_gp optimization will only insert
+	       'la $fp,_FP_BASE_' instruction, which will be
+	       converted into 4-byte instruction at link time.
+	       The threshold is "3" symbol accesses, 2 + 2 + 2 > 4.  */
+	  threshold = 3;
+	}
+      else
+	{
+	  /* None-prologue:
+	       Compiler originally does not generate prologue content,
+	       so the fp_as_gp optimization will NOT ONLY insert
+	       'la $fp,_FP_BASE' instruction, but also causes
+	       push/pop instructions.
+	       If we are using v3push (push25/pop25),
+	       the threshold is "5" symbol accesses, 5*2 > 4 + 2 + 2;
+	       If we are using normal push (smw/lmw),
+	       the threshold is "5+2" symbol accesses 7*2 > 4 + 4 + 4.  */
+	  threshold = 5 + (TARGET_V3PUSH ? 0 : 2);
+	}
+
+      symbol_count = nds32_get_symbol_count ();
+
+      if (symbol_count >= threshold)
+	good_gain = true;
+
+      /* Enable fp_as_gp optimization when potential gain is good enough.  */
+      return good_gain;
+    }
+}
+
+static unsigned int
+nds32_fp_as_gp (void)
+{
+  bool fp_as_gp_p;
+  calculate_dominance_info (CDI_POST_DOMINATORS);
+  fp_as_gp_p = nds32_fp_as_gp_check_available ();
+
+  /* Here is a hack to IRA for enable/disable a hard register per function.
+     We *MUST* review this way after migrate gcc 4.9! */
+  if (fp_as_gp_p) {
+    SET_HARD_REG_BIT(this_target_ira_int->x_no_unit_alloc_regs, FP_REGNUM);
+    df_set_regs_ever_live (FP_REGNUM, 1);
+  } else {
+    CLEAR_HARD_REG_BIT(this_target_ira_int->x_no_unit_alloc_regs, FP_REGNUM);
+  }
+
+  cfun->machine->fp_as_gp_p = fp_as_gp_p;
+
+  free_dominance_info (CDI_POST_DOMINATORS);
+  return 1;
+}
+
+const pass_data pass_data_nds32_fp_as_gp =
+{
+  RTL_PASS,				/* type */
+  "fp_as_gp",				/* name */
+  OPTGROUP_NONE,			/* optinfo_flags */
+  TV_MACH_DEP,				/* tv_id */
+  0,					/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  0					/* todo_flags_finish */
+};
+
+class pass_nds32_fp_as_gp : public rtl_opt_pass
+{
+public:
+  pass_nds32_fp_as_gp (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_nds32_fp_as_gp, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *)
+  {
+    return !TARGET_LINUX_ABI
+	   && TARGET_16_BIT
+	   && optimize_size;
+  }
+  unsigned int execute (function *) { return nds32_fp_as_gp (); }
+};
+
+rtl_opt_pass *
+make_pass_nds32_fp_as_gp (gcc::context *ctxt)
+{
+  return new pass_nds32_fp_as_gp (ctxt);
 }

 /* ------------------------------------------------------------------------ */
diff --git a/gcc/config/nds32/nds32-fpu.md b/gcc/config/nds32/nds32-fpu.md
new file mode 100644
index 0000000..11eabd5
--- /dev/null
+++ b/gcc/config/nds32/nds32-fpu.md
@@ -0,0 +1,503 @@
+;; Machine description of Andes NDS32 cpu for GNU compiler
+;; Copyright (C) 2012-2016 Free Software Foundation, Inc.
+;; Contributed by Andes Technology Corporation.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;;SFmode moves
+
+(define_expand "movsf"
+  [(set (match_operand:SF 0 "general_operand" "")
+	(match_operand:SF 1 "general_operand" ""))]
+  ""
+{
+  /* Need to force register if mem <- !reg.  */
+  if (MEM_P (operands[0]) && !REG_P (operands[1]))
+    operands[1] = force_reg (SFmode, operands[1]);
+  if (CONST_DOUBLE_P (operands[1])
+      && !satisfies_constraint_Cs20 (operands[1]))
+    {
+      const REAL_VALUE_TYPE *r;
+      unsigned long l;
+
+      r = CONST_DOUBLE_REAL_VALUE (operands[1]);
+      REAL_VALUE_TO_TARGET_SINGLE (*r, l);
+
+      emit_move_insn (operands[0], gen_rtx_HIGH (SFmode, operands[1]));
+
+      if ((l & 0xFFF) != 0)
+	emit_insn (gen_movsf_lo (operands[0], operands[0], operands[1]));
+      DONE;
+    }
+})
+
+(define_insn "movsf_lo"
+  [(set (match_operand:SF 0 "register_operand" "=r")
+	(lo_sum:SF (match_operand:SF 1 "register_operand" "r")
+		   (match_operand:SF 2 "immediate_operand" "i")))]
+  ""
+  "ori\t%0, %1, lo12(%2)"
+  [(set_attr "type"   "alu")
+   (set_attr "length"   "4")]
+)
+
+(define_insn "*movsf"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=r, r, U45, U33, U37, U45, m,   l,   l,   l,   d, r, f, *f, *r, f, Q,   r,   r,    r")
+	(match_operand:SF 1 "general_operand"      " r, r,   l,   l,   l,   d, r, U45, U33, U37, U45, m, f, *r, *f, Q, f,Cs05,Cs20, Chig"))]
+  "(register_operand(operands[0], SFmode)
+    || register_operand(operands[1], SFmode))"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "mov55\t%0, %1";
+    case 1:
+      return "ori\t%0, %1, 0";
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+      return nds32_output_16bit_store (operands, 4);
+    case 6:
+      return nds32_output_32bit_store (operands, 4);
+    case 7:
+    case 8:
+    case 9:
+    case 10:
+      return nds32_output_16bit_load (operands, 4);
+    case 11:
+      return nds32_output_32bit_load (operands, 4);
+    case 12:
+      if (TARGET_FPU_SINGLE)
+	return "fcpyss\t%0, %1, %1";
+      else
+	return "#";
+    case 13:
+      return "fmtsr\t%1, %0";
+    case 14:
+      return "fmfsr\t%0, %1";
+    case 15:
+      return nds32_output_float_load (operands);
+    case 16:
+      return nds32_output_float_store (operands);
+    case 17:
+      return "movi55\t%0, %1";
+    case 18:
+      return "movi\t%0, %1";
+    case 19:
+      return "sethi\t%0, %1";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type"    "alu,alu,store,store,store,store,store,load,load,load,load,load,fcpy,fmtsr,fmfsr,fload,fstore,alu,alu,alu")
+   (set_attr "length"  "  2,  4,    2,    2,    2,    2,    4,   2,   2,   2,   2,   4,   4,    4,    4,    4,     4,  2,  4,  4")
+   (set_attr "feature" " v1, v1,   v1,   v1,   v1,   v1,   v1,  v1,  v1,  v1,  v1,  v1, fpu,  fpu,  fpu,  fpu,   fpu, v1, v1, v1")])
+
+;; Conditional Move Instructions
+
+(define_expand "mov<mode>cc"
+  [(set (match_operand:ANYF 0 "register_operand" "")
+	(if_then_else:ANYF (match_operand 1 "nds32_float_comparison_operator" "")
+			   (match_operand:ANYF 2 "register_operand" "")
+			   (match_operand:ANYF 3 "register_operand" "")))]
+  ""
+{
+  if (nds32_cond_move_p (operands[1]))
+    {
+      /* Operands[1] condition code is UNORDERED or ORDERED, and
+	 sub-operands[1] MODE isn't SFmode or SFmode, return FAIL
+	 for gcc, because we don't using slt compare instruction
+	 to generate UNORDERED and ORDERED condition.  */
+      FAIL;
+    }
+  else
+    nds32_expand_float_movcc (operands);
+})
+
+(define_insn "fcmov<mode>_eq"
+  [(set (match_operand:ANYF 0 "register_operand" "=f, f")
+	(if_then_else:ANYF (eq (match_operand:SI 1 "register_operand" "f, f")
+			       (const_int 0))
+			   (match_operand:ANYF 2 "register_operand" "f, 0")
+			   (match_operand:ANYF 3 "register_operand" "0, f")))]
+  ""
+  "@
+   fcmovz<size>\t%0,%2,%1
+   fcmovn<size>\t%0,%3,%1"
+  [(set_attr "type"  "fcmov")
+   (set_attr "length" "4")]
+)
+
+(define_insn "fcmov<mode>_ne"
+  [(set (match_operand:ANYF 0 "register_operand" "=f, f")
+	(if_then_else:ANYF (ne (match_operand:SI 1 "register_operand" "f, f")
+			       (const_int 0))
+			   (match_operand:ANYF 2 "register_operand" "f, 0")
+			   (match_operand:ANYF 3 "register_operand" "0, f")))]
+  ""
+  "@
+   fcmovn<size>\t%0,%2,%1
+   fcmovz<size>\t%0,%3,%1"
+  [(set_attr "type"  "fcmov")
+   (set_attr "length" "4")]
+)
+
+;; Arithmetic instructions.
+
+(define_insn "add<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(plus:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		   (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "fadd<size>\t %0, %1, %2"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "sub<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(minus:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		    (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "fsub<size>\t %0, %1, %2"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+;; Multiplication insns.
+
+(define_insn "mul<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(mult:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		   (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "fmul<size>\t %0, %1, %2"
+  [(set_attr "type"   "fmul<size>")
+   (set_attr "length" "4")]
+)
+
+(define_insn "fma<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(fma:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		  (match_operand:ANYF 2 "register_operand" "f")
+		  (match_operand:ANYF 3 "register_operand" "0")))]
+  "TARGET_EXT_FPU_FMA"
+  "fmadd<size>\t%0, %1, %2"
+  [(set_attr "type"   "fmac<size>")
+   (set_attr "length" "4")]
+)
+
+(define_insn "fnma<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(fma:ANYF (neg:ANYF (match_operand:ANYF 1 "register_operand" "f"))
+		  (match_operand:ANYF 2 "register_operand" "f")
+		  (match_operand:ANYF 3 "register_operand" "0")))]
+  "TARGET_EXT_FPU_FMA"
+  "fmsub<size>\t%0, %1, %2"
+  [(set_attr "type"   "fmac<size>")
+   (set_attr "length" "4")]
+)
+
+(define_insn "fms<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(fma:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		  (match_operand:ANYF 2 "register_operand" "f")
+		  (neg:ANYF (match_operand:ANYF 3 "register_operand" "0"))))]
+  "TARGET_EXT_FPU_FMA"
+  "fnmsub<size>\t%0, %1, %2"
+  [(set_attr "type"   "fmac<size>")
+   (set_attr "length" "4")]
+)
+
+(define_insn "fnms<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(fma:ANYF (neg:ANYF (match_operand:ANYF 1 "register_operand" "f"))
+		  (match_operand:ANYF 2 "register_operand" "f")
+		  (neg:ANYF (match_operand:ANYF 3 "register_operand" "0"))))]
+  "TARGET_EXT_FPU_FMA"
+  "fnmadd<size>\t%0, %1, %2"
+  [(set_attr "type"   "fmac<size>")
+   (set_attr "length" "4")]
+)
+
+;; Div Instructions.
+
+(define_insn "div<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(div:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		  (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "fdiv<size>\t %0, %1, %2"
+  [(set_attr "type"   "fdiv<size>")
+   (set_attr "length" "4")]
+)
+
+(define_insn "sqrt<mode>2"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(sqrt:ANYF (match_operand:ANYF 1 "register_operand" "f")))]
+  ""
+  "fsqrt<size>\t %0, %1"
+  [(set_attr "type"   "fsqrt<size>")
+   (set_attr "length" "4")]
+)
+
+;; Conditional Branch patterns
+
+(define_expand "cstore<mode>4"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(match_operator:SI 1 "nds32_float_comparison_operator"
+	 [(match_operand:ANYF 2 "register_operand" "")
+	  (match_operand:ANYF 3 "register_operand" "")]))]
+  ""
+{
+  nds32_expand_float_cstore (operands);
+  DONE;
+})
+
+(define_expand "cbranch<mode>4"
+  [(set (pc)
+	(if_then_else (match_operator 0 "nds32_float_comparison_operator"
+		       [(match_operand:ANYF 1 "register_operand" "")
+			(match_operand:ANYF 2 "register_operand" "")])
+		      (label_ref (match_operand 3 "" ""))
+		      (pc)))]
+  ""
+{
+  nds32_expand_float_cbranch (operands);
+  DONE;
+})
+
+;; Copysign Instructions.
+
+(define_insn "copysignsf3"
+  [(set (match_operand:SF 0 "register_operand" "=f")
+	(unspec:SF [(match_operand:SF 1 "register_operand" "f")
+		    (match_operand:SF 2 "register_operand" "f")]
+		     UNSPEC_COPYSIGN))]
+  "TARGET_FPU_SINGLE"
+  "fcpyss\t%0,%1,%2"
+  [(set_attr "type"   "fcpy")
+   (set_attr "length" "4")]
+)
+
+(define_insn "copysigndf3"
+  [(set (match_operand:DF 0 "register_operand" "=f")
+	(unspec:DF [(match_operand:DF 1 "register_operand" "f")
+		    (match_operand:DF 2 "register_operand" "f")]
+		     UNSPEC_COPYSIGN))]
+  "TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE"
+  "fcpysd\t%0,%1,%2"
+  [(set_attr "type"   "fcpy")
+   (set_attr "length" "4")]
+)
+
+(define_insn "*ncopysign<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(neg:ANYF (unspec:ANYF [(match_operand:ANYF 1 "register_operand" "f")
+				(match_operand:ANYF 2 "register_operand" "f")]
+				UNSPEC_COPYSIGN)))]
+  ""
+  "fcpyns<size>\t%0,%1,%2"
+  [(set_attr "type"   "fcpy")
+   (set_attr "length" "4")]
+)
+
+;; Absolute Instructions
+
+(define_insn "abssf2"
+  [(set (match_operand:SF 0 "register_operand" "=f, r")
+	(abs:SF (match_operand:SF 1 "register_operand" "f, r")))]
+  "TARGET_FPU_SINGLE || TARGET_EXT_PERF"
+  "@
+   fabss\t%0, %1
+   bclr\t%0, %1, 31"
+  [(set_attr "type"    "fabs,alu")
+   (set_attr "length"  "4")
+   (set_attr "feature" "fpu,pe1")]
+)
+
+(define_insn "absdf2"
+  [(set (match_operand:DF 0 "register_operand" "=f")
+	(abs:DF (match_operand:DF 1 "register_operand" "f")))]
+  "TARGET_FPU_DOUBLE"
+  "fabsd\t%0, %1"
+  [(set_attr "type"   "fabs")
+   (set_attr "length" "4")]
+)
+
+;; Negation Instructions
+
+(define_insn "*negsf2"
+  [(set (match_operand:SF 0 "register_operand" "=f, r")
+	(neg:SF (match_operand:SF 1 "register_operand" "f, r")))]
+  "TARGET_FPU_SINGLE || TARGET_EXT_PERF"
+  "@
+   fcpynss\t%0, %1, %1
+   btgl\t%0, %1, 31"
+  [(set_attr "type"    "fcpy,alu")
+   (set_attr "length"  "4")
+   (set_attr "feature" "fpu,pe1")]
+)
+
+(define_insn "*negdf2"
+  [(set (match_operand:DF 0 "register_operand" "=f")
+	(neg:DF (match_operand:DF 1 "register_operand" "f")))]
+  "TARGET_FPU_DOUBLE"
+  "fcpynsd\t%0, %1, %1"
+  [(set_attr "type"   "fcpy")
+   (set_attr "length" "4")]
+)
+
+;; Data Format Conversion Instructions
+
+(define_insn "floatunssi<mode>2"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(unsigned_float:ANYF (match_operand:SI 1 "register_operand" "f")))]
+  ""
+  "fui2<size>\t %0, %1"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "floatsi<mode>2"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(float:ANYF (match_operand:SI 1 "register_operand" "f")))]
+  ""
+  "fsi2<size>\t %0, %1"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "fixuns_trunc<mode>si2"
+  [(set (match_operand:SI 0 "register_operand" "=f")
+	(unsigned_fix:SI (fix:ANYF (match_operand:ANYF 1 "register_operand" "f"))))]
+  ""
+  "f<size>2ui.z\t %0, %1"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "fix_trunc<mode>si2"
+  [(set (match_operand:SI 0 "register_operand" "=f")
+	(fix:SI (fix:ANYF (match_operand:ANYF 1 "register_operand" "f"))))]
+  ""
+  "f<size>2si.z\t %0, %1"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "extendsfdf2"
+  [(set (match_operand:DF 0 "register_operand" "=f")
+	(float_extend:DF (match_operand:SF 1 "register_operand" "f")))]
+  "TARGET_FPU_SINGLE && TARGET_FPU_DOUBLE"
+  "fs2d\t%0, %1"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "truncdfsf2"
+  [(set (match_operand:SF 0 "register_operand" "=f")
+	(float_truncate:SF (match_operand:DF 1 "register_operand" "f")))]
+  "TARGET_FPU_SINGLE && TARGET_FPU_DOUBLE"
+  "fd2s\t%0, %1"
+  [(set_attr "type"   "falu")
+   (set_attr "length" "4")]
+)
+
+;; Compare Instructions
+
+(define_insn "cmp<mode>_eq"
+  [(set (match_operand:SI 0 "register_operand" "=f")
+	(eq:SI (match_operand:ANYF 1 "register_operand" "f")
+	       (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  {
+    if (NDS32_EXT_FPU_DOT_E)
+      return "fcmpeq<size>.e %0, %1, %2";
+    else
+      return "fcmpeq<size>\t%0, %1, %2";
+  }
+  [(set_attr "type"   "fcmp")
+   (set_attr "length" "4")]
+)
+
+(define_insn "cmp<mode>_lt"
+  [(set (match_operand:SI 0 "register_operand" "=f")
+	(lt:SI (match_operand:ANYF 1 "register_operand" "f")
+	       (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+{
+  if (NDS32_EXT_FPU_DOT_E)
+    return "fcmplt<size>.e %0, %1, %2";
+  else
+    return "fcmplt<size>\t%0, %1, %2";
+}
+  [(set_attr "type"   "fcmp")
+   (set_attr "length" "4")]
+)
+
+(define_insn "cmp<mode>_le"
+  [(set (match_operand:SI 0 "register_operand" "=f")
+	(le:SI (match_operand:ANYF 1 "register_operand" "f")
+	       (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+{
+  if (NDS32_EXT_FPU_DOT_E)
+    return "fcmple<size>.e %0, %1, %2";
+  else
+    return "fcmple<size>\t%0, %1, %2";
+}
+  [(set_attr "type"   "fcmp")
+   (set_attr "length" "4")]
+)
+
+(define_insn "cmp<mode>_un"
+  [(set (match_operand:SI 0 "register_operand" "=f")
+	(unordered:SI (match_operand:ANYF 1 "register_operand" "f")
+		      (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+{
+  if (NDS32_EXT_FPU_DOT_E)
+    return "fcmpun<size>.e %0, %1, %2";
+  else
+    return "fcmpun<size>\t%0, %1, %2";
+}
+  [(set_attr "type"   "fcmp")
+   (set_attr "length" "4")]
+)
+
+(define_split
+  [(set (match_operand:SF 0 "register_operand" "")
+	(match_operand:SF 1 "register_operand" ""))]
+  "!TARGET_FPU_SINGLE
+   && NDS32_IS_FPR_REGNUM (REGNO (operands[0]))
+   && NDS32_IS_FPR_REGNUM (REGNO (operands[1]))"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0) (match_dup 2))]
+{
+  operands[2] = gen_rtx_REG (SFmode, TA_REGNUM);
+})
+
+(define_split
+  [(set (match_operand:SF 0 "register_operand" "")
+	(match_operand:SF 1 "const_double_operand" ""))]
+  "!satisfies_constraint_Cs20 (operands[1])
+   && !satisfies_constraint_Chig (operands[1])"
+  [(set (match_dup 0) (high:SF (match_dup 1)))
+   (set (match_dup 0) (lo_sum:SF (match_dup 0) (match_dup 1)))])
+;; ----------------------------------------------------------------------------
diff --git a/gcc/config/nds32/nds32-gcse.c b/gcc/config/nds32/nds32-gcse.c
new file mode 100644
index 0000000..301981d
--- /dev/null
+++ b/gcc/config/nds32/nds32-gcse.c
@@ -0,0 +1,670 @@
+/* Global CSE pass of Andes NDS32 cpu for GNU compiler
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+/* ------------------------------------------------------------------------ */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "tree.h"
+#include "rtl.h"
+#include "df.h"
+#include "alias.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "regs.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
+#include "explow.h"
+#include "emit-rtl.h"
+#include "stmt.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"		/* For add_builtin_function().  */
+#include "builtins.h"
+#include "cpplib.h"
+#include "params.h"
+#include "tree-pass.h"
+#include "dbgcnt.h"
+#include "df.h"
+#include "reload.h"
+
+/* ------------------------------------------------------------------------ */
+
+struct expr
+{
+  /* The expression.  */
+  rtx expr;
+
+  /* The same hash for this entry.  */
+  hashval_t hash;
+
+  struct occr *antic_occr;
+  /* The number of antic_occr.  */
+  unsigned int count;
+};
+
+struct occr
+{
+  /* Next occurrence of this expression.  */
+  struct occr *next;
+  /* The insn that computes the expression.  */
+  rtx_insn *insn;
+  /* Nonzero if this [anticipatable] occurrence has been deleted.  */
+  char deleted_p;
+};
+
+struct reg_avail_info
+{
+  basic_block last_bb;
+  int first_set;
+  int first_use;
+};
+
+/* Hashtable helpers.  */
+
+struct expr_hasher : nofree_ptr_hash <expr>
+{
+  static inline hashval_t hash (const expr *);
+  static inline bool equal (const expr *, const expr *);
+};
+
+/* Callback for hashtab.
+   Return the hash value for expression EXP.  We don't actually hash
+   here, we just return the cached hash value.  */
+
+inline hashval_t
+expr_hasher::hash (const expr *exp)
+{
+  return exp->hash;
+}
+
+/* Callback for hashtab.
+   Return nonzero if exp1 is equivalent to exp2.  */
+
+inline bool
+expr_hasher::equal (const expr *exp1, const expr *exp2)
+{
+  int equiv_p = exp_equiv_p (exp1->expr, exp2->expr, 0, true);
+
+  gcc_assert (!equiv_p || exp1->hash == exp2->hash);
+  return equiv_p;
+}
+
+static hashval_t
+hash_expr (rtx x, int *do_not_record_p)
+{
+  *do_not_record_p = 0;
+  return hash_rtx (x, GET_MODE (x), do_not_record_p,
+		   NULL,  /*have_reg_qty=*/false);
+}
+
+
+/* Helpers for memory allocation/freeing.  */
+static void alloc_mem (void);
+static void free_mem (void);
+static void compute_hash_table (void);
+/* Scan the pattern of INSN and add an entry to the hash TABLE.
+   After reload we are interested in loads/stores only.  */
+static void hash_scan_set (rtx_insn *);
+static void insert_expr_in_table (rtx, rtx_insn *);
+static void dump_hash_table (FILE *);
+
+static struct obstack expr_obstack;
+/* The table itself.  */
+static hash_table <expr_hasher> *expr_table;
+static struct reg_avail_info *reg_avail_info;
+static sbitmap *hoist_vbein;
+static sbitmap *hoist_vbeout;
+
+/* Allocate memory for the CUID mapping array and register/memory
+   tracking tables.  */
+
+static void
+alloc_mem (void)
+{
+  /* Allocate the available expressions hash table.  We don't want to
+     make the hash table too small, but unnecessarily making it too large
+     also doesn't help.  The i/4 is a gcse.c relic, and seems like a
+     reasonable choice.  */
+  expr_table = new hash_table<expr_hasher> (MAX (get_max_insn_count () / 4,
+					     13));
+
+  /* We allocate everything on obstacks because we often can roll back
+     the whole obstack to some point.  Freeing obstacks is very fast.  */
+  gcc_obstack_init (&expr_obstack);
+}
+
+/* Free memory allocated by alloc_mem.  */
+
+static void
+free_mem (void)
+{
+  delete expr_table;
+  expr_table = NULL;
+
+  obstack_free (&expr_obstack, NULL);
+}
+
+
+/* Dump all expressions and occurrences that are currently in the
+   expression hash table to FILE.  */
+
+/* This helper is called via htab_traverse.  */
+int
+nds32_dump_expr_hash_table_entry (expr **slot, FILE *file)
+{
+  struct expr *exprs = *slot;
+  struct occr *occr;
+
+  fprintf (file, "expr: ");
+  print_rtl (file, exprs->expr);
+  fprintf (file,"\nhashcode: %u\n", exprs->hash);
+  fprintf (file,"list of occurrences:\n");
+  occr = exprs->antic_occr;
+  while (occr)
+    {
+      rtx_insn *insn = occr->insn;
+      print_rtl_single (file, insn);
+      fprintf (file, "\n");
+      occr = occr->next;
+    }
+  fprintf (file, "\n");
+  return 1;
+}
+
+static void
+dump_hash_table (FILE *file)
+{
+  fprintf (file, "\n\nexpression hash table\n");
+  fprintf (file, "size %ld, %ld elements, %f collision/search ratio\n",
+	   (long) expr_table->size (),
+	   (long) expr_table->elements (),
+	   expr_table->collisions ());
+  if (expr_table->elements () > 0)
+    {
+      fprintf (file, "\n\ntable entries:\n");
+      expr_table->traverse <FILE *, nds32_dump_expr_hash_table_entry> (file);
+    }
+  fprintf (file, "\n");
+}
+
+/* Insert expression X in INSN in the hash TABLE.
+   If it is already present, record it as the last occurrence in INSN's
+   basic block.  */
+
+static void
+insert_expr_in_table (rtx x, rtx_insn *insn)
+{
+  int do_not_record_p;
+  hashval_t hash;
+  struct expr *cur_expr, **slot;
+  struct occr *antic_occr, *last_occr = NULL;
+
+  hash = hash_expr (x, &do_not_record_p);
+
+  /* Do not insert expression in the table if it contains volatile operands,
+     or if hash_expr determines the expression is something we don't want
+     to or can't handle.  */
+  if (do_not_record_p)
+    return;
+
+  /* We anticipate that redundant expressions are rare, so for convenience
+     allocate a new hash table element here already and set its fields.
+     If we don't do this, we need a hack with a static struct expr.  Anyway,
+     obstack_free is really fast and one more obstack_alloc doesn't hurt if
+     we're going to see more expressions later on.  */
+  cur_expr = (struct expr *) obstack_alloc (&expr_obstack,
+					    sizeof (struct expr));
+  cur_expr->expr = x;
+  cur_expr->hash = hash;
+  cur_expr->antic_occr = NULL;
+
+  slot = expr_table->find_slot_with_hash (cur_expr, hash, INSERT);
+
+  if (! (*slot))
+    /* The expression isn't found, so insert it.  */
+    *slot = cur_expr;
+  else
+    {
+      /* The expression is already in the table, so roll back the
+	 obstack and use the existing table entry.  */
+      obstack_free (&expr_obstack, cur_expr);
+      cur_expr = *slot;
+    }
+
+  /* Search for another occurrence in the same basic block.  */
+  antic_occr = cur_expr->antic_occr;
+  cur_expr->count++;
+  while (antic_occr
+	 && BLOCK_FOR_INSN (antic_occr->insn) != BLOCK_FOR_INSN (insn))
+    {
+      /* If an occurrence isn't found, save a pointer to the end of
+	 the list.  */
+      last_occr = antic_occr;
+      antic_occr = antic_occr->next;
+    }
+
+  if (antic_occr)
+    /* Found another instance of the expression in the same basic block.
+       Prefer this occurrence to the currently recorded one.  We want
+       the last one in the block and the block is scanned from start
+       to end.  */
+    antic_occr->insn = insn;
+  else
+    {
+      /* First occurrence of this expression in this basic block.  */
+      antic_occr = (struct occr *) obstack_alloc (&expr_obstack,
+						  sizeof (struct occr));
+
+      /* First occurrence of this expression in any block?  */
+      if (cur_expr->antic_occr == NULL)
+	cur_expr->antic_occr = antic_occr;
+      else
+	last_occr->next = antic_occr;
+
+      antic_occr->insn = insn;
+      antic_occr->next = NULL;
+      antic_occr->deleted_p = 0;
+    }
+}
+
+/* Check whether this instruction is supported format.  */
+
+static void
+hash_scan_set (rtx_insn *insn)
+{
+  rtx pat = PATTERN (insn);
+  rtx src = SET_SRC (pat);
+  rtx dest = SET_DEST (pat);
+  int regno;
+  struct reg_avail_info *info;
+
+  /* Don't mess with jumps and nops.  */
+  if (JUMP_P (insn) || set_noop_p (pat))
+    return;
+
+  /* TODO: support more format.  */
+
+  /* Only consider locally anticipatable intructions currently.  */
+  if (REG_P (dest) && REGNO (dest) <= SP_REGNUM)
+    {
+      regno = REGNO (dest);
+      info = &reg_avail_info[regno];
+
+      if (BLOCK_FOR_INSN (insn) == info->last_bb
+	  && info->first_set == DF_INSN_LUID (insn)
+	  && info->first_use >= info->first_set)
+	{
+	  /* Only support immediate input currently because
+	     this is bugzilla case.  */
+	  if (CONST_INT_P (src) || CONST_DOUBLE_P (src))
+	    insert_expr_in_table (PATTERN (insn), insn);
+	}
+    }
+}
+
+/* Record register first use information for REGNO in INSN.
+
+   first_use records the first place in the block where the register
+   is used and is used to compute "anticipatability".
+
+   last_bb records the block for which first_use is valid,
+   as a quick test to invalidate them.  */
+
+static void
+record_first_reg_use_info (rtx_insn *insn, int regno)
+{
+  struct reg_avail_info *info = &reg_avail_info[regno];
+  int luid = DF_INSN_LUID (insn);
+
+  if (info->last_bb != BLOCK_FOR_INSN (insn))
+    {
+      info->last_bb = BLOCK_FOR_INSN (insn);
+      info->first_use = luid;
+      /* Set the value to record the using is former than setting.  */
+      info->first_set = luid + 1;
+    }
+}
+
+/* Called from compute_hash_table via note_stores to handle one
+   SET or CLOBBER in an insn.  DATA is really the instruction in which
+   the SET is taking place.  */
+
+static void
+record_first_use_info (rtx *dest, void *data)
+{
+  rtx_insn *last_set_insn = static_cast<rtx_insn*> (data);
+  int i, j;
+  enum rtx_code code;
+  const char *fmt;
+  rtx x = *dest;
+
+  if (x == 0)
+    return;
+
+  code = GET_CODE (x);
+  if (REG_P (x) && REGNO (x) <= SP_REGNUM)
+    {
+      record_first_reg_use_info (last_set_insn, REGNO (x));
+      /* DF and DI mode may use two registers.  */
+      if (GET_MODE_SIZE (GET_MODE (x)) == 8)
+	record_first_reg_use_info (last_set_insn, REGNO (x) + 1);
+    }
+
+  for (i = GET_RTX_LENGTH (code) - 1, fmt = GET_RTX_FORMAT (code); i >= 0; i--)
+    {
+      if (fmt[i] == 'e')
+	record_first_use_info (&XEXP (x, i), data);
+      else if (fmt[i] == 'E')
+	for (j = 0; j < XVECLEN (x, i); j++)
+	  record_first_use_info (&XVECEXP (x, i, j), data);
+    }
+}
+
+/* Record register first/block set information for REGNO in INSN.
+
+   first_set records the first place in the block where the register
+   is set and is used to compute "anticipatability".
+
+   last_bb records the block for which first_set is valid,
+   as a quick test to invalidate them.  */
+
+static void
+record_first_reg_set_info (rtx_insn *insn, int regno)
+{
+  struct reg_avail_info *info = &reg_avail_info[regno];
+  int luid = DF_INSN_LUID (insn);
+
+  if (info->last_bb != BLOCK_FOR_INSN (insn))
+    {
+      info->last_bb = BLOCK_FOR_INSN (insn);
+      info->first_set = luid;
+      /* Set the value to record the using is later than setting.  */
+      info->first_use = luid + 1;
+    }
+}
+
+/* Called from compute_hash_table via note_stores to handle one
+   SET or CLOBBER in an insn.  DATA is really the instruction in which
+   the SET is taking place.  */
+
+static void
+record_first_set_info (rtx dest, const_rtx setter ATTRIBUTE_UNUSED, void *data)
+{
+  rtx_insn *last_set_insn = static_cast<rtx_insn *> (data);
+
+  if (GET_CODE (dest) == SUBREG)
+    dest = SUBREG_REG (dest);
+
+  if (REG_P (dest) && REGNO (dest) <= SP_REGNUM)
+    {
+      record_first_reg_set_info (last_set_insn, REGNO (dest));
+      if (GET_MODE_SIZE (GET_MODE (dest)) == 8)
+	record_first_reg_set_info (last_set_insn, REGNO (dest) + 1);
+    }
+}
+
+/* Build hash table for supported format instructions.
+   Only consider if the instruction is anticipatable in the basic block here.
+   We postpone the def-use check until hoisting.  */
+
+static void
+compute_hash_table (void)
+{
+  basic_block bb;
+  int i;
+
+  /* We only take care hard registers.  */
+  reg_avail_info =
+    (struct reg_avail_info *) xmalloc (sizeof (struct reg_avail_info) *
+				       (SP_REGNUM + 1));
+
+  for (i = 0; i < 32; i++)
+    reg_avail_info[i].last_bb = NULL;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      rtx_insn *insn;
+
+      /* Do not hoist instrucion from block which has more
+	 than one predecessor.  */
+      if (EDGE_COUNT (bb->preds) > 1)
+	continue;
+
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (!NONDEBUG_INSN_P (insn))
+	    continue;
+
+	  /* Construct a caller save register barrier.  We cannot hoist the
+	     instruction over a function call which sets caller save
+	     registers.  */
+	  if (CALL_P (insn))
+	    {
+	      for (i = 0; i <= SP_REGNUM; i++)
+		if (call_used_regs[i])
+		  record_first_reg_use_info (insn, i);
+	    }
+
+	  note_uses (&PATTERN (insn), record_first_use_info, insn);
+	  note_stores (PATTERN (insn), record_first_set_info, insn);
+	}
+
+      /* Build the hash table.  */
+      FOR_BB_INSNS (bb, insn)
+	if (INSN_P (insn) && GET_CODE (PATTERN (insn)) == SET)
+	  hash_scan_set (insn);
+    }
+}
+
+/* Hoist instructions in this slot if possible.  */
+int
+nds32_find_gcse_expr_table (expr **slot, void *data ATTRIBUTE_UNUSED)
+{
+  struct expr *exprs = *slot;
+  struct occr *occr;
+  rtx_insn *insn = NULL;
+  rtx_insn *last_insn;
+  basic_block bb;
+  edge e;
+  unsigned ix;
+  unsigned emit_done;
+  unsigned cover, regno;
+  df_ref use;
+  enum machine_mode mode;
+
+  if (exprs->count < 2)
+    return 1;
+
+  bitmap_vector_clear (hoist_vbeout, last_basic_block_for_fn (cfun));
+  bitmap_vector_clear (hoist_vbein, last_basic_block_for_fn (cfun));
+
+  /* Set the bit for this slot.  */
+  occr = exprs->antic_occr;
+  while (occr)
+    {
+      insn = occr->insn;
+      bb = BLOCK_FOR_INSN (insn);
+      if (!occr->deleted_p)
+	bitmap_set_bit (hoist_vbein[bb->index], 0);
+      occr = occr->next;
+    }
+
+  /* Try to hoist code for each basic block.  */
+  FOR_EACH_BB_REVERSE_FN (bb, cfun)
+    {
+      if (bb->next_bb != EXIT_BLOCK_PTR_FOR_FN (cfun))
+	bitmap_intersection_of_succs (hoist_vbeout[bb->index], hoist_vbein, bb);
+
+      if (bitmap_bit_p (hoist_vbeout[bb->index], 0)
+	  && EDGE_COUNT (bb->succs) > 1)
+	{
+	  emit_done = 0;
+	  cover = FALSE;
+	  for (e = NULL, ix = 0; ix < EDGE_COUNT (bb->succs); ix++)
+	    {
+	      e = EDGE_SUCC (bb, ix);
+	      if (e->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
+		continue;
+	      occr = exprs->antic_occr;
+	      while (occr)
+		{
+		  insn = occr->insn;
+		  if (!occr->deleted_p && e->dest == BLOCK_FOR_INSN (insn))
+		    break;
+		  occr = occr->next;
+		}
+
+	      gcc_assert (insn != NULL);
+
+	      if (!emit_done)
+		{
+		  last_insn = BB_END (bb);
+		  /* Check the defined register is not used by the last
+		     instruction of the previos block.*/
+		  regno = REGNO (SET_DEST (PATTERN (insn)));
+		  mode = GET_MODE (SET_DEST (PATTERN (insn)));
+		  FOR_EACH_INSN_USE (use, last_insn)
+		    {
+		      if (DF_REF_REGNO (use) == regno
+			  || regno_clobbered_p (regno, last_insn, mode, 2))
+			{
+			  cover = TRUE;
+			  break;
+			}
+		    }
+
+		  /* TODO: support more format.  */
+		  if (cover)
+		    break;
+		  else if (JUMP_P (last_insn))
+		    {
+		      emit_insn_before_noloc (PATTERN (insn), last_insn, bb);
+		      emit_done = TRUE;
+		    }
+		  else
+		    break;
+		}
+
+	      if (emit_done)
+		{
+		  delete_insn (insn);
+		  occr->deleted_p = TRUE;
+		}
+	    }
+	}
+    }
+  return 1;
+}
+
+static int
+hoist_code (void)
+{
+  hoist_vbein = sbitmap_vector_alloc (last_basic_block_for_fn (cfun), 1);
+  hoist_vbeout = sbitmap_vector_alloc (last_basic_block_for_fn (cfun), 1);
+
+  expr_table->traverse <void *, nds32_find_gcse_expr_table> (NULL);
+
+  sbitmap_vector_free (hoist_vbein);
+  sbitmap_vector_free (hoist_vbeout);
+
+  return 0;
+}
+
+
+static unsigned int
+nds32_gcse_opt (void)
+{
+
+  if (n_basic_blocks_for_fn (cfun) <= NUM_FIXED_BLOCKS + 1)
+    return 0;
+  /* Allocate memory for this pass.
+     Also computes and initializes the insns' CUIDs.  */
+  alloc_mem ();
+
+  df_chain_add_problem (DF_DU_CHAIN);
+  df_insn_rescan_all ();
+  df_analyze ();
+
+  compute_hash_table ();
+
+  if (dump_file)
+    dump_hash_table (dump_file);
+
+  hoist_code ();
+
+  df_insn_rescan_all ();
+  free_mem ();
+  return 0;
+}
+
+const pass_data pass_data_nds32_gcse_opt =
+{
+  RTL_PASS,				/* type */
+  "gcse_opt",				/* name */
+  OPTGROUP_NONE,			/* optinfo_flags */
+  TV_MACH_DEP,				/* tv_id */
+  0,					/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  0,					/* todo_flags_finish */
+};
+
+class pass_nds32_gcse_opt : public rtl_opt_pass
+{
+public:
+  pass_nds32_gcse_opt (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_nds32_gcse_opt, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) { return TARGET_GCSE_OPT; }
+  unsigned int execute (function *) { return nds32_gcse_opt (); }
+};
+
+rtl_opt_pass *
+make_pass_nds32_gcse_opt (gcc::context *ctxt)
+{
+  return new pass_nds32_gcse_opt (ctxt);
+}
+
+/* ------------------------------------------------------------------------ */
diff --git a/gcc/config/nds32/nds32-graywolf.md b/gcc/config/nds32/nds32-graywolf.md
new file mode 100644
index 0000000..f9ddbd8
--- /dev/null
+++ b/gcc/config/nds32/nds32-graywolf.md
@@ -0,0 +1,471 @@
+;; Pipeline descriptions of Andes NDS32 cpu for GNU compiler
+;; Copyright (C) 2012-2016 Free Software Foundation, Inc.
+;; Contributed by Andes Technology Corporation.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; ------------------------------------------------------------------------
+;; Define Graywolf pipeline settings.
+;; ------------------------------------------------------------------------
+
+(define_automaton "nds32_graywolf_machine")
+
+(define_cpu_unit "gw_ii_0" "nds32_graywolf_machine")
+(define_cpu_unit "gw_ii_1" "nds32_graywolf_machine")
+(define_cpu_unit "gw_ex_p0" "nds32_graywolf_machine")
+(define_cpu_unit "gw_mm_p0" "nds32_graywolf_machine")
+(define_cpu_unit "gw_wb_p0" "nds32_graywolf_machine")
+(define_cpu_unit "gw_ex_p1" "nds32_graywolf_machine")
+(define_cpu_unit "gw_mm_p1" "nds32_graywolf_machine")
+(define_cpu_unit "gw_wb_p1" "nds32_graywolf_machine")
+(define_cpu_unit "gw_iq_p2" "nds32_graywolf_machine")
+(define_cpu_unit "gw_rf_p2" "nds32_graywolf_machine")
+(define_cpu_unit "gw_e1_p2" "nds32_graywolf_machine")
+(define_cpu_unit "gw_e2_p2" "nds32_graywolf_machine")
+(define_cpu_unit "gw_e3_p2" "nds32_graywolf_machine")
+(define_cpu_unit "gw_e4_p2" "nds32_graywolf_machine")
+
+(define_reservation "gw_ii" "gw_ii_0 | gw_ii_1")
+(define_reservation "gw_ex" "gw_ex_p0 | gw_ex_p1")
+(define_reservation "gw_mm" "gw_mm_p0 | gw_mm_p1")
+(define_reservation "gw_wb" "gw_wb_p0 | gw_wb_p1")
+
+(define_reservation "gw_ii_all" "gw_ii_0 + gw_ii_1")
+
+(define_insn_reservation "nds_gw_unknown" 1
+  (and (eq_attr "type" "unknown")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_ex, gw_mm, gw_wb")
+
+(define_insn_reservation "nds_gw_misc" 1
+  (and (eq_attr "type" "misc")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_ex, gw_mm, gw_wb")
+
+(define_insn_reservation "nds_gw_mmu" 1
+  (and (eq_attr "type" "mmu")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_ex, gw_mm, gw_wb")
+
+(define_insn_reservation "nds_gw_alu" 1
+  (and (and (eq_attr "type" "alu")
+            (match_test "!nds32::movd44_insn_p (insn)"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_ex, gw_mm, gw_wb")
+
+(define_insn_reservation "nds_gw_movd44" 1
+  (and (and (eq_attr "type" "alu")
+            (match_test "nds32::movd44_insn_p (insn)"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex, gw_mm, gw_wb")
+
+(define_insn_reservation "nds_gw_alu_shift" 1
+  (and (eq_attr "type" "alu_shift")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_ex*2, gw_mm, gw_wb")
+
+(define_insn_reservation "nds_gw_pbsad" 1
+  (and (eq_attr "type" "pbsad")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_ex*3, gw_mm, gw_wb")
+
+(define_insn_reservation "nds_gw_pbsada" 1
+  (and (eq_attr "type" "pbsada")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_ex*3, gw_mm, gw_wb")
+
+(define_insn_reservation "nds_gw_load" 1
+  (and (and (eq_attr "type" "load")
+            (match_test "!nds32::post_update_insn_p (insn)"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_load_2w" 1
+  (and (and (eq_attr "type" "load")
+            (match_test "nds32::post_update_insn_p (insn)"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_all, gw_ex_p1, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_store" 1
+  (and (and (eq_attr "type" "store")
+            (match_test "!nds32::store_offset_reg_p (insn)"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_store_3r" 1
+  (and (and (eq_attr "type" "store")
+            (match_test "nds32::store_offset_reg_p (insn)"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_all, gw_ex_p1, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_load_multiple_1" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "1"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_load_multiple_2" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "2"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*2, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_load_multiple_3" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "3"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*3, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_load_multiple_4" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "4"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*4, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_load_multiple_5" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "5"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*4, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_load_multiple_6" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "6"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*4, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_load_multiple_7" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "7"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*4, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_load_multiple_8" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "8"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*4, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_load_multiple_12" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "12"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*4, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_store_multiple_1" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "1"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_store_multiple_2" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "2"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*2, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_store_multiple_3" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "3"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*3, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_store_multiple_4" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "4"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*4, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_store_multiple_5" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "5"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*4, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_store_multiple_6" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "6"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*4, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_store_multiple_7" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "7"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*4, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_store_multiple_8" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "8"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*4, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_store_multiple_12" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "12"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_1, gw_ex_p1*4, gw_mm_p1, gw_wb_p1")
+
+(define_insn_reservation "nds_gw_mul_fast1" 1
+  (and (match_test "nds32_mul_config == MUL_TYPE_FAST_1")
+       (and (eq_attr "type" "mul")
+       (eq_attr "pipeline_model" "graywolf")))
+  "gw_ii_0, gw_ex_p0, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_mul_fast2" 1
+  (and (match_test "nds32_mul_config == MUL_TYPE_FAST_2")
+       (and (eq_attr "type" "mul")
+       (eq_attr "pipeline_model" "graywolf")))
+  "gw_ii_0, gw_ex_p0*2, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_mul_slow" 1
+  (and (match_test "nds32_mul_config == MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mul")
+       (eq_attr "pipeline_model" "graywolf")))
+  "gw_ii_0, gw_ex_p0*4, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_mac_fast1" 1
+  (and (match_test "nds32_mul_config == MUL_TYPE_FAST_1")
+       (and (eq_attr "type" "mac")
+       (eq_attr "pipeline_model" "graywolf")))
+  "gw_ii_all, gw_ex_p0, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_mac_fast2" 1
+  (and (match_test "nds32_mul_config == MUL_TYPE_FAST_2")
+       (and (eq_attr "type" "mac")
+       (eq_attr "pipeline_model" "graywolf")))
+  "gw_ii_all, gw_ex_p0*2, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_mac_slow" 1
+  (and (match_test "nds32_mul_config == MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mac")
+       (eq_attr "pipeline_model" "graywolf")))
+  "gw_ii_all, gw_ex_p0*4, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_div" 1
+  (and (and (eq_attr "type" "div")
+            (match_test "!nds32::divmod_p (insn)"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_0, gw_ex_p0*4, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_div_2w" 1
+  (and (and (eq_attr "type" "div")
+            (match_test "nds32::divmod_p (insn)"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_all, gw_ex_p0*4, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_branch" 1
+  (and (eq_attr "type" "branch")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_0, gw_ex_p0, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_dsp_alu" 1
+  (and (eq_attr "type" "dalu")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_ex, gw_mm, gw_wb")
+
+(define_insn_reservation "nds_gw_dsp_alu64" 1
+  (and (eq_attr "type" "dalu64")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_all, gw_ex_p0, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_dsp_alu_round" 1
+  (and (eq_attr "type" "daluround")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_0, gw_ex_p0, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_dsp_cmp" 1
+  (and (eq_attr "type" "dcmp")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_0, gw_ex_p0, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_dsp_clip" 1
+  (and (eq_attr "type" "dclip")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_0, gw_ex_p0, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_dsp_mul" 1
+  (and (eq_attr "type" "dmul")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_0, gw_ex_p0, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_dsp_mac" 1
+  (and (eq_attr "type" "dmac")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_all, gw_ex_p0, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_dsp_insb" 1
+  (and (eq_attr "type" "dinsb")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_0, gw_ex_p0, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_dsp_pack" 1
+  (and (eq_attr "type" "dpack")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_0, gw_ex_p0, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_dsp_bpick" 1
+  (and (eq_attr "type" "dbpick")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_0, gw_ex_p0, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_dsp_wext" 1
+  (and (eq_attr "type" "dwext")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii_all, gw_ex_p0, gw_mm_p0, gw_wb_p0")
+
+(define_insn_reservation "nds_gw_fpu_alu" 4
+  (and (eq_attr "type" "falu")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_iq_p2, gw_rf_p2, gw_e1_p2, gw_e2_p2, gw_e3_p2, gw_e4_p2")
+
+(define_insn_reservation "nds_gw_fpu_muls" 4
+  (and (eq_attr "type" "fmuls")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_iq_p2, gw_rf_p2, gw_e1_p2, gw_e2_p2, gw_e3_p2, gw_e4_p2")
+
+(define_insn_reservation "nds_gw_fpu_muld" 4
+  (and (eq_attr "type" "fmuld")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_iq_p2, gw_rf_p2, gw_e1_p2, gw_e2_p2*2, gw_e3_p2, gw_e4_p2")
+
+(define_insn_reservation "nds_gw_fpu_macs" 4
+  (and (eq_attr "type" "fmacs")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_iq_p2, gw_rf_p2, gw_e1_p2, gw_e2_p2*3, gw_e3_p2, gw_e4_p2")
+
+(define_insn_reservation "nds_gw_fpu_macd" 4
+  (and (eq_attr "type" "fmacd")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_iq_p2, gw_rf_p2, gw_e1_p2, gw_e2_p2*4, gw_e3_p2, gw_e4_p2")
+
+(define_insn_reservation "nds_gw_fpu_divs" 4
+  (and (ior (eq_attr "type" "fdivs")
+	    (eq_attr "type" "fsqrts"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_iq_p2, gw_rf_p2, gw_e1_p2, gw_e2_p2*14, gw_e3_p2, gw_e4_p2")
+
+(define_insn_reservation "nds_gw_fpu_divd" 4
+  (and (ior (eq_attr "type" "fdivd")
+	    (eq_attr "type" "fsqrtd"))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_iq_p2, gw_rf_p2, gw_e1_p2, gw_e2_p2*28, gw_e3_p2, gw_e4_p2")
+
+(define_insn_reservation "nds_gw_fpu_fast_alu" 2
+  (and (ior (eq_attr "type" "fcmp")
+	    (ior (eq_attr "type" "fabs")
+		 (ior (eq_attr "type" "fcpy")
+		      (eq_attr "type" "fcmov"))))
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_iq_p2, gw_rf_p2, gw_e1_p2, gw_e2_p2, gw_e3_p2, gw_e4_p2")
+
+(define_insn_reservation "nds_gw_fpu_fmtsr" 1
+  (and (eq_attr "type" "fmtsr")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_iq_p2, gw_rf_p2, gw_e1_p2, gw_e2_p2, gw_e3_p2, gw_e4_p2")
+
+(define_insn_reservation "nds_gw_fpu_fmtdr" 1
+  (and (eq_attr "type" "fmtdr")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_ii+gw_iq_p2, gw_iq_p2+gw_rf_p2, gw_rf_p2+gw_e1_p2, gw_e1_p2+gw_e2_p2, gw_e2_p2+gw_e3_p2, gw_e3_p2+gw_e4_p2, gw_e4_p2")
+
+(define_insn_reservation "nds_gw_fpu_fmfsr" 1
+  (and (eq_attr "type" "fmfsr")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_iq_p2, gw_rf_p2, gw_e1_p2, gw_e2_p2, gw_e3_p2, gw_e4_p2")
+
+(define_insn_reservation "nds_gw_fpu_fmfdr" 1
+  (and (eq_attr "type" "fmfdr")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_ii+gw_iq_p2, gw_iq_p2+gw_rf_p2, gw_rf_p2+gw_e1_p2, gw_e1_p2+gw_e2_p2, gw_e2_p2+gw_e3_p2, gw_e3_p2+gw_e4_p2, gw_e4_p2")
+
+(define_insn_reservation "nds_gw_fpu_load" 3
+  (and (eq_attr "type" "fload")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_iq_p2, gw_rf_p2, gw_e1_p2, gw_e2_p2, gw_e3_p2, gw_e4_p2")
+
+(define_insn_reservation "nds_gw_fpu_store" 1
+  (and (eq_attr "type" "fstore")
+       (eq_attr "pipeline_model" "graywolf"))
+  "gw_ii, gw_iq_p2, gw_rf_p2, gw_e1_p2, gw_e2_p2, gw_e3_p2, gw_e4_p2")
+
+;; FPU_ADDR_OUT -> FPU_ADDR_IN
+;; Main pipeline rules don't need this because those default latency is 1.
+(define_bypass 1
+  "nds_gw_fpu_load, nds_gw_fpu_store"
+  "nds_gw_fpu_load, nds_gw_fpu_store"
+  "nds32_gw_ex_to_ex_p"
+)
+
+;; LD, MUL, MAC, DIV, DALU64, DMUL, DMAC, DALUROUND, DBPICK, DWEXT
+;;   -> ALU, ALU_SHIFT_Rb, PBSAD, PBSADA_RaRb, MOVD44, MUL, MAC_RaRb, DIV, ADDR_IN, BR, MMU,
+;;      DALU, DALUROUND, DMUL, DMAC_RaRb, DPACK, DINSB, DCMP, DCLIP, WEXT_O, BPICK_RaRb
+(define_bypass 2
+  "nds_gw_load, nds_gw_load_2w,\
+   nds_gw_mul_fast1, nds_gw_mul_fast2, nds_gw_mul_slow,\
+   nds_gw_mac_fast1, nds_gw_mac_fast2, nds_gw_mac_slow,\
+   nds_gw_div, nds_gw_div_2w,\
+   nds_gw_dsp_alu64, nds_gw_dsp_mul, nds_gw_dsp_mac,\
+   nds_gw_dsp_alu_round, nds_gw_dsp_bpick, nds_gw_dsp_wext"
+  "nds_gw_alu, nds_gw_movd44, nds_gw_alu_shift,\
+   nds_gw_pbsad, nds_gw_pbsada,\
+   nds_gw_mul_fast1, nds_gw_mul_fast2, nds_gw_mul_slow,\
+   nds_gw_mac_fast1, nds_gw_mac_fast2, nds_gw_mac_slow,\
+   nds_gw_branch,\
+   nds_gw_div, nds_gw_div_2w,\
+   nds_gw_load, nds_gw_load_2w, nds_gw_store, nds_gw_store_3r,\
+   nds_gw_load_multiple_1,nds_gw_load_multiple_2, nds_gw_load_multiple_3,\
+   nds_gw_load_multiple_4,nds_gw_load_multiple_5, nds_gw_load_multiple_6,\
+   nds_gw_load_multiple_7,nds_gw_load_multiple_8, nds_gw_load_multiple_12,\
+   nds_gw_store_multiple_1,nds_gw_store_multiple_2, nds_gw_store_multiple_3,\
+   nds_gw_store_multiple_4,nds_gw_store_multiple_5, nds_gw_store_multiple_6,\
+   nds_gw_store_multiple_7,nds_gw_store_multiple_8, nds_gw_store_multiple_12,\
+   nds_gw_mmu,\
+   nds_gw_dsp_alu, nds_gw_dsp_alu_round,\
+   nds_gw_dsp_mul, nds_gw_dsp_mac, nds_gw_dsp_pack,\
+   nds_gw_dsp_insb, nds_gw_dsp_cmp, nds_gw_dsp_clip,\
+   nds_gw_dsp_wext, nds_gw_dsp_bpick"
+  "nds32_gw_mm_to_ex_p"
+)
+
+;; LMW(N, N)
+;;   -> ALU, ALU_SHIFT_Rb, PBSAD, PBSADA_RaRb, MOVD44, MUL, MAC_RaRb, DIV, ADDR_IN, BR, MMU
+;;      DALU, DALUROUND, DMUL, DMAC_RaRb, DPACK, DINSB, DCMP, DCLIP, WEXT_O, BPICK_RaRb
+(define_bypass 2
+  "nds_gw_load_multiple_1,nds_gw_load_multiple_2, nds_gw_load_multiple_3,\
+   nds_gw_load_multiple_4,nds_gw_load_multiple_5, nds_gw_load_multiple_6,\
+   nds_gw_load_multiple_7,nds_gw_load_multiple_8, nds_gw_load_multiple_12"
+  "nds_gw_alu, nds_gw_movd44, nds_gw_alu_shift,\
+   nds_gw_pbsad, nds_gw_pbsada,\
+   nds_gw_mul_fast1, nds_gw_mul_fast2, nds_gw_mul_slow,\
+   nds_gw_mac_fast1, nds_gw_mac_fast2, nds_gw_mac_slow,\
+   nds_gw_branch,\
+   nds_gw_div, nds_gw_div_2w,\
+   nds_gw_load, nds_gw_load_2w, nds_gw_store, nds_gw_store_3r,\
+   nds_gw_load_multiple_1,nds_gw_load_multiple_2, nds_gw_load_multiple_3,\
+   nds_gw_load_multiple_4,nds_gw_load_multiple_5, nds_gw_load_multiple_6,\
+   nds_gw_load_multiple_7,nds_gw_load_multiple_8, nds_gw_load_multiple_12,\
+   nds_gw_store_multiple_1,nds_gw_store_multiple_2, nds_gw_store_multiple_3,\
+   nds_gw_store_multiple_4,nds_gw_store_multiple_5, nds_gw_store_multiple_6,\
+   nds_gw_store_multiple_7,nds_gw_store_multiple_8, nds_gw_store_multiple_12,\
+   nds_gw_mmu,\
+   nds_gw_dsp_alu, nds_gw_dsp_alu_round,\
+   nds_gw_dsp_mul, nds_gw_dsp_mac, nds_gw_dsp_pack,\
+   nds_gw_dsp_insb, nds_gw_dsp_cmp, nds_gw_dsp_clip,\
+   nds_gw_dsp_wext, nds_gw_dsp_bpick"
+  "nds32_gw_last_load_to_ex_p"
+)
diff --git a/gcc/config/nds32/nds32-intrinsic.c b/gcc/config/nds32/nds32-intrinsic.c
index fabf262..7547fb1 100644
--- a/gcc/config/nds32/nds32-intrinsic.c
+++ b/gcc/config/nds32/nds32-intrinsic.c
@@ -24,210 +24,1867 @@
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
-#include "target.h"
-#include "rtl.h"
 #include "tree.h"
-#include "optabs.h"		/* For GEN_FCN.  */
-#include "diagnostic-core.h"
+#include "rtl.h"
+#include "df.h"
+#include "alias.h"
 #include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "regs.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
+#include "explow.h"
+#include "emit-rtl.h"
+#include "stmt.h"
 #include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
 #include "langhooks.h"		/* For add_builtin_function().  */
+#include "builtins.h"

 /* ------------------------------------------------------------------------ */

-/* Function to expand builtin function for
-   '[(unspec_volatile [(reg)])]'.  */
+/* Read the requested argument from the EXP given by INDEX.
+   Return the value as an rtx.  */
+static rtx
+nds32_read_argument (tree exp, unsigned int index)
+{
+  return expand_normal (CALL_EXPR_ARG (exp, index));
+}
+
+/* Return a legitimate rtx for instruction ICODE's return value.  Use TARGET
+   if it's not null, has the right mode, and satisfies operand 0's
+   predicate.  */
+static rtx
+nds32_legitimize_target (enum insn_code icode, rtx target)
+{
+  enum machine_mode mode = insn_data[icode].operand[0].mode;
+
+  if (! target
+      || GET_MODE (target) != mode
+      || ! (*insn_data[icode].operand[0].predicate) (target, mode))
+    return gen_reg_rtx (mode);
+  else
+    return target;
+}
+
+/* Given that ARG is being passed as operand OPNUM to instruction ICODE,
+   check whether ARG satisfies the operand's constraints.  If it doesn't,
+   copy ARG to a temporary register and return that.  Otherwise return ARG
+   itself.  */
 static rtx
-nds32_expand_builtin_null_ftype_reg (enum insn_code icode,
-				     tree exp, rtx target)
+nds32_legitimize_argument (enum insn_code icode, int opnum, rtx arg)
+{
+  enum machine_mode mode = insn_data[icode].operand[opnum].mode;
+
+  if ((*insn_data[icode].operand[opnum].predicate) (arg, mode))
+    return arg;
+  else if (VECTOR_MODE_P (mode) && CONST_INT_P (arg))
+    {
+      /* Handle CONST_INT covert to CONST_VECTOR.  */
+      int nunits = GET_MODE_NUNITS (mode);
+      int i, shift = 0;
+      rtvec v = rtvec_alloc (nunits);
+      int val = INTVAL (arg);
+      enum machine_mode val_mode = (mode == V4QImode) ? QImode : HImode;
+      int shift_acc = (val_mode == QImode) ? 8 : 16;
+      int mask = (val_mode == QImode) ? 0xff : 0xffff;
+      int tmp_val = val;
+
+      if (TARGET_BIG_ENDIAN)
+	for (i = 0; i < nunits; i++)
+	  {
+	    tmp_val = (val >> shift) & mask;
+	    RTVEC_ELT (v, nunits - i - 1) = gen_int_mode (tmp_val, val_mode);
+	    shift += shift_acc;
+	  }
+      else
+	for (i = 0; i < nunits; i++)
+	  {
+	    tmp_val = (val >> shift) & mask;
+	    RTVEC_ELT (v, i) = gen_int_mode (tmp_val, val_mode);
+	    shift += shift_acc;
+	  }
+
+      return copy_to_mode_reg (mode, gen_rtx_CONST_VECTOR (mode, v));
+    }
+  else
+    {
+      rtx tmp_rtx = gen_reg_rtx (mode);
+      convert_move (tmp_rtx, arg, false);
+      return tmp_rtx;
+    }
+}
+
+/* Return true if OPVAL can be used for operand OPNUM of instruction ICODE.
+   The instruction should require a constant operand of some sort.  The
+   function prints an error if OPVAL is not valid.  */
+static int
+nds32_check_constant_argument (enum insn_code icode, int opnum, rtx opval,
+			       const char *name)
 {
-  /* Mapping:
-       ops[0] <--> value0 <--> arg0 */
-  struct expand_operand ops[1];
-  tree arg0;
-  rtx value0;
+  if (GET_CODE (opval) != CONST_INT)
+    {
+      error ("invalid argument to built-in function %s", name);
+      return false;
+    }
+  if (! (*insn_data[icode].operand[opnum].predicate) (opval, VOIDmode))
+    {
+      error ("constant argument out of range for %s", name);
+
+      return false;
+    }
+  return true;
+}

-  /* Grab the incoming arguments and extract its rtx.  */
-  arg0 = CALL_EXPR_ARG (exp, 0);
-  value0 = expand_normal (arg0);
+/* Expand builtins that return target.  */
+static rtx
+nds32_expand_noarg_builtin (enum insn_code icode, rtx target)
+{
+  rtx pat;

-  /* Create operands.  */
-  create_input_operand (&ops[0], value0, TYPE_MODE (TREE_TYPE (arg0)));
+  target = nds32_legitimize_target (icode, target);

-  /* Emit new instruction.  */
-  if (!maybe_expand_insn (icode, 1, ops))
-    error ("invalid argument to built-in function");
+  /* Emit and return the new instruction. */
+  pat = GEN_FCN (icode) (target);
+  if (! pat)
+    return NULL_RTX;

+  emit_insn (pat);
   return target;
 }

-/* Function to expand builtin function for
-   '[(set (reg) (unspec_volatile [(imm)]))]'.  */
+/* Expand builtins that take one operand.  */
 static rtx
-nds32_expand_builtin_reg_ftype_imm (enum insn_code icode,
-				    tree exp, rtx target)
+nds32_expand_unop_builtin (enum insn_code icode, tree exp, rtx target,
+			   bool return_p)
 {
-  /* Mapping:
-       ops[0] <--> target <--> exp
-       ops[1] <--> value0 <--> arg0 */
-  struct expand_operand ops[2];
-  tree arg0;
-  rtx value0;
+  rtx pat;
+  rtx op0 = nds32_read_argument (exp, 0);
+  int op0_num = return_p ? 1 : 0;
+
+  if (return_p)
+    target = nds32_legitimize_target (icode, target);

-  /* Grab the incoming arguments and extract its rtx.  */
-  arg0 = CALL_EXPR_ARG (exp, 0);
-  value0 = expand_normal (arg0);
+  op0 = nds32_legitimize_argument (icode, op0_num, op0);

-  /* Create operands.  */
-  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (exp)));
-  create_input_operand (&ops[1], value0, TYPE_MODE (TREE_TYPE (arg0)));
+  /* Emit and return the new instruction. */
+  if (return_p)
+    pat = GEN_FCN (icode) (target, op0);
+  else
+    pat = GEN_FCN (icode) (op0);

-  /* Emit new instruction.  */
-  if (!maybe_expand_insn (icode, 2, ops))
-    error ("invalid argument to built-in function");
+  if (! pat)
+    return NULL_RTX;

+  emit_insn (pat);
   return target;
 }

-/* Function to expand builtin function for
-   '[(unspec_volatile [(reg) (imm)])]' pattern.  */
+/* Expand builtins that take one operands and the first is immediate.  */
 static rtx
-nds32_expand_builtin_null_ftype_reg_imm (enum insn_code icode,
-					 tree exp, rtx target)
-{
-  /* Mapping:
-       ops[0] <--> value0 <--> arg0
-       ops[1] <--> value1 <--> arg1 */
-  struct expand_operand ops[2];
-  tree arg0, arg1;
-  rtx value0, value1;
-
-  /* Grab the incoming arguments and extract its rtx.  */
-  arg0 = CALL_EXPR_ARG (exp, 0);
-  arg1 = CALL_EXPR_ARG (exp, 1);
-  value0 = expand_normal (arg0);
-  value1 = expand_normal (arg1);
-
-  /* Create operands.  */
-  create_input_operand (&ops[0], value0, TYPE_MODE (TREE_TYPE (arg0)));
-  create_input_operand (&ops[1], value1, TYPE_MODE (TREE_TYPE (arg1)));
-
-  /* Emit new instruction.  */
-  if (!maybe_expand_insn (icode, 2, ops))
-    error ("invalid argument to built-in function");
+nds32_expand_unopimm_builtin (enum insn_code icode, tree exp, rtx target,
+			      bool return_p, const char *name)
+{
+  rtx pat;
+  rtx op0 = nds32_read_argument (exp, 0);
+  int op0_num = return_p ? 1 : 0;
+
+  if (return_p)
+    target = nds32_legitimize_target (icode, target);
+
+  if (!nds32_check_constant_argument (icode, op0_num, op0, name))
+    return NULL_RTX;
+
+  op0 = nds32_legitimize_argument (icode, op0_num, op0);

+  /* Emit and return the new instruction. */
+  if (return_p)
+    pat = GEN_FCN (icode) (target, op0);
+  else
+    pat = GEN_FCN (icode) (op0);
+
+  if (! pat)
+    return NULL_RTX;
+
+  emit_insn (pat);
   return target;
 }

-/* ------------------------------------------------------------------------ */
+/* Expand builtins that take two operands.  */
+static rtx
+nds32_expand_binop_builtin (enum insn_code icode, tree exp, rtx target,
+			    bool return_p)
+{
+  rtx pat;
+  rtx op0 = nds32_read_argument (exp, 0);
+  rtx op1 = nds32_read_argument (exp, 1);
+  int op0_num = return_p ? 1 : 0;
+  int op1_num = return_p ? 2 : 1;

-void
-nds32_init_builtins_impl (void)
+  if (return_p)
+    target = nds32_legitimize_target (icode, target);
+
+  op0 = nds32_legitimize_argument (icode, op0_num, op0);
+  op1 = nds32_legitimize_argument (icode, op1_num, op1);
+
+  /* Emit and return the new instruction. */
+  if (return_p)
+    pat = GEN_FCN (icode) (target, op0, op1);
+  else
+    pat = GEN_FCN (icode) (op0, op1);
+
+  if (! pat)
+    return NULL_RTX;
+
+  emit_insn (pat);
+  return target;
+}
+
+/* Expand builtins that take two operands and the second is immediate.  */
+static rtx
+nds32_expand_binopimm_builtin (enum insn_code icode, tree exp, rtx target,
+			       bool return_p, const char *name)
 {
-  tree pointer_type_node  = build_pointer_type (integer_type_node);
+  rtx pat;
+  rtx op0 = nds32_read_argument (exp, 0);
+  rtx op1 = nds32_read_argument (exp, 1);
+  int op0_num = return_p ? 1 : 0;
+  int op1_num = return_p ? 2 : 1;

-  tree void_ftype_void    = build_function_type (void_type_node,
-						 void_list_node);
+  if (return_p)
+    target = nds32_legitimize_target (icode, target);

-  tree void_ftype_pint    = build_function_type_list (void_type_node,
-						      pointer_type_node,
-						      NULL_TREE);
+  if (!nds32_check_constant_argument (icode, op1_num, op1, name))
+    return NULL_RTX;

-  tree int_ftype_int      = build_function_type_list (integer_type_node,
-						      integer_type_node,
-						      NULL_TREE);
+  op0 = nds32_legitimize_argument (icode, op0_num, op0);
+  op1 = nds32_legitimize_argument (icode, op1_num, op1);

-  tree void_ftype_int_int = build_function_type_list (void_type_node,
-						      integer_type_node,
-						      integer_type_node,
-						      NULL_TREE);
+  /* Emit and return the new instruction. */
+  if (return_p)
+    pat = GEN_FCN (icode) (target, op0, op1);
+  else
+    pat = GEN_FCN (icode) (op0, op1);

-  /* Cache.  */
-  add_builtin_function ("__builtin_nds32_isync",  void_ftype_pint,
-			NDS32_BUILTIN_ISYNC,
-			BUILT_IN_MD, NULL, NULL_TREE);
-  add_builtin_function ("__builtin_nds32_isb",  void_ftype_void,
-			NDS32_BUILTIN_ISB,
-			BUILT_IN_MD, NULL, NULL_TREE);
+  if (! pat)
+    return NULL_RTX;

-  /* Register Transfer.  */
-  add_builtin_function ("__builtin_nds32_mfsr",  int_ftype_int,
-			NDS32_BUILTIN_MFSR,
-			BUILT_IN_MD, NULL, NULL_TREE);
-  add_builtin_function ("__builtin_nds32_mfusr", int_ftype_int,
-			NDS32_BUILTIN_MFUSR,
-			BUILT_IN_MD, NULL, NULL_TREE);
-  add_builtin_function ("__builtin_nds32_mtsr",  void_ftype_int_int,
-			NDS32_BUILTIN_MTSR,
-			BUILT_IN_MD, NULL, NULL_TREE);
-  add_builtin_function ("__builtin_nds32_mtusr", void_ftype_int_int,
-			NDS32_BUILTIN_MTUSR,
-			BUILT_IN_MD, NULL, NULL_TREE);
+  emit_insn (pat);
+  return target;
+}

-  /* Interrupt.  */
-  add_builtin_function ("__builtin_nds32_setgie_en",  void_ftype_void,
-			NDS32_BUILTIN_SETGIE_EN,
-			BUILT_IN_MD, NULL, NULL_TREE);
-  add_builtin_function ("__builtin_nds32_setgie_dis", void_ftype_void,
-			NDS32_BUILTIN_SETGIE_DIS,
-			BUILT_IN_MD, NULL, NULL_TREE);
+/* Expand builtins that take three operands.  */
+static rtx
+nds32_expand_triop_builtin (enum insn_code icode, tree exp, rtx target,
+			    bool return_p)
+{
+  rtx pat;
+  rtx op0 = nds32_read_argument (exp, 0);
+  rtx op1 = nds32_read_argument (exp, 1);
+  rtx op2 = nds32_read_argument (exp, 2);
+  int op0_num = return_p ? 1 : 0;
+  int op1_num = return_p ? 2 : 1;
+  int op2_num = return_p ? 3 : 2;
+
+  if (return_p)
+    target = nds32_legitimize_target (icode, target);
+
+  op0 = nds32_legitimize_argument (icode, op0_num, op0);
+  op1 = nds32_legitimize_argument (icode, op1_num, op1);
+  op2 = nds32_legitimize_argument (icode, op2_num, op2);
+
+  /* Emit and return the new instruction. */
+  if (return_p)
+    pat = GEN_FCN (icode) (target, op0, op1, op2);
+  else
+    pat = GEN_FCN (icode) (op0, op1, op2);
+
+  if (! pat)
+    return NULL_RTX;
+
+  emit_insn (pat);
+  return target;
+}
+
+/* Expand builtins that take three operands and the third is immediate.  */
+static rtx
+nds32_expand_triopimm_builtin (enum insn_code icode, tree exp, rtx target,
+			       bool return_p, const char *name)
+{
+  rtx pat;
+  rtx op0 = nds32_read_argument (exp, 0);
+  rtx op1 = nds32_read_argument (exp, 1);
+  rtx op2 = nds32_read_argument (exp, 2);
+  int op0_num = return_p ? 1 : 0;
+  int op1_num = return_p ? 2 : 1;
+  int op2_num = return_p ? 3 : 2;
+
+  if (return_p)
+    target = nds32_legitimize_target (icode, target);
+
+  if (!nds32_check_constant_argument (icode, op2_num, op2, name))
+    return NULL_RTX;
+
+  op0 = nds32_legitimize_argument (icode, op0_num, op0);
+  op1 = nds32_legitimize_argument (icode, op1_num, op1);
+  op2 = nds32_legitimize_argument (icode, op2_num, op2);
+
+  /* Emit and return the new instruction. */
+  if (return_p)
+    pat = GEN_FCN (icode) (target, op0, op1, op2);
+  else
+    pat = GEN_FCN (icode) (op0, op1, op2);
+
+  if (! pat)
+    return NULL_RTX;
+
+  emit_insn (pat);
+  return target;
+}
+
+/* Expand builtins for load.  */
+static rtx
+nds32_expand_builtin_load (enum insn_code icode, tree exp, rtx target)
+{
+  /* Load address format is [$ra + $rb],
+     but input arguments not enough,
+     so we need another temp register as $rb.
+     Generating assembly code:
+       movi $temp, 0
+       llw  $rt, [$ra + $temp] */
+  rtx pat;
+  rtx op0 = nds32_read_argument (exp, 0);
+  rtx addr_helper = gen_reg_rtx (insn_data[icode].operand[1].mode);
+
+  target = nds32_legitimize_target (icode, target);
+  op0 = nds32_legitimize_argument (icode, 1, op0);
+
+  /* Emit and return the new instruction. */
+  pat = GEN_FCN (icode) (target, op0, addr_helper);
+  if (!pat)
+    return NULL_RTX;
+
+  emit_move_insn (addr_helper, GEN_INT (0));
+  emit_insn (pat);
+  return target;
+}
+
+/* Expand builtins for store.  */
+static rtx
+nds32_expand_builtin_store (enum insn_code icode, tree exp, rtx target)
+{
+  /* Store address format is [$ra + $rb],
+     but input arguments not enough,
+     so we need another temp register as $rb.
+     Generating assembly code:
+       movi $temp, 0
+       store  $rt, [$ra + $temp] */
+  rtx pat;
+  rtx op0 = nds32_read_argument (exp, 0);
+  rtx op1 = nds32_read_argument (exp, 1);
+  rtx addr_helper = gen_reg_rtx (insn_data[icode].operand[1].mode);
+
+  op0 = nds32_legitimize_argument (icode, 0, op0);
+  op1 = nds32_legitimize_argument (icode, 2, op1);
+
+  /* Emit and return the new instruction. */
+  pat = GEN_FCN (icode) (op0, addr_helper, op1);
+  if (! pat)
+    return NULL_RTX;
+
+  emit_move_insn (addr_helper, GEN_INT (0));
+  emit_insn (pat);
+  return target;
+}
+
+/* Expand cctl builtins.  */
+static rtx
+nds32_expand_cctl_builtin (enum insn_code icode, tree exp, rtx target,
+			   bool return_p, const char *name)
+{
+  rtx pat;
+  rtx op0 = nds32_read_argument (exp, 0);
+  rtx op1 = nds32_read_argument (exp, 1);
+  int op0_num = return_p ? 1 : 0;
+  int op1_num = return_p ? 2 : 1;
+
+  if (return_p)
+    target = nds32_legitimize_target (icode, target);
+
+  if (!nds32_check_constant_argument (icode, op0_num, op0, name))
+    return NULL_RTX;
+
+  op0 = nds32_legitimize_argument (icode, op0_num, op0);
+  op1 = nds32_legitimize_argument (icode, op1_num, op1);
+
+  /* Emit and return the new instruction. */
+  if (icode == CODE_FOR_cctl_idx_write)
+    {
+      /* cctl_idx_write is three argument,
+	 so create operand2 for cctl_idx_write pattern.  */
+      rtx op2 = nds32_read_argument (exp, 2);
+      op2 = nds32_legitimize_argument (icode, 2, op2);
+      pat = GEN_FCN (icode) (op0, op1, op2);
+    }
+  else if (return_p)
+    pat = GEN_FCN (icode) (target, op0, op1);
+  else
+    pat = GEN_FCN (icode) (op0, op1);
+
+  if (! pat)
+    return NULL_RTX;
+
+  emit_insn (pat);
+  return target;
+}
+
+/* Expand scw builtins.  */
+static rtx
+nds32_expand_scw_builtin (enum insn_code icode, tree exp, rtx target)
+{
+  /* SCW address format is [$ra + $rb], but input arguments not enough,
+     so we need another temp register as $rb.
+     Generating assembly code:
+	movi $temp, 0
+	scw  $rt, [$ra + $temp] */
+  rtx pat;
+  rtx op0 = nds32_read_argument (exp, 0);
+  rtx op1 = nds32_read_argument (exp, 1);
+  rtx addr_helper = gen_reg_rtx (insn_data[icode].operand[1].mode);
+
+  target = nds32_legitimize_target (icode, target);
+  op0 = nds32_legitimize_argument (icode, 1, op0);
+  op1 = nds32_legitimize_argument (icode, 2, op1);
+
+  /* Emit and return the new instruction. */
+  pat = GEN_FCN (icode) (target, op0, addr_helper, target);
+
+  if (!pat)
+    return NULL_RTX;
+
+  emit_move_insn (addr_helper, GEN_INT (0));
+  emit_move_insn (target, op1);
+  emit_insn (pat);
+  return target;
 }

+/* Expand set int priority builtins. */
+static rtx
+nds32_expand_priority_builtin (enum insn_code icode, tree exp, rtx target,
+			       const char *name)
+{
+  rtx pat;
+  rtx op0 = nds32_read_argument (exp, 0);
+  rtx op1 = nds32_read_argument (exp, 1);
+
+  /* set_int_priority intrinsic function that two arguments are immediate,
+     so check whether auguments are immedite.  */
+
+  if (!nds32_check_constant_argument (icode, 0, op0, name))
+    return NULL_RTX;
+
+  if (!nds32_check_constant_argument (icode, 1, op1, name))
+    return NULL_RTX;
+
+  op0 = nds32_legitimize_argument (icode, 0, op0);
+  op1 = nds32_legitimize_argument (icode, 1, op1);
+
+  /* Emit and return the new instruction. */
+  pat = GEN_FCN (icode) (op0, op1);
+
+  if (! pat)
+    return NULL_RTX;
+
+  emit_insn (pat);
+  return target;
+}
+
+struct builtin_description
+{
+  const enum insn_code icode;
+  const char *name;
+  enum nds32_builtins code;
+  bool return_p;
+};
+
+#define NDS32_BUILTIN(code, string, builtin) \
+  { CODE_FOR_##code, "__nds32__" string, \
+    NDS32_BUILTIN_##builtin, true },
+
+#define NDS32_NO_TARGET_BUILTIN(code, string, builtin) \
+  { CODE_FOR_##code, "__nds32__" string, \
+    NDS32_BUILTIN_##builtin, false },
+
+/* Intrinsics that no argument, and that return value.  */
+static struct builtin_description bdesc_noarg[] =
+{
+  NDS32_BUILTIN(unspec_fmfcfg, "fmfcfg", FMFCFG)
+  NDS32_BUILTIN(unspec_fmfcsr, "fmfcsr", FMFCSR)
+  NDS32_BUILTIN(unspec_volatile_rdov, "rdov", RDOV)
+  NDS32_BUILTIN(unspec_get_current_sp, "get_current_sp", GET_CURRENT_SP)
+  NDS32_BUILTIN(unspec_return_address, "return_address", RETURN_ADDRESS)
+  NDS32_BUILTIN(unspec_get_all_pending_int, "get_all_pending_int",
+		GET_ALL_PENDING_INT)
+  NDS32_BUILTIN(unspec_unaligned_feature, "unaligned_feature",
+		UNALIGNED_FEATURE)
+  NDS32_NO_TARGET_BUILTIN(unspec_enable_unaligned, "enable_unaligned",
+			  ENABLE_UNALIGNED)
+  NDS32_NO_TARGET_BUILTIN(unspec_disable_unaligned, "disable_unaligned",
+			  DISABLE_UNALIGNED)
+};
+
+/* Intrinsics that take just one argument.  */
+static struct builtin_description bdesc_1arg[] =
+{
+  NDS32_BUILTIN(unspec_ssabssi2, "abs", ABS)
+  NDS32_BUILTIN(clzsi2, "clz", CLZ)
+  NDS32_BUILTIN(unspec_clo, "clo", CLO)
+  NDS32_BUILTIN(unspec_wsbh, "wsbh", WSBH)
+  NDS32_BUILTIN(unspec_tlbop_pb, "tlbop_pb",TLBOP_PB)
+  NDS32_BUILTIN(unaligned_load_hw, "unaligned_load_hw", UALOAD_HW)
+  NDS32_BUILTIN(unaligned_loadsi, "unaligned_load_w", UALOAD_W)
+  NDS32_BUILTIN(unaligned_loaddi, "unaligned_load_dw", UALOAD_DW)
+  NDS32_NO_TARGET_BUILTIN(unspec_volatile_isync, "isync", ISYNC)
+  NDS32_NO_TARGET_BUILTIN(unspec_fmtcsr, "fmtcsr", FMTCSR)
+  NDS32_NO_TARGET_BUILTIN(unspec_jr_itoff, "jr_itoff", JR_ITOFF)
+  NDS32_NO_TARGET_BUILTIN(unspec_jr_toff, "jr_toff", JR_TOFF)
+  NDS32_NO_TARGET_BUILTIN(unspec_jral_ton, "jral_ton", JRAL_TON)
+  NDS32_NO_TARGET_BUILTIN(unspec_ret_toff, "ret_toff", RET_TOFF)
+  NDS32_NO_TARGET_BUILTIN(unspec_jral_iton, "jral_iton",JRAL_ITON)
+  NDS32_NO_TARGET_BUILTIN(unspec_tlbop_trd, "tlbop_trd", TLBOP_TRD)
+  NDS32_NO_TARGET_BUILTIN(unspec_tlbop_twr, "tlbop_twr", TLBOP_TWR)
+  NDS32_NO_TARGET_BUILTIN(unspec_tlbop_rwr, "tlbop_rwr", TLBOP_RWR)
+  NDS32_NO_TARGET_BUILTIN(unspec_tlbop_rwlk, "tlbop_rwlk", TLBOP_RWLK)
+  NDS32_NO_TARGET_BUILTIN(unspec_tlbop_unlk, "tlbop_unlk", TLBOP_UNLK)
+  NDS32_NO_TARGET_BUILTIN(unspec_tlbop_inv, "tlbop_inv", TLBOP_INV)
+  NDS32_NO_TARGET_BUILTIN(unspec_ret_itoff, "ret_itoff", RET_ITOFF)
+  NDS32_NO_TARGET_BUILTIN(unspec_set_current_sp,
+			  "set_current_sp", SET_CURRENT_SP)
+  NDS32_BUILTIN(kabsv2hi2, "kabs16", KABS16)
+  NDS32_BUILTIN(kabsv2hi2, "v_kabs16", V_KABS16)
+  NDS32_BUILTIN(kabsv4qi2, "kabs8", KABS8)
+  NDS32_BUILTIN(kabsv4qi2, "v_kabs8", V_KABS8)
+  NDS32_BUILTIN(sunpkd810, "sunpkd810", SUNPKD810)
+  NDS32_BUILTIN(sunpkd810, "v_sunpkd810", V_SUNPKD810)
+  NDS32_BUILTIN(sunpkd820, "sunpkd820", SUNPKD820)
+  NDS32_BUILTIN(sunpkd820, "v_sunpkd820", V_SUNPKD820)
+  NDS32_BUILTIN(sunpkd830, "sunpkd830", SUNPKD830)
+  NDS32_BUILTIN(sunpkd830, "v_sunpkd830", V_SUNPKD830)
+  NDS32_BUILTIN(sunpkd831, "sunpkd831", SUNPKD831)
+  NDS32_BUILTIN(sunpkd831, "v_sunpkd831", V_SUNPKD831)
+  NDS32_BUILTIN(zunpkd810, "zunpkd810", ZUNPKD810)
+  NDS32_BUILTIN(zunpkd810, "v_zunpkd810", V_ZUNPKD810)
+  NDS32_BUILTIN(zunpkd820, "zunpkd820", ZUNPKD820)
+  NDS32_BUILTIN(zunpkd820, "v_zunpkd820", V_ZUNPKD820)
+  NDS32_BUILTIN(zunpkd830, "zunpkd830", ZUNPKD830)
+  NDS32_BUILTIN(zunpkd830, "v_zunpkd830", V_ZUNPKD830)
+  NDS32_BUILTIN(zunpkd831, "zunpkd831", ZUNPKD831)
+  NDS32_BUILTIN(zunpkd831, "v_zunpkd831", V_ZUNPKD831)
+  NDS32_BUILTIN(unspec_kabs, "kabs", KABS)
+  NDS32_BUILTIN(unaligned_loadv2hi, "get_unaligned_u16x2", UALOAD_U16)
+  NDS32_BUILTIN(unaligned_loadv2hi, "get_unaligned_s16x2", UALOAD_S16)
+  NDS32_BUILTIN(unaligned_loadv4qi, "get_unaligned_u8x4", UALOAD_U8)
+  NDS32_BUILTIN(unaligned_loadv4qi, "get_unaligned_s8x4", UALOAD_S8)
+};
+
+/* Intrinsics that take just one argument. and the argument is immediate.  */
+static struct builtin_description bdesc_1argimm[] =
+{
+  NDS32_BUILTIN(unspec_volatile_mfsr, "mfsr", MFSR)
+  NDS32_BUILTIN(unspec_volatile_mfusr, "mfsr", MFUSR)
+  NDS32_BUILTIN(unspec_get_pending_int, "get_pending_int", GET_PENDING_INT)
+  NDS32_BUILTIN(unspec_get_int_priority, "get_int_priority", GET_INT_PRIORITY)
+  NDS32_NO_TARGET_BUILTIN(unspec_trap, "trap", TRAP)
+  NDS32_NO_TARGET_BUILTIN(unspec_break, "break", BREAK)
+  NDS32_NO_TARGET_BUILTIN(unspec_syscall, "syscall", SYSCALL)
+  NDS32_NO_TARGET_BUILTIN(unspec_enable_int, "enable_int", ENABLE_INT)
+  NDS32_NO_TARGET_BUILTIN(unspec_disable_int, "disable_int", DISABLE_INT)
+  NDS32_NO_TARGET_BUILTIN(unspec_clr_pending_hwint, "clr_pending_hwint",
+			  CLR_PENDING_HWINT)
+  NDS32_NO_TARGET_BUILTIN(unspec_set_trig_level, "set_trig_level",
+			  SET_TRIG_LEVEL)
+  NDS32_NO_TARGET_BUILTIN(unspec_set_trig_edge, "set_trig_edge",
+			  SET_TRIG_EDGE)
+  NDS32_BUILTIN(unspec_get_trig_type, "get_trig_type", GET_TRIG_TYPE)
+};
+
+/* Intrinsics that take two arguments.  */
+static struct builtin_description bdesc_2arg[] =
+{
+  NDS32_BUILTIN(unspec_fcpynss, "fcpynss", FCPYNSS)
+  NDS32_BUILTIN(unspec_fcpyss, "fcpyss", FCPYSS)
+  NDS32_BUILTIN(unspec_fcpynsd, "fcpynsd", FCPYNSD)
+  NDS32_BUILTIN(unspec_fcpysd, "fcpysd", FCPYSD)
+  NDS32_BUILTIN(unspec_ave, "ave", AVE)
+  NDS32_BUILTIN(unspec_pbsad, "pbsad", PBSAD)
+  NDS32_BUILTIN(unspec_ffb, "ffb", FFB)
+  NDS32_BUILTIN(unspec_ffmism, "ffmsim", FFMISM)
+  NDS32_BUILTIN(unspec_flmism, "flmism", FLMISM)
+  NDS32_BUILTIN(unspec_kaddw, "kaddw", KADDW)
+  NDS32_BUILTIN(unspec_kaddh, "kaddh", KADDH)
+  NDS32_BUILTIN(unspec_ksubw, "ksubw", KSUBW)
+  NDS32_BUILTIN(unspec_ksubh, "ksubh", KSUBH)
+  NDS32_BUILTIN(unspec_kdmbb, "kdmbb", KDMBB)
+  NDS32_BUILTIN(unspec_kdmbb, "v_kdmbb", V_KDMBB)
+  NDS32_BUILTIN(unspec_kdmbt, "kdmbt", KDMBT)
+  NDS32_BUILTIN(unspec_kdmbt, "v_kdmbt", V_KDMBT)
+  NDS32_BUILTIN(unspec_kdmtb, "kdmtb", KDMTB)
+  NDS32_BUILTIN(unspec_kdmtb, "v_kdmtb", V_KDMTB)
+  NDS32_BUILTIN(unspec_kdmtt, "kdmtt", KDMTT)
+  NDS32_BUILTIN(unspec_kdmtt, "v_kdmtt", V_KDMTT)
+  NDS32_BUILTIN(unspec_khmbb, "khmbb", KHMBB)
+  NDS32_BUILTIN(unspec_khmbb, "v_khmbb", V_KHMBB)
+  NDS32_BUILTIN(unspec_khmbt, "khmbt", KHMBT)
+  NDS32_BUILTIN(unspec_khmbt, "v_khmbt", V_KHMBT)
+  NDS32_BUILTIN(unspec_khmtb, "khmtb", KHMTB)
+  NDS32_BUILTIN(unspec_khmtb, "v_khmtb", V_KHMTB)
+  NDS32_BUILTIN(unspec_khmtt, "khmtt", KHMTT)
+  NDS32_BUILTIN(unspec_khmtt, "v_khmtt", V_KHMTT)
+  NDS32_BUILTIN(unspec_kslraw, "kslraw", KSLRAW)
+  NDS32_BUILTIN(unspec_kslrawu, "kslraw_u", KSLRAW_U)
+  NDS32_BUILTIN(rotrsi3, "rotr", ROTR)
+  NDS32_BUILTIN(unspec_sva, "sva", SVA)
+  NDS32_BUILTIN(unspec_svs, "svs", SVS)
+  NDS32_NO_TARGET_BUILTIN(mtsr_isb, "mtsr_isb", MTSR_ISB)
+  NDS32_NO_TARGET_BUILTIN(mtsr_dsb, "mtsr_dsb", MTSR_DSB)
+  NDS32_NO_TARGET_BUILTIN(unspec_volatile_mtsr, "mtsr", MTSR)
+  NDS32_NO_TARGET_BUILTIN(unspec_volatile_mtusr, "mtusr", MTUSR)
+  NDS32_NO_TARGET_BUILTIN(unaligned_store_hw, "unaligned_store_hw", UASTORE_HW)
+  NDS32_NO_TARGET_BUILTIN(unaligned_storesi, "unaligned_store_hw", UASTORE_W)
+  NDS32_NO_TARGET_BUILTIN(unaligned_storedi, "unaligned_store_hw", UASTORE_DW)
+  NDS32_BUILTIN(addv2hi3, "add16", ADD16)
+  NDS32_BUILTIN(addv2hi3, "v_uadd16", V_UADD16)
+  NDS32_BUILTIN(addv2hi3, "v_sadd16", V_SADD16)
+  NDS32_BUILTIN(raddv2hi3, "radd16", RADD16)
+  NDS32_BUILTIN(raddv2hi3, "v_radd16", V_RADD16)
+  NDS32_BUILTIN(uraddv2hi3, "uradd16", URADD16)
+  NDS32_BUILTIN(uraddv2hi3, "v_uradd16", V_URADD16)
+  NDS32_BUILTIN(kaddv2hi3, "kadd16", KADD16)
+  NDS32_BUILTIN(kaddv2hi3, "v_kadd16", V_KADD16)
+  NDS32_BUILTIN(ukaddv2hi3, "ukadd16", UKADD16)
+  NDS32_BUILTIN(ukaddv2hi3, "v_ukadd16", V_UKADD16)
+  NDS32_BUILTIN(subv2hi3, "sub16", SUB16)
+  NDS32_BUILTIN(subv2hi3, "v_usub16", V_USUB16)
+  NDS32_BUILTIN(subv2hi3, "v_ssub16", V_SSUB16)
+  NDS32_BUILTIN(rsubv2hi3, "rsub16", RSUB16)
+  NDS32_BUILTIN(rsubv2hi3, "v_rsub16", V_RSUB16)
+  NDS32_BUILTIN(ursubv2hi3, "ursub16", URSUB16)
+  NDS32_BUILTIN(ursubv2hi3, "v_ursub16", V_URSUB16)
+  NDS32_BUILTIN(ksubv2hi3, "ksub16", KSUB16)
+  NDS32_BUILTIN(ksubv2hi3, "v_ksub16", V_KSUB16)
+  NDS32_BUILTIN(uksubv2hi3, "uksub16", UKSUB16)
+  NDS32_BUILTIN(uksubv2hi3, "v_uksub16", V_UKSUB16)
+  NDS32_BUILTIN(cras16_1, "cras16", CRAS16)
+  NDS32_BUILTIN(cras16_1, "v_ucras16", V_UCRAS16)
+  NDS32_BUILTIN(cras16_1, "v_scras16", V_SCRAS16)
+  NDS32_BUILTIN(rcras16_1, "rcras16", RCRAS16)
+  NDS32_BUILTIN(rcras16_1, "v_rcras16", V_RCRAS16)
+  NDS32_BUILTIN(urcras16_1, "urcras16", URCRAS16)
+  NDS32_BUILTIN(urcras16_1, "v_urcras16", V_URCRAS16)
+  NDS32_BUILTIN(kcras16_1, "kcras16", KCRAS16)
+  NDS32_BUILTIN(kcras16_1, "v_kcras16", V_KCRAS16)
+  NDS32_BUILTIN(ukcras16_1, "ukcras16", UKCRAS16)
+  NDS32_BUILTIN(ukcras16_1, "v_ukcras16", V_UKCRAS16)
+  NDS32_BUILTIN(crsa16_1, "crsa16", CRSA16)
+  NDS32_BUILTIN(crsa16_1, "v_ucrsa16", V_UCRSA16)
+  NDS32_BUILTIN(crsa16_1, "v_scrsa16", V_SCRSA16)
+  NDS32_BUILTIN(rcrsa16_1, "rcrsa16", RCRSA16)
+  NDS32_BUILTIN(rcrsa16_1, "v_rcrsa16", V_RCRSA16)
+  NDS32_BUILTIN(urcrsa16_1, "urcrsa16", URCRSA16)
+  NDS32_BUILTIN(urcrsa16_1, "v_urcrsa16", V_URCRSA16)
+  NDS32_BUILTIN(kcrsa16_1, "kcrsa16", KCRSA16)
+  NDS32_BUILTIN(kcrsa16_1, "v_kcrsa16", V_KCRSA16)
+  NDS32_BUILTIN(ukcrsa16_1, "ukcrsa16", UKCRSA16)
+  NDS32_BUILTIN(ukcrsa16_1, "v_ukcrsa16", V_UKCRSA16)
+  NDS32_BUILTIN(addv4qi3, "add8", ADD8)
+  NDS32_BUILTIN(addv4qi3, "v_uadd8", V_UADD8)
+  NDS32_BUILTIN(addv4qi3, "v_sadd8", V_SADD8)
+  NDS32_BUILTIN(raddv4qi3, "radd8", RADD8)
+  NDS32_BUILTIN(raddv4qi3, "v_radd8", V_RADD8)
+  NDS32_BUILTIN(uraddv4qi3, "uradd8", URADD8)
+  NDS32_BUILTIN(uraddv4qi3, "v_uradd8", V_URADD8)
+  NDS32_BUILTIN(kaddv4qi3, "kadd8", KADD8)
+  NDS32_BUILTIN(kaddv4qi3, "v_kadd8", V_KADD8)
+  NDS32_BUILTIN(ukaddv4qi3, "ukadd8", UKADD8)
+  NDS32_BUILTIN(ukaddv4qi3, "v_ukadd8", V_UKADD8)
+  NDS32_BUILTIN(subv4qi3, "sub8", SUB8)
+  NDS32_BUILTIN(subv4qi3, "v_usub8", V_USUB8)
+  NDS32_BUILTIN(subv4qi3, "v_ssub8", V_SSUB8)
+  NDS32_BUILTIN(rsubv4qi3, "rsub8", RSUB8)
+  NDS32_BUILTIN(rsubv4qi3, "v_rsub8", V_RSUB8)
+  NDS32_BUILTIN(ursubv4qi3, "ursub8", URSUB8)
+  NDS32_BUILTIN(ursubv4qi3, "v_ursub8", V_URSUB8)
+  NDS32_BUILTIN(ksubv4qi3, "ksub8", KSUB8)
+  NDS32_BUILTIN(ksubv4qi3, "v_ksub8", V_KSUB8)
+  NDS32_BUILTIN(uksubv4qi3, "uksub8", UKSUB8)
+  NDS32_BUILTIN(uksubv4qi3, "v_uksub8", V_UKSUB8)
+  NDS32_BUILTIN(ashrv2hi3, "sra16", SRA16)
+  NDS32_BUILTIN(ashrv2hi3, "v_sra16", V_SRA16)
+  NDS32_BUILTIN(sra16_round, "sra16_u", SRA16_U)
+  NDS32_BUILTIN(sra16_round, "v_sra16_u", V_SRA16_U)
+  NDS32_BUILTIN(lshrv2hi3, "srl16", SRL16)
+  NDS32_BUILTIN(lshrv2hi3, "v_srl16", V_SRL16)
+  NDS32_BUILTIN(srl16_round, "srl16_u", SRL16_U)
+  NDS32_BUILTIN(srl16_round, "v_srl16_u", V_SRL16_U)
+  NDS32_BUILTIN(ashlv2hi3, "sll16", SLL16)
+  NDS32_BUILTIN(ashlv2hi3, "v_sll16", V_SLL16)
+  NDS32_BUILTIN(kslli16, "ksll16", KSLL16)
+  NDS32_BUILTIN(kslli16, "v_ksll16", V_KSLL16)
+  NDS32_BUILTIN(kslra16, "kslra16", KSLRA16)
+  NDS32_BUILTIN(kslra16, "v_kslra16", V_KSLRA16)
+  NDS32_BUILTIN(kslra16_round, "kslra16_u", KSLRA16_U)
+  NDS32_BUILTIN(kslra16_round, "v_kslra16_u", V_KSLRA16_U)
+  NDS32_BUILTIN(cmpeq16, "cmpeq16", CMPEQ16)
+  NDS32_BUILTIN(cmpeq16, "v_scmpeq16", V_SCMPEQ16)
+  NDS32_BUILTIN(cmpeq16, "v_ucmpeq16", V_UCMPEQ16)
+  NDS32_BUILTIN(scmplt16, "scmplt16", SCMPLT16)
+  NDS32_BUILTIN(scmplt16, "v_scmplt16", V_SCMPLT16)
+  NDS32_BUILTIN(scmple16, "scmple16", SCMPLE16)
+  NDS32_BUILTIN(scmple16, "v_scmple16", V_SCMPLE16)
+  NDS32_BUILTIN(ucmplt16, "ucmplt16", UCMPLT16)
+  NDS32_BUILTIN(ucmplt16, "v_ucmplt16", V_UCMPLT16)
+  NDS32_BUILTIN(ucmplt16, "ucmple16", UCMPLE16)
+  NDS32_BUILTIN(ucmplt16, "v_ucmple16", V_UCMPLE16)
+  NDS32_BUILTIN(cmpeq8, "cmpeq8", CMPEQ8)
+  NDS32_BUILTIN(cmpeq8, "v_scmpeq8", V_SCMPEQ8)
+  NDS32_BUILTIN(cmpeq8, "v_ucmpeq8", V_UCMPEQ8)
+  NDS32_BUILTIN(scmplt8, "scmplt8", SCMPLT8)
+  NDS32_BUILTIN(scmplt8, "v_scmplt8", V_SCMPLT8)
+  NDS32_BUILTIN(scmple8, "scmple8", SCMPLE8)
+  NDS32_BUILTIN(scmple8, "v_scmple8", V_SCMPLE8)
+  NDS32_BUILTIN(ucmplt8, "ucmplt8", UCMPLT8)
+  NDS32_BUILTIN(ucmplt8, "v_ucmplt8", V_UCMPLT8)
+  NDS32_BUILTIN(ucmplt8, "ucmple8", UCMPLE8)
+  NDS32_BUILTIN(ucmplt8, "v_ucmple8", V_UCMPLE8)
+  NDS32_BUILTIN(sminv2hi3, "smin16", SMIN16)
+  NDS32_BUILTIN(sminv2hi3, "v_smin16", V_SMIN16)
+  NDS32_BUILTIN(uminv2hi3, "umin16", UMIN16)
+  NDS32_BUILTIN(uminv2hi3, "v_umin16", V_UMIN16)
+  NDS32_BUILTIN(smaxv2hi3, "smax16", SMAX16)
+  NDS32_BUILTIN(smaxv2hi3, "v_smax16", V_SMAX16)
+  NDS32_BUILTIN(umaxv2hi3, "umax16", UMAX16)
+  NDS32_BUILTIN(umaxv2hi3, "v_umax16", V_UMAX16)
+  NDS32_BUILTIN(khm16, "khm16", KHM16)
+  NDS32_BUILTIN(khm16, "v_khm16", V_KHM16)
+  NDS32_BUILTIN(khmx16, "khmx16", KHMX16)
+  NDS32_BUILTIN(khmx16, "v_khmx16", V_KHMX16)
+  NDS32_BUILTIN(sminv4qi3, "smin8", SMIN8)
+  NDS32_BUILTIN(sminv4qi3, "v_smin8", V_SMIN8)
+  NDS32_BUILTIN(uminv4qi3, "umin8", UMIN8)
+  NDS32_BUILTIN(uminv4qi3, "v_umin8", V_UMIN8)
+  NDS32_BUILTIN(smaxv4qi3, "smax8", SMAX8)
+  NDS32_BUILTIN(smaxv4qi3, "v_smax8", V_SMAX8)
+  NDS32_BUILTIN(umaxv4qi3, "umax8", UMAX8)
+  NDS32_BUILTIN(umaxv4qi3, "v_umax8", V_UMAX8)
+  NDS32_BUILTIN(raddsi3, "raddw", RADDW)
+  NDS32_BUILTIN(uraddsi3, "uraddw", URADDW)
+  NDS32_BUILTIN(rsubsi3, "rsubw", RSUBW)
+  NDS32_BUILTIN(ursubsi3, "ursubw", URSUBW)
+  NDS32_BUILTIN(sraiu, "sra_u", SRA_U)
+  NDS32_BUILTIN(kssl, "ksll", KSLL)
+  NDS32_BUILTIN(pkbb, "pkbb16", PKBB16)
+  NDS32_BUILTIN(pkbb, "v_pkbb16", V_PKBB16)
+  NDS32_BUILTIN(pkbt, "pkbt16", PKBT16)
+  NDS32_BUILTIN(pkbt, "v_pkbt16", V_PKBT16)
+  NDS32_BUILTIN(pktb, "pktb16", PKTB16)
+  NDS32_BUILTIN(pktb, "v_pktb16", V_PKTB16)
+  NDS32_BUILTIN(pktt, "pktt16", PKTT16)
+  NDS32_BUILTIN(pktt, "v_pktt16", V_PKTT16)
+  NDS32_BUILTIN(smulsi3_highpart, "smmul", SMMUL)
+  NDS32_BUILTIN(smmul_round, "smmul_u", SMMUL_U)
+  NDS32_BUILTIN(smmwb, "smmwb", SMMWB)
+  NDS32_BUILTIN(smmwb, "v_smmwb", V_SMMWB)
+  NDS32_BUILTIN(smmwb_round, "smmwb_u", SMMWB_U)
+  NDS32_BUILTIN(smmwb_round, "v_smmwb_u", V_SMMWB_U)
+  NDS32_BUILTIN(smmwt, "smmwt", SMMWT)
+  NDS32_BUILTIN(smmwt, "v_smmwt", V_SMMWT)
+  NDS32_BUILTIN(smmwt_round, "smmwt_u", SMMWT_U)
+  NDS32_BUILTIN(smmwt_round, "v_smmwt_u", V_SMMWT_U)
+  NDS32_BUILTIN(smbb, "smbb", SMBB)
+  NDS32_BUILTIN(smbb, "v_smbb", V_SMBB)
+  NDS32_BUILTIN(smbt, "smbt", SMBT)
+  NDS32_BUILTIN(smbt, "v_smbt", V_SMBT)
+  NDS32_BUILTIN(smtt, "smtt", SMTT)
+  NDS32_BUILTIN(smtt, "v_smtt", V_SMTT)
+  NDS32_BUILTIN(kmda, "kmda", KMDA)
+  NDS32_BUILTIN(kmda, "v_kmda", V_KMDA)
+  NDS32_BUILTIN(kmxda, "kmxda", KMXDA)
+  NDS32_BUILTIN(kmxda, "v_kmxda", V_KMXDA)
+  NDS32_BUILTIN(smds, "smds", SMDS)
+  NDS32_BUILTIN(smds, "v_smds", V_SMDS)
+  NDS32_BUILTIN(smdrs, "smdrs", SMDRS)
+  NDS32_BUILTIN(smdrs, "v_smdrs", V_SMDRS)
+  NDS32_BUILTIN(smxdsv, "smxds", SMXDS)
+  NDS32_BUILTIN(smxdsv, "v_smxds", V_SMXDS)
+  NDS32_BUILTIN(smal1, "smal", SMAL)
+  NDS32_BUILTIN(smal1, "v_smal", V_SMAL)
+  NDS32_BUILTIN(bitrev, "bitrev", BITREV)
+  NDS32_BUILTIN(wext, "wext", WEXT)
+  NDS32_BUILTIN(adddi3, "sadd64", SADD64)
+  NDS32_BUILTIN(adddi3, "uadd64", UADD64)
+  NDS32_BUILTIN(radddi3, "radd64", RADD64)
+  NDS32_BUILTIN(uradddi3, "uradd64", URADD64)
+  NDS32_BUILTIN(kadddi3, "kadd64", KADD64)
+  NDS32_BUILTIN(ukadddi3, "ukadd64", UKADD64)
+  NDS32_BUILTIN(subdi3, "ssub64", SSUB64)
+  NDS32_BUILTIN(subdi3, "usub64", USUB64)
+  NDS32_BUILTIN(rsubdi3, "rsub64", RSUB64)
+  NDS32_BUILTIN(ursubdi3, "ursub64", URSUB64)
+  NDS32_BUILTIN(ksubdi3, "ksub64", KSUB64)
+  NDS32_BUILTIN(uksubdi3, "uksub64", UKSUB64)
+  NDS32_BUILTIN(smul16, "smul16", SMUL16)
+  NDS32_BUILTIN(smul16, "v_smul16", V_SMUL16)
+  NDS32_BUILTIN(smulx16, "smulx16", SMULX16)
+  NDS32_BUILTIN(smulx16, "v_smulx16", V_SMULX16)
+  NDS32_BUILTIN(umul16, "umul16", UMUL16)
+  NDS32_BUILTIN(umul16, "v_umul16", V_UMUL16)
+  NDS32_BUILTIN(umulx16, "umulx16", UMULX16)
+  NDS32_BUILTIN(umulx16, "v_umulx16", V_UMULX16)
+  NDS32_BUILTIN(kwmmul, "kwmmul", KWMMUL)
+  NDS32_BUILTIN(kwmmul_round, "kwmmul_u", KWMMUL_U)
+  NDS32_NO_TARGET_BUILTIN(unaligned_storev2hi,
+			  "put_unaligned_u16x2", UASTORE_U16)
+  NDS32_NO_TARGET_BUILTIN(unaligned_storev2hi,
+			  "put_unaligned_s16x2", UASTORE_S16)
+  NDS32_NO_TARGET_BUILTIN(unaligned_storev4qi, "put_unaligned_u8x4", UASTORE_U8)
+  NDS32_NO_TARGET_BUILTIN(unaligned_storev4qi, "put_unaligned_s8x4", UASTORE_S8)
+};
+
+/* Two-argument intrinsics with an immediate second argument.  */
+static struct builtin_description bdesc_2argimm[] =
+{
+  NDS32_BUILTIN(unspec_bclr, "bclr", BCLR)
+  NDS32_BUILTIN(unspec_bset, "bset", BSET)
+  NDS32_BUILTIN(unspec_btgl, "btgl", BTGL)
+  NDS32_BUILTIN(unspec_btst, "btst", BTST)
+  NDS32_BUILTIN(unspec_clip, "clip", CLIP)
+  NDS32_BUILTIN(unspec_clips, "clips", CLIPS)
+  NDS32_NO_TARGET_BUILTIN(unspec_teqz, "teqz", TEQZ)
+  NDS32_NO_TARGET_BUILTIN(unspec_tnez, "tnez", TNEZ)
+  NDS32_BUILTIN(ashrv2hi3, "srl16", SRL16)
+  NDS32_BUILTIN(ashrv2hi3, "v_srl16", V_SRL16)
+  NDS32_BUILTIN(srl16_round, "srl16_u", SRL16_U)
+  NDS32_BUILTIN(srl16_round, "v_srl16_u", V_SRL16_U)
+  NDS32_BUILTIN(kslli16, "ksll16", KSLL16)
+  NDS32_BUILTIN(kslli16, "v_ksll16", V_KSLL16)
+  NDS32_BUILTIN(sclip16, "sclip16", SCLIP16)
+  NDS32_BUILTIN(sclip16, "v_sclip16", V_SCLIP16)
+  NDS32_BUILTIN(uclip16, "uclip16", UCLIP16)
+  NDS32_BUILTIN(uclip16, "v_uclip16", V_UCLIP16)
+  NDS32_BUILTIN(sraiu, "sra_u", SRA_U)
+  NDS32_BUILTIN(kssl, "ksll", KSLL)
+  NDS32_BUILTIN(bitrev, "bitrev", BITREV)
+  NDS32_BUILTIN(wext, "wext", WEXT)
+  NDS32_BUILTIN(uclip32, "uclip32", UCLIP32)
+  NDS32_BUILTIN(sclip32, "sclip32", SCLIP32)
+};
+
+/* Intrinsics that take three arguments.  */
+static struct builtin_description bdesc_3arg[] =
+{
+  NDS32_BUILTIN(unspec_pbsada, "pbsada", PBSADA)
+  NDS32_NO_TARGET_BUILTIN(bse, "bse", BSE)
+  NDS32_NO_TARGET_BUILTIN(bsp, "bsp", BSP)
+  NDS32_BUILTIN(kmabb, "kmabb", KMABB)
+  NDS32_BUILTIN(kmabb, "v_kmabb", V_KMABB)
+  NDS32_BUILTIN(kmabt, "kmabt", KMABT)
+  NDS32_BUILTIN(kmabt, "v_kmabt", V_KMABT)
+  NDS32_BUILTIN(kmatt, "kmatt", KMATT)
+  NDS32_BUILTIN(kmatt, "v_kmatt", V_KMATT)
+  NDS32_BUILTIN(kmada, "kmada", KMADA)
+  NDS32_BUILTIN(kmada, "v_kmada", V_KMADA)
+  NDS32_BUILTIN(kmaxda, "kmaxda", KMAXDA)
+  NDS32_BUILTIN(kmaxda, "v_kmaxda", V_KMAXDA)
+  NDS32_BUILTIN(kmads, "kmads", KMADS)
+  NDS32_BUILTIN(kmads, "v_kmads", V_KMADS)
+  NDS32_BUILTIN(kmadrs, "kmadrs", KMADRS)
+  NDS32_BUILTIN(kmadrs, "v_kmadrs", V_KMADRS)
+  NDS32_BUILTIN(kmaxds, "kmaxds", KMAXDS)
+  NDS32_BUILTIN(kmaxds, "v_kmaxds", V_KMAXDS)
+  NDS32_BUILTIN(kmsda, "kmsda", KMSDA)
+  NDS32_BUILTIN(kmsda, "v_kmsda", V_KMSDA)
+  NDS32_BUILTIN(kmsxda, "kmsxda", KMSXDA)
+  NDS32_BUILTIN(kmsxda, "v_kmsxda", V_KMSXDA)
+  NDS32_BUILTIN(bpick1, "bpick", BPICK)
+  NDS32_BUILTIN(smar64_1, "smar64", SMAR64)
+  NDS32_BUILTIN(smsr64, "smsr64", SMSR64)
+  NDS32_BUILTIN(umar64_1, "umar64", UMAR64)
+  NDS32_BUILTIN(umsr64, "umsr64", UMSR64)
+  NDS32_BUILTIN(kmar64_1, "kmar64", KMAR64)
+  NDS32_BUILTIN(kmsr64, "kmsr64", KMSR64)
+  NDS32_BUILTIN(ukmar64_1, "ukmar64", UKMAR64)
+  NDS32_BUILTIN(ukmsr64, "ukmsr64", UKMSR64)
+  NDS32_BUILTIN(smalbb, "smalbb", SMALBB)
+  NDS32_BUILTIN(smalbb, "v_smalbb", V_SMALBB)
+  NDS32_BUILTIN(smalbt, "smalbt", SMALBT)
+  NDS32_BUILTIN(smalbt, "v_smalbt", V_SMALBT)
+  NDS32_BUILTIN(smaltt, "smaltt", SMALTT)
+  NDS32_BUILTIN(smaltt, "v_smaltt", V_SMALTT)
+  NDS32_BUILTIN(smalda1, "smalda", SMALDA)
+  NDS32_BUILTIN(smalda1, "v_smalda", V_SMALDA)
+  NDS32_BUILTIN(smalxda1, "smalxda", SMALXDA)
+  NDS32_BUILTIN(smalxda1, "v_smalxda", V_SMALXDA)
+  NDS32_BUILTIN(smalds1, "smalds", SMALDS)
+  NDS32_BUILTIN(smalds1, "v_smalds", V_SMALDS)
+  NDS32_BUILTIN(smaldrs3, "smaldrs", SMALDRS)
+  NDS32_BUILTIN(smaldrs3, "v_smaldrs", V_SMALDRS)
+  NDS32_BUILTIN(smalxds1, "smalxds", SMALXDS)
+  NDS32_BUILTIN(smalxds1, "v_smalxds", V_SMALXDS)
+  NDS32_BUILTIN(smslda1, "smslda", SMSLDA)
+  NDS32_BUILTIN(smslda1, "v_smslda", V_SMSLDA)
+  NDS32_BUILTIN(smslxda1, "smslxda", SMSLXDA)
+  NDS32_BUILTIN(smslxda1, "v_smslxda", V_SMSLXDA)
+  NDS32_BUILTIN(kmmawb, "kmmawb", KMMAWB)
+  NDS32_BUILTIN(kmmawb, "v_kmmawb", V_KMMAWB)
+  NDS32_BUILTIN(kmmawb_round, "kmmawb_u", KMMAWB_U)
+  NDS32_BUILTIN(kmmawb_round, "v_kmmawb_u", V_KMMAWB_U)
+  NDS32_BUILTIN(kmmawt, "kmmawt", KMMAWT)
+  NDS32_BUILTIN(kmmawt, "v_kmmawt", V_KMMAWT)
+  NDS32_BUILTIN(kmmawt_round, "kmmawt_u", KMMAWT_U)
+  NDS32_BUILTIN(kmmawt_round, "v_kmmawt_u", V_KMMAWT_U)
+  NDS32_BUILTIN(kmmac, "kmmac", KMMAC)
+  NDS32_BUILTIN(kmmac_round, "kmmac_u", KMMAC_U)
+  NDS32_BUILTIN(kmmsb, "kmmsb", KMMSB)
+  NDS32_BUILTIN(kmmsb_round, "kmmsb_u", KMMSB_U)
+};
+
+/* Three-argument intrinsics with an immediate third argument.  */
+static struct builtin_description bdesc_3argimm[] =
+{
+  NDS32_NO_TARGET_BUILTIN(prefetch_qw, "prefetch_qw", DPREF_QW)
+  NDS32_NO_TARGET_BUILTIN(prefetch_hw, "prefetch_hw", DPREF_HW)
+  NDS32_NO_TARGET_BUILTIN(prefetch_w, "prefetch_w", DPREF_W)
+  NDS32_NO_TARGET_BUILTIN(prefetch_dw, "prefetch_dw", DPREF_DW)
+  NDS32_BUILTIN(insb, "insb", INSB)
+};
+
+/* Intrinsics that load a value.  */
+static struct builtin_description bdesc_load[] =
+{
+  NDS32_BUILTIN(unspec_volatile_llw, "llw", LLW)
+  NDS32_BUILTIN(unspec_lwup, "lwup", LWUP)
+  NDS32_BUILTIN(unspec_lbup, "lbup", LBUP)
+};
+
+/* Intrinsics that store a value.  */
+static struct builtin_description bdesc_store[] =
+{
+  NDS32_BUILTIN(unspec_swup, "swup", SWUP)
+  NDS32_BUILTIN(unspec_sbup, "sbup", SBUP)
+};
+
+static struct builtin_description bdesc_cctl[] =
+{
+  NDS32_BUILTIN(cctl_idx_read, "cctl_idx_read", CCTL_IDX_READ)
+  NDS32_NO_TARGET_BUILTIN(cctl_idx_write, "cctl_idx_write", CCTL_IDX_WRITE)
+  NDS32_NO_TARGET_BUILTIN(cctl_va_lck, "cctl_va_lck", CCTL_VA_LCK)
+  NDS32_NO_TARGET_BUILTIN(cctl_idx_wbinval,
+			  "cctl_idx_wbinval", CCTL_IDX_WBINVAL)
+  NDS32_NO_TARGET_BUILTIN(cctl_va_wbinval_l1,
+			  "cctl_va_wbinval_l1", CCTL_VA_WBINVAL_L1)
+  NDS32_NO_TARGET_BUILTIN(cctl_va_wbinval_la,
+			  "cctl_va_wbinval_la", CCTL_VA_WBINVAL_LA)
+};

 rtx
 nds32_expand_builtin_impl (tree exp,
 			   rtx target,
 			   rtx subtarget ATTRIBUTE_UNUSED,
-			   machine_mode mode ATTRIBUTE_UNUSED,
+			   enum machine_mode mode ATTRIBUTE_UNUSED,
 			   int ignore ATTRIBUTE_UNUSED)
 {
   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+  unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
+  unsigned i;
+  struct builtin_description *d;
+
+  if (!NDS32_EXT_DSP_P ()
+      && fcode > NDS32_BUILTIN_DSP_BEGIN
+      && fcode < NDS32_BUILTIN_DSP_END)
+    error ("don't support DSP extension instructions");
+
+  switch (fcode)
+    {
+    /* FPU Register Transfer.  */
+    case NDS32_BUILTIN_FMFCFG:
+    case NDS32_BUILTIN_FMFCSR:
+    case NDS32_BUILTIN_FMTCSR:
+    case NDS32_BUILTIN_FCPYNSS:
+    case NDS32_BUILTIN_FCPYSS:
+      /* Both v3s and v3f toolchains define TARGET_FPU_SINGLE.  */
+      if (!TARGET_FPU_SINGLE)
+	{
+	  error ("this builtin function is only available "
+		 "on the v3s or v3f toolchain");
+	  return NULL_RTX;
+	}
+      break;
+
+    /* FPU Register Transfer.  */
+    case NDS32_BUILTIN_FCPYNSD:
+    case NDS32_BUILTIN_FCPYSD:
+      /* Only v3f toolchain defines TARGET_FPU_DOUBLE.  */
+      if (!TARGET_FPU_DOUBLE)
+	{
+	  error ("this builtin function is only available "
+		 "on the v3f toolchain");
+	  return NULL_RTX;
+	}
+      break;
+
+    /* Load and Store  */
+    case NDS32_BUILTIN_LLW:
+    case NDS32_BUILTIN_LWUP:
+    case NDS32_BUILTIN_LBUP:
+    case NDS32_BUILTIN_SCW:
+    case NDS32_BUILTIN_SWUP:
+    case NDS32_BUILTIN_SBUP:
+      if (TARGET_ISA_V3M)
+	{
+	  error ("this builtin function not support "
+		 "on the v3m toolchain");
+	  return NULL_RTX;
+	}
+      break;
+
+    /* Performance Extension  */
+    case NDS32_BUILTIN_ABS:
+    case NDS32_BUILTIN_AVE:
+    case NDS32_BUILTIN_BCLR:
+    case NDS32_BUILTIN_BSET:
+    case NDS32_BUILTIN_BTGL:
+    case NDS32_BUILTIN_BTST:
+    case NDS32_BUILTIN_CLIP:
+    case NDS32_BUILTIN_CLIPS:
+    case NDS32_BUILTIN_CLZ:
+    case NDS32_BUILTIN_CLO:
+      if (!TARGET_EXT_PERF)
+	{
+	  error ("don't support performance extension instructions");
+	  return NULL_RTX;
+	}
+      break;
+
+    /* Performance Extension 2  */
+    case NDS32_BUILTIN_PBSAD:
+    case NDS32_BUILTIN_PBSADA:
+    case NDS32_BUILTIN_BSE:
+    case NDS32_BUILTIN_BSP:
+      if (!TARGET_EXT_PERF2)
+	{
+	  error ("don't support performance extension "
+		 "version 2 instructions");
+	  return NULL_RTX;
+	}
+      break;

-  int fcode = DECL_FUNCTION_CODE (fndecl);
+    /* String Extension  */
+    case NDS32_BUILTIN_FFB:
+    case NDS32_BUILTIN_FFMISM:
+    case NDS32_BUILTIN_FLMISM:
+      if (!TARGET_EXT_STRING)
+	{
+	  error ("don't support string extension instructions");
+	  return NULL_RTX;
+	}
+      break;

+    default:
+      break;
+    }
+
+  /* Since there are no result and operands, we can simply emit this rtx.  */
   switch (fcode)
     {
-    /* Cache.  */
-    case NDS32_BUILTIN_ISYNC:
-      return nds32_expand_builtin_null_ftype_reg
-	     (CODE_FOR_unspec_volatile_isync, exp, target);
     case NDS32_BUILTIN_ISB:
-      /* Since there are no result and operands for isb instruciton,
-         we can simply emit this rtx.  */
       emit_insn (gen_unspec_volatile_isb ());
       return target;
-
-    /* Register Transfer.  */
-    case NDS32_BUILTIN_MFSR:
-      return nds32_expand_builtin_reg_ftype_imm
-	     (CODE_FOR_unspec_volatile_mfsr, exp, target);
-    case NDS32_BUILTIN_MFUSR:
-      return nds32_expand_builtin_reg_ftype_imm
-	     (CODE_FOR_unspec_volatile_mfusr, exp, target);
-    case NDS32_BUILTIN_MTSR:
-      return nds32_expand_builtin_null_ftype_reg_imm
-	     (CODE_FOR_unspec_volatile_mtsr, exp, target);
-    case NDS32_BUILTIN_MTUSR:
-      return nds32_expand_builtin_null_ftype_reg_imm
-	     (CODE_FOR_unspec_volatile_mtusr, exp, target);
-
-    /* Interrupt.  */
+    case NDS32_BUILTIN_DSB:
+      emit_insn (gen_unspec_dsb ());
+      return target;
+    case NDS32_BUILTIN_MSYNC_ALL:
+      emit_insn (gen_unspec_msync_all ());
+      return target;
+    case NDS32_BUILTIN_MSYNC_STORE:
+      emit_insn (gen_unspec_msync_store ());
+      return target;
     case NDS32_BUILTIN_SETGIE_EN:
-      /* Since there are no result and operands for setgie.e instruciton,
-         we can simply emit this rtx.  */
       emit_insn (gen_unspec_volatile_setgie_en ());
+      emit_insn (gen_unspec_dsb ());
       return target;
     case NDS32_BUILTIN_SETGIE_DIS:
-      /* Since there are no result and operands for setgie.d instruciton,
-         we can simply emit this rtx.  */
       emit_insn (gen_unspec_volatile_setgie_dis ());
+      emit_insn (gen_unspec_dsb ());
+      return target;
+    case NDS32_BUILTIN_GIE_DIS:
+      emit_insn (gen_unspec_volatile_setgie_dis ());
+      emit_insn (gen_unspec_dsb ());
+      return target;
+    case NDS32_BUILTIN_GIE_EN:
+      emit_insn (gen_unspec_volatile_setgie_en ());
+      emit_insn (gen_unspec_dsb ());
+      return target;
+    case NDS32_BUILTIN_SET_PENDING_SWINT:
+      emit_insn (gen_unspec_set_pending_swint ());
+      return target;
+    case NDS32_BUILTIN_CLR_PENDING_SWINT:
+      emit_insn (gen_unspec_clr_pending_swint ());
+      return target;
+    case NDS32_BUILTIN_CCTL_L1D_INVALALL:
+      emit_insn (gen_cctl_l1d_invalall());
+      return target;
+    case NDS32_BUILTIN_CCTL_L1D_WBALL_ALVL:
+      emit_insn (gen_cctl_l1d_wball_alvl());
+      return target;
+    case NDS32_BUILTIN_CCTL_L1D_WBALL_ONE_LVL:
+      emit_insn (gen_cctl_l1d_wball_one_lvl());
+      return target;
+    case NDS32_BUILTIN_CLROV:
+      emit_insn (gen_unspec_volatile_clrov ());
+      return target;
+    case NDS32_BUILTIN_STANDBY_NO_WAKE_GRANT:
+      emit_insn (gen_unspec_standby_no_wake_grant ());
+      return target;
+    case NDS32_BUILTIN_STANDBY_WAKE_GRANT:
+      emit_insn (gen_unspec_standby_wake_grant ());
+      return target;
+    case NDS32_BUILTIN_STANDBY_WAKE_DONE:
+      emit_insn (gen_unspec_standby_wait_done ());
+      return target;
+    case NDS32_BUILTIN_SETEND_BIG:
+      emit_insn (gen_unspec_setend_big ());
+      return target;
+    case NDS32_BUILTIN_SETEND_LITTLE:
+      emit_insn (gen_unspec_setend_little ());
+      return target;
+    case NDS32_BUILTIN_NOP:
+      emit_insn (gen_unspec_nop ());
+      return target;
+    case NDS32_BUILTIN_SCHE_BARRIER:
+      emit_insn (gen_blockage ());
+      return target;
+    case NDS32_BUILTIN_TLBOP_FLUA:
+      emit_insn (gen_unspec_tlbop_flua ());
+      return target;
+    /* Instruction sequence protection  */
+    case NDS32_BUILTIN_SIGNATURE_BEGIN:
+      emit_insn (gen_unspec_signature_begin ());
+      return target;
+    case NDS32_BUILTIN_SIGNATURE_END:
+      emit_insn (gen_unspec_signature_end ());
+      return target;
+    case NDS32_BUILTIN_SCW:
+      return nds32_expand_scw_builtin (CODE_FOR_unspec_volatile_scw,
+				       exp, target);
+    case NDS32_BUILTIN_SET_INT_PRIORITY:
+      return nds32_expand_priority_builtin (CODE_FOR_unspec_set_int_priority,
+					    exp, target,
+					    "__nds32__set_int_priority");
+    case NDS32_BUILTIN_NO_HWLOOP:
+      emit_insn (gen_no_hwloop ());
       return target;
-
     default:
-      gcc_unreachable ();
+      break;
     }

+  /* Expand groups of builtins.  */
+  for (i = 0, d = bdesc_noarg; i < ARRAY_SIZE (bdesc_noarg); i++, d++)
+    if (d->code == fcode)
+      return nds32_expand_noarg_builtin (d->icode, target);
+
+  for (i = 0, d = bdesc_1arg; i < ARRAY_SIZE (bdesc_1arg); i++, d++)
+    if (d->code == fcode)
+      return nds32_expand_unop_builtin (d->icode, exp, target, d->return_p);
+
+  for (i = 0, d = bdesc_1argimm; i < ARRAY_SIZE (bdesc_1argimm); i++, d++)
+    if (d->code == fcode)
+      return nds32_expand_unopimm_builtin (d->icode, exp, target,
+					   d->return_p, d->name);
+
+  for (i = 0, d = bdesc_2arg; i < ARRAY_SIZE (bdesc_2arg); i++, d++)
+    if (d->code == fcode)
+      return nds32_expand_binop_builtin (d->icode, exp, target, d->return_p);
+
+  for (i = 0, d = bdesc_2argimm; i < ARRAY_SIZE (bdesc_2argimm); i++, d++)
+    if (d->code == fcode)
+      return nds32_expand_binopimm_builtin (d->icode, exp, target,
+					    d->return_p, d->name);
+
+  for (i = 0, d = bdesc_3arg; i < ARRAY_SIZE (bdesc_3arg); i++, d++)
+    if (d->code == fcode)
+      return nds32_expand_triop_builtin (d->icode, exp, target, d->return_p);
+
+  for (i = 0, d = bdesc_3argimm; i < ARRAY_SIZE (bdesc_3argimm); i++, d++)
+    if (d->code == fcode)
+      return nds32_expand_triopimm_builtin (d->icode, exp, target,
+					    d->return_p, d->name);
+
+  for (i = 0, d = bdesc_load; i < ARRAY_SIZE (bdesc_load); i++, d++)
+    if (d->code == fcode)
+      return nds32_expand_builtin_load (d->icode, exp, target);
+
+  for (i = 0, d = bdesc_store; i < ARRAY_SIZE (bdesc_store); i++, d++)
+    if (d->code == fcode)
+      return nds32_expand_builtin_store (d->icode, exp, target);
+
+  for (i = 0, d = bdesc_cctl; i < ARRAY_SIZE (bdesc_cctl); i++, d++)
+    if (d->code == fcode)
+      return nds32_expand_cctl_builtin (d->icode, exp, target,
+					d->return_p, d->name);
+
   return NULL_RTX;
 }

+static GTY(()) tree nds32_builtin_decls[NDS32_BUILTIN_COUNT];
+
+/* Return the NDS32 builtin for CODE.  */
+tree
+nds32_builtin_decl_impl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
+{
+  if (code >= NDS32_BUILTIN_COUNT)
+    return error_mark_node;
+
+  return nds32_builtin_decls[code];
+}
+
+void
+nds32_init_builtins_impl (void)
+{
+#define ADD_NDS32_BUILTIN0(NAME, RET_TYPE, CODE)		\
+  nds32_builtin_decls[NDS32_BUILTIN_ ## CODE] =			\
+  add_builtin_function ("__builtin_nds32_" NAME,		\
+			build_function_type_list (RET_TYPE##_type_node, \
+						  NULL_TREE),		\
+			NDS32_BUILTIN_ ## CODE, BUILT_IN_MD, NULL, NULL_TREE)
+
+#define ADD_NDS32_BUILTIN1(NAME, RET_TYPE, ARG_TYPE, CODE)	\
+  nds32_builtin_decls[NDS32_BUILTIN_ ## CODE] =			\
+  add_builtin_function ("__builtin_nds32_" NAME,		\
+			build_function_type_list (RET_TYPE##_type_node, \
+						  ARG_TYPE##_type_node, \
+						  NULL_TREE),		\
+			NDS32_BUILTIN_ ## CODE, BUILT_IN_MD, NULL, NULL_TREE)
+
+#define ADD_NDS32_BUILTIN2(NAME, RET_TYPE, ARG_TYPE1, ARG_TYPE2, CODE)	\
+  nds32_builtin_decls[NDS32_BUILTIN_ ## CODE] =				\
+  add_builtin_function ("__builtin_nds32_" NAME,			\
+			build_function_type_list (RET_TYPE##_type_node, \
+						  ARG_TYPE1##_type_node,\
+						  ARG_TYPE2##_type_node,\
+						  NULL_TREE),		\
+			NDS32_BUILTIN_ ## CODE, BUILT_IN_MD, NULL, NULL_TREE)
+
+#define ADD_NDS32_BUILTIN3(NAME, RET_TYPE, ARG_TYPE1, ARG_TYPE2, ARG_TYPE3, CODE) \
+  nds32_builtin_decls[NDS32_BUILTIN_ ## CODE] =				\
+  add_builtin_function ("__builtin_nds32_" NAME,			\
+			build_function_type_list (RET_TYPE##_type_node,	\
+						  ARG_TYPE1##_type_node,\
+						  ARG_TYPE2##_type_node,\
+						  ARG_TYPE3##_type_node,\
+						  NULL_TREE),		\
+			NDS32_BUILTIN_ ## CODE, BUILT_IN_MD, NULL, NULL_TREE)
+
+  /* Looking for return type and argument can be found in tree.h file.  */
+  tree ptr_char_type_node = build_pointer_type (char_type_node);
+  tree ptr_uchar_type_node = build_pointer_type (unsigned_char_type_node);
+  tree ptr_ushort_type_node = build_pointer_type (short_unsigned_type_node);
+  tree ptr_short_type_node = build_pointer_type (short_integer_type_node);
+  tree ptr_uint_type_node = build_pointer_type (unsigned_type_node);
+  tree ptr_ulong_type_node = build_pointer_type (long_long_unsigned_type_node);
+  tree v4qi_type_node = build_vector_type (intQI_type_node, 4);
+  tree u_v4qi_type_node = build_vector_type (unsigned_intQI_type_node, 4);
+  tree v2hi_type_node = build_vector_type (intHI_type_node, 2);
+  tree u_v2hi_type_node = build_vector_type (unsigned_intHI_type_node, 2);
+  tree v2si_type_node = build_vector_type (intSI_type_node, 2);
+  tree u_v2si_type_node = build_vector_type (unsigned_intSI_type_node, 2);
+
+  /* Cache.  */
+  ADD_NDS32_BUILTIN1 ("isync", void, ptr_uint, ISYNC);
+  ADD_NDS32_BUILTIN0 ("isb", void, ISB);
+  ADD_NDS32_BUILTIN0 ("dsb", void, DSB);
+  ADD_NDS32_BUILTIN0 ("msync_all", void, MSYNC_ALL);
+  ADD_NDS32_BUILTIN0 ("msync_store", void, MSYNC_STORE);
+
+  /* Register Transfer.  */
+  ADD_NDS32_BUILTIN1 ("mfsr", unsigned, integer, MFSR);
+  ADD_NDS32_BUILTIN1 ("mfusr", unsigned, integer, MFUSR);
+  ADD_NDS32_BUILTIN2 ("mtsr", void, unsigned, integer, MTSR);
+  ADD_NDS32_BUILTIN2 ("mtsr_isb", void, unsigned, integer, MTSR_ISB);
+  ADD_NDS32_BUILTIN2 ("mtsr_dsb", void, unsigned, integer, MTSR_DSB);
+  ADD_NDS32_BUILTIN2 ("mtusr", void, unsigned, integer, MTUSR);
+
+  /* FPU Register Transfer.  */
+  ADD_NDS32_BUILTIN0 ("fmfcsr", unsigned, FMFCSR);
+  ADD_NDS32_BUILTIN1 ("fmtcsr", void, unsigned, FMTCSR);
+  ADD_NDS32_BUILTIN0 ("fmfcfg", unsigned, FMFCFG);
+  ADD_NDS32_BUILTIN2 ("fcpyss", float, float, float, FCPYSS);
+  ADD_NDS32_BUILTIN2 ("fcpynss", float, float, float, FCPYNSS);
+  ADD_NDS32_BUILTIN2 ("fcpysd", double, double, double, FCPYSD);
+  ADD_NDS32_BUILTIN2 ("fcpynsd", double, double, double, FCPYNSD);
+
+  /* Interrupt.  */
+  ADD_NDS32_BUILTIN0 ("setgie_en", void, SETGIE_EN);
+  ADD_NDS32_BUILTIN0 ("setgie_dis", void, SETGIE_DIS);
+  ADD_NDS32_BUILTIN0 ("gie_en", void, GIE_EN);
+  ADD_NDS32_BUILTIN0 ("gie_dis", void, GIE_DIS);
+  ADD_NDS32_BUILTIN1 ("enable_int", void, integer, ENABLE_INT);
+  ADD_NDS32_BUILTIN1 ("disable_int", void, integer, DISABLE_INT);
+  ADD_NDS32_BUILTIN0 ("set_pending_swint", void, SET_PENDING_SWINT);
+  ADD_NDS32_BUILTIN0 ("clr_pending_swint", void, CLR_PENDING_SWINT);
+  ADD_NDS32_BUILTIN0 ("get_all_pending_int", unsigned, GET_ALL_PENDING_INT);
+  ADD_NDS32_BUILTIN1 ("get_pending_int", unsigned, integer, GET_PENDING_INT);
+  ADD_NDS32_BUILTIN1 ("get_int_priority", unsigned, integer, GET_INT_PRIORITY);
+  ADD_NDS32_BUILTIN2 ("set_int_priority", void, integer, integer,
+		      SET_INT_PRIORITY);
+  ADD_NDS32_BUILTIN1 ("clr_pending_hwint", void, integer, CLR_PENDING_HWINT);
+  ADD_NDS32_BUILTIN1 ("set_trig_level", void, integer, SET_TRIG_LEVEL);
+  ADD_NDS32_BUILTIN1 ("set_trig_edge", void, integer, SET_TRIG_EDGE);
+  ADD_NDS32_BUILTIN1 ("get_trig_type", unsigned, integer, GET_TRIG_TYPE);
+
+  /* Load and Store  */
+  ADD_NDS32_BUILTIN1 ("llw", unsigned, ptr_uint, LLW);
+  ADD_NDS32_BUILTIN1 ("lwup", unsigned, ptr_uint, LWUP);
+  ADD_NDS32_BUILTIN1 ("lbup", char, ptr_uchar, LBUP);
+  ADD_NDS32_BUILTIN2 ("scw", unsigned, ptr_uint, unsigned, SCW);
+  ADD_NDS32_BUILTIN2 ("swup", void, ptr_uint, unsigned, SWUP);
+  ADD_NDS32_BUILTIN2 ("sbup", void, ptr_uchar, char, SBUP);
+
+  /* CCTL  */
+  ADD_NDS32_BUILTIN0 ("cctl_l1d_invalall", void, CCTL_L1D_INVALALL);
+  ADD_NDS32_BUILTIN0 ("cctl_l1d_wball_alvl", void, CCTL_L1D_WBALL_ALVL);
+  ADD_NDS32_BUILTIN0 ("cctl_l1d_wball_one_lvl", void, CCTL_L1D_WBALL_ONE_LVL);
+  ADD_NDS32_BUILTIN2 ("cctl_va_lck", void, integer, ptr_uint, CCTL_VA_LCK);
+  ADD_NDS32_BUILTIN2 ("cctl_idx_wbinval", void, integer, unsigned,
+		      CCTL_IDX_WBINVAL);
+  ADD_NDS32_BUILTIN2 ("cctl_va_wbinval_l1", void, integer, ptr_uint,
+		      CCTL_VA_WBINVAL_L1);
+  ADD_NDS32_BUILTIN2 ("cctl_va_wbinval_la", void, integer, ptr_uint,
+		      CCTL_VA_WBINVAL_LA);
+  ADD_NDS32_BUILTIN2 ("cctl_idx_read", unsigned, integer, unsigned,
+		      CCTL_IDX_READ);
+  ADD_NDS32_BUILTIN3 ("cctl_idx_write", void, integer, unsigned, unsigned,
+		      CCTL_IDX_WRITE);
+
+  /* PREFETCH  */
+  ADD_NDS32_BUILTIN3 ("dpref_qw", void, ptr_uchar, unsigned, integer, DPREF_QW);
+  ADD_NDS32_BUILTIN3 ("dpref_hw", void, ptr_ushort, unsigned, integer,
+		      DPREF_HW);
+  ADD_NDS32_BUILTIN3 ("dpref_w", void, ptr_uint, unsigned, integer, DPREF_W);
+  ADD_NDS32_BUILTIN3 ("dpref_dw", void, ptr_ulong, unsigned, integer, DPREF_DW);
+
+  /* Performance Extension  */
+  ADD_NDS32_BUILTIN1 ("pe_abs", integer, integer, ABS);
+  ADD_NDS32_BUILTIN2 ("pe_ave", integer, integer, integer, AVE);
+  ADD_NDS32_BUILTIN2 ("pe_bclr", unsigned, unsigned, unsigned, BCLR);
+  ADD_NDS32_BUILTIN2 ("pe_bset", unsigned, unsigned, unsigned, BSET);
+  ADD_NDS32_BUILTIN2 ("pe_btgl", unsigned, unsigned, unsigned, BTGL);
+  ADD_NDS32_BUILTIN2 ("pe_btst", unsigned, unsigned, unsigned, BTST);
+  ADD_NDS32_BUILTIN2 ("pe_clip", unsigned, integer, unsigned, CLIP);
+  ADD_NDS32_BUILTIN2 ("pe_clips", integer, integer, unsigned, CLIPS);
+  ADD_NDS32_BUILTIN1 ("pe_clz", unsigned, unsigned, CLZ);
+  ADD_NDS32_BUILTIN1 ("pe_clo", unsigned, unsigned, CLO);
+
+  /* Performance Extension 2  */
+  ADD_NDS32_BUILTIN3 ("pe2_bse", void, ptr_uint, unsigned, ptr_uint, BSE);
+  ADD_NDS32_BUILTIN3 ("pe2_bsp", void, ptr_uint, unsigned, ptr_uint, BSP);
+  ADD_NDS32_BUILTIN2 ("pe2_pbsad", unsigned, unsigned, unsigned, PBSAD);
+  ADD_NDS32_BUILTIN3 ("pe2_pbsada", unsigned, unsigned, unsigned, unsigned,
+		      PBSADA);
+
+  /* String Extension  */
+  ADD_NDS32_BUILTIN2 ("se_ffb", integer, unsigned, unsigned, FFB);
+  ADD_NDS32_BUILTIN2 ("se_ffmism", integer, unsigned, unsigned, FFMISM);
+  ADD_NDS32_BUILTIN2 ("se_flmism", integer, unsigned, unsigned, FLMISM);
+
+  /* SATURATION  */
+  ADD_NDS32_BUILTIN2 ("kaddw", integer, integer, integer, KADDW);
+  ADD_NDS32_BUILTIN2 ("ksubw", integer, integer, integer, KSUBW);
+  ADD_NDS32_BUILTIN2 ("kaddh", integer, integer, integer, KADDH);
+  ADD_NDS32_BUILTIN2 ("ksubh", integer, integer, integer, KSUBH);
+  ADD_NDS32_BUILTIN2 ("kdmbb", integer, unsigned, unsigned, KDMBB);
+  ADD_NDS32_BUILTIN2 ("v_kdmbb", integer, v2hi, v2hi, V_KDMBB);
+  ADD_NDS32_BUILTIN2 ("kdmbt", integer, unsigned, unsigned, KDMBT);
+  ADD_NDS32_BUILTIN2 ("v_kdmbt", integer, v2hi, v2hi, V_KDMBT);
+  ADD_NDS32_BUILTIN2 ("kdmtb", integer, unsigned, unsigned, KDMTB);
+  ADD_NDS32_BUILTIN2 ("v_kdmtb", integer, v2hi, v2hi, V_KDMTB);
+  ADD_NDS32_BUILTIN2 ("kdmtt", integer, unsigned, unsigned, KDMTT);
+  ADD_NDS32_BUILTIN2 ("v_kdmtt", integer, v2hi, v2hi, V_KDMTT);
+  ADD_NDS32_BUILTIN2 ("khmbb", integer, unsigned, unsigned, KHMBB);
+  ADD_NDS32_BUILTIN2 ("v_khmbb", integer, v2hi, v2hi, V_KHMBB);
+  ADD_NDS32_BUILTIN2 ("khmbt", integer, unsigned, unsigned, KHMBT);
+  ADD_NDS32_BUILTIN2 ("v_khmbt", integer, v2hi, v2hi, V_KHMBT);
+  ADD_NDS32_BUILTIN2 ("khmtb", integer, unsigned, unsigned, KHMTB);
+  ADD_NDS32_BUILTIN2 ("v_khmtb", integer, v2hi, v2hi, V_KHMTB);
+  ADD_NDS32_BUILTIN2 ("khmtt", integer, unsigned, unsigned, KHMTT);
+  ADD_NDS32_BUILTIN2 ("v_khmtt", integer, v2hi, v2hi, V_KHMTT);
+  ADD_NDS32_BUILTIN2 ("kslraw", integer, integer, integer, KSLRAW);
+  ADD_NDS32_BUILTIN2 ("kslraw_u", integer, integer, integer, KSLRAW_U);
+  ADD_NDS32_BUILTIN0 ("rdov", unsigned, RDOV);
+  ADD_NDS32_BUILTIN0 ("clrov", void, CLROV);
+
+  /* ROTR  */
+  ADD_NDS32_BUILTIN2 ("rotr", unsigned, unsigned, unsigned, ROTR);
+
+  /* Swap  */
+  ADD_NDS32_BUILTIN1 ("wsbh", unsigned, unsigned, WSBH);
+
+  /* System  */
+  ADD_NDS32_BUILTIN2 ("svs", unsigned, integer, integer, SVS);
+  ADD_NDS32_BUILTIN2 ("sva", unsigned, integer, integer, SVA);
+  ADD_NDS32_BUILTIN1 ("jr_itoff", void, unsigned, JR_ITOFF);
+  ADD_NDS32_BUILTIN1 ("jr_toff", void, unsigned, JR_TOFF);
+  ADD_NDS32_BUILTIN1 ("jral_iton", void, unsigned, JRAL_ITON);
+  ADD_NDS32_BUILTIN1 ("jral_ton", void, unsigned, JRAL_TON);
+  ADD_NDS32_BUILTIN1 ("ret_itoff", void, unsigned, RET_ITOFF);
+  ADD_NDS32_BUILTIN1 ("ret_toff", void, unsigned, RET_TOFF);
+  ADD_NDS32_BUILTIN0 ("standby_no_wake_grant", void, STANDBY_NO_WAKE_GRANT);
+  ADD_NDS32_BUILTIN0 ("standby_wake_grant", void, STANDBY_WAKE_GRANT);
+  ADD_NDS32_BUILTIN0 ("standby_wait_done", void, STANDBY_WAKE_DONE);
+  ADD_NDS32_BUILTIN1 ("break", void, unsigned, BREAK);
+  ADD_NDS32_BUILTIN1 ("syscall", void, unsigned, SYSCALL);
+  ADD_NDS32_BUILTIN0 ("nop", void, NOP);
+  ADD_NDS32_BUILTIN0 ("get_current_sp", unsigned, GET_CURRENT_SP);
+  ADD_NDS32_BUILTIN1 ("set_current_sp", void, unsigned, SET_CURRENT_SP);
+  ADD_NDS32_BUILTIN2 ("teqz", void, unsigned, unsigned, TEQZ);
+  ADD_NDS32_BUILTIN2 ("tnez", void, unsigned, unsigned, TNEZ);
+  ADD_NDS32_BUILTIN1 ("trap", void, unsigned, TRAP);
+  ADD_NDS32_BUILTIN0 ("return_address", unsigned, RETURN_ADDRESS);
+  ADD_NDS32_BUILTIN0 ("setend_big", void, SETEND_BIG);
+  ADD_NDS32_BUILTIN0 ("setend_little", void, SETEND_LITTLE);
+
+  /* Schedule Barrier */
+  ADD_NDS32_BUILTIN0 ("schedule_barrier", void, SCHE_BARRIER);
+
+  /* TLBOP  */
+  ADD_NDS32_BUILTIN1 ("tlbop_trd", void, unsigned, TLBOP_TRD);
+  ADD_NDS32_BUILTIN1 ("tlbop_twr", void, unsigned, TLBOP_TWR);
+  ADD_NDS32_BUILTIN1 ("tlbop_rwr", void, unsigned, TLBOP_RWR);
+  ADD_NDS32_BUILTIN1 ("tlbop_rwlk", void, unsigned, TLBOP_RWLK);
+  ADD_NDS32_BUILTIN1 ("tlbop_unlk", void, unsigned, TLBOP_UNLK);
+  ADD_NDS32_BUILTIN1 ("tlbop_pb", unsigned, unsigned, TLBOP_PB);
+  ADD_NDS32_BUILTIN1 ("tlbop_inv", void, unsigned, TLBOP_INV);
+  ADD_NDS32_BUILTIN0 ("tlbop_flua", void, TLBOP_FLUA);
+
+  /* Unaligned Load/Store  */
+  ADD_NDS32_BUILTIN1 ("unaligned_load_hw", short_unsigned, ptr_ushort,
+		      UALOAD_HW);
+  ADD_NDS32_BUILTIN1 ("unaligned_load_w", unsigned, ptr_uint, UALOAD_W);
+  ADD_NDS32_BUILTIN1 ("unaligned_load_dw", long_long_unsigned, ptr_ulong,
+		      UALOAD_DW);
+  ADD_NDS32_BUILTIN2 ("unaligned_store_hw", void, ptr_ushort, short_unsigned,
+		      UASTORE_HW);
+  ADD_NDS32_BUILTIN2 ("unaligned_store_w", void, ptr_uint, unsigned, UASTORE_W);
+  ADD_NDS32_BUILTIN2 ("unaligned_store_dw", void, ptr_ulong, long_long_unsigned,
+		      UASTORE_DW);
+  ADD_NDS32_BUILTIN0 ("unaligned_feature", unsigned, UNALIGNED_FEATURE);
+  ADD_NDS32_BUILTIN0 ("enable_unaligned", void, ENABLE_UNALIGNED);
+  ADD_NDS32_BUILTIN0 ("disable_unaligned", void, DISABLE_UNALIGNED);
+
+  /* Instruction sequence protection  */
+  ADD_NDS32_BUILTIN0 ("signature_begin", void, SIGNATURE_BEGIN);
+  ADD_NDS32_BUILTIN0 ("signature_end", void, SIGNATURE_END);
+
+  /* DSP Extension: SIMD 16bit Add and Subtract.  */
+  ADD_NDS32_BUILTIN2 ("add16", unsigned, unsigned, unsigned, ADD16);
+  ADD_NDS32_BUILTIN2 ("v_uadd16", u_v2hi, u_v2hi, u_v2hi, V_UADD16);
+  ADD_NDS32_BUILTIN2 ("v_sadd16", v2hi, v2hi, v2hi, V_SADD16);
+  ADD_NDS32_BUILTIN2 ("radd16", unsigned, unsigned, unsigned, RADD16);
+  ADD_NDS32_BUILTIN2 ("v_radd16", v2hi, v2hi, v2hi, V_RADD16);
+  ADD_NDS32_BUILTIN2 ("uradd16", unsigned, unsigned, unsigned, URADD16);
+  ADD_NDS32_BUILTIN2 ("v_uradd16", u_v2hi, u_v2hi, u_v2hi, V_URADD16);
+  ADD_NDS32_BUILTIN2 ("kadd16", unsigned, unsigned, unsigned, KADD16);
+  ADD_NDS32_BUILTIN2 ("v_kadd16", v2hi, v2hi, v2hi, V_KADD16);
+  ADD_NDS32_BUILTIN2 ("ukadd16", unsigned, unsigned, unsigned, UKADD16);
+  ADD_NDS32_BUILTIN2 ("v_ukadd16", u_v2hi, u_v2hi, u_v2hi, V_UKADD16);
+  ADD_NDS32_BUILTIN2 ("sub16", unsigned, unsigned, unsigned, SUB16);
+  ADD_NDS32_BUILTIN2 ("v_usub16", u_v2hi, u_v2hi, u_v2hi, V_USUB16);
+  ADD_NDS32_BUILTIN2 ("v_ssub16", v2hi, v2hi, v2hi, V_SSUB16);
+  ADD_NDS32_BUILTIN2 ("rsub16", unsigned, unsigned, unsigned, RSUB16);
+  ADD_NDS32_BUILTIN2 ("v_rsub16", v2hi, v2hi, v2hi, V_RSUB16);
+  ADD_NDS32_BUILTIN2 ("ursub16", unsigned, unsigned, unsigned, URSUB16);
+  ADD_NDS32_BUILTIN2 ("v_ursub16", u_v2hi, u_v2hi, u_v2hi, V_URSUB16);
+  ADD_NDS32_BUILTIN2 ("ksub16", unsigned, unsigned, unsigned, KSUB16);
+  ADD_NDS32_BUILTIN2 ("v_ksub16", v2hi, v2hi, v2hi, V_KSUB16);
+  ADD_NDS32_BUILTIN2 ("uksub16", unsigned, unsigned, unsigned, UKSUB16);
+  ADD_NDS32_BUILTIN2 ("v_uksub16", u_v2hi, u_v2hi, u_v2hi, V_UKSUB16);
+  ADD_NDS32_BUILTIN2 ("cras16", unsigned, unsigned, unsigned, CRAS16);
+  ADD_NDS32_BUILTIN2 ("v_ucras16", u_v2hi, u_v2hi, u_v2hi, V_UCRAS16);
+  ADD_NDS32_BUILTIN2 ("v_scras16", v2hi, v2hi, v2hi, V_SCRAS16);
+  ADD_NDS32_BUILTIN2 ("rcras16", unsigned, unsigned, unsigned, RCRAS16);
+  ADD_NDS32_BUILTIN2 ("v_rcras16", v2hi, v2hi, v2hi, V_RCRAS16);
+  ADD_NDS32_BUILTIN2 ("urcras16", unsigned, unsigned, unsigned, URCRAS16);
+  ADD_NDS32_BUILTIN2 ("v_urcras16", u_v2hi, u_v2hi, u_v2hi, V_URCRAS16);
+  ADD_NDS32_BUILTIN2 ("kcras16", unsigned, unsigned, unsigned, KCRAS16);
+  ADD_NDS32_BUILTIN2 ("v_kcras16", v2hi, v2hi, v2hi, V_KCRAS16);
+  ADD_NDS32_BUILTIN2 ("ukcras16", unsigned, unsigned, unsigned, UKCRAS16);
+  ADD_NDS32_BUILTIN2 ("v_ukcras16", u_v2hi, u_v2hi, u_v2hi, V_UKCRAS16);
+  ADD_NDS32_BUILTIN2 ("crsa16", unsigned, unsigned, unsigned, CRSA16);
+  ADD_NDS32_BUILTIN2 ("v_ucrsa16", u_v2hi, u_v2hi, u_v2hi, V_UCRSA16);
+  ADD_NDS32_BUILTIN2 ("v_scrsa16", v2hi, v2hi, v2hi, V_SCRSA16);
+  ADD_NDS32_BUILTIN2 ("rcrsa16", unsigned, unsigned, unsigned, RCRSA16);
+  ADD_NDS32_BUILTIN2 ("v_rcrsa16", v2hi, v2hi, v2hi, V_RCRSA16);
+  ADD_NDS32_BUILTIN2 ("urcrsa16", unsigned, unsigned, unsigned, URCRSA16);
+  ADD_NDS32_BUILTIN2 ("v_urcrsa16", u_v2hi, u_v2hi, u_v2hi, V_URCRSA16);
+  ADD_NDS32_BUILTIN2 ("kcrsa16", unsigned, unsigned, unsigned, KCRSA16);
+  ADD_NDS32_BUILTIN2 ("v_kcrsa16", v2hi, v2hi, v2hi, V_KCRSA16);
+  ADD_NDS32_BUILTIN2 ("ukcrsa16", unsigned, unsigned, unsigned, UKCRSA16);
+  ADD_NDS32_BUILTIN2 ("v_ukcrsa16", u_v2hi, u_v2hi, u_v2hi, V_UKCRSA16);
+
+  /* DSP Extension: SIMD 8bit Add and Subtract.  */
+  ADD_NDS32_BUILTIN2 ("add8", integer, integer, integer, ADD8);
+  ADD_NDS32_BUILTIN2 ("v_uadd8", u_v4qi, u_v4qi, u_v4qi, V_UADD8);
+  ADD_NDS32_BUILTIN2 ("v_sadd8", v4qi, v4qi, v4qi, V_SADD8);
+  ADD_NDS32_BUILTIN2 ("radd8", unsigned, unsigned, unsigned, RADD8);
+  ADD_NDS32_BUILTIN2 ("v_radd8", v4qi, v4qi, v4qi, V_RADD8);
+  ADD_NDS32_BUILTIN2 ("uradd8", unsigned, unsigned, unsigned, URADD8);
+  ADD_NDS32_BUILTIN2 ("v_uradd8", u_v4qi, u_v4qi, u_v4qi, V_URADD8);
+  ADD_NDS32_BUILTIN2 ("kadd8", unsigned, unsigned, unsigned, KADD8);
+  ADD_NDS32_BUILTIN2 ("v_kadd8", v4qi, v4qi, v4qi, V_KADD8);
+  ADD_NDS32_BUILTIN2 ("ukadd8", unsigned, unsigned, unsigned, UKADD8);
+  ADD_NDS32_BUILTIN2 ("v_ukadd8", u_v4qi, u_v4qi, u_v4qi, V_UKADD8);
+  ADD_NDS32_BUILTIN2 ("sub8", integer, integer, integer, SUB8);
+  ADD_NDS32_BUILTIN2 ("v_usub8", u_v4qi, u_v4qi, u_v4qi, V_USUB8);
+  ADD_NDS32_BUILTIN2 ("v_ssub8", v4qi, v4qi, v4qi, V_SSUB8);
+  ADD_NDS32_BUILTIN2 ("rsub8", unsigned, unsigned, unsigned, RSUB8);
+  ADD_NDS32_BUILTIN2 ("v_rsub8", v4qi, v4qi, v4qi, V_RSUB8);
+  ADD_NDS32_BUILTIN2 ("ursub8", unsigned, unsigned, unsigned, URSUB8);
+  ADD_NDS32_BUILTIN2 ("v_ursub8", u_v4qi, u_v4qi, u_v4qi, V_URSUB8);
+  ADD_NDS32_BUILTIN2 ("ksub8", unsigned, unsigned, unsigned, KSUB8);
+  ADD_NDS32_BUILTIN2 ("v_ksub8", v4qi, v4qi, v4qi, V_KSUB8);
+  ADD_NDS32_BUILTIN2 ("uksub8", unsigned, unsigned, unsigned, UKSUB8);
+  ADD_NDS32_BUILTIN2 ("v_uksub8", u_v4qi, u_v4qi, u_v4qi, V_UKSUB8);
+
+  /* DSP Extension: SIMD 16bit Shift.  */
+  ADD_NDS32_BUILTIN2 ("sra16", unsigned, unsigned, unsigned, SRA16);
+  ADD_NDS32_BUILTIN2 ("v_sra16", v2hi, v2hi, unsigned, V_SRA16);
+  ADD_NDS32_BUILTIN2 ("sra16_u", unsigned, unsigned, unsigned, SRA16_U);
+  ADD_NDS32_BUILTIN2 ("v_sra16_u", v2hi, v2hi, unsigned, V_SRA16_U);
+  ADD_NDS32_BUILTIN2 ("srl16", unsigned, unsigned, unsigned, SRL16);
+  ADD_NDS32_BUILTIN2 ("v_srl16", u_v2hi, u_v2hi, unsigned, V_SRL16);
+  ADD_NDS32_BUILTIN2 ("srl16_u", unsigned, unsigned, unsigned, SRL16_U);
+  ADD_NDS32_BUILTIN2 ("v_srl16_u", u_v2hi, u_v2hi, unsigned, V_SRL16_U);
+  ADD_NDS32_BUILTIN2 ("sll16", unsigned, unsigned, unsigned, SLL16);
+  ADD_NDS32_BUILTIN2 ("v_sll16", u_v2hi, u_v2hi, unsigned, V_SLL16);
+  ADD_NDS32_BUILTIN2 ("ksll16", unsigned, unsigned, unsigned, KSLL16);
+  ADD_NDS32_BUILTIN2 ("v_ksll16", v2hi, v2hi, unsigned, V_KSLL16);
+  ADD_NDS32_BUILTIN2 ("kslra16", unsigned, unsigned, unsigned, KSLRA16);
+  ADD_NDS32_BUILTIN2 ("v_kslra16", v2hi, v2hi, unsigned, V_KSLRA16);
+  ADD_NDS32_BUILTIN2 ("kslra16_u", unsigned, unsigned, unsigned, KSLRA16_U);
+  ADD_NDS32_BUILTIN2 ("v_kslra16_u", v2hi, v2hi, unsigned, V_KSLRA16_U);
+
+  /* DSP Extension: 16bit Compare.  */
+  ADD_NDS32_BUILTIN2 ("cmpeq16", unsigned, unsigned, unsigned, CMPEQ16);
+  ADD_NDS32_BUILTIN2 ("v_scmpeq16", u_v2hi, v2hi, v2hi, V_SCMPEQ16);
+  ADD_NDS32_BUILTIN2 ("v_ucmpeq16", u_v2hi, u_v2hi, u_v2hi, V_UCMPEQ16);
+  ADD_NDS32_BUILTIN2 ("scmplt16", unsigned, unsigned, unsigned, SCMPLT16);
+  ADD_NDS32_BUILTIN2 ("v_scmplt16", u_v2hi, v2hi, v2hi, V_SCMPLT16);
+  ADD_NDS32_BUILTIN2 ("scmple16", unsigned, unsigned, unsigned, SCMPLE16);
+  ADD_NDS32_BUILTIN2 ("v_scmple16", u_v2hi, v2hi, v2hi, V_SCMPLE16);
+  ADD_NDS32_BUILTIN2 ("ucmplt16", unsigned, unsigned, unsigned, UCMPLT16);
+  ADD_NDS32_BUILTIN2 ("v_ucmplt16", u_v2hi, u_v2hi, u_v2hi, V_UCMPLT16);
+  ADD_NDS32_BUILTIN2 ("ucmple16", unsigned, unsigned, unsigned, UCMPLE16);
+  ADD_NDS32_BUILTIN2 ("v_ucmple16", u_v2hi, u_v2hi, u_v2hi, V_UCMPLE16);
+
+  /* DSP Extension: 8bit Compare.  */
+  ADD_NDS32_BUILTIN2 ("cmpeq8", unsigned, unsigned, unsigned, CMPEQ8);
+  ADD_NDS32_BUILTIN2 ("v_scmpeq8", u_v4qi, v4qi, v4qi, V_SCMPEQ8);
+  ADD_NDS32_BUILTIN2 ("v_ucmpeq8", u_v4qi, u_v4qi, u_v4qi, V_UCMPEQ8);
+  ADD_NDS32_BUILTIN2 ("scmplt8", unsigned, unsigned, unsigned, SCMPLT8);
+  ADD_NDS32_BUILTIN2 ("v_scmplt8", u_v4qi, v4qi, v4qi, V_SCMPLT8);
+  ADD_NDS32_BUILTIN2 ("scmple8", unsigned, unsigned, unsigned, SCMPLE8);
+  ADD_NDS32_BUILTIN2 ("v_scmple8", u_v4qi, v4qi, v4qi, V_SCMPLE8);
+  ADD_NDS32_BUILTIN2 ("ucmplt8", unsigned, unsigned, unsigned, UCMPLT8);
+  ADD_NDS32_BUILTIN2 ("v_ucmplt8", u_v4qi, u_v4qi, u_v4qi, V_UCMPLT8);
+  ADD_NDS32_BUILTIN2 ("ucmple8", unsigned, unsigned, unsigned, UCMPLE8);
+  ADD_NDS32_BUILTIN2 ("v_ucmple8", u_v4qi, u_v4qi, u_v4qi, V_UCMPLE8);
+
+  /* DSP Extension: SIMD 16bit MISC.  */
+  ADD_NDS32_BUILTIN2 ("smin16", unsigned, unsigned, unsigned, SMIN16);
+  ADD_NDS32_BUILTIN2 ("v_smin16", v2hi, v2hi, v2hi, V_SMIN16);
+  ADD_NDS32_BUILTIN2 ("umin16", unsigned, unsigned, unsigned, UMIN16);
+  ADD_NDS32_BUILTIN2 ("v_umin16", u_v2hi, u_v2hi, u_v2hi, V_UMIN16);
+  ADD_NDS32_BUILTIN2 ("smax16", unsigned, unsigned, unsigned, SMAX16);
+  ADD_NDS32_BUILTIN2 ("v_smax16", v2hi, v2hi, v2hi, V_SMAX16);
+  ADD_NDS32_BUILTIN2 ("umax16", unsigned, unsigned, unsigned, UMAX16);
+  ADD_NDS32_BUILTIN2 ("v_umax16", u_v2hi, u_v2hi, u_v2hi, V_UMAX16);
+  ADD_NDS32_BUILTIN2 ("sclip16", unsigned, unsigned, unsigned, SCLIP16);
+  ADD_NDS32_BUILTIN2 ("v_sclip16", v2hi, v2hi, unsigned, V_SCLIP16);
+  ADD_NDS32_BUILTIN2 ("uclip16", unsigned, unsigned, unsigned, UCLIP16);
+  ADD_NDS32_BUILTIN2 ("v_uclip16", v2hi, v2hi, unsigned, V_UCLIP16);
+  ADD_NDS32_BUILTIN2 ("khm16", unsigned, unsigned, unsigned, KHM16);
+  ADD_NDS32_BUILTIN2 ("v_khm16", v2hi, v2hi, v2hi, V_KHM16);
+  ADD_NDS32_BUILTIN2 ("khmx16", unsigned, unsigned, unsigned, KHMX16);
+  ADD_NDS32_BUILTIN2 ("v_khmx16", v2hi, v2hi, v2hi, V_KHMX16);
+  ADD_NDS32_BUILTIN1 ("kabs16", unsigned, unsigned, KABS16);
+  ADD_NDS32_BUILTIN1 ("v_kabs16", v2hi, v2hi, V_KABS16);
+  ADD_NDS32_BUILTIN2 ("smul16", long_long_unsigned, unsigned, unsigned, SMUL16);
+  ADD_NDS32_BUILTIN2 ("v_smul16", v2si, v2hi, v2hi, V_SMUL16);
+  ADD_NDS32_BUILTIN2 ("smulx16",
+		      long_long_unsigned, unsigned, unsigned, SMULX16);
+  ADD_NDS32_BUILTIN2 ("v_smulx16", v2si, v2hi, v2hi, V_SMULX16);
+  ADD_NDS32_BUILTIN2 ("umul16", long_long_unsigned, unsigned, unsigned, UMUL16);
+  ADD_NDS32_BUILTIN2 ("v_umul16", u_v2si, u_v2hi, u_v2hi, V_UMUL16);
+  ADD_NDS32_BUILTIN2 ("umulx16",
+		      long_long_unsigned, unsigned, unsigned, UMULX16);
+  ADD_NDS32_BUILTIN2 ("v_umulx16", u_v2si, u_v2hi, u_v2hi, V_UMULX16);
+
+  /* DSP Extension: SIMD 8bit MISC.  */
+  ADD_NDS32_BUILTIN2 ("smin8", unsigned, unsigned, unsigned, SMIN8);
+  ADD_NDS32_BUILTIN2 ("v_smin8", v4qi, v4qi, v4qi, V_SMIN8);
+  ADD_NDS32_BUILTIN2 ("umin8", unsigned, unsigned, unsigned, UMIN8);
+  ADD_NDS32_BUILTIN2 ("v_umin8", u_v4qi, u_v4qi, u_v4qi, V_UMIN8);
+  ADD_NDS32_BUILTIN2 ("smax8", unsigned, unsigned, unsigned, SMAX8);
+  ADD_NDS32_BUILTIN2 ("v_smax8", v4qi, v4qi, v4qi, V_SMAX8);
+  ADD_NDS32_BUILTIN2 ("umax8", unsigned, unsigned, unsigned, UMAX8);
+  ADD_NDS32_BUILTIN2 ("v_umax8", u_v4qi, u_v4qi, u_v4qi, V_UMAX8);
+  ADD_NDS32_BUILTIN1 ("kabs8", unsigned, unsigned, KABS8);
+  ADD_NDS32_BUILTIN1 ("v_kabs8", v4qi, v4qi, V_KABS8);
+
+  /* DSP Extension: 8bit Unpacking.  */
+  ADD_NDS32_BUILTIN1 ("sunpkd810", unsigned, unsigned, SUNPKD810);
+  ADD_NDS32_BUILTIN1 ("v_sunpkd810", v2hi, v4qi, V_SUNPKD810);
+  ADD_NDS32_BUILTIN1 ("sunpkd820", unsigned, unsigned, SUNPKD820);
+  ADD_NDS32_BUILTIN1 ("v_sunpkd820", v2hi, v4qi, V_SUNPKD820);
+  ADD_NDS32_BUILTIN1 ("sunpkd830", unsigned, unsigned, SUNPKD830);
+  ADD_NDS32_BUILTIN1 ("v_sunpkd830", v2hi, v4qi, V_SUNPKD830);
+  ADD_NDS32_BUILTIN1 ("sunpkd831", unsigned, unsigned, SUNPKD831);
+  ADD_NDS32_BUILTIN1 ("v_sunpkd831", v2hi, v4qi, V_SUNPKD831);
+  ADD_NDS32_BUILTIN1 ("zunpkd810", unsigned, unsigned, ZUNPKD810);
+  ADD_NDS32_BUILTIN1 ("v_zunpkd810", u_v2hi, u_v4qi, V_ZUNPKD810);
+  ADD_NDS32_BUILTIN1 ("zunpkd820", unsigned, unsigned, ZUNPKD820);
+  ADD_NDS32_BUILTIN1 ("v_zunpkd820", u_v2hi, u_v4qi, V_ZUNPKD820);
+  ADD_NDS32_BUILTIN1 ("zunpkd830", unsigned, unsigned, ZUNPKD830);
+  ADD_NDS32_BUILTIN1 ("v_zunpkd830", u_v2hi, u_v4qi, V_ZUNPKD830);
+  ADD_NDS32_BUILTIN1 ("zunpkd831", unsigned, unsigned, ZUNPKD831);
+  ADD_NDS32_BUILTIN1 ("v_zunpkd831", u_v2hi, u_v4qi, V_ZUNPKD831);
+
+  /* DSP Extension: 32bit Add and Subtract.  */
+  ADD_NDS32_BUILTIN2 ("raddw", integer, integer, integer, RADDW);
+  ADD_NDS32_BUILTIN2 ("uraddw", unsigned, unsigned, unsigned, URADDW);
+  ADD_NDS32_BUILTIN2 ("rsubw", integer, integer, integer, RSUBW);
+  ADD_NDS32_BUILTIN2 ("ursubw", unsigned, unsigned, unsigned, URSUBW);
+
+  /* DSP Extension: 32bit Shift.  */
+  ADD_NDS32_BUILTIN2 ("sra_u", integer, integer, unsigned, SRA_U);
+  ADD_NDS32_BUILTIN2 ("ksll", integer, integer, unsigned, KSLL);
+
+  /* DSP Extension: 16bit Packing.  */
+  ADD_NDS32_BUILTIN2 ("pkbb16", unsigned, unsigned, unsigned, PKBB16);
+  ADD_NDS32_BUILTIN2 ("v_pkbb16", u_v2hi, u_v2hi, u_v2hi, V_PKBB16);
+  ADD_NDS32_BUILTIN2 ("pkbt16", unsigned, unsigned, unsigned, PKBT16);
+  ADD_NDS32_BUILTIN2 ("v_pkbt16", u_v2hi, u_v2hi, u_v2hi, V_PKBT16);
+  ADD_NDS32_BUILTIN2 ("pktb16", unsigned, unsigned, unsigned, PKTB16);
+  ADD_NDS32_BUILTIN2 ("v_pktb16", u_v2hi, u_v2hi, u_v2hi, V_PKTB16);
+  ADD_NDS32_BUILTIN2 ("pktt16", unsigned, unsigned, unsigned, PKTT16);
+  ADD_NDS32_BUILTIN2 ("v_pktt16", u_v2hi, u_v2hi, u_v2hi, V_PKTT16);
+
+  /* DSP Extension: Signed MSW 32x32 Multiply and ADD.  */
+  ADD_NDS32_BUILTIN2 ("smmul", integer, integer, integer, SMMUL);
+  ADD_NDS32_BUILTIN2 ("smmul_u", integer, integer, integer, SMMUL_U);
+  ADD_NDS32_BUILTIN3 ("kmmac", integer, integer, integer, integer, KMMAC);
+  ADD_NDS32_BUILTIN3 ("kmmac_u", integer, integer, integer, integer, KMMAC_U);
+  ADD_NDS32_BUILTIN3 ("kmmsb", integer, integer, integer, integer, KMMSB);
+  ADD_NDS32_BUILTIN3 ("kmmsb_u", integer, integer, integer, integer, KMMSB_U);
+  ADD_NDS32_BUILTIN2 ("kwmmul", integer, integer, integer, KWMMUL);
+  ADD_NDS32_BUILTIN2 ("kwmmul_u", integer, integer, integer, KWMMUL_U);
+
+  /* DSP Extension: Most Significant Word 32x16 Multiply and ADD.  */
+  ADD_NDS32_BUILTIN2 ("smmwb", integer, integer, unsigned, SMMWB);
+  ADD_NDS32_BUILTIN2 ("v_smmwb", integer, integer, v2hi, V_SMMWB);
+  ADD_NDS32_BUILTIN2 ("smmwb_u", integer, integer, unsigned, SMMWB_U);
+  ADD_NDS32_BUILTIN2 ("v_smmwb_u", integer, integer, v2hi, V_SMMWB_U);
+  ADD_NDS32_BUILTIN2 ("smmwt", integer, integer, unsigned, SMMWT);
+  ADD_NDS32_BUILTIN2 ("v_smmwt", integer, integer, v2hi, V_SMMWT);
+  ADD_NDS32_BUILTIN2 ("smmwt_u", integer, integer, unsigned, SMMWT_U);
+  ADD_NDS32_BUILTIN2 ("v_smmwt_u", integer, integer, v2hi, V_SMMWT_U);
+  ADD_NDS32_BUILTIN3 ("kmmawb", integer, integer, integer, unsigned, KMMAWB);
+  ADD_NDS32_BUILTIN3 ("v_kmmawb", integer, integer, integer, v2hi, V_KMMAWB);
+  ADD_NDS32_BUILTIN3 ("kmmawb_u",
+		      integer, integer, integer, unsigned, KMMAWB_U);
+  ADD_NDS32_BUILTIN3 ("v_kmmawb_u",
+		      integer, integer, integer, v2hi, V_KMMAWB_U);
+  ADD_NDS32_BUILTIN3 ("kmmawt", integer, integer, integer, unsigned, KMMAWT);
+  ADD_NDS32_BUILTIN3 ("v_kmmawt", integer, integer, integer, v2hi, V_KMMAWT);
+  ADD_NDS32_BUILTIN3 ("kmmawt_u",
+		      integer, integer, integer, unsigned, KMMAWT_U);
+  ADD_NDS32_BUILTIN3 ("v_kmmawt_u",
+		      integer, integer, integer, v2hi, V_KMMAWT_U);
+
+  /* DSP Extension: Signed 16bit Multiply with ADD/Subtract.  */
+  ADD_NDS32_BUILTIN2 ("smbb", integer, unsigned, unsigned, SMBB);
+  ADD_NDS32_BUILTIN2 ("v_smbb", integer, v2hi, v2hi, V_SMBB);
+  ADD_NDS32_BUILTIN2 ("smbt", integer, unsigned, unsigned, SMBT);
+  ADD_NDS32_BUILTIN2 ("v_smbt", integer, v2hi, v2hi, V_SMBT);
+  ADD_NDS32_BUILTIN2 ("smtt", integer, unsigned, unsigned, SMTT);
+  ADD_NDS32_BUILTIN2 ("v_smtt", integer, v2hi, v2hi, V_SMTT);
+  ADD_NDS32_BUILTIN2 ("kmda", integer, unsigned, unsigned, KMDA);
+  ADD_NDS32_BUILTIN2 ("v_kmda", integer, v2hi, v2hi, V_KMDA);
+  ADD_NDS32_BUILTIN2 ("kmxda", integer, unsigned, unsigned, KMXDA);
+  ADD_NDS32_BUILTIN2 ("v_kmxda", integer, v2hi, v2hi, V_KMXDA);
+  ADD_NDS32_BUILTIN2 ("smds", integer, unsigned, unsigned, SMDS);
+  ADD_NDS32_BUILTIN2 ("v_smds", integer, v2hi, v2hi, V_SMDS);
+  ADD_NDS32_BUILTIN2 ("smdrs", integer, unsigned, unsigned, SMDRS);
+  ADD_NDS32_BUILTIN2 ("v_smdrs", integer, v2hi, v2hi, V_SMDRS);
+  ADD_NDS32_BUILTIN2 ("smxds", integer, unsigned, unsigned, SMXDS);
+  ADD_NDS32_BUILTIN2 ("v_smxds", integer, v2hi, v2hi, V_SMXDS);
+  ADD_NDS32_BUILTIN3 ("kmabb", integer, integer, unsigned, unsigned, KMABB);
+  ADD_NDS32_BUILTIN3 ("v_kmabb", integer, integer, v2hi, v2hi, V_KMABB);
+  ADD_NDS32_BUILTIN3 ("kmabt", integer, integer, unsigned, unsigned, KMABT);
+  ADD_NDS32_BUILTIN3 ("v_kmabt", integer, integer, v2hi, v2hi, V_KMABT);
+  ADD_NDS32_BUILTIN3 ("kmatt", integer, integer, unsigned, unsigned, KMATT);
+  ADD_NDS32_BUILTIN3 ("v_kmatt", integer, integer, v2hi, v2hi, V_KMATT);
+  ADD_NDS32_BUILTIN3 ("kmada", integer, integer, unsigned, unsigned, KMADA);
+  ADD_NDS32_BUILTIN3 ("v_kmada", integer, integer, v2hi, v2hi, V_KMADA);
+  ADD_NDS32_BUILTIN3 ("kmaxda", integer, integer, unsigned, unsigned, KMAXDA);
+  ADD_NDS32_BUILTIN3 ("v_kmaxda", integer, integer, v2hi, v2hi, V_KMAXDA);
+  ADD_NDS32_BUILTIN3 ("kmads", integer, integer, unsigned, unsigned, KMADS);
+  ADD_NDS32_BUILTIN3 ("v_kmads", integer, integer, v2hi, v2hi, V_KMADS);
+  ADD_NDS32_BUILTIN3 ("kmadrs", integer, integer, unsigned, unsigned, KMADRS);
+  ADD_NDS32_BUILTIN3 ("v_kmadrs", integer, integer, v2hi, v2hi, V_KMADRS);
+  ADD_NDS32_BUILTIN3 ("kmaxds", integer, integer, unsigned, unsigned, KMAXDS);
+  ADD_NDS32_BUILTIN3 ("v_kmaxds", integer, integer, v2hi, v2hi, V_KMAXDS);
+  ADD_NDS32_BUILTIN3 ("kmsda", integer, integer, unsigned, unsigned, KMSDA);
+  ADD_NDS32_BUILTIN3 ("v_kmsda", integer, integer, v2hi, v2hi, V_KMSDA);
+  ADD_NDS32_BUILTIN3 ("kmsxda", integer, integer, unsigned, unsigned, KMSXDA);
+  ADD_NDS32_BUILTIN3 ("v_kmsxda", integer, integer, v2hi, v2hi, V_KMSXDA);
+
+  /* DSP Extension: Signed 16bit Multiply with 64bit ADD/Subtract.  */
+  ADD_NDS32_BUILTIN2 ("smal", long_long_integer,
+		      long_long_integer, unsigned, SMAL);
+  ADD_NDS32_BUILTIN2 ("v_smal", long_long_integer,
+		      long_long_integer, v2hi, V_SMAL);
+
+  /* DSP Extension: 32bit MISC.  */
+  ADD_NDS32_BUILTIN2 ("bitrev", unsigned, unsigned, unsigned, BITREV);
+  ADD_NDS32_BUILTIN2 ("wext", unsigned, long_long_integer, unsigned, WEXT);
+  ADD_NDS32_BUILTIN3 ("bpick", unsigned, unsigned, unsigned, unsigned, BPICK);
+  ADD_NDS32_BUILTIN3 ("insb", unsigned, unsigned, unsigned, unsigned, INSB);
+
+  /* DSP Extension: 64bit Add and Subtract.  */
+  ADD_NDS32_BUILTIN2 ("sadd64", long_long_integer,
+		      long_long_integer, long_long_integer, SADD64);
+  ADD_NDS32_BUILTIN2 ("uadd64", long_long_unsigned,
+		      long_long_unsigned, long_long_unsigned, UADD64);
+  ADD_NDS32_BUILTIN2 ("radd64", long_long_integer,
+		      long_long_integer, long_long_integer, RADD64);
+  ADD_NDS32_BUILTIN2 ("uradd64", long_long_unsigned,
+		      long_long_unsigned, long_long_unsigned, URADD64);
+  ADD_NDS32_BUILTIN2 ("kadd64", long_long_integer,
+		      long_long_integer, long_long_integer, KADD64);
+  ADD_NDS32_BUILTIN2 ("ukadd64", long_long_unsigned,
+		      long_long_unsigned, long_long_unsigned, UKADD64);
+  ADD_NDS32_BUILTIN2 ("ssub64", long_long_integer,
+		      long_long_integer, long_long_integer, SSUB64);
+  ADD_NDS32_BUILTIN2 ("usub64", long_long_unsigned,
+		      long_long_unsigned, long_long_unsigned, USUB64);
+  ADD_NDS32_BUILTIN2 ("rsub64", long_long_integer,
+		      long_long_integer, long_long_integer, RSUB64);
+  ADD_NDS32_BUILTIN2 ("ursub64", long_long_unsigned,
+		      long_long_unsigned, long_long_unsigned, URSUB64);
+  ADD_NDS32_BUILTIN2 ("ksub64", long_long_integer,
+		      long_long_integer, long_long_integer, KSUB64);
+  ADD_NDS32_BUILTIN2 ("uksub64", long_long_unsigned,
+		      long_long_unsigned, long_long_unsigned, UKSUB64);
+
+  /* DSP Extension: 32bit Multiply with 64bit Add/Subtract.  */
+  ADD_NDS32_BUILTIN3 ("smar64", long_long_integer,
+		      long_long_integer, integer, integer, SMAR64);
+  ADD_NDS32_BUILTIN3 ("smsr64", long_long_integer,
+		      long_long_integer, integer, integer, SMSR64);
+  ADD_NDS32_BUILTIN3 ("umar64", long_long_unsigned,
+		      long_long_unsigned, unsigned, unsigned, UMAR64);
+  ADD_NDS32_BUILTIN3 ("umsr64", long_long_unsigned,
+		      long_long_unsigned, unsigned, unsigned, UMSR64);
+  ADD_NDS32_BUILTIN3 ("kmar64", long_long_integer,
+		      long_long_integer, integer, integer, KMAR64);
+  ADD_NDS32_BUILTIN3 ("kmsr64", long_long_integer,
+		      long_long_integer, integer, integer, KMSR64);
+  ADD_NDS32_BUILTIN3 ("ukmar64", long_long_unsigned,
+		      long_long_unsigned, unsigned, unsigned, UKMAR64);
+  ADD_NDS32_BUILTIN3 ("ukmsr64", long_long_unsigned,
+		      long_long_unsigned, unsigned, unsigned, UKMSR64);
+
+  /* DSP Extension: Signed 16bit Multiply with 64bit Add/Subtract.  */
+  ADD_NDS32_BUILTIN3 ("smalbb", long_long_integer,
+		      long_long_integer, unsigned, unsigned, SMALBB);
+  ADD_NDS32_BUILTIN3 ("v_smalbb", long_long_integer,
+		      long_long_integer, v2hi, v2hi, V_SMALBB);
+  ADD_NDS32_BUILTIN3 ("smalbt", long_long_integer,
+		      long_long_integer, unsigned, unsigned, SMALBT);
+  ADD_NDS32_BUILTIN3 ("v_smalbt", long_long_integer,
+		      long_long_integer, v2hi, v2hi, V_SMALBT);
+  ADD_NDS32_BUILTIN3 ("smaltt", long_long_integer,
+		      long_long_integer, unsigned, unsigned, SMALTT);
+  ADD_NDS32_BUILTIN3 ("v_smaltt", long_long_integer,
+		      long_long_integer, v2hi, v2hi, V_SMALTT);
+  ADD_NDS32_BUILTIN3 ("smalda", long_long_integer,
+		      long_long_integer, unsigned, unsigned, SMALDA);
+  ADD_NDS32_BUILTIN3 ("v_smalda", long_long_integer,
+		      long_long_integer, v2hi, v2hi, V_SMALDA);
+  ADD_NDS32_BUILTIN3 ("smalxda", long_long_integer,
+		      long_long_integer, unsigned, unsigned, SMALXDA);
+  ADD_NDS32_BUILTIN3 ("v_smalxda", long_long_integer,
+		      long_long_integer, v2hi, v2hi, V_SMALXDA);
+  ADD_NDS32_BUILTIN3 ("smalds", long_long_integer,
+		      long_long_integer, unsigned, unsigned, SMALDS);
+  ADD_NDS32_BUILTIN3 ("v_smalds", long_long_integer,
+		      long_long_integer, v2hi, v2hi, V_SMALDS);
+  ADD_NDS32_BUILTIN3 ("smaldrs", long_long_integer,
+		      long_long_integer, unsigned, unsigned, SMALDRS);
+  ADD_NDS32_BUILTIN3 ("v_smaldrs", long_long_integer,
+		      long_long_integer, v2hi, v2hi, V_SMALDRS);
+  ADD_NDS32_BUILTIN3 ("smalxds", long_long_integer,
+		      long_long_integer, unsigned, unsigned, SMALXDS);
+  ADD_NDS32_BUILTIN3 ("v_smalxds", long_long_integer,
+		      long_long_integer, v2hi, v2hi, V_SMALXDS);
+  ADD_NDS32_BUILTIN3 ("smslda", long_long_integer,
+		      long_long_integer, unsigned, unsigned, SMSLDA);
+  ADD_NDS32_BUILTIN3 ("v_smslda", long_long_integer,
+		      long_long_integer, v2hi, v2hi, V_SMSLDA);
+  ADD_NDS32_BUILTIN3 ("smslxda", long_long_integer,
+		      long_long_integer, unsigned, unsigned, SMSLXDA);
+  ADD_NDS32_BUILTIN3 ("v_smslxda", long_long_integer,
+		      long_long_integer, v2hi, v2hi, V_SMSLXDA);
+
+  /* DSP Extension: augmented baseline.  */
+  ADD_NDS32_BUILTIN2 ("uclip32", unsigned, integer, unsigned, UCLIP32);
+  ADD_NDS32_BUILTIN2 ("sclip32", integer, integer, unsigned, SCLIP32);
+  ADD_NDS32_BUILTIN1 ("kabs", integer, integer, KABS);
+
+  /* The builtin turn off hwloop optimization.  */
+  ADD_NDS32_BUILTIN0 ("no_ext_zol", void, NO_HWLOOP);
+
+  /* DSP Extension: vector type unaligned Load/Store  */
+  ADD_NDS32_BUILTIN1 ("get_unaligned_u16x2", u_v2hi, ptr_ushort, UALOAD_U16);
+  ADD_NDS32_BUILTIN1 ("get_unaligned_s16x2", v2hi, ptr_short, UALOAD_S16);
+  ADD_NDS32_BUILTIN1 ("get_unaligned_u8x4", u_v4qi, ptr_uchar, UALOAD_U8);
+  ADD_NDS32_BUILTIN1 ("get_unaligned_s8x4", v4qi, ptr_char, UALOAD_S8);
+  ADD_NDS32_BUILTIN2 ("put_unaligned_u16x2", void, ptr_ushort,
+		      u_v2hi, UASTORE_U16);
+  ADD_NDS32_BUILTIN2 ("put_unaligned_s16x2", void, ptr_short,
+		      v2hi, UASTORE_S16);
+  ADD_NDS32_BUILTIN2 ("put_unaligned_u8x4", void, ptr_uchar,
+		      u_v4qi, UASTORE_U8);
+  ADD_NDS32_BUILTIN2 ("put_unaligned_s8x4", void, ptr_char,
+		      v4qi, UASTORE_S8);
+}
 /* ------------------------------------------------------------------------ */
diff --git a/gcc/config/nds32/nds32-intrinsic.md b/gcc/config/nds32/nds32-intrinsic.md
index 53876c5..6f8b3eb 100644
--- a/gcc/config/nds32/nds32-intrinsic.md
+++ b/gcc/config/nds32/nds32-intrinsic.md
@@ -40,6 +40,26 @@
    (set_attr "length"    "4")]
 )

+(define_expand "mtsr_isb"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(match_operand:SI 1 "immediate_operand" ""))]
+  ""
+{
+  emit_insn (gen_unspec_volatile_mtsr (operands[0], operands[1]));
+  emit_insn (gen_unspec_volatile_isb());
+  DONE;
+})
+
+(define_expand "mtsr_dsb"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(match_operand:SI 1 "immediate_operand" ""))]
+  ""
+{
+  emit_insn (gen_unspec_volatile_mtsr (operands[0], operands[1]));
+  emit_insn (gen_unspec_dsb());
+  DONE;
+})
+
 (define_insn "unspec_volatile_mtsr"
   [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")
 			(match_operand:SI 1 "immediate_operand" "i")] UNSPEC_VOLATILE_MTSR)]
@@ -58,6 +78,74 @@
    (set_attr "length"    "4")]
 )

+;; FPU Register Transfer.
+
+(define_insn "unspec_fcpynsd"
+   [(set (match_operand:DF 0 "register_operand" "=f")
+	 (unspec:DF [(match_operand:DF 1 "register_operand" "f")
+		     (match_operand:DF 2 "register_operand" "f")] UNSPEC_FCPYNSD))]
+  ""
+  "fcpynsd\t%0, %1, %2"
+  [(set_attr "type"   "misc")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_fcpynss"
+   [(set (match_operand:SF 0 "register_operand" "=f")
+	 (unspec:SF [(match_operand:SF 1 "register_operand" "f")
+		     (match_operand:SF 2 "register_operand" "f")] UNSPEC_FCPYNSS))]
+  ""
+  "fcpynss\t%0, %1, %2"
+  [(set_attr "type"   "misc")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_fcpysd"
+   [(set (match_operand:DF 0 "register_operand" "=f")
+	 (unspec:DF [(match_operand:DF 1 "register_operand" "f")
+		     (match_operand:DF 2 "register_operand" "f")] UNSPEC_FCPYSD))]
+  ""
+  "fcpysd\t%0, %1, %2"
+  [(set_attr "type"   "misc")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_fcpyss"
+   [(set (match_operand:SF 0 "register_operand" "=f")
+	 (unspec:SF [(match_operand:SF 1 "register_operand" "f")
+		     (match_operand:SF 2 "register_operand" "f")] UNSPEC_FCPYSS))]
+  ""
+  "fcpyss\t%0, %1, %2"
+  [(set_attr "type"   "misc")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_fmfcsr"
+   [(set (match_operand:SI 0 "register_operand" "=r")
+	 (unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_FMFCSR))]
+  ""
+  "fmfcsr\t%0"
+  [(set_attr "type"   "misc")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_fmtcsr"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_FMTCSR)]
+  ""
+  "fmtcsr\t%0"
+  [(set_attr "type"   "misc")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_fmfcfg"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_FMFCFG))]
+  ""
+  "fmfcfg\t%0"
+  [(set_attr "type"   "misc")
+   (set_attr "length"    "4")]
+)
+
 ;; ------------------------------------------------------------------------

 ;; Interrupt Instructions.
@@ -76,6 +164,445 @@
   [(set_attr "type" "misc")]
 )

+(define_expand "unspec_enable_int"
+  [(unspec_volatile:SI [(match_operand:SI 0 "immediate_operand" "")] UNSPEC_VOLATILE_ENABLE_INT)]
+  ""
+{
+  rtx system_reg;
+  rtx temp_reg = gen_reg_rtx (SImode);
+
+  /* Set system register form nds32_intrinsic_register_names[].  */
+  if ((INTVAL (operands[0]) >= NDS32_INT_H16)
+      && (INTVAL (operands[0]) <= NDS32_INT_H31))
+    {
+      system_reg =  GEN_INT (__NDS32_REG_INT_MASK2__);
+      operands[0] = GEN_INT (1 << (INTVAL (operands[0])));
+    }
+  else if ((INTVAL (operands[0]) >= NDS32_INT_H32)
+	   && (INTVAL (operands[0]) <= NDS32_INT_H63))
+    {
+      system_reg =  GEN_INT (__NDS32_REG_INT_MASK3__);
+      operands[0] = GEN_INT (1 << (INTVAL (operands[0]) - 32));
+    }
+  else
+    {
+      system_reg =  GEN_INT (__NDS32_REG_INT_MASK__);
+
+      if (INTVAL (operands[0]) == NDS32_INT_SWI)
+        operands[0] = GEN_INT (1 << 16);
+      else if ((INTVAL (operands[0]) >= NDS32_INT_ALZ)
+	       && (INTVAL (operands[0]) <= NDS32_INT_DSSIM))
+	operands[0] = GEN_INT (1 << (INTVAL (operands[0]) - 4));
+      else
+	operands[0] = GEN_INT (1 << (INTVAL (operands[0])));
+    }
+
+  emit_insn (gen_unspec_volatile_mfsr (temp_reg, system_reg));
+  emit_insn (gen_iorsi3 (temp_reg, temp_reg, operands[0]));
+  emit_insn (gen_unspec_volatile_mtsr (temp_reg, system_reg));
+  emit_insn (gen_unspec_dsb ());
+  DONE;
+})
+
+(define_expand "unspec_disable_int"
+  [(unspec_volatile:SI [(match_operand:SI 0 "immediate_operand" "")] UNSPEC_VOLATILE_DISABLE_INT)]
+  ""
+{
+  rtx system_reg;
+  rtx temp_reg = gen_reg_rtx (SImode);
+
+  /* Set system register form nds32_intrinsic_register_names[].  */
+  if ((INTVAL (operands[0]) >= NDS32_INT_H16)
+      && (INTVAL (operands[0]) <= NDS32_INT_H31))
+    {
+      system_reg =  GEN_INT (__NDS32_REG_INT_MASK2__);
+      operands[0] = GEN_INT (~(1 << INTVAL (operands[0])));
+    }
+  else if ((INTVAL (operands[0]) >= NDS32_INT_H32)
+	   && (INTVAL (operands[0]) <= NDS32_INT_H63))
+    {
+      system_reg =  GEN_INT (__NDS32_REG_INT_MASK3__);
+      operands[0] = GEN_INT (~(1 << (INTVAL (operands[0]) - 32)));
+    }
+  else
+    {
+      system_reg =  GEN_INT (__NDS32_REG_INT_MASK__);
+
+      if (INTVAL (operands[0]) == NDS32_INT_SWI)
+        operands[0] = GEN_INT (~(1 << 16));
+      else if ((INTVAL (operands[0]) >= NDS32_INT_ALZ)
+	       && (INTVAL (operands[0]) <= NDS32_INT_DSSIM))
+	operands[0] = GEN_INT (~(1 << (INTVAL (operands[0]) - 4)));
+      else
+	operands[0] = GEN_INT (~(1 << INTVAL (operands[0])));
+    }
+
+  emit_insn (gen_unspec_volatile_mfsr (temp_reg, system_reg));
+  emit_insn (gen_andsi3 (temp_reg, temp_reg, operands[0]));
+  emit_insn (gen_unspec_volatile_mtsr (temp_reg, system_reg));
+  emit_insn (gen_unspec_dsb ());
+  DONE;
+})
+
+(define_expand "unspec_set_pending_swint"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_SET_PENDING_SWINT)]
+  ""
+{
+  /* Get $INT_PEND system register form nds32_intrinsic_register_names[]  */
+  rtx system_reg =  GEN_INT (__NDS32_REG_INT_PEND__);
+  rtx temp_reg = gen_reg_rtx (SImode);
+
+  emit_insn (gen_unspec_volatile_mfsr (temp_reg, system_reg));
+  emit_insn (gen_iorsi3 (temp_reg, temp_reg, GEN_INT (65536)));
+  emit_insn (gen_unspec_volatile_mtsr (temp_reg, system_reg));
+  emit_insn (gen_unspec_dsb ());
+  DONE;
+})
+
+(define_expand "unspec_clr_pending_swint"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_CLR_PENDING_SWINT)]
+  ""
+{
+  /* Get $INT_PEND system register form nds32_intrinsic_register_names[]  */
+  rtx system_reg =  GEN_INT (__NDS32_REG_INT_PEND__);
+  rtx temp_reg = gen_reg_rtx (SImode);
+
+  emit_insn (gen_unspec_volatile_mfsr (temp_reg, system_reg));
+  emit_insn (gen_andsi3 (temp_reg, temp_reg, GEN_INT (~(1 << 16))));
+  emit_insn (gen_unspec_volatile_mtsr (temp_reg, system_reg));
+  emit_insn (gen_unspec_dsb ());
+  DONE;
+})
+
+(define_expand "unspec_clr_pending_hwint"
+  [(unspec_volatile:SI [(match_operand:SI 0 "immediate_operand" "")] UNSPEC_VOLATILE_CLR_PENDING_HWINT)]
+  ""
+{
+  rtx system_reg = NULL_RTX;
+  rtx temp_reg = gen_reg_rtx (SImode);
+  rtx clr_hwint;
+  unsigned offset = 0;
+
+  /* Set system register form nds32_intrinsic_register_names[].  */
+  if ((INTVAL (operands[0]) >= NDS32_INT_H0)
+      && (INTVAL (operands[0]) <= NDS32_INT_H15))
+    {
+      system_reg = GEN_INT (__NDS32_REG_INT_PEND__);
+    }
+  else if ((INTVAL (operands[0]) >= NDS32_INT_H16)
+	   && (INTVAL (operands[0]) <= NDS32_INT_H31))
+    {
+      system_reg = GEN_INT (__NDS32_REG_INT_PEND2__);
+    }
+  else if ((INTVAL (operands[0]) >= NDS32_INT_H32)
+	   && (INTVAL (operands[0]) <= NDS32_INT_H63))
+    {
+      system_reg = GEN_INT (__NDS32_REG_INT_PEND3__);
+      offset = 32;
+    }
+  else
+    error ("__nds32__clr_pending_hwint not support NDS32_INT_SWI,"
+	   " NDS32_INT_ALZ, NDS32_INT_IDIVZE, NDS32_INT_DSSIM");
+
+  /* $INT_PEND type is write one clear.  */
+  clr_hwint = GEN_INT (1 << (INTVAL (operands[0]) - offset));
+
+  if (system_reg != NULL_RTX)
+    {
+      emit_move_insn (temp_reg, clr_hwint);
+      emit_insn (gen_unspec_volatile_mtsr (temp_reg, system_reg));
+      emit_insn (gen_unspec_dsb ());
+    }
+  DONE;
+})
+
+(define_expand "unspec_get_all_pending_int"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_GET_ALL_PENDING_INT))]
+  ""
+{
+  rtx system_reg = GEN_INT (__NDS32_REG_INT_PEND__);
+  emit_insn (gen_unspec_volatile_mfsr (operands[0], system_reg));
+  emit_insn (gen_unspec_dsb ());
+  DONE;
+})
+
+(define_expand "unspec_get_pending_int"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(unspec_volatile:SI [(match_operand:SI 1 "immediate_operand" "")] UNSPEC_VOLATILE_GET_PENDING_INT))]
+  ""
+{
+  rtx system_reg = NULL_RTX;
+
+  /* Set system register form nds32_intrinsic_register_names[].  */
+  if ((INTVAL (operands[1]) >= NDS32_INT_H0)
+      && (INTVAL (operands[1]) <= NDS32_INT_H15))
+    {
+      system_reg = GEN_INT (__NDS32_REG_INT_PEND__);
+      operands[2] = GEN_INT (31 - INTVAL (operands[1]));
+    }
+  else if (INTVAL (operands[1]) == NDS32_INT_SWI)
+    {
+      system_reg = GEN_INT (__NDS32_REG_INT_PEND__);
+      operands[2] = GEN_INT (15);
+    }
+  else if ((INTVAL (operands[1]) >= NDS32_INT_H16)
+	   && (INTVAL (operands[1]) <= NDS32_INT_H31))
+    {
+      system_reg = GEN_INT (__NDS32_REG_INT_PEND2__);
+      operands[2] = GEN_INT (31 - INTVAL (operands[1]));
+    }
+  else if ((INTVAL (operands[1]) >= NDS32_INT_H32)
+	   && (INTVAL (operands[1]) <= NDS32_INT_H63))
+    {
+      system_reg = GEN_INT (__NDS32_REG_INT_PEND3__);
+      operands[2] = GEN_INT (31 - (INTVAL (operands[1]) - 32));
+    }
+  else
+    error ("get_pending_int not support NDS32_INT_ALZ,"
+	   " NDS32_INT_IDIVZE, NDS32_INT_DSSIM");
+
+  /* mfsr op0, sytem_reg  */
+  if (system_reg != NULL_RTX)
+    {
+      emit_insn (gen_unspec_volatile_mfsr (operands[0], system_reg));
+      emit_insn (gen_ashlsi3 (operands[0], operands[0], operands[2]));
+      emit_insn (gen_lshrsi3 (operands[0], operands[0], GEN_INT (31)));
+      emit_insn (gen_unspec_dsb ());
+    }
+  DONE;
+})
+
+(define_expand "unspec_set_int_priority"
+  [(unspec_volatile:SI [(match_operand:SI 0 "immediate_operand" "")
+			(match_operand:SI 1 "immediate_operand" "")] UNSPEC_VOLATILE_SET_INT_PRIORITY)]
+  ""
+{
+  rtx system_reg = NULL_RTX;
+  rtx priority = NULL_RTX;
+  rtx mask = NULL_RTX;
+  rtx temp_reg = gen_reg_rtx (SImode);
+  rtx mask_reg = gen_reg_rtx (SImode);
+  rtx set_reg = gen_reg_rtx (SImode);
+  unsigned offset = 0;
+
+  /* Get system register form nds32_intrinsic_register_names[].  */
+  if (INTVAL (operands[0]) <= NDS32_INT_H15)
+    {
+      system_reg =  GEN_INT (__NDS32_REG_INT_PRI__);
+      offset = 0;
+    }
+  else if (INTVAL (operands[0]) >= NDS32_INT_H16
+	   && INTVAL (operands[0]) <= NDS32_INT_H31)
+    {
+      system_reg =  GEN_INT (__NDS32_REG_INT_PRI2__);
+      /* The $INT_PRI2 first bit correspond to H16, so need
+	 subtract 16.  */
+      offset = 16;
+    }
+  else if (INTVAL (operands[0]) >= NDS32_INT_H32
+	   && INTVAL (operands[0]) <= NDS32_INT_H47)
+    {
+      system_reg =  GEN_INT (__NDS32_REG_INT_PRI3__);
+      /* The $INT_PRI3 first bit correspond to H32, so need
+	 subtract 32.  */
+      offset = 32;
+    }
+  else if (INTVAL (operands[0]) >= NDS32_INT_H48
+	   && INTVAL (operands[0]) <= NDS32_INT_H63)
+    {
+      system_reg =  GEN_INT (__NDS32_REG_INT_PRI4__);
+      /* The $INT_PRI3 first bit correspond to H48, so need
+	 subtract 48.  */
+      offset = 48;
+    }
+  else
+    error ("set_int_priority not support NDS32_INT_SWI,"
+	   " NDS32_INT_ALZ, NDS32_INT_IDIVZE, NDS32_INT_DSSIM");
+
+  mask = GEN_INT (~(3 << 2 * (INTVAL (operands[0]) - offset)));
+  priority = GEN_INT ((int) (INTVAL (operands[1])
+			     << ((INTVAL (operands[0]) - offset) * 2)));
+
+  if (system_reg != NULL_RTX)
+    {
+      emit_move_insn (mask_reg, mask);
+      emit_move_insn (set_reg, priority);
+      emit_insn (gen_unspec_volatile_mfsr (temp_reg, system_reg));
+      emit_insn (gen_andsi3 (temp_reg, temp_reg, mask_reg));
+      emit_insn (gen_iorsi3 (temp_reg, temp_reg, set_reg));
+      emit_insn (gen_unspec_volatile_mtsr (temp_reg, system_reg));
+      emit_insn (gen_unspec_dsb ());
+    }
+  DONE;
+})
+
+(define_expand "unspec_get_int_priority"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(unspec_volatile:SI [(match_operand:SI 1 "immediate_operand" "")] UNSPEC_VOLATILE_GET_INT_PRIORITY))]
+  ""
+{
+  rtx system_reg = NULL_RTX;
+  rtx priority = NULL_RTX;
+  unsigned offset = 0;
+
+  /* Get system register form nds32_intrinsic_register_names[]  */
+  if (INTVAL (operands[1]) <= NDS32_INT_H15)
+    {
+      system_reg =  GEN_INT (__NDS32_REG_INT_PRI__);
+      offset = 0;
+    }
+  else if (INTVAL (operands[1]) >= NDS32_INT_H16
+	   && INTVAL (operands[1]) <= NDS32_INT_H31)
+    {
+      system_reg =  GEN_INT (__NDS32_REG_INT_PRI2__);
+      /* The $INT_PRI2 first bit correspond to H16, so need
+	 subtract 16.  */
+      offset = 16;
+    }
+  else if (INTVAL (operands[1]) >= NDS32_INT_H32
+	   && INTVAL (operands[1]) <= NDS32_INT_H47)
+    {
+      system_reg =  GEN_INT (__NDS32_REG_INT_PRI3__);
+      /* The $INT_PRI3 first bit correspond to H32, so need
+	 subtract 32.  */
+      offset = 32;
+    }
+  else if (INTVAL (operands[1]) >= NDS32_INT_H48
+	   && INTVAL (operands[1]) <= NDS32_INT_H63)
+    {
+      system_reg =  GEN_INT (__NDS32_REG_INT_PRI4__);
+      /* The $INT_PRI4 first bit correspond to H48, so need
+	 subtract 48.  */
+      offset = 48;
+    }
+  else
+    error ("set_int_priority not support NDS32_INT_SWI,"
+	   " NDS32_INT_ALZ, NDS32_INT_IDIVZE, NDS32_INT_DSSIM");
+
+  priority = GEN_INT (31 - 2 * (INTVAL (operands[1]) - offset));
+
+  if (system_reg != NULL_RTX)
+    {
+      emit_insn (gen_unspec_volatile_mfsr (operands[0], system_reg));
+      emit_insn (gen_ashlsi3 (operands[0], operands[0], priority));
+      emit_insn (gen_lshrsi3 (operands[0], operands[0], GEN_INT (30)));
+      emit_insn (gen_unspec_dsb ());
+    }
+  DONE;
+})
+
+(define_expand "unspec_set_trig_level"
+  [(unspec_volatile:SI [(match_operand:SI 0 "immediate_operand" "")] UNSPEC_VOLATILE_SET_TRIG_LEVEL)]
+  ""
+{
+  rtx system_reg = NULL_RTX;
+  rtx temp_reg = gen_reg_rtx (SImode);
+  rtx set_level;
+  unsigned offset = 0;
+
+  if (INTVAL (operands[0]) >= NDS32_INT_H0
+      && INTVAL (operands[0]) <= NDS32_INT_H31)
+    {
+      system_reg = GEN_INT (__NDS32_REG_INT_TRIGGER__);
+      offset = 0;
+    }
+  else if (INTVAL (operands[0]) >= NDS32_INT_H32
+	   && INTVAL (operands[0]) <= NDS32_INT_H63)
+    {
+      system_reg = GEN_INT (__NDS32_REG_INT_TRIGGER2__);
+      offset = 32;
+    }
+  else
+    error ("__nds32__set_trig_type_level not support NDS32_INT_SWI,"
+	   " NDS32_INT_ALZ, NDS32_INT_IDIVZE, NDS32_INT_DSSIM");
+
+  if (system_reg != NULL_RTX)
+    {
+      /* TRIGGER register, 0 mean level triggered and 1 mean edge triggered. */
+      set_level = GEN_INT (~(1 << (INTVAL (operands[0]) - offset)));
+
+      emit_insn (gen_unspec_volatile_mfsr (temp_reg, system_reg));
+      emit_insn (gen_andsi3 (temp_reg, temp_reg, set_level));
+      emit_insn (gen_unspec_volatile_mtsr (temp_reg, system_reg));
+    }
+  DONE;
+})
+
+(define_expand "unspec_set_trig_edge"
+  [(unspec_volatile:SI [(match_operand:SI 0 "immediate_operand" "")] UNSPEC_VOLATILE_SET_TRIG_EDGE)]
+  ""
+{
+  rtx system_reg = NULL_RTX;
+  rtx temp_reg = gen_reg_rtx (SImode);
+  rtx set_level;
+  unsigned offset = 0;
+
+  if (INTVAL (operands[0]) >= NDS32_INT_H0
+      && INTVAL (operands[0]) <= NDS32_INT_H31)
+    {
+      system_reg = GEN_INT (__NDS32_REG_INT_TRIGGER__);
+      offset = 0;
+    }
+  else if (INTVAL (operands[0]) >= NDS32_INT_H32
+	   && INTVAL (operands[0]) <= NDS32_INT_H63)
+    {
+      system_reg = GEN_INT (__NDS32_REG_INT_TRIGGER2__);
+      offset = 32;
+    }
+  else
+    error ("__nds32__set_trig_type_edge not support NDS32_INT_SWI,"
+	   " NDS32_INT_ALZ, NDS32_INT_IDIVZE, NDS32_INT_DSSIM");
+
+  if (system_reg != NULL_RTX)
+    {
+      /* TRIGGER register, 0 mean level triggered and 1 mean edge triggered. */
+      set_level = GEN_INT ((1 << (INTVAL (operands[0]) - offset)));
+
+      emit_insn (gen_unspec_volatile_mfsr (temp_reg, system_reg));
+      emit_insn (gen_iorsi3 (temp_reg, temp_reg, set_level));
+      emit_insn (gen_unspec_volatile_mtsr (temp_reg, system_reg));
+    }
+  DONE;
+})
+
+(define_expand "unspec_get_trig_type"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(unspec_volatile:SI [(match_operand:SI 1 "immediate_operand" "")] UNSPEC_VOLATILE_GET_TRIG_TYPE))]
+  ""
+{
+  rtx system_reg = NULL_RTX;
+  rtx trig_type;
+  unsigned offset = 0;
+
+  if (INTVAL (operands[1]) >= NDS32_INT_H0
+      && INTVAL (operands[1]) <= NDS32_INT_H31)
+    {
+      system_reg = GEN_INT (__NDS32_REG_INT_TRIGGER__);
+      offset = 0;
+    }
+  else if (INTVAL (operands[1]) >= NDS32_INT_H32
+	   && INTVAL (operands[1]) <= NDS32_INT_H63)
+    {
+      system_reg = GEN_INT (__NDS32_REG_INT_TRIGGER2__);
+      offset = 32;
+    }
+  else
+    error ("__nds32__get_trig_type not support NDS32_INT_SWI,"
+	   " NDS32_INT_ALZ, NDS32_INT_IDIVZE, NDS32_INT_DSSIM");
+
+  if (system_reg != NULL_RTX)
+    {
+      trig_type = GEN_INT (31 - (INTVAL (operands[1]) - offset));
+
+      emit_insn (gen_unspec_volatile_mfsr (operands[0], system_reg));
+      emit_insn (gen_ashlsi3 (operands[0], operands[0], trig_type));
+      emit_insn (gen_lshrsi3 (operands[0], operands[0], GEN_INT (31)));
+      emit_insn (gen_unspec_dsb ());
+    }
+  DONE;
+})
+
 ;; ------------------------------------------------------------------------

 ;; Cache Synchronization Instructions
@@ -84,7 +611,7 @@
   [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_ISYNC)]
   ""
   "isync\t%0"
-  [(set_attr "type" "misc")]
+  [(set_attr "type" "mmu")]
 )

 (define_insn "unspec_volatile_isb"
@@ -94,4 +621,1077 @@
   [(set_attr "type" "misc")]
 )

+(define_insn "unspec_dsb"
+  [(unspec_volatile [(const_int 0)] UNSPEC_VOLATILE_DSB)]
+  ""
+  "dsb"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "unspec_msync"
+  [(unspec_volatile [(match_operand:SI 0 "immediate_operand" "i")] UNSPEC_VOLATILE_MSYNC)]
+  ""
+  "msync\t%0"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "unspec_msync_all"
+  [(unspec_volatile [(const_int 0)] UNSPEC_VOLATILE_MSYNC_ALL)]
+  ""
+  "msync\tall"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "unspec_msync_store"
+  [(unspec_volatile [(const_int 0)] UNSPEC_VOLATILE_MSYNC_STORE)]
+  ""
+  "msync\tstore"
+  [(set_attr "type" "misc")]
+)
+
+;; Load and Store
+
+(define_insn "unspec_volatile_llw"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec_volatile:SI [(mem:SI (plus:SI (match_operand:SI 1 "register_operand" "r")
+					      (match_operand:SI 2 "register_operand" "r")))] UNSPEC_VOLATILE_LLW))]
+  ""
+  "llw\t%0, [%1 + %2]"
+  [(set_attr "length"    "4")]
+)
+
+(define_insn "unspec_lwup"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec_volatile:SI [(mem:SI (plus:SI (match_operand:SI 1 "register_operand" "r")
+					      (match_operand:SI 2 "register_operand" "r")))] UNSPEC_LWUP))]
+  ""
+  "lwup\t%0, [%1 + %2]"
+  [(set_attr "length"    "4")]
+)
+
+(define_insn "unspec_lbup"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec_volatile:SI [(mem:SI (plus:SI (match_operand:SI 1 "register_operand" "r")
+					      (match_operand:SI 2 "register_operand" "r")))] UNSPEC_LBUP))]
+  ""
+  "lbup\t%0, [%1 + %2]"
+  [(set_attr "length"    "4")]
+)
+
+(define_insn "unspec_volatile_scw"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec_volatile:SI [(mem:SI (plus:SI (match_operand:SI 1 "register_operand" "r")
+					      (match_operand:SI 2 "register_operand" "r")))
+			     (match_operand:SI 3 "register_operand" "0")] UNSPEC_VOLATILE_SCW))]
+  ""
+  "scw\t%0, [%1 + %2]"
+  [(set_attr "length"     "4")]
+)
+
+(define_insn "unspec_swup"
+  [(set (mem:SI (plus:SI (match_operand:SI 0 "register_operand" "r")
+			 (match_operand:SI 1 "register_operand" "r")))
+	(unspec:SI [(match_operand:SI 2 "register_operand" "r")] UNSPEC_SWUP))]
+  ""
+  "swup\t%2, [%0 + %1]"
+  [(set_attr "length"     "4")]
+)
+
+(define_insn "unspec_sbup"
+  [(set (mem:SI (plus:SI (match_operand:SI 0 "register_operand" "r")
+			 (match_operand:SI 1 "register_operand" "r")))
+	(unspec:SI [(match_operand:SI 2 "register_operand" "r")] UNSPEC_SBUP))]
+  ""
+  "sbup\t%2, [%0 + %1]"
+  [(set_attr "length"     "4")]
+)
+
+;; CCTL
+
+(define_insn "cctl_l1d_invalall"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_CCTL_L1D_INVALALL)]
+  ""
+  "cctl\tL1D_INVALALL"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "cctl_l1d_wball_alvl"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_CCTL_L1D_WBALL_ALVL)]
+  ""
+  "cctl\tL1D_WBALL, alevel"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "cctl_l1d_wball_one_lvl"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_CCTL_L1D_WBALL_ONE_LVL)]
+  ""
+  "cctl\tL1D_WBALL, 1level"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "cctl_idx_read"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec_volatile:SI [(match_operand:SI 1 "immediate_operand" "i")
+			     (match_operand:SI 2 "register_operand" "r")] UNSPEC_VOLATILE_CCTL_IDX_READ))]
+  ""
+  "cctl\t%0, %2, %X1"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "cctl_idx_write"
+  [(unspec_volatile:SI [(match_operand:SI 0 "immediate_operand" "i")
+			(match_operand:SI 1 "register_operand" "r")
+			(match_operand:SI 2 "register_operand" "r")] UNSPEC_VOLATILE_CCTL_IDX_WRITE)]
+  ""
+  "cctl\t%1, %2, %W0"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "cctl_va_wbinval_l1"
+  [(unspec_volatile:SI [(match_operand:SI 0 "immediate_operand" "i")
+			(match_operand:SI 1 "register_operand" "r")] UNSPEC_VOLATILE_CCTL_VA_WBINVAL_L1)]
+  ""
+  "cctl\t%1, %U0, 1level"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "cctl_va_wbinval_la"
+  [(unspec_volatile:SI [(match_operand:SI 0 "immediate_operand" "i")
+			(match_operand:SI 1 "register_operand" "r")] UNSPEC_VOLATILE_CCTL_VA_WBINVAL_LA)]
+  ""
+  "cctl\t%1, %U0, alevel"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "cctl_idx_wbinval"
+  [(unspec_volatile:SI [(match_operand:SI 0 "immediate_operand" "i")
+			(match_operand:SI 1 "register_operand" "r")] UNSPEC_VOLATILE_CCTL_IDX_WBINVAL)]
+  ""
+  "cctl\t%1, %T0"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "cctl_va_lck"
+  [(unspec_volatile:SI [(match_operand:SI 0 "immediate_operand" "i")
+			(match_operand:SI 1 "register_operand" "r")] UNSPEC_VOLATILE_CCTL_VA_LCK)]
+  ""
+  "cctl\t%1, %R0"
+  [(set_attr "type" "mmu")]
+)
+
+;;PREFETCH
+
+(define_insn "prefetch_qw"
+  [(unspec_volatile:QI [(match_operand:SI 0 "register_operand" "r")
+			(match_operand:SI 1 "nonmemory_operand" "r")
+			(match_operand:SI 2 "immediate_operand" "i")] UNSPEC_VOLATILE_DPREF_QW)]
+  ""
+  "dpref\t%Z2, [%0 + %1]"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "prefetch_hw"
+  [(unspec_volatile:HI [(match_operand:SI 0 "register_operand" "r")
+			(match_operand:SI 1 "nonmemory_operand" "r")
+			(match_operand:SI 2 "immediate_operand" "i")] UNSPEC_VOLATILE_DPREF_HW)]
+  ""
+  "dpref\t%Z2, [%0 + (%1<<1)]"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "prefetch_w"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "    r, r")
+			(match_operand:SI 1 "nonmemory_operand" "Is15, r")
+			(match_operand:SI 2 "immediate_operand" "   i, i")] UNSPEC_VOLATILE_DPREF_W)]
+  ""
+  "@
+  dprefi.w\t%Z2, [%0 + %1]
+  dpref\t%Z2, [%0 + (%1<<2)]"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "prefetch_dw"
+  [(unspec_volatile:DI [(match_operand:SI 0 "register_operand"  "   r, r")
+			(match_operand:SI 1 "nonmemory_operand" "Is15, r")
+			(match_operand:SI 2 "immediate_operand" "   i, i")] UNSPEC_VOLATILE_DPREF_DW)]
+  ""
+  "@
+  dprefi.d\t%Z2, [%0 + %1]
+  dpref\t%Z2, [%0 + (%1<<3)]"
+  [(set_attr "type" "misc")]
+)
+
+;; Performance Extension
+
+(define_expand "unspec_ave"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:SI 2 "register_operand" "")]
+  ""
+{
+  emit_insn (gen_ave (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_expand "unspec_bclr"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:SI 2 "immediate_operand" "")]
+  ""
+{
+  unsigned HOST_WIDE_INT val = ~(1u << UINTVAL (operands[2]));
+  emit_insn (gen_andsi3 (operands[0], operands[1], gen_int_mode (val, SImode)));
+  DONE;
+})
+
+(define_expand "unspec_bset"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:SI 2 "immediate_operand" "")]
+  ""
+{
+  unsigned HOST_WIDE_INT val = 1u << UINTVAL (operands[2]);
+  emit_insn (gen_iorsi3 (operands[0], operands[1], gen_int_mode (val, SImode)));
+  DONE;
+})
+
+(define_expand "unspec_btgl"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:SI 2 "immediate_operand" "")]
+  ""
+{
+  unsigned HOST_WIDE_INT val = 1u << UINTVAL (operands[2]);
+  emit_insn (gen_xorsi3 (operands[0], operands[1], gen_int_mode (val, SImode)));
+  DONE;
+})
+
+(define_expand "unspec_btst"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:SI 2 "immediate_operand" "")]
+  ""
+{
+  emit_insn (gen_btst (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "unspec_clip"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")
+		    (match_operand:SI 2 "immediate_operand" "i")] UNSPEC_CLIP))]
+  ""
+  "clip\t%0, %1, %2"
+  [(set_attr "type" "alu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "unspec_clips"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")
+		    (match_operand:SI 2 "immediate_operand" "i")] UNSPEC_CLIPS))]
+  ""
+  "clips\t%0, %1, %2"
+  [(set_attr "type" "alu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "unspec_clo"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")] UNSPEC_CLO))]
+  ""
+  "clo\t%0, %1"
+  [(set_attr "type" "alu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "unspec_ssabssi2"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(ss_abs:SI (match_operand:SI 1 "register_operand" "r")))]
+  ""
+  "abs\t%0, %1"
+  [(set_attr "type" "alu")
+   (set_attr "length" "4")]
+)
+
+;; Performance extension 2
+
+(define_insn "unspec_pbsad"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")
+		    (match_operand:SI 2 "register_operand" "r")] UNSPEC_PBSAD))]
+  ""
+  "pbsad\t%0, %1, %2"
+  [(set_attr "type" "pbsad")
+   (set_attr "length"   "4")]
+)
+
+(define_insn "unspec_pbsada"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "0")
+		    (match_operand:SI 2 "register_operand" "r")
+		    (match_operand:SI 3 "register_operand" "r")] UNSPEC_PBSADA))]
+  ""
+  "pbsada\t%0, %2, %3"
+  [(set_attr "type" "pbsada")
+   (set_attr "length"    "4")]
+)
+
+(define_expand "bse"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:SI 2 "register_operand" "")]
+  ""
+  {
+    rtx temp0 = gen_reg_rtx (SImode);
+    rtx temp2 = gen_reg_rtx (SImode);
+
+    emit_move_insn (temp0, gen_rtx_MEM (Pmode, operands[0]));
+    emit_move_insn (temp2, gen_rtx_MEM (Pmode, operands[2]));
+    emit_insn (gen_unspec_bse (temp0, operands[1], temp2, temp0, temp2));
+    emit_move_insn (gen_rtx_MEM (Pmode, operands[0]), temp0);
+    emit_move_insn (gen_rtx_MEM (Pmode, operands[2]), temp2);
+    DONE;
+  }
+)
+
+(define_insn "unspec_bse"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")
+		    (match_operand:SI 2 "register_operand" "r")
+		    (match_operand:SI 3 "register_operand" "0")] UNSPEC_BSE))
+   (set (match_operand:SI 4 "register_operand" "=2")
+	(unspec:SI [(match_dup 1)
+		    (match_dup 2)
+		    (match_dup 0)] UNSPEC_BSE_2))]
+  ""
+  "bse\t%0, %1, %2"
+  [(set_attr "type" "alu")
+   (set_attr "length" "4")]
+)
+
+(define_expand "bsp"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:SI 1 "register_operand" "")
+   (match_operand:SI 2 "register_operand" "")]
+  ""
+  {
+    rtx temp0 = gen_reg_rtx (SImode);
+    rtx temp2 = gen_reg_rtx (SImode);
+
+    emit_move_insn (temp0, gen_rtx_MEM (Pmode, operands[0]));
+    emit_move_insn (temp2, gen_rtx_MEM (Pmode, operands[2]));
+    emit_insn (gen_unspec_bsp (temp0, operands[1], temp2, temp0, temp2));
+    emit_move_insn (gen_rtx_MEM (Pmode, operands[0]), temp0);
+    emit_move_insn (gen_rtx_MEM (Pmode, operands[2]), temp2);
+    DONE;
+  }
+)
+
+(define_insn "unspec_bsp"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")
+		    (match_operand:SI 2 "register_operand" "r")
+		    (match_operand:SI 3 "register_operand" "0")] UNSPEC_BSP))
+   (set (match_operand:SI 4 "register_operand" "=2")
+	(unspec:SI [(match_dup 1)
+		    (match_dup 2)
+		    (match_dup 0)] UNSPEC_BSP_2))]
+  ""
+  "bsp\t%0, %1, %2"
+  [(set_attr "type" "alu")
+   (set_attr "length" "4")]
+)
+
+;; String Extension
+
+(define_insn "unspec_ffb"
+  [(set (match_operand:SI 0 "register_operand" "=r, r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r, r")
+		    (match_operand:SI 2 "nonmemory_operand" "Iu08, r")] UNSPEC_FFB))]
+  ""
+  "@
+  ffbi\t%0, %1, %2
+  ffb\t%0, %1, %2"
+  [(set_attr "type" "alu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "unspec_ffmism"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")
+		    (match_operand:SI 2 "register_operand" "r")] UNSPEC_FFMISM))]
+  ""
+  "ffmism\t%0, %1, %2"
+  [(set_attr "type" "alu")
+   (set_attr "length" "4")]
+)
+
+(define_insn "unspec_flmism"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")
+		    (match_operand:SI 2 "register_operand" "r")] UNSPEC_FLMISM))]
+  ""
+  "flmism\t%0, %1, %2"
+  [(set_attr "type" "alu")
+   (set_attr "length" "4")]
+)
+
+;; SATURATION
+
+(define_insn "unspec_kaddw"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(ss_plus:SI (match_operand:SI 1 "register_operand" "r")
+		    (match_operand:SI 2 "register_operand" "r")))]
+  ""
+  "kaddw\t%0, %1, %2"
+  [(set_attr "type"    "alu")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_ksubw"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(ss_minus:SI (match_operand:SI 1 "register_operand" "r")
+		     (match_operand:SI 2 "register_operand" "r")))]
+  ""
+  "ksubw\t%0, %1, %2"
+  [(set_attr "type"    "alu")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_kaddh"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(plus:SI (match_operand:SI 1 "register_operand" "r")
+			     (match_operand:SI 2 "register_operand" "r"))
+		    (const_int 15)] UNSPEC_CLIPS))]
+  ""
+  "kaddh\t%0, %1, %2"
+  [(set_attr "type"    "alu")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_ksubh"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(minus:SI (match_operand:SI 1 "register_operand" "r")
+			      (match_operand:SI 2 "register_operand" "r"))
+		    (const_int 15)] UNSPEC_CLIPS))]
+  ""
+  "ksubh\t%0, %1, %2"
+  [(set_attr "type"    "alu")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_kdmbb"
+  [(set (match_operand:V2HI 0 "register_operand" "=r")
+	(unspec:V2HI [(match_operand:V2HI 1 "register_operand" "r")
+		      (match_operand:V2HI 2 "register_operand" "r")] UNSPEC_KDMBB))]
+  ""
+  "kdmbb\t%0, %1, %2"
+  [(set_attr "type"    "mul")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_kdmbt"
+  [(set (match_operand:V2HI 0 "register_operand" "=r")
+	(unspec:V2HI [(match_operand:V2HI 1 "register_operand" "r")
+		      (match_operand:V2HI 2 "register_operand" "r")] UNSPEC_KDMBT))]
+  ""
+  "kdmbt\t%0, %1, %2"
+  [(set_attr "type"    "mul")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_kdmtb"
+  [(set (match_operand:V2HI 0 "register_operand" "=r")
+	(unspec:V2HI [(match_operand:V2HI 1 "register_operand" "r")
+		      (match_operand:V2HI 2 "register_operand" "r")] UNSPEC_KDMTB))]
+  ""
+  "kdmtb\t%0, %1, %2"
+  [(set_attr "type"    "mul")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_kdmtt"
+  [(set (match_operand:V2HI 0 "register_operand" "=r")
+	(unspec:V2HI [(match_operand:V2HI 1 "register_operand" "r")
+		      (match_operand:V2HI 2 "register_operand" "r")] UNSPEC_KDMTT))]
+  ""
+  "kdmtt\t%0, %1, %2"
+  [(set_attr "type"    "mul")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_khmbb"
+  [(set (match_operand:V2HI 0 "register_operand" "=r")
+	(unspec:V2HI [(match_operand:V2HI 1 "register_operand" "r")
+		      (match_operand:V2HI 2 "register_operand" "r")] UNSPEC_KHMBB))]
+  ""
+  "khmbb\t%0, %1, %2"
+  [(set_attr "type"    "mul")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_khmbt"
+  [(set (match_operand:V2HI 0 "register_operand" "=r")
+	(unspec:V2HI [(match_operand:V2HI 1 "register_operand" "r")
+		      (match_operand:V2HI 2 "register_operand" "r")] UNSPEC_KHMBT))]
+  ""
+  "khmbt\t%0, %1, %2"
+  [(set_attr "type"    "mul")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_khmtb"
+  [(set (match_operand:V2HI 0 "register_operand" "=r")
+	(unspec:V2HI [(match_operand:V2HI 1 "register_operand" "r")
+		      (match_operand:V2HI 2 "register_operand" "r")] UNSPEC_KHMTB))]
+  ""
+  "khmtb\t%0, %1, %2"
+  [(set_attr "type"    "mul")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_khmtt"
+  [(set (match_operand:V2HI 0 "register_operand" "=r")
+	(unspec:V2HI [(match_operand:V2HI 1 "register_operand" "r")
+		      (match_operand:V2HI 2 "register_operand" "r")] UNSPEC_KHMTT))]
+  ""
+  "khmtt\t%0, %1, %2"
+  [(set_attr "type"    "mul")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_kslraw"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")
+		    (match_operand:SI 2 "register_operand" "r")] UNSPEC_KSLRAW))]
+  ""
+  "kslraw\t%0, %1, %2"
+  [(set_attr "type"    "alu")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_kslrawu"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")
+		    (match_operand:SI 2 "register_operand" "r")] UNSPEC_KSLRAWU))]
+  ""
+  "kslraw.u\t%0, %1, %2"
+  [(set_attr "type"    "alu")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_volatile_rdov"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_RDOV))]
+  ""
+  "rdov\t%0"
+  [(set_attr "type"   "misc")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_volatile_clrov"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_CLROV)]
+  ""
+  "clrov"
+  [(set_attr "type"   "misc")
+   (set_attr "length"    "4")]
+)
+
+;; System
+
+(define_insn "unspec_sva"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")
+		    (match_operand:SI 2 "register_operand" "r")] UNSPEC_SVA))]
+  ""
+  "sva\t%0, %1, %2"
+  [(set_attr "type"    "alu")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_svs"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")
+		    (match_operand:SI 2 "register_operand" "r")] UNSPEC_SVS))]
+  ""
+  "svs\t%0, %1, %2"
+  [(set_attr "type"    "alu")
+   (set_attr "length"    "4")]
+)
+
+(define_insn "unspec_jr_itoff"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_JR_ITOFF)]
+  ""
+  "jr.itoff\t%0"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "unspec_jr_toff"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_JR_TOFF)]
+  ""
+  "jr.toff\t%0"
+  [(set_attr "type" "branch")]
+)
+
+(define_insn "unspec_jral_iton"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_JRAL_ITON)]
+  ""
+  "jral.iton\t%0"
+  [(set_attr "type" "branch")]
+)
+
+(define_insn "unspec_jral_ton"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_JRAL_TON)]
+  ""
+  "jral.ton\t%0"
+  [(set_attr "type" "branch")]
+)
+
+(define_insn "unspec_ret_itoff"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_RET_ITOFF)]
+  ""
+  "ret.itoff\t%0"
+  [(set_attr "type" "branch")]
+)
+
+(define_insn "unspec_ret_toff"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_RET_TOFF)]
+  ""
+  "ret.toff\t%0"
+  [(set_attr "type" "branch")]
+)
+
+(define_insn "unspec_standby_no_wake_grant"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_STANDBY_NO_WAKE_GRANT)]
+  ""
+  "standby\tno_wake_grant"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "unspec_standby_wake_grant"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_STANDBY_WAKE_GRANT)]
+  ""
+  "standby\twake_grant"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "unspec_standby_wait_done"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_STANDBY_WAKE_DONE)]
+  ""
+  "standby\twait_done"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "unspec_teqz"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")
+			(match_operand:SI 1 "immediate_operand" "i")] UNSPEC_VOLATILE_TEQZ)]
+  ""
+  "teqz\t%0, %1"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "unspec_tnez"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")
+			(match_operand:SI 1 "immediate_operand" "i")] UNSPEC_VOLATILE_TNEZ)]
+  ""
+  "tnez\t%0, %1"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "unspec_trap"
+  [(unspec_volatile:SI [(match_operand:SI 0 "immediate_operand" "i")] UNSPEC_VOLATILE_TRAP)]
+  ""
+  "trap\t%0"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "unspec_setend_big"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_SETEND_BIG)]
+  ""
+  "setend.b"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "unspec_setend_little"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_SETEND_LITTLE)]
+  ""
+  "setend.l"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "unspec_break"
+  [(unspec_volatile:SI [(match_operand:SI 0 "immediate_operand" "i")] UNSPEC_VOLATILE_BREAK)]
+  ""
+  "break\t%0"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "unspec_syscall"
+  [(unspec_volatile:SI [(match_operand:SI 0 "immediate_operand" "i")] UNSPEC_VOLATILE_SYSCALL)]
+  ""
+  "syscall\t%0"
+  [(set_attr "type" "misc")]
+)
+
+(define_insn "unspec_nop"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_NOP)]
+  ""
+  "nop"
+  [(set_attr "type" "misc")]
+)
+
+(define_expand "unspec_get_current_sp"
+  [(match_operand:SI 0 "register_operand" "")]
+  ""
+{
+  emit_move_insn (operands[0], gen_rtx_REG (SImode, SP_REGNUM));
+  DONE;
+})
+
+(define_expand "unspec_set_current_sp"
+  [(match_operand:SI 0 "register_operand" "")]
+  ""
+{
+  emit_move_insn (gen_rtx_REG (SImode, SP_REGNUM), operands[0]);
+  DONE;
+})
+
+(define_expand "unspec_return_address"
+  [(match_operand:SI 0 "register_operand" "")]
+  ""
+{
+  emit_move_insn (operands[0], gen_rtx_REG (SImode, LP_REGNUM));
+  DONE;
+})
+
+(define_insn "unspec_signature_begin"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_SIGNATURE_BEGIN)]
+  ""
+  "isps"
+  [(set_attr "length" "4")]
+)
+
+(define_insn "unspec_signature_end"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_SIGNATURE_END)]
+  ""
+  "! -----\;.signature_end\;j8 2\;! -----"
+  [(set_attr "length" "2")]
+)
+
+;; Swap
+
+(define_insn "unspec_wsbh"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")] UNSPEC_WSBH))]
+  ""
+  "wsbh\t%0, %1"
+  [(set_attr "type"    "alu")
+   (set_attr "length"    "4")]
+)
+
+;; TLBOP Intrinsic
+
+(define_insn "unspec_tlbop_trd"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_TLBOP_TRD)]
+  ""
+  "tlbop\t%0, TRD"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "unspec_tlbop_twr"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_TLBOP_TWR)]
+  ""
+  "tlbop\t%0, TWR"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "unspec_tlbop_rwr"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_TLBOP_RWR)]
+  ""
+  "tlbop\t%0, RWR"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "unspec_tlbop_rwlk"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_TLBOP_RWLK)]
+  ""
+  "tlbop\t%0, RWLK"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "unspec_tlbop_unlk"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_TLBOP_UNLK)]
+  ""
+  "tlbop\t%0, UNLK"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "unspec_tlbop_pb"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec_volatile:SI [(match_operand:SI 1 "register_operand" "r")] UNSPEC_VOLATILE_TLBOP_PB))]
+  ""
+  "tlbop\t%0, %1, PB"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "unspec_tlbop_inv"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_TLBOP_INV)]
+  ""
+  "tlbop\t%0, INV"
+  [(set_attr "type" "mmu")]
+)
+
+(define_insn "unspec_tlbop_flua"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_TLBOP_FLUA)]
+  ""
+  "tlbop\tFLUA"
+  [(set_attr "type" "mmu")]
+)
+
+;;Unaligned Load/Store
+
+(define_expand "unaligned_load_hw"
+  [(set (match_operand:HI 0 "register_operand" "")
+	(unspec:HI [(mem:HI (match_operand:SI 1 "register_operand" ""))] UNSPEC_UALOAD_HW))]
+  ""
+{
+  operands[0] = simplify_gen_subreg (SImode, operands[0],
+				     GET_MODE (operands[0]), 0);
+  if (TARGET_ISA_V3M)
+    {
+      nds32_expand_unaligned_load (operands, HImode);
+    }
+  else
+    {
+      emit_insn (gen_unaligned_load_w (operands[0],
+				       gen_rtx_MEM (SImode, operands[1])));
+
+      if (WORDS_BIG_ENDIAN)
+	emit_insn (gen_lshrsi3 (operands[0], operands[0], GEN_INT(16)));
+      else
+	emit_insn (gen_andsi3 (operands[0], operands[0], GEN_INT (0xffff)));
+    }
+
+  DONE;
+})
+
+(define_expand "unaligned_loadsi"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(mem:SI (match_operand:SI 1 "register_operand" "r"))] UNSPEC_UALOAD_W))]
+  ""
+{
+  if (flag_unaligned_access)
+    {
+      rtx mem = gen_rtx_MEM (SImode, operands[1]);
+      emit_move_insn (operands[0], mem);
+    }
+  else
+    {
+      if (TARGET_ISA_V3M)
+	nds32_expand_unaligned_load (operands, SImode);
+      else
+	emit_insn (gen_unaligned_load_w (operands[0],
+					 gen_rtx_MEM (SImode, (operands[1]))));
+    }
+  DONE;
+})
+
+(define_insn "unaligned_load_w"
+  [(set (match_operand:SI 0 "register_operand"                       "=  r")
+	(unspec:SI [(match_operand:SI 1 "nds32_lmw_smw_base_operand" " Umw")] UNSPEC_UALOAD_W))]
+  ""
+{
+  return nds32_output_lmw_single_word (operands);
+}
+  [(set_attr "type"   "load")
+   (set_attr "length"    "4")]
+)
+
+(define_expand "unaligned_loaddi"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec:DI [(mem:DI (match_operand:SI 1 "register_operand" "r"))] UNSPEC_UALOAD_DW))]
+  ""
+{
+  if (TARGET_ISA_V3M)
+    {
+      nds32_expand_unaligned_load (operands, DImode);
+    }
+  else
+    emit_insn (gen_unaligned_load_dw (operands[0], operands[1]));
+  DONE;
+})
+
+(define_insn "unaligned_load_dw"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec:DI [(mem:DI (match_operand:SI 1 "register_operand" "r"))] UNSPEC_UALOAD_DW))]
+  ""
+{
+  rtx otherops[3];
+  otherops[0] = gen_rtx_REG (SImode, REGNO (operands[0]));
+  otherops[1] = gen_rtx_REG (SImode, REGNO (operands[0]) + 1);
+  otherops[2] = operands[1];
+
+  output_asm_insn ("lmw.bi\t%0, [%2], %1, 0", otherops);
+  return "";
+}
+  [(set_attr "type"   "load")
+   (set_attr "length"    "4")]
+)
+
+(define_expand "unaligned_store_hw"
+  [(set (mem:SI (match_operand:SI 0 "register_operand" ""))
+	(unspec:HI [(match_operand:HI 1 "register_operand" "")] UNSPEC_UASTORE_HW))]
+  ""
+{
+  operands[1] = simplify_gen_subreg (SImode, operands[1],
+				     GET_MODE (operands[1]), 0);
+  nds32_expand_unaligned_store (operands, HImode);
+  DONE;
+})
+
+(define_expand "unaligned_storesi"
+  [(set (mem:SI (match_operand:SI 0 "register_operand" "r"))
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")] UNSPEC_UASTORE_W))]
+  ""
+{
+  if (flag_unaligned_access)
+    {
+      rtx mem = gen_rtx_MEM (SImode, operands[0]);
+      emit_move_insn (mem, operands[1]);
+    }
+  else
+    {
+      if (TARGET_ISA_V3M)
+	nds32_expand_unaligned_store (operands, SImode);
+      else
+	emit_insn (gen_unaligned_store_w (gen_rtx_MEM (SImode, operands[0]),
+					  operands[1]));
+    }
+  DONE;
+})
+
+(define_insn "unaligned_store_w"
+  [(set (match_operand:SI 0 "nds32_lmw_smw_base_operand"   "=Umw")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "   r")] UNSPEC_UASTORE_W))]
+  ""
+{
+  return nds32_output_smw_single_word (operands);
+}
+  [(set_attr "type"   "store")
+   (set_attr "length"     "4")]
+)
+
+(define_expand "unaligned_storedi"
+  [(set (mem:DI (match_operand:SI 0 "register_operand" "r"))
+	(unspec:DI [(match_operand:DI 1 "register_operand" "r")] UNSPEC_UASTORE_DW))]
+  ""
+{
+  if (TARGET_ISA_V3M)
+    nds32_expand_unaligned_store (operands, DImode);
+  else
+    emit_insn (gen_unaligned_store_dw (gen_rtx_MEM (DImode, operands[0]),
+				       operands[1]));
+  DONE;
+})
+
+(define_insn "unaligned_store_dw"
+  [(set (match_operand:DI 0 "nds32_lmw_smw_base_operand"   "=Umw")
+	(unspec:DI [(match_operand:DI 1 "register_operand" "   r")] UNSPEC_UASTORE_DW))]
+  ""
+{
+  return nds32_output_smw_double_word (operands);
+}
+  [(set_attr "type"   "store")
+   (set_attr "length"     "4")]
+)
+
+(define_expand "unspec_unaligned_feature"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_UNALIGNED_FEATURE))]
+  ""
+{
+  /* Get $MMU_CTL system register form nds32_intrinsic_register_names[]  */
+  rtx system_reg =  GEN_INT (__NDS32_REG_MMU_CTL__);
+  rtx temp_reg = gen_reg_rtx (SImode);
+  rtx temp2_reg = gen_reg_rtx (SImode);
+
+  emit_insn (gen_unspec_volatile_mfsr (operands[0], system_reg));
+  emit_move_insn (temp_reg, operands[0]);
+  emit_move_insn (temp2_reg, GEN_INT (0x800 << 12));
+  emit_insn (gen_iorsi3 (operands[0], operands[0], temp2_reg));
+  emit_insn (gen_unspec_volatile_mtsr (operands[0], system_reg));
+  emit_insn (gen_unspec_dsb ());
+
+  emit_insn (gen_unspec_volatile_mfsr (operands[0], system_reg));
+  emit_insn (gen_unspec_volatile_mtsr (temp_reg, system_reg));
+  emit_insn (gen_unspec_dsb ());
+
+  emit_insn (gen_ashlsi3 (operands[0], operands[0], GEN_INT (8)));
+  emit_insn (gen_lshrsi3 (operands[0], operands[0], GEN_INT (31)));
+  DONE;
+})
+
+(define_expand "unspec_enable_unaligned"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_UNALIGNED_FEATURE)]
+  ""
+{
+  /* Get $MMU_CTL system register form nds32_intrinsic_register_names[]  */
+  rtx system_reg =  GEN_INT (__NDS32_REG_MMU_CTL__);
+  rtx temp_reg = gen_reg_rtx (SImode);
+  rtx temp2_reg = gen_reg_rtx (SImode);
+  emit_insn (gen_unspec_volatile_mfsr (temp_reg, system_reg));
+  emit_move_insn (temp2_reg, GEN_INT (0x800 << 12));
+  emit_insn (gen_iorsi3 (temp_reg, temp_reg, temp2_reg));
+  emit_insn (gen_unspec_volatile_mtsr (temp_reg, system_reg));
+  emit_insn (gen_unspec_dsb ());
+  DONE;
+})
+
+(define_expand "unspec_disable_unaligned"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_UNALIGNED_FEATURE)]
+  ""
+{
+  /* Get $MMU_CTL system register form nds32_intrinsic_register_names[]  */
+  rtx system_reg =  GEN_INT (__NDS32_REG_MMU_CTL__);
+  rtx temp_reg = gen_reg_rtx (SImode);
+  rtx temp2_reg = gen_reg_rtx (SImode);
+  emit_insn (gen_unspec_volatile_mfsr (temp_reg, system_reg));
+  emit_move_insn (temp2_reg, GEN_INT (0x800 << 12));
+  emit_insn (gen_one_cmplsi2 (temp2_reg, temp2_reg));
+  emit_insn (gen_andsi3 (temp_reg, temp_reg, temp2_reg));
+  emit_insn (gen_unspec_volatile_mtsr (temp_reg, system_reg));
+  emit_insn (gen_unspec_dsb ());
+  DONE;
+})
+
+;; abs alias kabs
+
+(define_insn "unspec_kabs"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")] UNSPEC_KABS))]
+  ""
+  "kabs\t%0, %1"
+  [(set_attr "type" "alu")
+   (set_attr "length" "4")]
+)
+
+(define_expand "no_hwloop"
+  [(const_int 0)]
+  ""
+{
+  if (NDS32_HW_LOOP_P ())
+    emit_insn (gen_unspec_no_hwloop ());
+  else
+    emit_insn (gen_nop ());
+
+  DONE;
+})
+
+(define_insn "unspec_no_hwloop"
+  [(unspec_volatile [(const_int 0)] UNSPEC_VOLATILE_NO_HWLOOP)]
+  ""
+  ""
+  [(set_attr "type" "misc")]
+)
 ;; ------------------------------------------------------------------------
diff --git a/gcc/config/nds32/nds32-isr.c b/gcc/config/nds32/nds32-isr.c
index 79be27e..be82609 100644
--- a/gcc/config/nds32/nds32-isr.c
+++ b/gcc/config/nds32/nds32-isr.c
@@ -24,11 +24,41 @@
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
-#include "target.h"
-#include "rtl.h"
 #include "tree.h"
-#include "diagnostic-core.h"
+#include "rtl.h"
+#include "df.h"
+#include "alias.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "regs.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
 #include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
+#include "explow.h"
+#include "emit-rtl.h"
+#include "stmt.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"		/* For add_builtin_function().  */
+#include "builtins.h"

 /* ------------------------------------------------------------------------ */

@@ -39,7 +69,260 @@
    We use an array to record essential information for each vector.  */
 static struct nds32_isr_info nds32_isr_vectors[NDS32_N_ISR_VECTORS];

-/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------- */
+/* FIXME:
+   FOR BACKWARD COMPATIBILITY, we need to support following patterns:
+
+       __attribute__((interrupt("XXX;YYY;id=ZZZ")))
+       __attribute__((exception("XXX;YYY;id=ZZZ")))
+       __attribute__((reset("vectors=XXX;nmi_func=YYY;warm_func=ZZZ")))
+
+   We provide several functions to parse the strings.  */
+
+static void
+nds32_interrupt_attribute_parse_string (const char *original_str,
+					const char *func_name,
+					unsigned int s_level)
+{
+  char target_str[100];
+  enum nds32_isr_save_reg save_reg;
+  enum nds32_isr_nested_type nested_type;
+
+  char *save_all_regs_str, *save_caller_regs_str;
+  char *nested_str, *not_nested_str, *ready_nested_str, *critical_str;
+  char *id_str, *value_str;
+
+  /* Copy original string into a character array so that
+     the string APIs can handle it.  */
+  strcpy (target_str, original_str);
+
+  /* 1. Detect 'save_all_regs'    : NDS32_SAVE_ALL
+	       'save_caller_regs' : NDS32_PARTIAL_SAVE */
+  save_all_regs_str    = strstr (target_str, "save_all_regs");
+  save_caller_regs_str = strstr (target_str, "save_caller_regs");
+
+  /* Note that if no argument is found,
+     use NDS32_PARTIAL_SAVE by default.  */
+  if (save_all_regs_str)
+    save_reg = NDS32_SAVE_ALL;
+  else if (save_caller_regs_str)
+    save_reg = NDS32_PARTIAL_SAVE;
+  else
+    save_reg = NDS32_PARTIAL_SAVE;
+
+  /* 2. Detect 'nested'       : NDS32_NESTED
+	       'not_nested'   : NDS32_NOT_NESTED
+	       'ready_nested' : NDS32_NESTED_READY
+	       'critical'     : NDS32_CRITICAL */
+  nested_str       = strstr (target_str, "nested");
+  not_nested_str   = strstr (target_str, "not_nested");
+  ready_nested_str = strstr (target_str, "ready_nested");
+  critical_str     = strstr (target_str, "critical");
+
+  /* Note that if no argument is found,
+     use NDS32_NOT_NESTED by default.
+     Also, since 'not_nested' and 'ready_nested' both contains
+     'nested' string, we check 'nested' with lowest priority.  */
+  if (not_nested_str)
+    nested_type = NDS32_NOT_NESTED;
+  else if (ready_nested_str)
+    nested_type = NDS32_NESTED_READY;
+  else if (nested_str)
+    nested_type = NDS32_NESTED;
+  else if (critical_str)
+    nested_type = NDS32_CRITICAL;
+  else
+    nested_type = NDS32_NOT_NESTED;
+
+  /* 3. Traverse each id value and set corresponding information.  */
+  id_str = strstr (target_str, "id=");
+
+  /* If user forgets to assign 'id', issue an error message.  */
+  if (id_str == NULL)
+    error ("require id argument in the string");
+  /* Extract the value_str first.  */
+  id_str    = strtok (id_str, "=");
+  value_str = strtok (NULL, ";");
+
+  /* Pick up the first id value token.  */
+  value_str = strtok (value_str, ",");
+  while (value_str != NULL)
+    {
+      int i;
+      i = atoi (value_str);
+
+      /* For interrupt(0..63), the actual vector number is (9..72).  */
+      i = i + 9;
+      if (i < 9 || i > 72)
+	error ("invalid id value for interrupt attribute");
+
+      /* Setup nds32_isr_vectors[] array.  */
+      nds32_isr_vectors[i].category = NDS32_ISR_INTERRUPT;
+      strcpy (nds32_isr_vectors[i].func_name, func_name);
+      nds32_isr_vectors[i].save_reg = save_reg;
+      nds32_isr_vectors[i].nested_type = nested_type;
+      nds32_isr_vectors[i].security_level = s_level;
+
+      /* Fetch next token.  */
+      value_str = strtok (NULL, ",");
+    }
+
+  return;
+}
+
+static void
+nds32_exception_attribute_parse_string (const char *original_str,
+					const char *func_name,
+					unsigned int s_level)
+{
+  char target_str[100];
+  enum nds32_isr_save_reg save_reg;
+  enum nds32_isr_nested_type nested_type;
+
+  char *save_all_regs_str, *save_caller_regs_str;
+  char *nested_str, *not_nested_str, *ready_nested_str, *critical_str;
+  char *id_str, *value_str;
+
+  /* Copy original string into a character array so that
+     the string APIs can handle it.  */
+  strcpy (target_str, original_str);
+
+  /* 1. Detect 'save_all_regs'    : NDS32_SAVE_ALL
+	       'save_caller_regs' : NDS32_PARTIAL_SAVE */
+  save_all_regs_str    = strstr (target_str, "save_all_regs");
+  save_caller_regs_str = strstr (target_str, "save_caller_regs");
+
+  /* Note that if no argument is found,
+     use NDS32_PARTIAL_SAVE by default.  */
+  if (save_all_regs_str)
+    save_reg = NDS32_SAVE_ALL;
+  else if (save_caller_regs_str)
+    save_reg = NDS32_PARTIAL_SAVE;
+  else
+    save_reg = NDS32_PARTIAL_SAVE;
+
+  /* 2. Detect 'nested'       : NDS32_NESTED
+	       'not_nested'   : NDS32_NOT_NESTED
+	       'ready_nested' : NDS32_NESTED_READY
+	       'critical'     : NDS32_CRITICAL */
+  nested_str       = strstr (target_str, "nested");
+  not_nested_str   = strstr (target_str, "not_nested");
+  ready_nested_str = strstr (target_str, "ready_nested");
+  critical_str     = strstr (target_str, "critical");
+
+  /* Note that if no argument is found,
+     use NDS32_NOT_NESTED by default.
+     Also, since 'not_nested' and 'ready_nested' both contains
+     'nested' string, we check 'nested' with lowest priority.  */
+  if (not_nested_str)
+    nested_type = NDS32_NOT_NESTED;
+  else if (ready_nested_str)
+    nested_type = NDS32_NESTED_READY;
+  else if (nested_str)
+    nested_type = NDS32_NESTED;
+  else if (critical_str)
+    nested_type = NDS32_CRITICAL;
+  else
+    nested_type = NDS32_NOT_NESTED;
+
+  /* 3. Traverse each id value and set corresponding information.  */
+  id_str = strstr (target_str, "id=");
+
+  /* If user forgets to assign 'id', issue an error message.  */
+  if (id_str == NULL)
+    error ("require id argument in the string");
+  /* Extract the value_str first.  */
+  id_str    = strtok (id_str, "=");
+  value_str = strtok (NULL, ";");
+
+  /* Pick up the first id value token.  */
+  value_str = strtok (value_str, ",");
+  while (value_str != NULL)
+    {
+      int i;
+      i = atoi (value_str);
+
+      /* For exception(1..8), the actual vector number is (1..8).  */
+      if (i < 1 || i > 8)
+	error ("invalid id value for exception attribute");
+
+      /* Setup nds32_isr_vectors[] array.  */
+      nds32_isr_vectors[i].category = NDS32_ISR_EXCEPTION;
+      strcpy (nds32_isr_vectors[i].func_name, func_name);
+      nds32_isr_vectors[i].save_reg = save_reg;
+      nds32_isr_vectors[i].nested_type = nested_type;
+      nds32_isr_vectors[i].security_level = s_level;
+
+      /* Fetch next token.  */
+      value_str = strtok (NULL, ",");
+    }
+
+  return;
+}
+
+static void
+nds32_reset_attribute_parse_string (const char *original_str,
+				    const char *func_name)
+{
+  char target_str[100];
+  char *vectors_str, *nmi_str, *warm_str, *value_str;
+
+  /* Deal with reset attribute.  Its vector number is always 0.  */
+  nds32_isr_vectors[0].category = NDS32_ISR_RESET;
+
+
+  /* 1. Parse 'vectors=XXXX'.  */
+
+  /* Copy original string into a character array so that
+     the string APIs can handle it.  */
+  strcpy (target_str, original_str);
+  vectors_str = strstr (target_str, "vectors=");
+  /* The total vectors = interrupt + exception numbers + reset.
+     There are 8 exception and 1 reset in nds32 architecture.
+     If user forgets to assign 'vectors', user default 16 interrupts.  */
+  if (vectors_str != NULL)
+    {
+      /* Extract the value_str.  */
+      vectors_str = strtok (vectors_str, "=");
+      value_str  = strtok (NULL, ";");
+      nds32_isr_vectors[0].total_n_vectors = atoi (value_str) + 8 + 1;
+    }
+  else
+    nds32_isr_vectors[0].total_n_vectors = 16 + 8 + 1;
+  strcpy (nds32_isr_vectors[0].func_name, func_name);
+
+
+  /* 2. Parse 'nmi_func=YYYY'.  */
+
+  /* Copy original string into a character array so that
+     the string APIs can handle it.  */
+  strcpy (target_str, original_str);
+  nmi_str = strstr (target_str, "nmi_func=");
+  if (nmi_str != NULL)
+    {
+      /* Extract the value_str.  */
+      nmi_str = strtok (nmi_str, "=");
+      value_str  = strtok (NULL, ";");
+      strcpy (nds32_isr_vectors[0].nmi_name, value_str);
+    }
+
+  /* 3. Parse 'warm_func=ZZZZ'.  */
+
+  /* Copy original string into a character array so that
+     the string APIs can handle it.  */
+  strcpy (target_str, original_str);
+  warm_str = strstr (target_str, "warm_func=");
+  if (warm_str != NULL)
+    {
+      /* Extract the value_str.  */
+      warm_str = strtok (warm_str, "=");
+      value_str  = strtok (NULL, ";");
+      strcpy (nds32_isr_vectors[0].warm_name, value_str);
+    }
+
+  return;
+}
+/* ------------------------------------------------------------- */

 /* A helper function to emit section head template.  */
 static void
@@ -75,6 +358,15 @@ nds32_emit_isr_jmptbl_section (int vector_id)
   char section_name[100];
   char symbol_name[100];

+  /* A critical isr does not need jump table section because
+     its behavior is not performed by two-level handler.  */
+  if (nds32_isr_vectors[vector_id].nested_type == NDS32_CRITICAL)
+    {
+      fprintf (asm_out_file, "\t! The vector %02d is a critical isr !\n",
+			     vector_id);
+      return;
+    }
+
   /* Prepare jmptbl section and symbol name.  */
   snprintf (section_name, sizeof (section_name),
 	    ".nds32_jmptbl.%02d", vector_id);
@@ -95,7 +387,6 @@ nds32_emit_isr_vector_section (int vector_id)
   const char *c_str = "CATEGORY";
   const char *sr_str = "SR";
   const char *nt_str = "NT";
-  const char *vs_str = "VS";
   char first_level_handler_name[100];
   char section_name[100];
   char symbol_name[100];
@@ -143,46 +434,63 @@ nds32_emit_isr_vector_section (int vector_id)
     case NDS32_NESTED_READY:
       nt_str = "nr";
       break;
+    case NDS32_CRITICAL:
+      /* The critical isr is not performed by two-level handler.  */
+      nt_str = "";
+      break;
     }

-  /* Currently we have 4-byte or 16-byte size for each vector.
-     If it is 4-byte, the first level handler name has suffix string "_4b".  */
-  vs_str = (nds32_isr_vector_size == 4) ? "_4b" : "";
-
   /* Now we can create first level handler name.  */
-  snprintf (first_level_handler_name, sizeof (first_level_handler_name),
-	    "_nds32_%s_%s_%s%s", c_str, sr_str, nt_str, vs_str);
+  if (nds32_isr_vectors[vector_id].security_level == 0)
+    {
+      /* For security level 0, use normal first level handler name.  */
+      snprintf (first_level_handler_name, sizeof (first_level_handler_name),
+		"_nds32_%s_%s_%s", c_str, sr_str, nt_str);
+    }
+  else
+    {
+      /* For security level 1-3, use corresponding spl_1, spl_2, or spl_3.  */
+      snprintf (first_level_handler_name, sizeof (first_level_handler_name),
+		"_nds32_spl_%d", nds32_isr_vectors[vector_id].security_level);
+    }

   /* Prepare vector section and symbol name.  */
   snprintf (section_name, sizeof (section_name),
 	    ".nds32_vector.%02d", vector_id);
   snprintf (symbol_name, sizeof (symbol_name),
-	    "_nds32_vector_%02d%s", vector_id, vs_str);
+	    "_nds32_vector_%02d", vector_id);


   /* Everything is ready.  We can start emit vector section content.  */
   nds32_emit_section_head_template (section_name, symbol_name,
 				    floor_log2 (nds32_isr_vector_size), false);

-  /* According to the vector size, the instructions in the
-     vector section may be different.  */
-  if (nds32_isr_vector_size == 4)
+  /* First we check if it is a critical isr.
+     If so, jump to user handler directly; otherwise, the instructions
+     in the vector section may be different according to the vector size.  */
+  if (nds32_isr_vectors[vector_id].nested_type == NDS32_CRITICAL)
+    {
+      /* This block is for critical isr.  Jump to user handler directly.  */
+      fprintf (asm_out_file, "\tj\t%s ! jump to user handler directly\n",
+			     nds32_isr_vectors[vector_id].func_name);
+    }
+  else if (nds32_isr_vector_size == 4)
     {
       /* This block is for 4-byte vector size.
-         Hardware $VID support is necessary and only one instruction
-         is needed in vector section.  */
+	 Hardware $VID support is necessary and only one instruction
+	 is needed in vector section.  */
       fprintf (asm_out_file, "\tj\t%s ! jump to first level handler\n",
 			     first_level_handler_name);
     }
   else
     {
       /* This block is for 16-byte vector size.
-         There is NO hardware $VID so that we need several instructions
-         such as pushing GPRs and preparing software vid at vector section.
-         For pushing GPRs, there are four variations for
-         16-byte vector content and we have to handle each combination.
-         For preparing software vid, note that the vid need to
-         be substracted vector_number_offset.  */
+	 There is NO hardware $VID so that we need several instructions
+	 such as pushing GPRs and preparing software vid at vector section.
+	 For pushing GPRs, there are four variations for
+	 16-byte vector content and we have to handle each combination.
+	 For preparing software vid, note that the vid need to
+	 be substracted vector_number_offset.  */
       if (TARGET_REDUCED_REGS)
 	{
 	  if (nds32_isr_vectors[vector_id].save_reg == NDS32_SAVE_ALL)
@@ -235,13 +543,11 @@ nds32_emit_isr_reset_content (void)
 {
   unsigned int i;
   unsigned int total_n_vectors;
-  const char *vs_str;
   char reset_handler_name[100];
   char section_name[100];
   char symbol_name[100];

   total_n_vectors = nds32_isr_vectors[0].total_n_vectors;
-  vs_str = (nds32_isr_vector_size == 4) ? "_4b" : "";

   fprintf (asm_out_file, "\t! RESET HANDLER CONTENT - BEGIN !\n");

@@ -257,7 +563,7 @@ nds32_emit_isr_reset_content (void)
   /* Emit vector references.  */
   fprintf (asm_out_file, "\t ! references to vector section entries\n");
   for (i = 0; i < total_n_vectors; i++)
-    fprintf (asm_out_file, "\t.word\t_nds32_vector_%02d%s\n", i, vs_str);
+    fprintf (asm_out_file, "\t.word\t_nds32_vector_%02d\n", i);

   /* Emit jmptbl_00 section.  */
   snprintf (section_name, sizeof (section_name), ".nds32_jmptbl.00");
@@ -271,9 +577,9 @@ nds32_emit_isr_reset_content (void)

   /* Emit vector_00 section.  */
   snprintf (section_name, sizeof (section_name), ".nds32_vector.00");
-  snprintf (symbol_name, sizeof (symbol_name), "_nds32_vector_00%s", vs_str);
+  snprintf (symbol_name, sizeof (symbol_name), "_nds32_vector_00");
   snprintf (reset_handler_name, sizeof (reset_handler_name),
-	    "_nds32_reset%s", vs_str);
+	    "_nds32_reset");

   fprintf (asm_out_file, "\t! ....................................\n");
   nds32_emit_section_head_template (section_name, symbol_name,
@@ -319,12 +625,12 @@ void
 nds32_check_isr_attrs_conflict (tree func_decl, tree func_attrs)
 {
   int save_all_p, partial_save_p;
-  int nested_p, not_nested_p, nested_ready_p;
+  int nested_p, not_nested_p, nested_ready_p, critical_p;
   int intr_p, excp_p, reset_p;

   /* Initialize variables.  */
   save_all_p = partial_save_p = 0;
-  nested_p = not_nested_p = nested_ready_p = 0;
+  nested_p = not_nested_p = nested_ready_p = critical_p = 0;
   intr_p = excp_p = reset_p = 0;

   /* We must check at MOST one attribute to set save-reg.  */
@@ -343,8 +649,10 @@ nds32_check_isr_attrs_conflict (tree func_decl, tree func_attrs)
     not_nested_p = 1;
   if (lookup_attribute ("nested_ready", func_attrs))
     nested_ready_p = 1;
+  if (lookup_attribute ("critical", func_attrs))
+    critical_p = 1;

-  if ((nested_p + not_nested_p + nested_ready_p) > 1)
+  if ((nested_p + not_nested_p + nested_ready_p + critical_p) > 1)
     error ("multiple nested types attributes to function %qD", func_decl);

   /* We must check at MOST one attribute to
@@ -358,6 +666,17 @@ nds32_check_isr_attrs_conflict (tree func_decl, tree func_attrs)

   if ((intr_p + excp_p + reset_p) > 1)
     error ("multiple interrupt attributes to function %qD", func_decl);
+
+  /* Do not allow isr attributes under linux toolchain.  */
+  if (TARGET_LINUX_ABI && intr_p)
+      error ("cannot use interrupt attributes to function %qD "
+	     "under linux toolchain", func_decl);
+  if (TARGET_LINUX_ABI && excp_p)
+      error ("cannot use exception attributes to function %qD "
+	     "under linux toolchain", func_decl);
+  if (TARGET_LINUX_ABI && reset_p)
+      error ("cannot use reset attributes to function %qD "
+	     "under linux toolchain", func_decl);
 }

 /* Function to construct isr vectors information array.
@@ -369,15 +688,21 @@ nds32_construct_isr_vectors_information (tree func_attrs,
 					 const char *func_name)
 {
   tree save_all, partial_save;
-  tree nested, not_nested, nested_ready;
+  tree nested, not_nested, nested_ready, critical;
   tree intr, excp, reset;

+  tree secure;
+  tree security_level_list;
+  tree security_level;
+  unsigned int s_level;
+
   save_all     = lookup_attribute ("save_all", func_attrs);
   partial_save = lookup_attribute ("partial_save", func_attrs);

   nested       = lookup_attribute ("nested", func_attrs);
   not_nested   = lookup_attribute ("not_nested", func_attrs);
   nested_ready = lookup_attribute ("nested_ready", func_attrs);
+  critical     = lookup_attribute ("critical", func_attrs);

   intr  = lookup_attribute ("interrupt", func_attrs);
   excp  = lookup_attribute ("exception", func_attrs);
@@ -387,6 +712,63 @@ nds32_construct_isr_vectors_information (tree func_attrs,
   if (!intr && !excp && !reset)
     return;

+  /* At first, we need to retrieve security level.  */
+  secure = lookup_attribute ("secure", func_attrs);
+  if (secure != NULL)
+    {
+      security_level_list = TREE_VALUE (secure);
+      security_level = TREE_VALUE (security_level_list);
+      s_level = TREE_INT_CST_LOW (security_level);
+    }
+  else
+    {
+      /* If there is no secure attribute, the security level is set by
+	 nds32_isr_secure_level, which is controlled by -misr-secure=X option.
+	 By default nds32_isr_secure_level should be 0.  */
+      s_level = nds32_isr_secure_level;
+    }
+
+  /* ------------------------------------------------------------- */
+  /* FIXME:
+     FOR BACKWARD COMPATIBILITY, we need to support following patterns:
+
+	 __attribute__((interrupt("XXX;YYY;id=ZZZ")))
+	 __attribute__((exception("XXX;YYY;id=ZZZ")))
+	 __attribute__((reset("vectors=XXX;nmi_func=YYY;warm_func=ZZZ")))
+
+     If interrupt/exception/reset appears and its argument is a
+     STRING_CST, we will parse string with some auxiliary functions
+     which set necessary isr information in the nds32_isr_vectors[] array.
+     After that, we can return immediately to avoid new-syntax isr
+     information construction.  */
+  if (intr != NULL_TREE
+      && TREE_CODE (TREE_VALUE (TREE_VALUE (intr))) == STRING_CST)
+    {
+      tree string_arg = TREE_VALUE (TREE_VALUE (intr));
+      nds32_interrupt_attribute_parse_string (TREE_STRING_POINTER (string_arg),
+					      func_name,
+					      s_level);
+      return;
+    }
+  if (excp != NULL_TREE
+      && TREE_CODE (TREE_VALUE (TREE_VALUE (excp))) == STRING_CST)
+    {
+      tree string_arg = TREE_VALUE (TREE_VALUE (excp));
+      nds32_exception_attribute_parse_string (TREE_STRING_POINTER (string_arg),
+					      func_name,
+					      s_level);
+      return;
+    }
+  if (reset != NULL_TREE
+      && TREE_CODE (TREE_VALUE (TREE_VALUE (reset))) == STRING_CST)
+    {
+      tree string_arg = TREE_VALUE (TREE_VALUE (reset));
+      nds32_reset_attribute_parse_string (TREE_STRING_POINTER (string_arg),
+					  func_name);
+      return;
+    }
+  /* ------------------------------------------------------------- */
+
   /* If we are here, either we have interrupt/exception,
      or reset attribute.  */
   if (intr || excp)
@@ -413,6 +795,9 @@ nds32_construct_isr_vectors_information (tree func_attrs,
 	  /* Add vector_number_offset to get actual vector number.  */
 	  vector_id = TREE_INT_CST_LOW (id) + vector_number_offset;

+	  /* Set security level.  */
+	  nds32_isr_vectors[vector_id].security_level = s_level;
+
 	  /* Enable corresponding vector and set function name.  */
 	  nds32_isr_vectors[vector_id].category = (intr)
 						  ? (NDS32_ISR_INTERRUPT)
@@ -432,6 +817,8 @@ nds32_construct_isr_vectors_information (tree func_attrs,
 	    nds32_isr_vectors[vector_id].nested_type = NDS32_NOT_NESTED;
 	  else if (nested_ready)
 	    nds32_isr_vectors[vector_id].nested_type = NDS32_NESTED_READY;
+	  else if (critical)
+	    nds32_isr_vectors[vector_id].nested_type = NDS32_CRITICAL;

 	  /* Advance to next id.  */
 	  id_list = TREE_CHAIN (id_list);
@@ -447,12 +834,12 @@ nds32_construct_isr_vectors_information (tree func_attrs,
       nds32_isr_vectors[0].category = NDS32_ISR_RESET;

       /* Prepare id_list and identify id value so that
-         we can set total number of vectors.  */
+	 we can set total number of vectors.  */
       id_list = TREE_VALUE (reset);
       id = TREE_VALUE (id_list);

       /* The total vectors = interrupt + exception numbers + reset.
-         There are 8 exception and 1 reset in nds32 architecture.  */
+	 There are 8 exception and 1 reset in nds32 architecture.  */
       nds32_isr_vectors[0].total_n_vectors = TREE_INT_CST_LOW (id) + 8 + 1;
       strcpy (nds32_isr_vectors[0].func_name, func_name);

@@ -488,7 +875,6 @@ nds32_construct_isr_vectors_information (tree func_attrs,
     }
 }

-/* A helper function to handle isr stuff at the beginning of asm file.  */
 void
 nds32_asm_file_start_for_isr (void)
 {
@@ -501,15 +887,14 @@ nds32_asm_file_start_for_isr (void)
       strcpy (nds32_isr_vectors[i].func_name, "");
       nds32_isr_vectors[i].save_reg = NDS32_PARTIAL_SAVE;
       nds32_isr_vectors[i].nested_type = NDS32_NOT_NESTED;
+      nds32_isr_vectors[i].security_level = 0;
       nds32_isr_vectors[i].total_n_vectors = 0;
       strcpy (nds32_isr_vectors[i].nmi_name, "");
       strcpy (nds32_isr_vectors[i].warm_name, "");
     }
 }

-/* A helper function to handle isr stuff at the end of asm file.  */
-void
-nds32_asm_file_end_for_isr (void)
+void nds32_asm_file_end_for_isr (void)
 {
   int i;

@@ -543,6 +928,8 @@ nds32_asm_file_end_for_isr (void)
 	  /* Found one vector which is interupt or exception.
 	     Output its jmptbl and vector section content.  */
 	  fprintf (asm_out_file, "\t! interrupt/exception vector %02d\n", i);
+	  fprintf (asm_out_file, "\t! security level: %d\n",
+		   nds32_isr_vectors[i].security_level);
 	  fprintf (asm_out_file, "\t! ------------------------------------\n");
 	  nds32_emit_isr_jmptbl_section (i);
 	  fprintf (asm_out_file, "\t! ....................................\n");
@@ -576,4 +963,65 @@ nds32_isr_function_p (tree func)
 	  || (t_reset != NULL_TREE));
 }

-/* ------------------------------------------------------------------------ */
+/* Return true if FUNC is a isr function with critical attribute.  */
+bool
+nds32_isr_function_critical_p (tree func)
+{
+  tree t_intr;
+  tree t_excp;
+  tree t_critical;
+
+  tree attrs;
+
+  if (TREE_CODE (func) != FUNCTION_DECL)
+    abort ();
+
+  attrs = DECL_ATTRIBUTES (func);
+
+  t_intr  = lookup_attribute ("interrupt", attrs);
+  t_excp  = lookup_attribute ("exception", attrs);
+
+  t_critical = lookup_attribute ("critical", attrs);
+
+  /* If both interrupt and exception attribute does not appear,
+     we can return false immediately.  */
+  if ((t_intr == NULL_TREE) && (t_excp == NULL_TREE))
+    return false;
+
+  /* Here we can guarantee either interrupt or ecxception attribute
+     does exist, so further check critical attribute.
+     If it also appears, we can return true.  */
+  if (t_critical != NULL_TREE)
+    return true;
+
+  /* ------------------------------------------------------------- */
+  /* FIXME:
+     FOR BACKWARD COMPATIBILITY, we need to handle string type.
+     If the string 'critical' appears in the interrupt/exception
+     string argument, we can return true.  */
+  if (t_intr != NULL_TREE || t_excp != NULL_TREE)
+    {
+      char target_str[100];
+      char *critical_str;
+      tree t_check;
+      tree string_arg;
+
+      t_check = t_intr ? t_intr : t_excp;
+      if (TREE_CODE (TREE_VALUE (TREE_VALUE (t_check))) == STRING_CST)
+	{
+	  string_arg = TREE_VALUE (TREE_VALUE (t_check));
+	  strcpy (target_str, TREE_STRING_POINTER (string_arg));
+	  critical_str = strstr (target_str, "critical");
+
+	  /* Found 'critical' string, so return true.  */
+	  if (critical_str)
+	    return true;
+	}
+    }
+  /* ------------------------------------------------------------- */
+
+  /* Other cases, this isr function is not critical type.  */
+  return false;
+}
+
+/* ------------------------------------------------------------- */
diff --git a/gcc/config/nds32/nds32-linux.opt b/gcc/config/nds32/nds32-linux.opt
new file mode 100644
index 0000000..75ccd76
--- /dev/null
+++ b/gcc/config/nds32/nds32-linux.opt
@@ -0,0 +1,16 @@
+mcmodel=
+Target RejectNegative Joined Enum(nds32_cmodel_type) Var(nds32_cmodel_option) Init(CMODEL_LARGE)
+Specify the address generation strategy for code model.
+
+Enum
+Name(nds32_cmodel_type) Type(enum nds32_cmodel_type)
+Known cmodel types (for use with the -mcmodel= option):
+
+EnumValue
+Enum(nds32_cmodel_type) String(small) Value(CMODEL_SMALL)
+
+EnumValue
+Enum(nds32_cmodel_type) String(medium) Value(CMODEL_MEDIUM)
+
+EnumValue
+Enum(nds32_cmodel_type) String(large) Value(CMODEL_LARGE)
diff --git a/gcc/config/nds32/nds32-lmwsmw.c b/gcc/config/nds32/nds32-lmwsmw.c
new file mode 100644
index 0000000..e3b66bf
--- /dev/null
+++ b/gcc/config/nds32/nds32-lmwsmw.c
@@ -0,0 +1,1998 @@
+
+/* lmwsmw pass of Andes NDS32 cpu for GNU compiler
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+/* ------------------------------------------------------------------------ */
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "hash-set.h"
+#include "machmode.h"
+#include "vec.h"
+#include "double-int.h"
+#include "input.h"
+#include "alias.h"
+#include "symtab.h"
+#include "wide-int.h"
+#include "inchash.h"
+#include "tree.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "rtl.h"
+#include "regs.h"
+#include "hard-reg-set.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "input.h"
+#include "function.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "dominance.h"
+#include "cfg.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "predict.h"
+#include "basic-block.h"
+#include "bitmap.h"
+#include "df.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"		/* For add_builtin_function().  */
+#include "ggc.h"
+#include "tree-pass.h"
+#include "target-globals.h"
+#include "ira.h"
+#include "ira-int.h"
+#include "regrename.h"
+#include "nds32-load-store-opt.h"
+#include "nds32-reg-utils.h"
+#include <set>
+#include <vector>
+#include <algorithm>
+
+#define NDS32_GPR_NUM 32
+
+static int
+compare_order (const void *a, const void *b)
+{
+  const load_store_info_t *fp1 = (const load_store_info_t *) a;
+  const load_store_info_t *fp2 = (const load_store_info_t *) b;
+  const load_store_info_t f1 = *fp1;
+  const load_store_info_t f2 = *fp2;
+
+  return f1.order < f2.order ? -1 : 1;
+}
+
+static int
+compare_offset (const void *a, const void *b)
+{
+  const load_store_info_t *fp1 = (const load_store_info_t *) a;
+  const load_store_info_t *fp2 = (const load_store_info_t *) b;
+  const load_store_info_t f1 = *fp1;
+  const load_store_info_t f2 = *fp2;
+
+  return f1.offset < f2.offset ? -1 : 1;
+}
+
+static bool
+compare_amount(available_reg_info_t a, available_reg_info_t b)
+{
+    return a.amount > b.amount;
+}
+
+static bool
+nds32_load_store_reg_plus_offset (rtx_insn *insn, load_store_info_t *load_store_info)
+{
+  rtx pattern, mem, reg, base_reg, addr;
+  HOST_WIDE_INT offset;
+  bool load_p;
+  enum nds32_memory_post_type post_type = NDS32_NONE;
+
+  pattern = PATTERN (insn);
+  mem = NULL_RTX;
+  reg = NULL_RTX;
+  base_reg = NULL_RTX;
+  offset = 0;
+  load_p = false;
+
+  if (GET_CODE (pattern) != SET)
+    return false;
+
+  if (MEM_P (SET_SRC (pattern)))
+    {
+      mem = SET_SRC (pattern);
+      reg = SET_DEST (pattern);
+      load_p = true;
+    }
+
+  if (MEM_P (SET_DEST (pattern)))
+    {
+      mem = SET_DEST (pattern);
+      reg = SET_SRC (pattern);
+      load_p = false;
+    }
+
+  if (mem == NULL_RTX || reg == NULL_RTX || !REG_P (reg))
+    return false;
+
+  /* The FPU ISA has not load-store-multiple instruction.  */
+  if (!NDS32_IS_GPR_REGNUM (REGNO (reg)))
+    return false;
+
+  if (MEM_VOLATILE_P (mem))
+    return false;
+
+  if (GET_MODE (reg) != SImode)
+    return false;
+
+  gcc_assert (REG_P (reg));
+
+  addr = XEXP (mem, 0);
+
+  /* We only care about [reg] and [reg+const].  */
+  if (REG_P (addr))
+    {
+      base_reg = addr;
+      offset = 0;
+    }
+  else if (GET_CODE (addr) == PLUS
+	   && CONST_INT_P (XEXP (addr, 1)))
+    {
+      base_reg = XEXP (addr, 0);
+      offset = INTVAL (XEXP (addr, 1));
+      if (!REG_P (base_reg))
+	return false;
+    }
+  else if (GET_CODE (addr) == POST_INC)
+    {
+      base_reg = XEXP (addr, 0);
+      offset = 0;
+      post_type = NDS32_POST_INC;
+    }
+  else if (GET_CODE (addr) == POST_DEC)
+    {
+      base_reg = XEXP (addr, 0);
+      offset = 0;
+      post_type = NDS32_POST_DEC;
+    }
+  else
+    return false;
+
+  if ((REGNO (base_reg) > NDS32_LAST_GPR_REGNUM)
+      && (REGNO (base_reg) < FIRST_PSEUDO_REGISTER))
+    return false;
+
+  if (load_store_info)
+    {
+      load_store_info->load_p   = load_p;
+      load_store_info->offset   = offset;
+      load_store_info->reg      = reg;
+      load_store_info->base_reg = base_reg;
+      load_store_info->insn     = insn;
+      load_store_info->mem      = mem;
+      load_store_info->post_type = post_type;
+    }
+
+  return true;
+}
+
+static bool
+nds32_insn_alias_p (rtx memref, rtx x)
+{
+  rtx mem;
+
+  if (GET_CODE (x) == PARALLEL)
+    {
+      int i, j;
+
+      for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
+	{
+	  for (j = XVECLEN (x, i) - 1; j >= 0; j--)
+	    if (nds32_insn_alias_p (memref, XVECEXP (x, i, j)))
+	      return true;
+	}
+
+      return false;
+    }
+
+  if (GET_CODE (x) != SET)
+    return true;
+
+  if (MEM_P (SET_SRC (x)))
+    mem = SET_SRC (x);
+  else if (MEM_P (SET_DEST (x)))
+    mem = SET_DEST (x);
+  else
+    return false;
+
+  if (may_alias_p (memref, mem))
+    return true;
+  else
+    return false;
+}
+
+static void
+nds32_emit_multiple_insn (load_store_infos_t *multiple_insn,
+			  rtx base_reg, rtx place, bool update_p)
+{
+  unsigned int i;
+  unsigned int num_use_regs = multiple_insn->length ();
+  int par_index = 0;
+  int offset = 0;
+  bool load_p = (*multiple_insn)[0].load_p;
+
+  rtx reg;
+  rtx mem;
+  rtx push_rtx;
+  rtx update_offset;
+  rtx parallel_insn;
+
+  /* In addition to used registers,
+     we need one more space for (set base base-x) rtx.  */
+  if (update_p)
+    num_use_regs++;
+
+  parallel_insn = gen_rtx_PARALLEL (VOIDmode,
+				    rtvec_alloc (num_use_regs));
+
+  /* Set update insn.  */
+    if (update_p)
+      {
+	update_offset = GEN_INT (multiple_insn->length () * 4);
+	push_rtx = gen_addsi3 (base_reg, base_reg, update_offset);
+	XVECEXP (parallel_insn, 0, par_index) = push_rtx;
+	par_index++;
+      }
+
+  /* Create (set mem regX) from start_reg to end_reg.  */
+  for (i = 0; i < multiple_insn->length (); ++i)
+    {
+      reg = (*multiple_insn)[i].reg;
+      mem = gen_frame_mem (SImode, plus_constant (Pmode,
+						  base_reg,
+						  offset));
+      MEM_COPY_ATTRIBUTES (mem, (*multiple_insn)[i].mem);
+
+      if (load_p)
+	push_rtx = gen_rtx_SET (reg, mem);
+      else
+	push_rtx = gen_rtx_SET (mem, reg);
+
+      XVECEXP (parallel_insn, 0, par_index) = push_rtx;
+      offset = offset + 4;
+      par_index++;
+    }
+
+  emit_insn_before (parallel_insn, place);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "lmw/smw instruction:\n");
+      print_rtl_single (dump_file, parallel_insn);
+    }
+}
+
+static void
+nds32_emit_add_insn (load_store_info_t insn, rtx base_reg,
+		     rtx place, bool add_p)
+{
+  rtx add_insn;
+  HOST_WIDE_INT offset = insn.offset;
+  if (!add_p)
+    offset = -offset;
+
+  add_insn = gen_addsi3 (base_reg, insn.base_reg, GEN_INT (offset));
+  emit_insn_before (add_insn, place);
+}
+
+/* Get the instruction of same ID.  */
+static void
+nds32_fetch_group_insn (load_store_infos_t *src,
+			load_store_infos_t *dst, int id)
+{
+  unsigned int i = 0;
+
+  while (i < src->length ())
+    {
+      if (id == (*src)[i].group)
+	{
+	  dst->safe_push ((*src)[i]);
+	  src->ordered_remove (i);
+	  i = 0;
+	}
+      else
+	i++;
+    }
+}
+
+/* Check registers are not used and defined.  */
+static rtx
+nds32_lmwsmw_insert_place (load_store_infos_t *insn_set)
+{
+  unsigned int i, position;
+  bool combine_p;
+  rtx_insn *insn;
+  auto_vec<load_store_info_t, 64> temp_set;
+
+  for (i = 0; i < insn_set->length (); i++)
+    temp_set.safe_push ((*insn_set)[i]);
+
+  /* Check registers are not used and defined
+     between first instruction and last instruction,
+     and find insert lmw/smw instruction place.
+       example:
+	 lwi $r0, [$r2 + 4]
+	 lwi $r1, [$r2 + 8]
+
+     Check $r0 and $r1 are not used and defined.  */
+  temp_set.qsort (compare_order);
+
+  for (position = 0; position < temp_set.length (); ++position)
+    {
+      combine_p = true;
+
+      /* Check instruction form first instruction to position.  */
+      for (i = 0; i < position; i++)
+	{
+	  for (insn = NEXT_INSN (temp_set[i].insn);
+	       insn != temp_set[position].insn;
+	       insn = NEXT_INSN (insn))
+	    {
+	      if (!NONDEBUG_INSN_P (insn))
+		continue;
+	      if (df_reg_used (insn, temp_set[i].reg)
+		  || df_reg_defined (insn, temp_set[i].reg))
+		{
+		  if (dump_file)
+		    {
+		      fprintf (dump_file, "Fail:register has modify\n");
+		      fprintf (dump_file, "insn uid:%d, reg: r%d,\n",
+			       INSN_UID (temp_set[position].insn),
+			       REGNO (temp_set[position].reg));
+		      fprintf (dump_file, "Modify instruction:\n");
+		      print_rtl_single (dump_file, insn);
+		    }
+		  combine_p = false;
+		  break;
+		}
+	    }
+	}
+
+      /* Check instruction form position to last instruction.  */
+      for (i = position + 1; i < temp_set.length (); i++)
+	{
+	  for (insn = temp_set[position].insn;
+	       insn != temp_set[i].insn;
+	       insn = NEXT_INSN (insn))
+	    {
+	      if (!NONDEBUG_INSN_P (insn))
+		continue;
+	      if (df_reg_used (insn, temp_set[i].reg)
+		  || df_reg_defined (insn, temp_set[i].reg))
+		{
+		  if (dump_file)
+		    {
+		      fprintf (dump_file, "Fail:register has modify\n");
+		      fprintf (dump_file, "insn uid:%d, reg: r%d,\n",
+			       INSN_UID (temp_set[position].insn),
+			       REGNO (temp_set[position].reg));
+		      fprintf (dump_file, "Modify instruction:\n");
+		      print_rtl_single (dump_file, insn);
+		    }
+		  combine_p = false;
+		  break;
+		}
+	    }
+	}
+
+      if (combine_p)
+	return temp_set[position].insn;
+    }
+
+  return NULL_RTX;
+}
+
+/* Check registers are not used and defined.  */
+static bool
+nds32_base_reg_safe_p (load_store_infos_t *insn_set)
+{
+  unsigned int i;
+  rtx_insn *insn;
+  auto_vec<load_store_info_t, 64> temp_set;
+
+  /* We will change 'insn_set' element order,
+     to avoid change order using 'temp_set'.  */
+  for (i = 0; i < insn_set->length (); i++)
+    temp_set.safe_push ((*insn_set)[i]);
+
+  /* We want to combine load and store instructions,
+     need to check base register is not used and defined
+     between first insn and last insn.
+     example:
+       lwi $r0, [$r3 + 4]
+	    ...		  <- check here
+       lwi $r1, [$r3 + 8]
+	    ...		  <- check here
+       lwi $r2, [$r3 + 12]
+
+     Check $r3 is not used and defined,
+     between first insn and last insn.  */
+
+  /* Scan instruction from top to bottom,
+     so need to sort by order.  */
+  temp_set.qsort (compare_order);
+
+  for (i = 0; i < temp_set.length () - 1; ++i)
+    {
+      for (insn = NEXT_INSN (temp_set[i].insn);
+	   insn != temp_set[i + 1].insn;
+	   insn = NEXT_INSN (insn))
+	{
+	  if (!NONDEBUG_INSN_P (insn))
+	    continue;
+
+	  if (nds32_insn_alias_p (temp_set[0].mem, PATTERN (insn)))
+	    {
+	      if (dump_file)
+		{
+		  fprintf (dump_file, "Memory alias:\n");
+		  print_rtl_single (dump_file, insn);
+		}
+	      return false;
+	    }
+
+	  if (temp_set[0].load_p)
+ 	    {
+	      if (df_reg_defined (insn, temp_set[0].base_reg))
+ 		{
+		  if (dump_file)
+		    {
+		      fprintf (dump_file, "Fail: base register has modify\n");
+		      fprintf (dump_file, "insn uid:%d, base reg: r%d,\n",
+			       INSN_UID (temp_set[i].insn),
+			       REGNO (temp_set[i].reg));
+		      fprintf (dump_file, "Modify instruction:\n");
+		      print_rtl_single (dump_file, insn);
+		    }
+		  return false;
+		}
+	    }
+	  else
+	    {
+	      if (df_reg_used (insn, temp_set[0].base_reg))
+		{
+		  if (dump_file)
+		    {
+		      fprintf (dump_file, "Fail: base register has modify\n");
+		      fprintf (dump_file, "insn uid:%d, base reg: r%d,\n",
+			       INSN_UID (temp_set[i].insn),
+			       REGNO (temp_set[i].reg));
+		      fprintf (dump_file, "Modify instruction:\n");
+		      print_rtl_single (dump_file, insn);
+		    }
+		  return false;
+ 		}
+ 	    }
+	}
+    }
+  return true;
+}
+
+static bool
+nds32_gain_size_p (load_store_infos_t *insn, bool new_base_p)
+{
+  unsigned int i, new_cost = 4, old_cost = 0;
+  rtx reg;
+  rtx base_reg = (*insn)[0].base_reg;
+  HOST_WIDE_INT offset;
+
+  for (i = 0; i < insn->length (); ++i)
+    {
+      reg = (*insn)[i].reg;
+      offset = (*insn)[i].offset;
+
+      if (in_reg_class_p (reg, LOW_REGS))
+	{
+	  /* lwi37.sp/swi37.sp/lwi37/swi37 */
+	  if ((REGNO (base_reg) == SP_REGNUM
+	      || REGNO (base_reg) == FP_REGNUM)
+	      && (offset >= 0 && offset < 512 && (offset % 4 == 0)))
+	    old_cost += 2;
+	  /* lwi333/swi333 */
+	  else if (in_reg_class_p (base_reg, LOW_REGS)
+		   && (offset >= 0 && offset < 32 && (offset % 4 == 0)))
+	    old_cost += 2;
+	  else
+	    old_cost += 4;
+        }
+      else
+	{
+	  /* lwi450/swi450 */
+	  if (in_reg_class_p (reg, MIDDLE_REGS)
+	      && offset == 0)
+	    old_cost += 2;
+	  else
+	    old_cost += 4;
+	}
+    }
+
+  offset = (*insn)[0].offset;
+  if (offset != 0)
+    {
+      /* addi333 */
+      if (in_reg_class_p (base_reg, LOW_REGS)
+	  && satisfies_constraint_Iu05 (GEN_INT (offset)))
+	new_cost += 2;
+      /* addi45 */
+      else if (in_reg_class_p (base_reg, MIDDLE_REGS)
+	       && satisfies_constraint_Iu05 (GEN_INT (offset)))
+	new_cost += 2;
+      else
+	new_cost += 4;
+
+      /* subri */
+      if (!new_base_p)
+	new_cost += 4;
+    }
+
+  if (dump_file)
+    fprintf (dump_file, "Code size compare: old code size is %d,"
+			" new code size is %d\n", old_cost, new_cost);
+
+  return new_cost < old_cost;
+}
+
+static bool
+nds32_gain_speed_p (load_store_infos_t *insn, bool new_base_p)
+{
+  unsigned int new_cost = 0, old_cost = insn->length ();
+
+  if (TARGET_PIPELINE_GRAYWOLF)
+    {
+      new_cost = insn->length () / 2 + insn->length () % 2;
+
+      if ((*insn)[0].offset != 0)
+	{
+	  /* Need addi instruction. */
+	  new_cost += 1;
+
+	  /* Need subri instruction. */
+	  if (!new_base_p)
+	    new_cost += 1;
+	}
+    }
+  else
+    {
+      if ((*insn)[0].offset != 0)
+	return false;
+    }
+
+  return new_cost < old_cost;
+}
+
+/* Check instructions can combine into a mulitple-instruction.  */
+static bool
+nds32_combine_multiple_p (load_store_infos_t *insn_set, bool new_base_p)
+{
+  unsigned int i;
+  auto_vec<load_store_info_t, 64> temp_set;
+
+  /* We will change 'insn_set' element order,
+     to avoid change order using 'temp_set'.  */
+  for (i = 0; i < insn_set->length (); i++)
+    temp_set.safe_push ((*insn_set)[i]);
+
+  /* Check start offset need to sort by offset.  */
+  temp_set.qsort (compare_offset);
+
+  /* The lmw/smw pattern, need two or more instructions.  */
+  if (temp_set.length () < 2)
+    return false;
+
+  /* The lmw/smw pattern, only allow combine 25 instruction.  */
+  if (temp_set.length () > 25)
+    return false;
+
+  if (TARGET_LMWSMW_OPT_SIZE
+      || (TARGET_LMWSMW_OPT_AUTO && optimize_size))
+    {
+      /* Compare original instructions with multiple instruction,
+	 when mupltiple instruction is small than original instructions
+	 then combine it.  */
+      if (!nds32_gain_size_p (&temp_set, new_base_p))
+	return false;
+    }
+  else if (TARGET_LMWSMW_OPT_SPEED
+	   || (TARGET_LMWSMW_OPT_AUTO && !optimize_size))
+    {
+      /* The start offset is not zero, we need add a instrucion
+	 to handle offset, it is not worth on -O3, -O2 level.  */
+      if (!nds32_gain_speed_p (&temp_set, new_base_p))
+	return false;
+    }
+
+  /* Base register is not equal register, when offset is not zero.  */
+  if (temp_set[0].offset != 0)
+    for (i = 0; i < temp_set.length (); ++i)
+      {
+	if (REGNO (temp_set[i].reg)
+	    == REGNO (temp_set[0].base_reg))
+	  return false;
+      }
+
+  /* Don't combine, when start offset is greater then Is15,
+     because need extra register.  */
+  if (!satisfies_constraint_Is15 (GEN_INT (temp_set[0].offset)))
+    return false;
+
+  return true;
+}
+
+static bool
+nds32_use_bim_p (load_store_infos_t *insn_set,
+		 load_store_infos_t *ref_set)
+{
+  rtx_insn *insn;
+  bool combine_p = true;
+
+  /* Generate .bim form, need offset is continuous.  */
+  if (insn_set->last ().offset != ((*ref_set)[0].offset - 4))
+    return false;
+
+  /* Reject 'insn_set' instructions bottom
+     of the 'ref_set' instructions.  */
+  if ((*insn_set)[0].group > (*ref_set)[0].group)
+    return false;
+
+  /* Scan instruction from top to bottom,
+     so need to sort by order.  */
+  insn_set->qsort (compare_order);
+  ref_set->qsort (compare_order);
+
+  /* We want to combine .bim form instruction,
+     so need to check base register is not used and defined
+     between multiple-insn and next mulitple-insn.
+     example:
+      lmw.bim $r0, [$r2], $r1
+		...		       <- check here
+      lmw.bi  $r3, [$r2], $r4
+
+    Use .bim form need to check $r2 is not used and defined,
+    between lmw.bim and lmw.bi.  */
+    for (insn = NEXT_INSN (insn_set->last ().insn);
+	 insn != (*ref_set)[0].insn;
+	 insn = NEXT_INSN (insn))
+      {
+	if (!NONDEBUG_INSN_P (insn))
+	  continue;
+
+	if (nds32_insn_alias_p ((*insn_set)[0].mem, PATTERN (insn)))
+	  {
+	    if (dump_file)
+	      {
+		fprintf (dump_file, "Have memory instruction:\n");
+		print_rtl_single (dump_file, insn);
+	      }
+	    combine_p = false;
+	    break;
+	  }
+
+	if (df_reg_used (insn, (*insn_set)[0].base_reg)
+	    || df_reg_defined (insn, (*insn_set)[0].base_reg))
+	  {
+	    if (dump_file)
+	      {
+		fprintf (dump_file, "Use .bi form: Base reg is"
+			 " used or defined between multiple-insn"
+			 " and next multiple-insn\n");
+		fprintf (dump_file, "Base register: r%d,\n",
+			 REGNO ((*insn_set)[0].base_reg));
+		fprintf (dump_file, "use or def instruction:\n");
+		print_rtl_single (dump_file, insn);
+	      }
+	    combine_p = false;
+	    break;
+	  }
+      }
+
+  /* Restore element order.  */
+  insn_set->qsort (compare_offset);
+  ref_set->qsort (compare_offset);
+
+  if (combine_p)
+    return true;
+  else
+    return false;
+}
+
+static void
+nds32_merge_overlapping_regs (HARD_REG_SET *pset, struct du_head *head)
+{
+  bitmap_iterator bi;
+  unsigned i;
+  IOR_HARD_REG_SET (*pset, head->hard_conflicts);
+  EXECUTE_IF_SET_IN_BITMAP (&head->conflicts, 0, i, bi)
+    {
+      du_head_p other = regrename_chain_from_id (i);
+      unsigned j = other->nregs;
+      gcc_assert (other != head);
+      while (j-- > 0)
+	SET_HARD_REG_BIT (*pset, other->regno + j);
+    }
+}
+
+/* Check if NEW_REG can be the candidate register to rename for
+   REG in THIS_HEAD chain.  THIS_UNAVAILABLE is a set of unavailable hard
+   registers.  */
+static bool
+nds32_check_new_reg_p (int reg ATTRIBUTE_UNUSED, int new_reg,
+		       struct du_head *this_head, HARD_REG_SET this_unavailable)
+{
+  enum machine_mode mode = GET_MODE (*this_head->first->loc);
+  int nregs = hard_regno_nregs[new_reg][mode];
+  int i;
+  struct du_chain *tmp;
+
+  for (i = nregs - 1; i >= 0; --i)
+    if (TEST_HARD_REG_BIT (this_unavailable, new_reg + i)
+	|| fixed_regs[new_reg + i]
+	|| global_regs[new_reg + i]
+	/* Can't use regs which aren't saved by the prologue.  */
+	|| (! df_regs_ever_live_p (new_reg + i)
+	    && ! call_used_regs[new_reg + i])
+#ifdef LEAF_REGISTERS
+	/* We can't use a non-leaf register if we're in a
+	   leaf function.  */
+	|| (crtl->is_leaf
+	    && !LEAF_REGISTERS[new_reg + i])
+#endif
+#ifdef HARD_REGNO_RENAME_OK
+	|| ! HARD_REGNO_RENAME_OK (reg + i, new_reg + i)
+#endif
+	)
+      return false;
+
+  /* See whether it accepts all modes that occur in
+     definition and uses.  */
+  for (tmp = this_head->first; tmp; tmp = tmp->next_use)
+    if ((! HARD_REGNO_MODE_OK (new_reg, GET_MODE (*tmp->loc))
+	 && ! DEBUG_INSN_P (tmp->insn))
+	|| (this_head->need_caller_save_reg
+	    && ! (HARD_REGNO_CALL_PART_CLOBBERED
+		  (reg, GET_MODE (*tmp->loc)))
+	    && (HARD_REGNO_CALL_PART_CLOBBERED
+		(new_reg, GET_MODE (*tmp->loc)))))
+      return false;
+
+  return true;
+}
+
+static int
+nds32_find_best_rename_reg (du_head_p this_head, int new_reg, int old_reg)
+{
+  HARD_REG_SET unavailable;
+  int best_new_reg = old_reg;
+
+  COMPL_HARD_REG_SET (unavailable, reg_class_contents[GENERAL_REGS]);
+  CLEAR_HARD_REG_BIT (unavailable, this_head->regno);
+
+  /* Further narrow the set of registers we can use for renaming.
+     If the chain needs a call-saved register, mark the call-used
+     registers as unavailable.  */
+  if (this_head->need_caller_save_reg)
+    IOR_HARD_REG_SET (unavailable, call_used_reg_set);
+
+  /* Mark registers that overlap this chain's lifetime as unavailable.  */
+  nds32_merge_overlapping_regs (&unavailable, this_head);
+
+  if (nds32_check_new_reg_p (old_reg, new_reg, this_head, unavailable))
+    best_new_reg = new_reg;
+
+  return best_new_reg;
+}
+
+static bool
+nds32_try_rename_reg (rtx_insn *insn, unsigned op_pos, unsigned best_reg)
+{
+  insn_rr_info *info;
+  du_head_p op_chain;
+  unsigned oldreg, newreg;
+
+  info = &insn_rr[INSN_UID (insn)];
+
+  if (info->op_info == NULL)
+    return false;
+
+  if (info->op_info[op_pos].n_chains == 0)
+    return false;
+
+  op_chain = regrename_chain_from_id (info->op_info[op_pos].heads[0]->id);
+
+  if (op_chain->cannot_rename)
+    return false;
+
+  oldreg = op_chain->regno;
+  newreg = nds32_find_best_rename_reg (op_chain, best_reg, oldreg);
+
+  if (newreg == oldreg)
+    return false;
+
+  return true;
+}
+
+/* Grouping consecutive registers.  */
+static void
+nds32_group_available_reg (HARD_REG_SET *available_regset, enum reg_class clazz,
+			   std::vector <available_reg_info_t> *available_group)
+{
+  hard_reg_set_iterator hrsi;
+  unsigned regno, pre_regno = 0;
+  unsigned count = 0;
+  available_reg_info_t reg_info;
+  std::vector<available_reg_info_t>::iterator it;
+
+  if (!available_group->empty ())
+    available_group->clear ();
+
+  /* Find available register form $r16 to $r31.  */
+  EXECUTE_IF_SET_IN_HARD_REG_SET (reg_class_contents[clazz], 2, regno, hrsi)
+    {
+      /* Caller-save register or callee-save register but it's ever live.  */
+      if (TEST_HARD_REG_BIT (*available_regset, regno)
+	  && (call_used_regs[regno] || df_regs_ever_live_p (regno)))
+	{
+	  if (pre_regno == 0
+	      || (pre_regno + 1) == regno)
+	    count++;
+	}
+      else
+	{
+	  if (count >= 2)
+	    {
+	      reg_info.amount = count;
+	      reg_info.end = pre_regno;
+	      reg_info.start = pre_regno - count + 1;
+	      available_group->push_back (reg_info);
+	    }
+	  count = 0;
+	}
+      pre_regno = regno;
+    }
+
+  sort (available_group->begin(), available_group->end(), compare_amount);
+
+  if (dump_file)
+    {
+      for (it = available_group->begin();
+	   it != available_group->end(); ++it)
+	fprintf (dump_file,
+		 "available amount = %d start = %d "
+		 "end = %d \n", it->amount, it->start,
+		 it->end);
+    }
+}
+
+/* Try to rename insn's register in order.  */
+static void
+nds32_find_reg (load_store_infos_t *insn, load_store_infos_t *rename_insn,
+		HARD_REG_SET *available_regset)
+{
+  int can_rename_number;
+  unsigned i, regno, amount;
+  unsigned op_pos = (*insn)[0].load_p ? 0 : 1;
+  auto_vec<load_store_info_t, 64> temp_set;
+  std::vector<available_reg_info_t> available_group;
+  std::vector<available_reg_info_t>::iterator it;
+  auto_vec<load_store_info_t, 64> down_set, up_set;
+  unsigned int down_num = 0, up_num = 0;
+  long offset;
+  int m;
+
+  /* We will change 'insn' element order,
+     to avoid change order using 'temp_set'.  */
+  for (i = 0; i < insn->length (); i++)
+    temp_set.safe_push ((*insn)[i]);
+
+  if (temp_set[0].post_type == NDS32_NONE)
+    temp_set.qsort (compare_offset);
+
+  nds32_group_available_reg (available_regset, GENERAL_REGS, &available_group);
+
+ /* Check rename register form top insn to bottom insn,
+    and avoid using fp, sp, lp, gp registers.  */
+  regno = REGNO (temp_set[0].reg);
+  can_rename_number = regno + temp_set.length () - 1;
+  offset = temp_set[0].offset;
+
+  if (can_rename_number < FP_REGNUM)
+    for (i = 1; i < temp_set.length (); ++i)
+      {
+	/* Find this case:
+	     lwi $r0, [$r2 + 4]
+	     lwi $r3, [$r2 + 8]
+
+	   Rename $r3 to $r1.  */
+	down_num++;
+	if ((regno + i) != REGNO (temp_set[i].reg))
+	  {
+	    if (nds32_try_rename_reg (temp_set[i].insn, op_pos, regno + i))
+	      {
+		/* Store in temparary set.  */
+		down_set.safe_push (temp_set[i]);
+		down_set.last ().new_reg = regno + i;
+	      }
+	    else
+	      /* Stop when the register sequence is broken.  */
+	      break;
+	  }
+      }
+
+  /* Check rename register form bottom insn to top insn,
+     and avoid using fp, sp, lp, gp registers.  */
+  regno = REGNO (temp_set.last ().reg);
+  can_rename_number = regno - temp_set.length () + 1;
+
+  if (can_rename_number > 0 && regno < FP_REGNUM)
+    for (i = temp_set.length () - 1; i > 0; --i)
+      {
+	/* Find this case:
+	     lwi $r1, [$r2 + 4]
+	     lwi $r4, [$r2 + 8]
+
+	   Rename $r1 to $r3.  */
+	up_num++;
+	if ((regno - i) != REGNO (temp_set[i - 1].reg))
+	  {
+	    if (nds32_try_rename_reg (temp_set[i - 1].insn, op_pos, regno - i))
+	      {
+		/* Store in rename_insn.  */
+		up_set.safe_push (temp_set[i - 1]);
+		up_set.last ().new_reg = regno - i;
+	      }
+	    else
+	      /* Stop when the register sequence is broken.  */
+	      break;
+	  }
+      }
+
+  /* Rename for the longest sequence.  */
+  /* The overhead of zero offset instruction is lowest, so try it first.  */
+  if ((offset == 0 || down_num >= up_num) && !down_set.is_empty ())
+    {
+      for (m = down_set.length () - 1; m >= 0; --m)
+	{
+	  regno = REGNO (down_set[m].reg);
+	  CLEAR_HARD_REG_BIT (*available_regset, regno);
+	  rename_insn->safe_push (down_set[m]);
+	}
+      nds32_group_available_reg (available_regset, GENERAL_REGS,
+				 &available_group);
+      return;
+    }
+  else if (up_num >= down_num && !up_set.is_empty ())
+    {
+      for (m = up_set.length () - 1; m >= 0; --m)
+	{
+	  regno = REGNO (up_set[m].reg);
+	  CLEAR_HARD_REG_BIT (*available_regset, regno);
+	  rename_insn->safe_push (up_set[m]);
+	}
+      nds32_group_available_reg (available_regset, GENERAL_REGS,
+				 &available_group);
+      return;
+    }
+  /* Check whether it is empty, We will use available table.  */
+  else if (available_group.empty ())
+    return;
+
+  amount = available_group.begin ()->amount;
+  /* Using the minimum number, as the rename amount.  */
+  if (amount > temp_set.length ())
+    amount = temp_set.length ();
+
+  /* Using most available register number to rename.  */
+  regno = available_group.begin ()->start;
+  for (i = 0; i < amount; ++i)
+    {
+      if (nds32_try_rename_reg (temp_set[i].insn, op_pos, regno))
+	{
+	  rename_insn->safe_push (temp_set[i]);
+	  rename_insn->last ().new_reg = regno;
+	  CLEAR_HARD_REG_BIT (*available_regset, regno);
+	  regno++;
+	}
+      else
+	/* Stop when the register sequence is broken.  */
+	break;
+    }
+
+  /* Check length here because the whole sequence entries
+     have to be renamed.  */
+  if (rename_insn->length () > 1)
+    {
+      /* Update available table.  */
+      nds32_group_available_reg (available_regset, GENERAL_REGS,
+				 &available_group);
+      return;
+    }
+
+  /* Using all available register to rename each insn.  */
+  for (i = 0; i < (temp_set.length () - 1); i += 2)
+    {
+      for (it = available_group.begin();
+	   it != available_group.end(); ++it)
+	{
+	  bool change_p = false;
+	  unsigned int j;
+	  regno = it->start;
+
+	  /* Once replaced two instructions. */
+	  for (j = regno; j < (it->end + 1); j += 2)
+	    {
+	      if (nds32_try_rename_reg (temp_set[i].insn, op_pos, regno)
+		  && nds32_try_rename_reg (temp_set[i + 1].insn,
+					   op_pos, regno + 1))
+		{
+		  rename_insn->safe_push (temp_set[i]);
+		  rename_insn->last ().new_reg = regno;
+		  CLEAR_HARD_REG_BIT (*available_regset, regno);
+
+		  rename_insn->safe_push (temp_set[i + 1]);
+		  rename_insn->last ().new_reg = regno + 1;
+		  CLEAR_HARD_REG_BIT (*available_regset, regno + 1);
+		  change_p = true;
+		  break;
+		}
+	    }
+
+	  if (change_p)
+	    {
+	      nds32_group_available_reg (available_regset, GENERAL_REGS,
+					 &available_group);
+	      break;
+	    }
+	}
+    }
+}
+
+static void
+nds32_rename_reg (rtx_insn *insn, unsigned op_pos, unsigned newreg)
+{
+  insn_rr_info *info;
+  du_head_p op_chain;
+
+  info = &insn_rr[INSN_UID (insn)];
+  op_chain = regrename_chain_from_id (info->op_info[op_pos].heads[0]->id);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Try to rename operand %d to %d:\n",
+	       op_pos, newreg);
+      print_rtl_single (dump_file, insn);
+    }
+
+  regrename_do_replace (op_chain, newreg);
+
+  if (dump_file)
+    {
+      print_rtl_single (dump_file, insn);
+    }
+}
+
+/* Combine mutilple load/store insn into a lmw/smw insn.  */
+static void
+nds32_combine_bi_insn (load_store_infos_t *load_store_info)
+{
+  auto_vec<load_store_info_t, 64> candidate_set, bi_set;
+  unsigned int i, j, regno;
+
+  bool load_insn_p;
+  enum nds32_memory_post_type post_type;
+
+  for (i = 0; i < load_store_info->length (); ++i)
+    {
+      /* Recording instruction order of priority and initinal place.  */
+      (*load_store_info)[i].order = i;
+      (*load_store_info)[i].place = false;
+      candidate_set.safe_push ((*load_store_info)[i]);
+    }
+
+  for (i = 0; i < candidate_set.length (); ++i)
+    {
+      load_insn_p = candidate_set[i].load_p;
+      post_type = candidate_set[i].post_type;
+      regno = REGNO (candidate_set[i].reg);
+
+      for (j = i + 1; j < candidate_set.length (); ++j)
+	{
+	  if ((post_type == candidate_set[j].post_type)
+	      && (load_insn_p == candidate_set[j].load_p)
+	      && ((regno + 1) == REGNO (candidate_set[j].reg)))
+	    {
+	      bi_set.safe_push (candidate_set[i]);
+	      bi_set.safe_push (candidate_set[j]);
+
+	      if (nds32_combine_multiple_p (&bi_set, false)
+		  && nds32_base_reg_safe_p (&bi_set)
+		  && nds32_lmwsmw_insert_place (&bi_set) != NULL_RTX)
+		{
+		  rtx place = nds32_lmwsmw_insert_place (&bi_set);
+		  rtx base_reg = bi_set[0].base_reg;
+
+		  nds32_emit_multiple_insn (&bi_set, base_reg, place, true);
+		  delete_insn (bi_set[i].insn);
+		  delete_insn (bi_set[j].insn);
+		  candidate_set.ordered_remove (j);
+		  bi_set.block_remove (0, bi_set.length ());
+		  break;
+		}
+
+	      bi_set.block_remove (0, bi_set.length ());
+	    }
+	}
+    }
+}
+
+/* Combine mutilple load/store insn into a lmw/smw insn.  */
+static void
+nds32_combine_load_store_insn (load_store_infos_t *load_store_info,
+			       HARD_REG_SET *available_regset)
+{
+  auto_vec<load_store_info_t, 64> candidate_set, main_set, temp_set;
+  auto_vec<load_store_info_t, 64> first_set, second_set;
+  HOST_WIDE_INT current_offset, last_offset = 0, add_offset = 0;
+  unsigned int i, j, regno;
+  int group_num = 0, group_id;
+  bool load_insn_p;
+  bool new_base_p = false;
+  bool prev_bim_p = false;
+  bool inc_p = true, dec_p = true;
+  rtx new_base_reg = NULL_RTX;
+  rtx base_reg = (*load_store_info)[0].base_reg;
+  rtx place;
+  unsigned new_base_regnum;
+
+  /* Get available register to add offset for first instruction.  */
+  new_base_regnum = find_available_reg (available_regset, GENERAL_REGS);
+  if (new_base_regnum != INVALID_REGNUM)
+    {
+      CLEAR_HARD_REG_BIT (*available_regset, new_base_regnum);
+      new_base_reg = gen_rtx_REG (Pmode, new_base_regnum);
+      /* Copy attribute form base register to new base register.  */
+      ORIGINAL_REGNO (new_base_reg) =
+	ORIGINAL_REGNO ((*load_store_info)[0].base_reg);
+      REG_ATTRS (new_base_reg) = REG_ATTRS ((*load_store_info)[0].base_reg);
+      new_base_p = true;
+
+      if (dump_file)
+	fprintf (dump_file, "Have new base register: %d\n", new_base_regnum);
+    }
+
+  /* Recording instruction order of priority and initinal place.  */
+  for (i = 0; i < load_store_info->length (); ++i)
+    {
+      (*load_store_info)[i].order = i;
+      (*load_store_info)[i].place = false;
+    }
+
+  /* Fetch first instruction information from 'load_store_info',
+     we will use first instruction as base, to search next instruction.  */
+  candidate_set.safe_push ((*load_store_info)[0]);
+  /* Set offset, regno, load_p state from candidate_set.  */
+  current_offset = candidate_set[0].offset;
+  regno = REGNO (candidate_set[0].reg);
+  load_insn_p = candidate_set[0].load_p;
+  /* Set first instruction group ID,
+     the group ID mark instruction for the same group.  */
+  candidate_set[0].group = group_num;
+
+  /* Search instructions can be combined to a lmw/smw instruction.  */
+  for (i = 1; i < load_store_info->length (); ++i)
+    {
+      /* Collecting register number and offset is increase,
+	 for example:
+
+	   lwi $r0, [$r22 + 4]  <- base instruction
+	   lwi $r1, [$r22 + 8]  <- collect object
+
+	 The collect object (regno + 1), (offset + 4)
+	 from base instruction.  */
+      if ((current_offset == (*load_store_info)[i].offset - 4)
+	  && ((regno + 1) == REGNO ((*load_store_info)[i].reg))
+	  && (load_insn_p == (*load_store_info)[i].load_p)
+	  && inc_p)
+	{
+	  /* Give instruction group ID.  */
+	  (*load_store_info)[i].group = group_num;
+	  /* Save instruction.  */
+	  candidate_set.safe_push ((*load_store_info)[i]);
+	  /* Update state, next register number and offset.  */
+	  regno = REGNO ((*load_store_info)[i].reg);
+	  current_offset += 4;
+	  /* Close decrease type, search increase type.  */
+	  dec_p = false;
+	}
+      /* Collecting register number and offset is decrease,
+	 for example:
+
+	   lwi $r2, [$r22 + 8]  <- base instruction
+	   lwi $r1, [$r22 + 4]  <- collect object
+
+	 The collect object (regno - 1), (offset - 4)
+	 from base instruction.  */
+      else if ((current_offset == (*load_store_info)[i].offset + 4)
+	       && ((regno - 1) == REGNO ((*load_store_info)[i].reg))
+	       && (load_insn_p == (*load_store_info)[i].load_p)
+	       && dec_p)
+	{
+	  /* Give instruction group ID.  */
+	  (*load_store_info)[i].group = group_num;
+	  /* Save instruction.  */
+	  candidate_set.safe_push ((*load_store_info)[i]);
+
+	  /* Update state, next register number and offset.  */
+	  regno = REGNO ((*load_store_info)[i].reg);
+	  current_offset -= 4;
+	  /* Close increase type, search decrease type.  */
+	  inc_p = false;
+	}
+      else
+	{
+	  inc_p = true;
+	  dec_p = true;
+	}
+
+      /* Instructions collect is complete.  */
+      if ((inc_p && dec_p)
+          || (i + 1) == load_store_info->length ())
+	{
+	  /* Filter candidate instructions.  */
+	  if (nds32_combine_multiple_p (&candidate_set, new_base_p)
+	      && nds32_base_reg_safe_p (&candidate_set)
+	      && nds32_lmwsmw_insert_place (&candidate_set) != NULL_RTX)
+	    {
+	      /* Store candidate instructions to 'main_set'.  */
+	      for (j = 0; j < candidate_set.length (); j++)
+		main_set.safe_push (candidate_set[j]);
+	    }
+
+	  /* Scan to the last instruction, it is complete.  */
+	  if ((i + 1) == load_store_info->length ())
+	    break;
+
+	  /* Clean candidate_set sequence.  */
+	  candidate_set.block_remove (0, candidate_set.length ());
+	  /* Reinitialize first instruction infomation
+	     to search next instruction.  */
+	  candidate_set.safe_push ((*load_store_info)[i]);
+	  /* Update group number for next sequence.  */
+	  group_num ++;
+	  /* Set offset, regno, load_p state from candidate_set.  */
+	  current_offset = candidate_set.last ().offset;
+	  regno = REGNO (candidate_set.last ().reg);
+	  load_insn_p = candidate_set.last ().load_p;
+	  candidate_set.last ().group = group_num;
+	}
+      else if (!nds32_base_reg_safe_p (&candidate_set)
+	       || nds32_lmwsmw_insert_place (&candidate_set) == NULL_RTX)
+	{
+	  /* Check collect instruction for each instruction,
+	     we store (n - 1) instructions in group, and
+	     last instruction make next group First instruction.  */
+	  for (j = 0; j < (candidate_set.length () - 1); j++)
+	    temp_set.safe_push (candidate_set[j]);
+
+	  /* Store candidate instructions to 'main_set'.  */
+	  if (nds32_combine_multiple_p (&temp_set, new_base_p))
+	    {
+	      for (j = 0; j < (temp_set.length ()); j++)
+		main_set.safe_push (temp_set[j]);
+	    }
+
+	  /* Clean temp_set sequence.  */
+	  temp_set.block_remove (0, temp_set.length ());
+	  /* Clean candidate_set sequence.  */
+	  candidate_set.block_remove (0, (candidate_set.length () - 1));
+	  /* Update group number for next sequence.  */
+	  group_num ++;
+	  /* Set offset, regno, load_p state from candidate_set.  */
+	  current_offset = candidate_set.last ().offset;
+	  regno = REGNO (candidate_set.last ().reg);
+	  load_insn_p = candidate_set.last ().load_p;
+	  candidate_set.last ().group = group_num;
+	  /* Reset it for search increase and decrease type.  */
+	  inc_p = true;
+	  dec_p = true;
+	}
+    }
+
+  if (dump_file)
+    {
+      if (!main_set.is_empty ())
+	fprintf (dump_file,"Do lmwsmw instructions:\n");
+      for (i = 0; i < main_set.length (); ++i)
+	{
+	  fprintf (dump_file,
+		   "regno = %d base_regno = %d "
+		   "offset = " HOST_WIDE_INT_PRINT_DEC " "
+		   "load_p = %d UID = %u group = %d,"
+		   " order = %d, place = %d\n",
+		   REGNO (main_set[i].reg),
+		   REGNO (main_set[i].base_reg),
+		   main_set[i].offset,
+		   main_set[i].load_p,
+		   INSN_UID (main_set[i].insn),
+		   main_set[i].group,
+		   main_set[i].order,
+		   main_set[i].place);
+	}
+    }
+
+  /* Fetch first group instruction from main_set.  */
+  if (!main_set.is_empty ())
+    {
+      /* Sort main_set by offset.  */
+      main_set.qsort (compare_offset);
+
+      group_id = main_set[0].group;
+      nds32_fetch_group_insn (&main_set, &first_set, group_id);
+      last_offset = first_set.last ().offset;
+    }
+
+  /* Main loop for emit lmw/smw instrucion.  */
+  while (!main_set.is_empty ())
+    {
+      /* Get second group ID.  */
+      group_id = main_set[0].group;
+      for (i = 0; i < main_set.length (); ++i)
+	{
+	  /* Prefer get consecutive offset form
+	     first group to second group  */
+	  if ((last_offset + 4) == main_set[i].offset)
+	    {
+	      group_id = main_set[i].group;
+	      break;
+	    }
+	}
+
+      /* Fetch second instrucion group.  */
+      nds32_fetch_group_insn (&main_set, &second_set, group_id);
+      /* Get lmw/smw insert place.  */
+      place = nds32_lmwsmw_insert_place (&first_set);
+
+      /* Adjust address offset, because lmw/smw instruction
+	 only allow offset is zero.
+	   example:
+	    lwi $r0, [$r3 + 4]
+	    lwi $r1, [$r3 + 8]
+	    lwi $r2, [$r3 + 12]
+
+	    combine into
+
+	    addi $r3, $r3, 4
+	    lwm.bi(m) $r0, [$r3], $r2
+
+	 Need addi instrucion to handle offset.  */
+      if (first_set[0].offset != 0 && !prev_bim_p)
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "Use addi insn handle offset: "
+		     "" HOST_WIDE_INT_PRINT_DEC "\n",
+		     first_set[0].offset);
+	  /* Use available register to process offset,
+	     and don't recovey base register value.  */
+	  if (new_base_p)
+	    {
+	      base_reg = new_base_reg;
+	      add_offset = 0;
+	      CLEAR_HARD_REG_BIT (*available_regset, new_base_regnum);
+	    }
+	  else
+	    add_offset = first_set[0].offset;
+
+	  nds32_emit_add_insn (first_set[0], base_reg, place, true);
+	}
+
+      if (nds32_use_bim_p (&first_set, &second_set))
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "Generate BIM form.\n");
+
+	  nds32_emit_multiple_insn (&first_set, base_reg, place, true);
+
+	  /* Update status, for next instruction sequence.
+	     The add_offset need add 4, because the instruction
+	     is post increase.  */
+	  add_offset = first_set.last ().offset + 4;
+	  prev_bim_p = true;
+	}
+      else
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "Generate BI form.\n");
+
+	  nds32_emit_multiple_insn (&first_set, base_reg, place, false);
+
+	  if (add_offset != 0)
+	    {
+	      if (dump_file)
+		fprintf (dump_file, "Use addi insn handle -offset: "
+			 "" HOST_WIDE_INT_PRINT_DEC "\n",
+			 add_offset);
+
+	      nds32_emit_add_insn (first_set[0], base_reg, place, false);
+	      add_offset = 0;
+	    }
+	  prev_bim_p = false;
+
+	  /* Recovey base register for next instruction sequence.  */
+	  if (REGNO (base_reg) != REGNO (first_set[0].base_reg))
+	    base_reg = first_set[0].base_reg;
+	}
+
+      /* Delete insn, replace by lmw/smw instruction.  */
+      for (i = 0; i < first_set.length (); ++i)
+	delete_insn (first_set[i].insn);
+
+      /* Clean first_set for store next instruction group.  */
+      first_set.block_remove (0, first_set.length ());
+      /* Store next instruction group.  */
+      for (i = 0; i < second_set.length (); ++i)
+	first_set.safe_insert (i, second_set[i]);
+
+      /* Clean second_set.  */
+      second_set.block_remove (0, second_set.length ());
+
+      /* Update last_offset for search next group.  */
+      last_offset = first_set.last ().offset;
+    }
+
+  /* Processing the last instruction group.  */
+  if (!first_set.is_empty ())
+    {
+      /* Get lmw/smw insert place.  */
+      place = nds32_lmwsmw_insert_place (&first_set);
+
+      if (first_set[0].offset != 0 && !prev_bim_p)
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "Use addi insn handle offset: "
+		     "" HOST_WIDE_INT_PRINT_DEC "\n",
+		     first_set[0].offset);
+
+	  if (new_base_p)
+	    {
+	      base_reg = new_base_reg;
+	      add_offset = 0;
+	    }
+	  else
+	    add_offset = first_set[0].offset;
+
+	  nds32_emit_add_insn (first_set[0], base_reg, place, true);
+	}
+
+      if (dump_file)
+	fprintf (dump_file, "Generate BI form.\n");
+
+      nds32_emit_multiple_insn (&first_set, base_reg, place, false);
+
+      if (add_offset != 0)
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "Use addi insn handle -offset: "
+		     "" HOST_WIDE_INT_PRINT_DEC "\n",
+		     -add_offset);
+
+	  nds32_emit_add_insn (first_set[0], base_reg, place, false);
+	}
+
+      /* Delete insn, replace by lmw/smw instruction.  */
+      for (i = 0; i < first_set.length (); ++i)
+	delete_insn (first_set[i].insn);
+    }
+}
+
+/* Combine mutilple load/store insn into a lmw/smw insn.  */
+static void
+nds32_rename_bi_insn (load_store_infos_t *load_store_info,
+		       HARD_REG_SET *available_regset)
+{
+  auto_vec<load_store_info_t, 64> candidate_set, bi_set, replace_set;
+  unsigned int i, j;
+
+  bool load_insn_p;
+  enum nds32_memory_post_type post_type;
+
+  for (i = 0; i < load_store_info->length (); ++i)
+    {
+      /* Recording instruction order of priority and initinal place.  */
+      (*load_store_info)[i].order = i;
+      (*load_store_info)[i].place = false;
+      candidate_set.safe_push ((*load_store_info)[i]);
+    }
+
+  for (i = 0; i < candidate_set.length (); ++i)
+    {
+      load_insn_p = candidate_set[i].load_p;
+      post_type = candidate_set[i].post_type;
+
+      for (j = i + 1; j < candidate_set.length (); ++j)
+	{
+	  if ((post_type == candidate_set[j].post_type)
+	      && (load_insn_p == candidate_set[j].load_p))
+	    {
+	      bi_set.safe_push (candidate_set[i]);
+	      bi_set.safe_push (candidate_set[j]);
+
+	      if (nds32_combine_multiple_p (&bi_set, false)
+		  && nds32_base_reg_safe_p (&bi_set)
+		  && nds32_lmwsmw_insert_place (&bi_set) != NULL_RTX)
+		{
+		  nds32_find_reg (&bi_set, &replace_set, available_regset);
+
+		  if (!replace_set.is_empty ())
+		    {
+		      unsigned k;
+		      unsigned op_pos = replace_set[0].load_p ? 0 : 1;
+
+		      /* Do rename register.  */
+		      for (k = 0; k < replace_set.length (); ++k)
+			nds32_rename_reg (replace_set[k].insn, op_pos,
+					  replace_set[k].new_reg);
+
+		      replace_set.block_remove (0, replace_set.length ());
+		    }
+
+		  candidate_set.ordered_remove (j);
+		  bi_set.block_remove (0, bi_set.length ());
+		  break;
+		}
+
+	      bi_set.block_remove (0, bi_set.length ());
+	    }
+	}
+    }
+}
+
+/* Rename register, can be combined mutilple load/store insn.  */
+static void
+nds32_rename_load_store_reg (load_store_infos_t *load_store_info,
+			     HARD_REG_SET *available_regset)
+{
+  auto_vec<load_store_info_t, 64> rename_set, temp_set, replace_set;
+  HOST_WIDE_INT current_offset;
+  unsigned int i, j;
+  bool load_insn_p;
+  bool inc_p = true, dec_p = true;
+
+  /* Recording instruction order of priority and initinal place.  */
+  for (i = 0; i < load_store_info->length (); ++i)
+    {
+      (*load_store_info)[i].order = i;
+      (*load_store_info)[i].place = false;
+    }
+
+  /* Fetch first instruction information from 'load_store_info',
+     we will use first instruction as base, to search next instruction.  */
+  rename_set.safe_push ((*load_store_info)[0]);
+  /* Set offset, load_p state from rename_set.  */
+  current_offset = rename_set[0].offset;
+  load_insn_p = rename_set[0].load_p;
+
+  /* Search instructions can be combined to a lmw/smw instruction.  */
+  for (i = 1; i < load_store_info->length (); ++i)
+    {
+      /* Collecting offset is increase, for example:
+
+	   lwi pseudo_reg, [$r22 + 4]  <- base instruction
+	   lwi pseudo_reg, [$r22 + 8]  <- collect object
+
+	 The collect object (offset + 4) from base instruction.  */
+      if ((current_offset == (*load_store_info)[i].offset - 4)
+	  && (load_insn_p == (*load_store_info)[i].load_p)
+	  && inc_p)
+	{
+	  /* Save instruction.  */
+	  rename_set.safe_push ((*load_store_info)[i]);
+	  /* Update offset.  */
+	  current_offset += 4;
+	  /* Close decrease type, search increase type.  */
+	  dec_p = false;
+	}
+      /* Collecting offset is decrease, for example:
+
+	   lwi pseudo_reg, [$r22 + 8]  <- base instruction
+	   lwi pseudo_reg, [$r22 + 4]  <- collect object
+
+	 The collect object (offset - 4) from base instruction.  */
+      else if ((current_offset == (*load_store_info)[i].offset + 4)
+	       && (load_insn_p == (*load_store_info)[i].load_p)
+	       && dec_p)
+	{
+	  /* Save instruction.  */
+	  rename_set.safe_push ((*load_store_info)[i]);
+
+	  /* Update offset.  */
+	  current_offset -= 4;
+	  /* Close increase type, search decrease type.  */
+	  inc_p = false;
+	}
+      else
+	{
+	  inc_p = true;
+	  dec_p = true;
+	}
+
+      /* Instructions collect is completed.  */
+      if ((inc_p && dec_p)
+	  || (i + 1) == load_store_info->length ())
+	{
+	  /* Check whether the rename register. */
+	  if (nds32_combine_multiple_p (&rename_set, false)
+	      && nds32_base_reg_safe_p (&rename_set)
+	      && nds32_lmwsmw_insert_place (&rename_set) != NULL_RTX)
+	    {
+	      /* Find can rename instruction, and store in 'replace_set'.  */
+	      nds32_find_reg (&rename_set, &replace_set, available_regset);
+
+	      if (!replace_set.is_empty ())
+		{
+		  unsigned op_pos = replace_set[0].load_p ? 0 : 1;
+
+		  /* Do rename register.  */
+		  for (j = 0; j < replace_set.length (); ++j)
+		    nds32_rename_reg (replace_set[j].insn, op_pos,
+				      replace_set[j].new_reg);
+
+		  replace_set.block_remove (0, replace_set.length ());
+		}
+	    }
+
+	  /* Scan to the last instruction, it is complete.  */
+	  if ((i + 1) == load_store_info->length ())
+	    break;
+
+	  /* Clean rename_set sequence.  */
+	  rename_set.block_remove (0, rename_set.length ());
+	  /* Reinitialize first instruction infomation
+	     to search next instruction.  */
+	  rename_set.safe_push ((*load_store_info)[i]);
+	  /* Set offset, load_p state from rename_set.  */
+	  current_offset = rename_set.last ().offset;
+	  load_insn_p = rename_set.last ().load_p;
+	}
+      else if (!nds32_base_reg_safe_p (&rename_set)
+	       || nds32_lmwsmw_insert_place (&rename_set) == NULL_RTX)
+	{
+	  /* Check collect instruction for each instruction,
+	     we store (n - 1) instructions in group, and
+	     last instruction as the first instruction of the next group.  */
+	  for (j = 0; j < (rename_set.length () - 1); j++)
+	    temp_set.safe_push (rename_set[j]);
+
+	  if (nds32_combine_multiple_p (&temp_set, false))
+	    {
+	      /* Find can rename instruction, and store in 'replace_set'.  */
+	      nds32_find_reg (&temp_set, &replace_set, available_regset);
+
+	      if (!replace_set.is_empty ())
+		{
+		  unsigned op_pos = replace_set[0].load_p ? 0 : 1;
+
+		  /* Do rename register.  */
+		  for (j = 0; j < replace_set.length (); ++j)
+		    nds32_rename_reg (replace_set[j].insn, op_pos,
+				      replace_set[j].new_reg);
+
+		  replace_set.block_remove (0, replace_set.length ());
+		}
+	    }
+
+	  /* Clean temp_set sequence.  */
+	  temp_set.block_remove (0, temp_set.length ());
+	  /* Clean rename_set sequence.  */
+	  rename_set.block_remove (0, (rename_set.length () - 1));
+	  /* Set offset, regno, load_p state from rename_set.  */
+	  current_offset = rename_set.last ().offset;
+	  load_insn_p = rename_set.last ().load_p;
+	  /* Reset it for search increase and decrease type.  */
+	  inc_p = true;
+	  dec_p = true;
+	}
+    }
+}
+
+static void
+nds32_do_lmwsmw_opt (basic_block bb, bool rename_p)
+{
+  rtx_insn *insn;
+  HARD_REG_SET available_regset;
+  load_store_info_t load_store_info;
+  auto_vec<load_store_info_t, 64> load_store_infos[NDS32_GPR_NUM];
+  auto_vec<load_store_info_t, 64> plus_infos[NDS32_GPR_NUM];
+  auto_vec<load_store_info_t, 64> post_infos[NDS32_GPR_NUM];
+  int i;
+  unsigned j;
+  unsigned regno;
+  unsigned polluting;
+  df_ref def;
+  /* Dirty mean a register is define again after
+     first load/store instruction.
+     For example:
+
+     lwi $r2, [$r3 + #0x100]
+     mov $r3, $r4            ! $r3 is dirty after this instruction.
+     lwi $r1, [$r3 + #0x120] ! so this load can't chain with prev load.
+   */
+  bool dirty[NDS32_GPR_NUM];
+
+  if (dump_file)
+    fprintf (dump_file, "scan bb %d\n", bb->index);
+
+  for (i = 0; i < NDS32_GPR_NUM; ++i)
+    dirty[i] = false;
+
+  FOR_BB_INSNS (bb, insn)
+    {
+      if (!INSN_P (insn))
+	continue;
+
+      polluting = INVALID_REGNUM;
+
+      /* Set def reg is dirty if chain is not empty.  */
+      FOR_EACH_INSN_USE (def, insn)
+	{
+	  regno = DF_REF_REGNO (def);
+
+	  if (!NDS32_IS_GPR_REGNUM (regno))
+	    continue;
+
+	  if (!load_store_infos[regno].is_empty ())
+	    {
+	      /* Set pulluting here because the source register
+		 may be the same one.  */
+	      if (dirty[regno] == false)
+		polluting = regno;
+
+	      dirty[regno] = true;
+	    }
+	}
+
+      /* Set all caller-save register is dirty if chain is not empty.  */
+      if (CALL_P (insn))
+	{
+	  for (i = 0; i < NDS32_GPR_NUM; ++i)
+	    {
+	      if (call_used_regs[i] && !load_store_infos[i].is_empty ())
+		dirty[i] = true;
+	    }
+	}
+
+      if (nds32_load_store_reg_plus_offset (insn, &load_store_info))
+	{
+	  regno = REGNO (load_store_info.base_reg);
+	  gcc_assert (NDS32_IS_GPR_REGNUM (regno));
+
+	  /* Don't add to chain if this reg is dirty.  */
+	  if (dirty[regno] && polluting != regno)
+	    break;
+
+	  /* If the register is first time to be used and be polluted
+	     right away, we don't push it.  */
+	  if (regno == REGNO (load_store_info.reg) && load_store_info.load_p
+	      && dirty[regno] == false)
+	    continue;
+
+	  load_store_infos[regno].safe_push (load_store_info);
+	}
+    }
+
+   for (i = 0; i < NDS32_GPR_NUM; ++i)
+    {
+      for (j = 0; j < load_store_infos[i].length (); ++j)
+	{
+	  if (load_store_infos[i][j].post_type == NDS32_NONE)
+	    plus_infos[i].safe_push (load_store_infos[i][j]);
+	  else
+	    post_infos[i].safe_push (load_store_infos[i][j]);
+	}
+    }
+
+  for (i = 0; i < NDS32_GPR_NUM; ++i)
+    {
+      if (load_store_infos[i].length () <= 1)
+	{
+	  if (dump_file && load_store_infos[i].length () == 1)
+	    fprintf (dump_file,
+		     "Skip Chain for $r%d since chain size only 1\n",
+		     i);
+	  continue;
+	}
+
+      if (dump_file)
+	{
+	  fprintf (dump_file,
+		   "Chain for $r%d: (size = %u)\n",
+		   i, load_store_infos[i].length ());
+
+	  for (j = 0; j < load_store_infos[i].length (); ++j)
+	    {
+	      fprintf (dump_file,
+		       "regno = %d base_regno = %d "
+		       "offset = " HOST_WIDE_INT_PRINT_DEC " "
+		       "load_p = %d UID = %u place = %d\n",
+		       REGNO (load_store_infos[i][j].reg),
+		       REGNO (load_store_infos[i][j].base_reg),
+		       load_store_infos[i][j].offset,
+		       load_store_infos[i][j].load_p,
+		       INSN_UID (load_store_infos[i][j].insn),
+		       load_store_infos[i][j].place);
+	    }
+	}
+
+      nds32_get_available_reg_set (bb,
+				   load_store_infos[i][0].insn,
+				   load_store_infos[i].last ().insn,
+				   &available_regset);
+      if (dump_file)
+	print_hard_reg_set (dump_file, "", available_regset);
+
+      /* If rename_p is true, then do rename register of load/store
+	 instruction. Otherwise combination of a multiple load/sotre
+	 a multiple load/store instruction.  */
+      if (rename_p)
+	{
+          if (plus_infos[i].length () > 1)
+	    nds32_rename_load_store_reg (&plus_infos[i], &available_regset);
+          if (post_infos[i].length () > 1)
+	    nds32_rename_bi_insn (&post_infos[i], &available_regset);
+	}
+      else
+	{
+          if (plus_infos[i].length () > 1)
+	    nds32_combine_load_store_insn (&plus_infos[i], &available_regset);
+          if (post_infos[i].length () > 1)
+	    nds32_combine_bi_insn (&post_infos[i]);
+	}
+    }
+}
+
+static void
+nds32_lmwsmw_opt (bool rename_p)
+{
+  basic_block bb;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    nds32_do_lmwsmw_opt (bb, rename_p);
+}
+
+/* Implement rename register for load and store instruction.  */
+static unsigned int
+rest_of_handle_rename_lmwsmw_opt (void)
+{
+  init_alias_analysis ();
+
+  df_set_flags (DF_LR_RUN_DCE);
+  df_note_add_problem ();
+  df_analyze ();
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+
+  regrename_init (true);
+  regrename_analyze (NULL);
+
+  nds32_lmwsmw_opt (true);
+
+  regrename_finish ();
+
+  /* We are finished with alias.  */
+  end_alias_analysis ();
+  return 1;
+}
+
+/* Implement generate lmw and smw instruction.  */
+static unsigned int
+rest_of_handle_gen_lmwsmw_opt (void)
+{
+  init_alias_analysis ();
+
+  df_note_add_problem ();
+  df_analyze ();
+  nds32_lmwsmw_opt (false);
+
+  /* We are finished with alias.  */
+  end_alias_analysis ();
+  return 1;
+}
+
+
+const pass_data pass_data_nds32_rename_lmwsmw_opt =
+{
+  RTL_PASS,				/* type */
+  "rename_lmwsmw_opt",			/* name */
+  OPTGROUP_NONE,			/* optinfo_flags */
+  TV_MACH_DEP,				/* tv_id */
+  0,					/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_df_finish,			/* todo_flags_finish */
+};
+
+class pass_nds32_rename_lmwsmw_opt : public rtl_opt_pass
+{
+public:
+  pass_nds32_rename_lmwsmw_opt (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_nds32_rename_lmwsmw_opt, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) { return flag_nds32_lmwsmw_opt; }
+  unsigned int execute (function *) { return rest_of_handle_rename_lmwsmw_opt (); }
+};
+
+rtl_opt_pass *
+make_pass_nds32_rename_lmwsmw_opt (gcc::context *ctxt)
+{
+  return new pass_nds32_rename_lmwsmw_opt (ctxt);
+}
+
+const pass_data pass_data_nds32_gen_lmwsmw_opt =
+{
+  RTL_PASS,				/* type */
+  "gen_lmwsmw_opt",			/* name */
+  OPTGROUP_NONE,			/* optinfo_flags */
+  TV_MACH_DEP,				/* tv_id */
+  0,					/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_df_finish,			/* todo_flags_finish */
+};
+
+class pass_nds32_gen_lmwsmw_opt : public rtl_opt_pass
+{
+public:
+  pass_nds32_gen_lmwsmw_opt (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_nds32_gen_lmwsmw_opt, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) { return flag_nds32_lmwsmw_opt; }
+  unsigned int execute (function *) { return rest_of_handle_gen_lmwsmw_opt (); }
+};
+
+rtl_opt_pass *
+make_pass_nds32_gen_lmwsmw_opt (gcc::context *ctxt)
+{
+  return new pass_nds32_gen_lmwsmw_opt (ctxt);
+}
diff --git a/gcc/config/nds32/nds32-load-store-opt.c b/gcc/config/nds32/nds32-load-store-opt.c
new file mode 100644
index 0000000..9e5161e
--- /dev/null
+++ b/gcc/config/nds32/nds32-load-store-opt.c
@@ -0,0 +1,721 @@
+/* load-store-opt pass of Andes NDS32 cpu for GNU compiler
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "tree.h"
+#include "rtl.h"
+#include "df.h"
+#include "alias.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "regs.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
+#include "explow.h"
+#include "emit-rtl.h"
+#include "stmt.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"		/* For add_builtin_function().  */
+#include "builtins.h"
+#include "cpplib.h"
+#include "params.h"
+#include "tree-pass.h"
+#include "target-globals.h"
+#include "nds32-load-store-opt.h"
+#include "nds32-reg-utils.h"
+#include <set>
+
+#define NDS32_GPR_NUM 32
+
+static new_base_reg_info_t gen_new_base (rtx,
+					 offset_info_t,
+					 unsigned,
+					 HOST_WIDE_INT,
+					 HOST_WIDE_INT);
+
+static const load_store_optimize_pass *load_store_optimizes[] =
+{
+  /*    allow_regclass, new_base_regclass,
+	offset_lower_bound, offset_upper_bound,
+	load_only_p, name */
+  new load_store_optimize_pass (
+	LOW_REGS, LOW_REGS,
+	0, (32-4),
+	false, "lswi333"),
+  new load_store_optimize_pass (
+	LOW_REGS, FRAME_POINTER_REG,
+	0, (512-4),
+	false, "lswi37"),
+  new load_store_optimize_pass (
+	MIDDLE_REGS, GENERAL_REGS,
+	0, 0,
+	false, "lswi450"),
+  new load_store_optimize_pass (
+	MIDDLE_REGS, R8_REG,
+	-128, -4,
+	true, "lwi45fe")
+};
+
+static const int N_LOAD_STORE_OPT_TYPE = sizeof (load_store_optimizes)
+					 / sizeof (load_store_optimize_pass*);
+
+load_store_optimize_pass
+::load_store_optimize_pass (enum reg_class allow_regclass,
+			    enum reg_class new_base_regclass,
+			    HOST_WIDE_INT offset_lower_bound,
+			    HOST_WIDE_INT offset_upper_bound,
+			    bool load_only_p,
+			    const char *name)
+  : m_allow_regclass (allow_regclass),
+    m_new_base_regclass (new_base_regclass),
+    m_offset_lower_bound (offset_lower_bound),
+    m_offset_upper_bound (offset_upper_bound),
+    m_load_only_p (load_only_p),
+    m_name (name)
+{
+  gcc_assert (offset_lower_bound <= offset_upper_bound);
+}
+
+int
+load_store_optimize_pass::calc_gain (HARD_REG_SET *available_regset,
+				     offset_info_t offset_info,
+				     load_store_infos_t *load_store_info) const
+{
+  int extra_cost = 0;
+  int gain = 0;
+  unsigned i;
+  unsigned chain_size;
+  unsigned new_base_regnum;
+  HOST_WIDE_INT allow_range = m_offset_upper_bound - m_offset_lower_bound;
+  new_base_regnum  = find_available_reg (available_regset, m_new_base_regclass);
+  chain_size = load_store_info->length ();
+
+  if (new_base_regnum == INVALID_REGNUM)
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "%s have no avariable register, so give up try %s\n",
+		 reg_class_names[m_new_base_regclass],
+		 m_name);
+      return 0;
+    }
+  else if (dump_file)
+    fprintf (dump_file,
+	     "%s is avariable, get %s, try %s, chain size = %u\n",
+	     reg_class_names[m_new_base_regclass],
+	     reg_names[new_base_regnum],
+	     m_name,
+	     chain_size);
+
+  HOST_WIDE_INT range = offset_info.max_offset - offset_info.min_offset;
+
+  if (range > allow_range)
+    {
+      /* TODO: We can perform load-store opt for only part of load store.  */
+      if (dump_file)
+	fprintf (dump_file,
+		 "range is too large for %s"
+		 " (range = " HOST_WIDE_INT_PRINT_DEC ", "
+		 "allow_range = " HOST_WIDE_INT_PRINT_DEC ")\n",
+		 m_name, range, allow_range);
+      return 0;
+    }
+
+  if (offset_info.min_offset >= m_offset_lower_bound
+      && offset_info.max_offset <= m_offset_upper_bound)
+    {
+      /* mov55.  */
+      extra_cost = 2;
+    }
+  else
+    {
+      if (satisfies_constraint_Is15 (GEN_INT (offset_info.min_offset
+						   - m_offset_lower_bound)))
+	{
+	  /* add.  */
+	  extra_cost = 4;
+	}
+      else
+	{
+	  /* TODO: Try m_offset_upper_bound instead of m_offset_lower_bound
+		   again.  */
+	  /* add45 + movi.  */
+	  if (satisfies_constraint_Is20 (GEN_INT (offset_info.min_offset
+						  - m_offset_lower_bound)))
+	    extra_cost = 6;
+	  else
+	    return -1; /* Give up if this constant is too large.  */
+	}
+    }
+
+  for (i = 0; i < chain_size; ++i)
+    {
+      if (m_load_only_p && !(*load_store_info)[i].load_p)
+	continue;
+
+      if (in_reg_class_p ((*load_store_info)[i].reg, m_allow_regclass))
+	gain += 2;
+    }
+
+  if (dump_file)
+    fprintf (dump_file,
+	     "%s: gain = %d extra_cost = %d\n",
+	     m_name, gain, extra_cost);
+
+  return gain - extra_cost;
+}
+
+
+void
+load_store_optimize_pass::do_optimize (
+  HARD_REG_SET *available_regset,
+  offset_info_t offset_info,
+  load_store_infos_t *load_store_info) const
+{
+  new_base_reg_info_t new_base_reg_info;
+  rtx load_store_insn;
+  unsigned new_base_regnum;
+
+  new_base_regnum  = find_available_reg (available_regset, m_new_base_regclass);
+  gcc_assert (new_base_regnum != INVALID_REGNUM);
+
+  new_base_reg_info =
+    gen_new_base ((*load_store_info)[0].base_reg,
+		  offset_info,
+		  new_base_regnum,
+		  m_offset_lower_bound, m_offset_upper_bound);
+  unsigned i;
+  rtx insn;
+  insn = emit_insn_before (new_base_reg_info.set_insns[0],
+			   (*load_store_info)[0].insn);
+  if (new_base_reg_info.n_set_insns > 1)
+    {
+      gcc_assert (new_base_reg_info.n_set_insns == 2);
+      emit_insn_before (new_base_reg_info.set_insns[1], insn);
+    }
+
+  for (i = 0; i < load_store_info->length (); ++i)
+    {
+      if (m_load_only_p && !(*load_store_info)[i].load_p)
+	continue;
+
+      if (!in_reg_class_p ((*load_store_info)[i].reg, m_allow_regclass))
+	continue;
+
+      HOST_WIDE_INT offset = (*load_store_info)[i].offset;
+
+      if (new_base_reg_info.need_adjust_offset_p)
+	offset = offset + new_base_reg_info.adjust_offset;
+
+      load_store_insn =
+	gen_reg_plus_imm_load_store ((*load_store_info)[i].reg,
+				     new_base_reg_info.reg,
+				     offset,
+				     (*load_store_info)[i].load_p,
+				     (*load_store_info)[i].mem);
+
+      emit_insn_before (load_store_insn, (*load_store_info)[i].insn);
+
+      delete_insn ((*load_store_info)[i].insn);
+    }
+
+  /* Recompute it CFG, to update BB_END() instruction.  */
+  compute_bb_for_insn ();
+}
+
+static new_base_reg_info_t
+gen_new_base (rtx original_base_reg,
+	      offset_info_t offset_info,
+	      unsigned new_base_regno,
+	      HOST_WIDE_INT offset_lower,
+	      HOST_WIDE_INT offset_upper)
+{
+  new_base_reg_info_t new_base_reg_info;
+
+  /* Use gen_raw_REG instead of gen_rtx_REG to prevent break the reg
+     info for global one.
+     For example, gen_rtx_REG will return frame_pointer_rtx immediate
+     instead of create new rtx for gen_raw_REG (Pmode, FP_REGNUM). */
+  new_base_reg_info.reg = gen_raw_REG (Pmode, new_base_regno);
+
+  /* Setup register info.  */
+  ORIGINAL_REGNO (new_base_reg_info.reg) = ORIGINAL_REGNO (original_base_reg);
+  REG_ATTRS (new_base_reg_info.reg) = REG_ATTRS (original_base_reg);
+
+  if (offset_info.max_offset <= offset_upper
+      && offset_info.min_offset >= offset_lower)
+    {
+      new_base_reg_info.set_insns[0] = gen_movsi (new_base_reg_info.reg,
+						  original_base_reg);
+      new_base_reg_info.n_set_insns = 1;
+      new_base_reg_info.need_adjust_offset_p = false;
+      new_base_reg_info.adjust_offset = 0;
+    }
+  else
+    {
+      /* For example:
+	 lwi45.fe allow -4 ~ -128 range:
+	 offset_lower = #-4
+	 offset_upper = #-128
+
+	 lwi $r2, [$r12 + #10]
+	 ->
+	 addi $r8, $r12, #14      ! $r8 = $r12 + #10 - offset_lower
+				  ! = $r12 + #10 - #-4
+				  ! = $r12 + #14
+	 lwi45.fe $r2, [$r8 - #4] ! [$r8 - #4]
+				  ! = [$r12 + #14 - #4]
+				  ! = [$r12 + #10]
+      */
+      new_base_reg_info.adjust_offset =
+	-(offset_info.min_offset - offset_lower);
+
+      rtx offset = GEN_INT (-new_base_reg_info.adjust_offset);
+
+
+      if (satisfies_constraint_Is15 (offset))
+	{
+	  new_base_reg_info.set_insns[0] =
+	    gen_addsi3(new_base_reg_info.reg,
+		       original_base_reg,
+		       offset);
+
+	  new_base_reg_info.n_set_insns = 1;
+	}
+      else
+	{
+	  if (!satisfies_constraint_Is20 (offset))
+	    gcc_unreachable ();
+
+	  new_base_reg_info.set_insns[1] =
+	    gen_rtx_SET (new_base_reg_info.reg,
+			 GEN_INT (-new_base_reg_info.adjust_offset));
+
+	  new_base_reg_info.set_insns[0] =
+	    gen_addsi3 (new_base_reg_info.reg,
+			new_base_reg_info.reg,
+			original_base_reg);
+
+	  new_base_reg_info.n_set_insns = 2;
+	}
+
+      new_base_reg_info.need_adjust_offset_p = true;
+    }
+
+  return new_base_reg_info;
+}
+
+static bool
+nds32_4byte_load_store_reg_plus_offset (
+  rtx_insn *insn,
+  load_store_info_t *load_store_info)
+{
+  if (!INSN_P (insn))
+    return false;
+
+  rtx pattern = PATTERN (insn);
+  rtx mem = NULL_RTX;
+  rtx reg = NULL_RTX;
+  rtx base_reg = NULL_RTX;
+  rtx addr;
+  HOST_WIDE_INT offset = 0;
+  bool load_p = false;
+
+  if (GET_CODE (pattern) != SET)
+    return false;
+
+  if (MEM_P (SET_SRC (pattern)))
+    {
+      mem = SET_SRC (pattern);
+      reg = SET_DEST (pattern);
+      load_p = true;
+    }
+
+  if (MEM_P (SET_DEST (pattern)))
+    {
+      mem = SET_DEST (pattern);
+      reg = SET_SRC (pattern);
+      load_p = false;
+    }
+
+  if (mem == NULL_RTX || reg == NULL_RTX || !REG_P (reg))
+    return false;
+
+  gcc_assert (REG_P (reg));
+
+  addr = XEXP (mem, 0);
+
+  /* We only care about [reg] and [reg+const].  */
+  if (REG_P (addr))
+    {
+      base_reg = addr;
+      offset = 0;
+    }
+  else if (GET_CODE (addr) == PLUS
+	   && CONST_INT_P (XEXP (addr, 1)))
+    {
+      base_reg = XEXP (addr, 0);
+      offset = INTVAL (XEXP (addr, 1));
+      if (!REG_P (base_reg))
+	return false;
+    }
+  else
+    return false;
+
+  /* At least need MIDDLE_REGS.  */
+  if (!in_reg_class_p (reg, MIDDLE_REGS))
+    return false;
+
+  /* lwi450/swi450 */
+  if (offset == 0)
+    return false;
+
+  if (in_reg_class_p (reg, LOW_REGS))
+    {
+      /* lwi37.sp/swi37.sp/lwi37/swi37 */
+      if ((REGNO (base_reg) == SP_REGNUM
+	   || REGNO (base_reg) == FP_REGNUM)
+	  && (offset >= 0 && offset < 512 && (offset % 4 == 0)))
+	return false;
+
+      /* lwi333/swi333 */
+      if (in_reg_class_p (base_reg, LOW_REGS)
+	  && (offset >= 0 && offset < 32 && (offset % 4 == 0)))
+	return false;
+    }
+
+  if (load_store_info)
+    {
+      load_store_info->load_p   = load_p;
+      load_store_info->offset   = offset;
+      load_store_info->reg      = reg;
+      load_store_info->base_reg = base_reg;
+      load_store_info->insn     = insn;
+      load_store_info->mem      = mem;
+    }
+
+  if (GET_MODE (reg) != SImode)
+    return false;
+
+  return true;
+}
+
+static bool
+nds32_4byte_load_store_reg_plus_offset_p (rtx_insn *insn)
+{
+  return nds32_4byte_load_store_reg_plus_offset (insn, NULL);
+}
+
+static bool
+nds32_load_store_opt_profitable_p (basic_block bb)
+{
+  int candidate = 0;
+  int threshold = 2;
+  rtx_insn *insn;
+
+  if (dump_file)
+    fprintf (dump_file, "scan bb %d\n", bb->index);
+
+  FOR_BB_INSNS (bb, insn)
+    {
+      if (nds32_4byte_load_store_reg_plus_offset_p (insn))
+	candidate++;
+    }
+
+  if (dump_file)
+    fprintf (dump_file, " candidate = %d\n", candidate);
+
+  return candidate >= threshold;
+}
+
+static offset_info_t
+nds32_get_offset_info (auto_vec<load_store_info_t, 64> *load_store_info)
+{
+  unsigned i;
+  std::set<HOST_WIDE_INT> offsets;
+  offset_info_t offset_info;
+  offset_info.max_offset = 0;
+  offset_info.min_offset = 0;
+  offset_info.num_offset = 0;
+
+  if (load_store_info->length () == 0)
+    return offset_info;
+
+  offset_info.max_offset = (*load_store_info)[0].offset;
+  offset_info.min_offset = (*load_store_info)[0].offset;
+  offsets.insert ((*load_store_info)[0].offset);
+
+  for (i = 1; i < load_store_info->length (); i++)
+    {
+      HOST_WIDE_INT offset = (*load_store_info)[i].offset;
+      offset_info.max_offset = MAX (offset_info.max_offset, offset);
+      offset_info.min_offset = MIN (offset_info.min_offset, offset);
+      offsets.insert (offset);
+    }
+
+  offset_info.num_offset = offsets.size ();
+
+  return offset_info;
+}
+
+static void
+nds32_do_load_store_opt (basic_block bb)
+{
+  rtx_insn *insn;
+  load_store_info_t load_store_info;
+  auto_vec<load_store_info_t, 64> load_store_infos[NDS32_GPR_NUM];
+  HARD_REG_SET available_regset;
+  int i;
+  unsigned j;
+  unsigned regno;
+  unsigned polluting;
+  df_ref def;
+  /* Dirty mean a register is define again after
+     first load/store instruction.
+     For example:
+
+     lwi $r2, [$r3 + #0x100]
+     mov $r3, $r4            ! $r3 is dirty after this instruction.
+     lwi $r1, [$r3 + #0x120] ! so this load can't chain with prev load.
+   */
+  bool dirty[NDS32_GPR_NUM];
+
+  if (dump_file)
+    fprintf (dump_file, "try load store opt for bb %d\n", bb->index);
+
+  for (i = 0; i < NDS32_GPR_NUM; ++i)
+    dirty[i] = false;
+
+  FOR_BB_INSNS (bb, insn)
+    {
+      if (!INSN_P (insn))
+	continue;
+
+      polluting = INVALID_REGNUM;
+
+      /* Set def reg is dirty if chain is not empty.  */
+      FOR_EACH_INSN_DEF (def, insn)
+	{
+	  regno = DF_REF_REGNO (def);
+
+	  if (!NDS32_IS_GPR_REGNUM (regno))
+	    continue;
+
+	  if (!load_store_infos[regno].is_empty ())
+	    {
+	      /* Set pulluting here because the source register
+		 may be the same one.  */
+	      if (dirty[regno] == false)
+		polluting = regno;
+
+	      dirty[regno] = true;
+	    }
+	}
+
+      /* Set all caller-save register is dirty if chain is not empty.  */
+      if (CALL_P (insn))
+	{
+	  for (i = 0; i < NDS32_GPR_NUM; ++i)
+	    {
+	      if (call_used_regs[i] && !load_store_infos[i].is_empty ())
+		dirty[i] = true;
+	    }
+	}
+
+      if (nds32_4byte_load_store_reg_plus_offset (insn, &load_store_info))
+	{
+	  regno = REGNO (load_store_info.base_reg);
+	  gcc_assert (NDS32_IS_GPR_REGNUM (regno));
+
+	  /* Don't add to chain if this reg is dirty.  */
+	  if (dirty[regno] && polluting != regno)
+	    break;
+
+	  /* If the register is first time to be used and be polluted
+	     right away, we don't push it.  */
+	  if (regno == REGNO (load_store_info.reg) && load_store_info.load_p
+	      && dirty[regno] == false)
+	    continue;
+
+	  load_store_infos[regno].safe_push (load_store_info);
+	}
+    }
+  for (i = 0; i < NDS32_GPR_NUM; ++i)
+    {
+      if (load_store_infos[i].length () <= 1)
+	{
+	  if (dump_file && load_store_infos[i].length () == 1)
+	    fprintf (dump_file,
+		     "Skip Chain for $r%d since chain size only 1\n",
+		     i);
+	  continue;
+	}
+
+      if (dump_file)
+	{
+	  fprintf (dump_file,
+		   "Chain for $r%d: (size = %u)\n",
+		   i, load_store_infos[i].length ());
+
+	  for (j = 0; j < load_store_infos[i].length (); ++j)
+	    {
+	      fprintf (dump_file,
+		       "regno = %d base_regno = %d "
+		       "offset = " HOST_WIDE_INT_PRINT_DEC " "
+		       "load_p = %d UID = %u\n",
+		       REGNO (load_store_infos[i][j].reg),
+		       REGNO (load_store_infos[i][j].base_reg),
+		       load_store_infos[i][j].offset,
+		       load_store_infos[i][j].load_p,
+		       INSN_UID (load_store_infos[i][j].insn));
+	    }
+	}
+
+      nds32_get_available_reg_set (bb,
+				   load_store_infos[i][0].insn,
+				   load_store_infos[i].last ().insn,
+				   &available_regset);
+
+      if (dump_file)
+	{
+	  print_hard_reg_set (dump_file, "", available_regset);
+	}
+
+      offset_info_t offset_info = nds32_get_offset_info (&load_store_infos[i]);
+      if (dump_file)
+	{
+	  fprintf (dump_file,
+		   "max offset = " HOST_WIDE_INT_PRINT_DEC "\n"
+		   "min offset = " HOST_WIDE_INT_PRINT_DEC "\n"
+		   "num offset = %d\n",
+		   offset_info.max_offset,
+		   offset_info.min_offset,
+		   offset_info.num_offset);
+	}
+
+      int gain;
+      int best_gain = 0;
+      const load_store_optimize_pass *best_load_store_optimize_pass = NULL;
+
+      for (j = 0; j < N_LOAD_STORE_OPT_TYPE; ++j)
+	{
+	  gain = load_store_optimizes[j]->calc_gain (&available_regset,
+						     offset_info,
+						     &load_store_infos[i]);
+
+	  if (dump_file)
+	    fprintf (dump_file, "%s gain = %d\n",
+		     load_store_optimizes[j]->name (), gain);
+
+	  if (gain > best_gain)
+	    {
+	      best_gain = gain;
+	      best_load_store_optimize_pass = load_store_optimizes[j];
+	    }
+	}
+
+      if (best_load_store_optimize_pass)
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "%s is most profit, optimize it!\n",
+		     best_load_store_optimize_pass->name ());
+
+	  best_load_store_optimize_pass->do_optimize (&available_regset,
+						      offset_info,
+						      &load_store_infos[i]);
+
+	  df_insn_rescan_all ();
+	}
+
+    }
+}
+
+static unsigned int
+nds32_load_store_opt (void)
+{
+  basic_block bb;
+
+  df_set_flags (DF_LR_RUN_DCE);
+  df_note_add_problem ();
+  df_analyze ();
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      if (nds32_load_store_opt_profitable_p (bb))
+	nds32_do_load_store_opt (bb);
+    }
+
+  return 1;
+}
+
+const pass_data pass_data_nds32_load_store_opt =
+{
+  RTL_PASS,				/* type */
+  "load_store_opt",			/* name */
+  OPTGROUP_NONE,			/* optinfo_flags */
+  TV_MACH_DEP,				/* tv_id */
+  0,					/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_df_finish,			/* todo_flags_finish */
+};
+
+class pass_nds32_load_store_opt : public rtl_opt_pass
+{
+public:
+  pass_nds32_load_store_opt (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_nds32_load_store_opt, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) { return TARGET_16_BIT && TARGET_LOAD_STORE_OPT; }
+  unsigned int execute (function *) { return nds32_load_store_opt (); }
+};
+
+rtl_opt_pass *
+make_pass_nds32_load_store_opt (gcc::context *ctxt)
+{
+  return new pass_nds32_load_store_opt (ctxt);
+}
diff --git a/gcc/config/nds32/nds32-load-store-opt.h b/gcc/config/nds32/nds32-load-store-opt.h
new file mode 100644
index 0000000..f94b56a
--- /dev/null
+++ b/gcc/config/nds32/nds32-load-store-opt.h
@@ -0,0 +1,117 @@
+/* Prototypes for load-store-opt of Andes NDS32 cpu for GNU compiler
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef NDS32_LOAD_STORE_OPT_H
+#define NDS32_LOAD_STORE_OPT_H
+
+/* Define the type of a set of hard registers.  */
+
+enum nds32_memory_post_type
+{
+  NDS32_NONE,
+  NDS32_POST_INC,
+  NDS32_POST_DEC
+};
+
+typedef struct {
+  rtx reg;
+  rtx base_reg;
+  rtx offset;
+  HOST_WIDE_INT shift;
+  bool load_p;
+  rtx insn;
+} rr_load_store_info_t;
+
+typedef struct {
+  rtx reg;
+  rtx base_reg;
+  HOST_WIDE_INT offset;
+  bool load_p;
+  rtx_insn *insn;
+  rtx mem;
+  int new_reg;
+  int order;
+  int group;
+  bool place;
+  enum nds32_memory_post_type post_type;
+} load_store_info_t;
+
+typedef struct {
+  HOST_WIDE_INT max_offset;
+  HOST_WIDE_INT min_offset;
+  /* How many different offset.  */
+  int num_offset;
+} offset_info_t;
+
+typedef struct {
+  rtx set_insns[2];
+  int n_set_insns;
+  rtx reg;
+  bool need_adjust_offset_p;
+  HOST_WIDE_INT adjust_offset;
+} new_base_reg_info_t;
+
+typedef struct {
+  unsigned int amount;
+  unsigned int start;
+  unsigned int end;
+} available_reg_info_t;
+
+typedef auto_vec<load_store_info_t, 64> load_store_infos_t;
+
+class load_store_optimize_pass
+{
+public:
+  load_store_optimize_pass (enum reg_class,
+			    enum reg_class,
+			    HOST_WIDE_INT,
+			    HOST_WIDE_INT,
+			    bool,
+			    const char *);
+  const char *name () const { return m_name; };
+  int calc_gain (HARD_REG_SET *,
+		 offset_info_t,
+		 load_store_infos_t *) const;
+  void do_optimize (HARD_REG_SET *,
+		    offset_info_t,
+		    load_store_infos_t *) const;
+private:
+  enum reg_class m_allow_regclass;
+  enum reg_class m_new_base_regclass;
+  HOST_WIDE_INT m_offset_lower_bound;
+  HOST_WIDE_INT m_offset_upper_bound;
+  bool m_load_only_p;
+  const char *m_name;
+};
+
+static inline rtx
+gen_reg_plus_imm_load_store (rtx reg, rtx base_reg,
+			     HOST_WIDE_INT offset, bool load_p, rtx oldmem)
+{
+  rtx addr = plus_constant(Pmode, base_reg, offset);
+  rtx mem = gen_rtx_MEM (SImode, addr);
+  MEM_COPY_ATTRIBUTES (mem, oldmem);
+  if (load_p)
+    return gen_movsi (reg, mem);
+  else
+    return gen_movsi (mem, reg);
+}
+
+#endif /* ! NDS32_LOAD_STORE_OPT_H */
diff --git a/gcc/config/nds32/nds32-md-auxiliary.c b/gcc/config/nds32/nds32-md-auxiliary.c
index def8eda..3881df7 100644
--- a/gcc/config/nds32/nds32-md-auxiliary.c
+++ b/gcc/config/nds32/nds32-md-auxiliary.c
@@ -25,17 +25,74 @@
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
-#include "target.h"
-#include "rtl.h"
 #include "tree.h"
-#include "tm_p.h"
-#include "optabs.h"		/* For GEN_FCN.  */
-#include "recog.h"
+#include "rtl.h"
+#include "df.h"
+#include "alias.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "regs.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
 #include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
+#include "explow.h"
+#include "emit-rtl.h"
+#include "stmt.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "tm_p.h"
 #include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"		/* For add_builtin_function().  */
+#include "builtins.h"
+
+/* ------------------------------------------------------------------------ */
+
+/* This file is divided into three parts:
+
+     PART 1: Auxiliary static function definitions.
+
+     PART 2: Auxiliary function for expand RTL pattern.
+
+     PART 3: Auxiliary function for output asm template.  */

 /* ------------------------------------------------------------------------ */

+/* PART 1: Auxiliary static function definitions.  */
+
+static int
+nds32_regno_to_enable4 (unsigned regno)
+{
+  switch (regno)
+    {
+    case 28: /* $r28/fp */
+      return 0x8;
+    case 29: /* $r29/gp */
+      return 0x4;
+    case 30: /* $r30/lp */
+      return 0x2;
+    case 31: /* $r31/sp */
+      return 0x1;
+    default:
+      gcc_unreachable ();
+    }
+}
+
 /* A helper function to return character based on byte size.  */
 static char
 nds32_byte_to_size (int byte)
@@ -54,796 +111,3825 @@ nds32_byte_to_size (int byte)
     }
 }

-/* A helper function to return memory format.  */
-enum nds32_16bit_address_type
-nds32_mem_format (rtx op)
+static int
+nds32_inverse_cond_code (int code)
 {
-  machine_mode mode_test;
-  int val;
-  int regno;
-
-  if (!TARGET_16_BIT)
-    return ADDRESS_NOT_16BIT_FORMAT;
-
-  mode_test = GET_MODE (op);
-
-  op = XEXP (op, 0);
+  switch (code)
+    {
+      case NE:
+	return EQ;
+      case EQ:
+	return NE;
+      case GT:
+	return LE;
+      case LE:
+	return GT;
+      case GE:
+	return LT;
+      case LT:
+	return GE;
+      default:
+	gcc_unreachable ();
+    }
+}

-  /* 45 format.  */
-  if (GET_CODE (op) == REG && (mode_test == SImode))
-    return ADDRESS_REG;
+static const char *
+nds32_cond_code_str (int code)
+{
+  switch (code)
+    {
+      case NE:
+	return "ne";
+      case EQ:
+	return "eq";
+      case GT:
+	return "gt";
+      case LE:
+	return "le";
+      case GE:
+	return "ge";
+      case LT:
+	return "lt";
+      default:
+	gcc_unreachable ();
+    }
+}

-  /* 333 format for QI/HImode.  */
-  if (GET_CODE (op) == REG && (REGNO (op) < R8_REGNUM))
-    return ADDRESS_LO_REG_IMM3U;
+static void
+output_cond_branch (int code, const char *suffix, bool r5_p,
+		    bool long_jump_p, rtx *operands)
+{
+  char pattern[256];
+  const char *cond_code;
+  bool align_p = NDS32_ALIGN_P ();
+  const char *align = align_p ? "\t.align\t2\n" : "";

-  /* post_inc 333 format.  */
-  if ((GET_CODE (op) == POST_INC) && (mode_test == SImode))
+  if (r5_p && REGNO (operands[2]) == 5 && TARGET_16_BIT)
     {
-      regno = REGNO(XEXP (op, 0));
-
-      if (regno < 8)
-	return ADDRESS_POST_INC_LO_REG_IMM3U;
+      /* This is special case for beqs38 and bnes38,
+	 second operand 2 can't be $r5 and it's almost meanless,
+	 however it may occur after copy propgation.  */
+      if (code == EQ)
+	{
+	  /* $r5 == $r5 always taken! */
+	  if (long_jump_p)
+	    snprintf (pattern, sizeof (pattern),
+		      "j\t%%3");
+	  else
+	    snprintf (pattern, sizeof (pattern),
+		      "j8\t%%3");
+	}
+      else
+	/* Don't output anything since $r5 != $r5 never taken! */
+	pattern[0] = '\0';
     }
-
-  /* post_inc 333 format.  */
-  if ((GET_CODE (op) == POST_MODIFY)
-      && (mode_test == SImode)
-      && (REG_P (XEXP (XEXP (op, 1), 0)))
-      && (CONST_INT_P (XEXP (XEXP (op, 1), 1))))
+  else if (long_jump_p)
     {
-      regno = REGNO (XEXP (XEXP (op, 1), 0));
-      val = INTVAL (XEXP (XEXP (op, 1), 1));
-      if (regno < 8 && val < 32)
-	return ADDRESS_POST_INC_LO_REG_IMM3U;
+      int inverse_code = nds32_inverse_cond_code (code);
+      cond_code = nds32_cond_code_str (inverse_code);
+
+      /*      b<cond><suffix>  $r0, $r1, .L0
+	    =>
+	      b<inverse_cond><suffix>  $r0, $r1, .LCB0
+	      j  .L0
+	    .LCB0:
+
+	    or
+
+	      b<cond><suffix>  $r0, $r1, .L0
+	    =>
+	      b<inverse_cond><suffix>  $r0, $r1, .LCB0
+	      j  .L0
+	    .LCB0:
+      */
+      if (r5_p && TARGET_16_BIT)
+	{
+	  snprintf (pattern, sizeof (pattern),
+		    "b%ss38\t %%2, .LCB%%=\n\tj\t%%3\n%s.LCB%%=:",
+		    cond_code, align);
+	}
+      else
+	{
+	  snprintf (pattern, sizeof (pattern),
+		    "b%s%s\t%%1, %%2, .LCB%%=\n\tj\t%%3\n%s.LCB%%=:",
+		    cond_code, suffix, align);
+	}
     }
-
-  if ((GET_CODE (op) == PLUS)
-      && (GET_CODE (XEXP (op, 0)) == REG)
-      && (GET_CODE (XEXP (op, 1)) == CONST_INT))
+  else
     {
-      val = INTVAL (XEXP (op, 1));
-
-      regno = REGNO(XEXP (op, 0));
-
-      if (regno > 7
-	  && regno != SP_REGNUM
-	  && regno != FP_REGNUM)
-	return ADDRESS_NOT_16BIT_FORMAT;
-
-      switch (mode_test)
+      cond_code = nds32_cond_code_str (code);
+      if (r5_p && TARGET_16_BIT)
 	{
-	case QImode:
-	  /* 333 format.  */
-	  if (val >= 0 && val < 8 && regno < 8)
-	    return ADDRESS_LO_REG_IMM3U;
-	  break;
-
-	case HImode:
-	  /* 333 format.  */
-	  if (val >= 0 && val < 16 && (val % 2 == 0) && regno < 8)
-	    return ADDRESS_LO_REG_IMM3U;
-	  break;
-
-	case SImode:
-	case SFmode:
-	case DFmode:
-	  /* fp imply 37 format.  */
-	  if ((regno == FP_REGNUM) &&
-	      (val >= 0 && val < 512 && (val % 4 == 0)))
-	    return ADDRESS_FP_IMM7U;
-	  /* sp imply 37 format.  */
-	  else if ((regno == SP_REGNUM) &&
-		   (val >= 0 && val < 512 && (val % 4 == 0)))
-	    return ADDRESS_SP_IMM7U;
-	  /* 333 format.  */
-	  else if (val >= 0 && val < 32 && (val % 4 == 0) && regno < 8)
-	    return ADDRESS_LO_REG_IMM3U;
-	  break;
-
-	default:
-	  break;
+	  /* b<cond>s38  $r1, .L0   */
+	  snprintf (pattern, sizeof (pattern),
+		    "b%ss38\t %%2, %%3", cond_code);
+	}
+      else
+	{
+	  /* b<cond><suffix>  $r0, $r1, .L0   */
+	  snprintf (pattern, sizeof (pattern),
+		    "b%s%s\t%%1, %%2, %%3", cond_code, suffix);
 	}
     }

-  return ADDRESS_NOT_16BIT_FORMAT;
+  output_asm_insn (pattern, operands);
 }

-/* Output 16-bit store.  */
-const char *
-nds32_output_16bit_store (rtx *operands, int byte)
+static void
+output_cond_branch_compare_zero (int code, const char *suffix,
+				 bool long_jump_p, rtx *operands,
+				 bool ta_implied_p)
 {
-  char pattern[100];
-  char size;
-  rtx code = XEXP (operands[0], 0);
-
-  size = nds32_byte_to_size (byte);
+  char pattern[256];
+  const char *cond_code;
+  bool align_p = NDS32_ALIGN_P ();
+  const char *align = align_p ? "\t.align\t2\n" : "";
+  if (long_jump_p)
+    {
+      int inverse_code = nds32_inverse_cond_code (code);
+      cond_code = nds32_cond_code_str (inverse_code);

-  switch (nds32_mem_format (operands[0]))
+      if (ta_implied_p && TARGET_16_BIT)
+	{
+	  /*    b<cond>z<suffix>  .L0
+	      =>
+		b<inverse_cond>z<suffix>  .LCB0
+		j  .L0
+	      .LCB0:
+	   */
+	  snprintf (pattern, sizeof (pattern),
+		    "b%sz%s\t.LCB%%=\n\tj\t%%2\n%s.LCB%%=:",
+		    cond_code, suffix, align);
+	}
+      else
+	{
+	  /*      b<cond>z<suffix>  $r0, .L0
+		=>
+		  b<inverse_cond>z<suffix>  $r0, .LCB0
+		  j  .L0
+		.LCB0:
+	   */
+	  snprintf (pattern, sizeof (pattern),
+		    "b%sz%s\t%%1, .LCB%%=\n\tj\t%%2\n%s.LCB%%=:",
+		    cond_code, suffix, align);
+	}
+    }
+  else
     {
-    case ADDRESS_REG:
-      operands[0] = code;
-      output_asm_insn ("swi450\t%1, [%0]", operands);
-      break;
-    case ADDRESS_LO_REG_IMM3U:
-      snprintf (pattern, sizeof (pattern), "s%ci333\t%%1, %%0", size);
-      output_asm_insn (pattern, operands);
-      break;
-    case ADDRESS_POST_INC_LO_REG_IMM3U:
-      snprintf (pattern, sizeof (pattern), "s%ci333.bi\t%%1, %%0", size);
-      output_asm_insn (pattern, operands);
-      break;
-    case ADDRESS_FP_IMM7U:
-      output_asm_insn ("swi37\t%1, %0", operands);
-      break;
-    case ADDRESS_SP_IMM7U:
-      /* Get immediate value and set back to operands[1].  */
-      operands[0] = XEXP (code, 1);
-      output_asm_insn ("swi37.sp\t%1, [ + (%0)]", operands);
-      break;
-    default:
-      break;
+      cond_code = nds32_cond_code_str (code);
+      if (ta_implied_p && TARGET_16_BIT)
+	{
+	  /* b<cond>z<suffix>  .L0  */
+	  snprintf (pattern, sizeof (pattern),
+		    "b%sz%s\t%%2", cond_code, suffix);
+	}
+      else
+	{
+	  /* b<cond>z<suffix>  $r0, .L0  */
+	  snprintf (pattern, sizeof (pattern),
+		    "b%sz%s\t%%1, %%2", cond_code, suffix);
+	}
     }

-  return "";
+  output_asm_insn (pattern, operands);
 }

-/* Output 16-bit load.  */
-const char *
-nds32_output_16bit_load (rtx *operands, int byte)
+static void
+nds32_split_shiftrtdi3 (rtx dst, rtx src, rtx shiftamount, bool logic_shift_p)
 {
-  char pattern[100];
-  unsigned char size;
-  rtx code = XEXP (operands[1], 0);
+  rtx src_high_part;
+  rtx dst_high_part, dst_low_part;

-  size = nds32_byte_to_size (byte);
+  dst_high_part = nds32_di_high_part_subreg (dst);
+  src_high_part = nds32_di_high_part_subreg (src);
+  dst_low_part = nds32_di_low_part_subreg (dst);

-  switch (nds32_mem_format (operands[1]))
+  if (CONST_INT_P (shiftamount))
     {
-    case ADDRESS_REG:
-      operands[1] = code;
-      output_asm_insn ("lwi450\t%0, [%1]", operands);
-      break;
-    case ADDRESS_LO_REG_IMM3U:
-      snprintf (pattern, sizeof (pattern), "l%ci333\t%%0, %%1", size);
-      output_asm_insn (pattern, operands);
-      break;
-    case ADDRESS_POST_INC_LO_REG_IMM3U:
-      snprintf (pattern, sizeof (pattern), "l%ci333.bi\t%%0, %%1", size);
-      output_asm_insn (pattern, operands);
-      break;
-    case ADDRESS_FP_IMM7U:
-      output_asm_insn ("lwi37\t%0, %1", operands);
-      break;
-    case ADDRESS_SP_IMM7U:
-      /* Get immediate value and set back to operands[0].  */
-      operands[1] = XEXP (code, 1);
-      output_asm_insn ("lwi37.sp\t%0, [ + (%1)]", operands);
-      break;
-    default:
-      break;
+      if (INTVAL (shiftamount) < 32)
+	{
+	  if (logic_shift_p)
+	    {
+	      emit_insn (gen_uwext (dst_low_part, src,
+						  shiftamount));
+	      emit_insn (gen_lshrsi3 (dst_high_part, src_high_part,
+						     shiftamount));
+	    }
+	  else
+	    {
+	      emit_insn (gen_wext (dst_low_part, src,
+						 shiftamount));
+	      emit_insn (gen_ashrsi3 (dst_high_part, src_high_part,
+						     shiftamount));
+	    }
+	}
+      else
+	{
+	  rtx new_shift_amout = gen_int_mode(INTVAL (shiftamount) - 32, SImode);
+
+	  if (logic_shift_p)
+	    {
+	      emit_insn (gen_lshrsi3 (dst_low_part, src_high_part,
+						    new_shift_amout));
+	      emit_move_insn (dst_high_part, const0_rtx);
+	    }
+	  else
+	    {
+	      emit_insn (gen_ashrsi3 (dst_low_part, src_high_part,
+						    new_shift_amout));
+	      emit_insn (gen_ashrsi3 (dst_high_part, src_high_part,
+						     GEN_INT (31)));
+	    }
+	}
     }
+  else
+    {
+      rtx dst_low_part_l32, dst_high_part_l32;
+      rtx dst_low_part_g32, dst_high_part_g32;
+      rtx new_shift_amout, select_reg;
+      dst_low_part_l32 = gen_reg_rtx (SImode);
+      dst_high_part_l32 = gen_reg_rtx (SImode);
+      dst_low_part_g32 = gen_reg_rtx (SImode);
+      dst_high_part_g32 = gen_reg_rtx (SImode);
+      new_shift_amout = gen_reg_rtx (SImode);
+      select_reg = gen_reg_rtx (SImode);
+
+      emit_insn (gen_andsi3 (shiftamount, shiftamount, GEN_INT (0x3f)));
+
+      if (logic_shift_p)
+	{
+	  /*
+	     if (shiftamount < 32)
+	       dst_low_part = wext (src, shiftamount)
+	       dst_high_part = src_high_part >> shiftamount
+	     else
+	       dst_low_part = src_high_part >> (shiftamount & 0x1f)
+	       dst_high_part = 0
+	  */
+	  emit_insn (gen_uwext (dst_low_part_l32, src, shiftamount));
+	  emit_insn (gen_lshrsi3 (dst_high_part_l32, src_high_part,
+						     shiftamount));
+
+	  emit_insn (gen_andsi3 (new_shift_amout, shiftamount, GEN_INT (0x1f)));
+	  emit_insn (gen_lshrsi3 (dst_low_part_g32, src_high_part,
+						    new_shift_amout));
+	  emit_move_insn (dst_high_part_g32, const0_rtx);
+	}
+      else
+	{
+	  /*
+	     if (shiftamount < 32)
+	       dst_low_part = wext (src, shiftamount)
+	       dst_high_part = src_high_part >> shiftamount
+	     else
+	       dst_low_part = src_high_part >> (shiftamount & 0x1f)
+	       # shift 31 for sign extend
+	       dst_high_part = src_high_part >> 31
+	  */
+	  emit_insn (gen_wext (dst_low_part_l32, src, shiftamount));
+	  emit_insn (gen_ashrsi3 (dst_high_part_l32, src_high_part,
+						     shiftamount));
+
+	  emit_insn (gen_andsi3 (new_shift_amout, shiftamount, GEN_INT (0x1f)));
+	  emit_insn (gen_ashrsi3 (dst_low_part_g32, src_high_part,
+						    new_shift_amout));
+	  emit_insn (gen_ashrsi3 (dst_high_part_g32, src_high_part,
+						     GEN_INT (31)));
+	}

-  return "";
+      emit_insn (gen_slt_compare (select_reg, shiftamount, GEN_INT (32)));
+
+      emit_insn (gen_cmovnsi (dst_low_part, select_reg,
+			      dst_low_part_l32, dst_low_part_g32));
+      emit_insn (gen_cmovnsi (dst_high_part, select_reg,
+			      dst_high_part_l32, dst_high_part_g32));
+  }
 }

-/* Output 32-bit store.  */
-const char *
-nds32_output_32bit_store (rtx *operands, int byte)
-{
-  char pattern[100];
-  unsigned char size;
-  rtx code = XEXP (operands[0], 0);
+/* ------------------------------------------------------------------------ */

-  size = nds32_byte_to_size (byte);
+/* PART 2: Auxiliary function for expand RTL pattern.  */

-  switch (GET_CODE (code))
+enum nds32_expand_result_type
+nds32_expand_cbranch (rtx *operands)
+{
+  rtx tmp_reg;
+  enum rtx_code code;
+
+  code = GET_CODE (operands[0]);
+
+  /* If operands[2] is (const_int 0),
+     we can use beqz,bnez,bgtz,bgez,bltz,or blez instructions.
+     So we have gcc generate original template rtx.  */
+  if (GET_CODE (operands[2]) == CONST_INT)
+    if (INTVAL (operands[2]) == 0)
+      if ((code != GTU)
+	  && (code != GEU)
+	  && (code != LTU)
+	  && (code != LEU))
+	return EXPAND_CREATE_TEMPLATE;
+
+  /* For other comparison, NDS32 ISA only has slt (Set-on-Less-Than)
+     behavior for the comparison, we might need to generate other
+     rtx patterns to achieve same semantic.  */
+  switch (code)
     {
-    case REG:
-      /* (mem (reg X))
-	 => access location by using register,
-	 use "sbi / shi / swi" */
-      snprintf (pattern, sizeof (pattern), "s%ci\t%%1, %%0", size);
-      break;
-
-    case SYMBOL_REF:
-    case CONST:
-      /* (mem (symbol_ref X))
-	 (mem (const (...)))
-	 => access global variables,
-	 use "sbi.gp / shi.gp / swi.gp" */
-      operands[0] = XEXP (operands[0], 0);
-      snprintf (pattern, sizeof (pattern), "s%ci.gp\t%%1, [ + %%0]", size);
-      break;
+    case GT:
+    case GTU:
+      if (GET_CODE (operands[2]) == CONST_INT)
+	{
+	  /* GT  reg_A, const_int  =>  !(LT  reg_A, const_int + 1) */
+	  if (optimize_size || optimize == 0)
+	    tmp_reg = gen_rtx_REG (SImode, TA_REGNUM);
+	  else
+	    tmp_reg = gen_reg_rtx (SImode);
+
+	  /* We want to plus 1 into the integer value
+	     of operands[2] to create 'slt' instruction.
+	     This caculation is performed on the host machine,
+	     which may be 64-bit integer.
+	     So the meaning of caculation result may be
+	     different from the 32-bit nds32 target.
+
+	     For example:
+	       0x7fffffff + 0x1 -> 0x80000000,
+	       this value is POSITIVE on 64-bit machine,
+	       but the expected value on 32-bit nds32 target
+	       should be NEGATIVE value.
+
+	     Hence, instead of using GEN_INT(), we use gen_int_mode() to
+	     explicitly create SImode constant rtx.  */
+	  enum rtx_code cmp_code;
+
+	  rtx plus1 = gen_int_mode (INTVAL (operands[2]) + 1, SImode);
+	  if (satisfies_constraint_Is15 (plus1))
+	    {
+	      operands[2] = plus1;
+	      cmp_code = EQ;
+	      if (code == GT)
+		{
+		  /* GT, use slts instruction */
+		  emit_insn (
+		    gen_slts_compare (tmp_reg, operands[1], operands[2]));
+		}
+	      else
+		{
+		  /* GTU, use slt instruction */
+		  emit_insn (
+		    gen_slt_compare  (tmp_reg, operands[1], operands[2]));
+		}
+	    }
+	  else
+	    {
+	      cmp_code = NE;
+	      if (code == GT)
+		{
+		  /* GT, use slts instruction */
+		  emit_insn (
+		    gen_slts_compare (tmp_reg, operands[2], operands[1]));
+		}
+	      else
+		{
+		  /* GTU, use slt instruction */
+		  emit_insn (
+		    gen_slt_compare  (tmp_reg, operands[2], operands[1]));
+		}
+	    }
+
+	  PUT_CODE (operands[0], cmp_code);
+	  operands[1] = tmp_reg;
+	  operands[2] = const0_rtx;
+	  emit_insn (gen_cbranchsi4 (operands[0], operands[1],
+				     operands[2], operands[3]));
+
+	  return EXPAND_DONE;
+	}
+      else
+	{
+	  /* GT  reg_A, reg_B  =>  LT  reg_B, reg_A */
+	  if (optimize_size || optimize == 0)
+	    tmp_reg = gen_rtx_REG (SImode, TA_REGNUM);
+	  else
+	    tmp_reg = gen_reg_rtx (SImode);
+
+	  if (code == GT)
+	    {
+	      /* GT, use slts instruction */
+	      emit_insn (gen_slts_compare (tmp_reg, operands[2], operands[1]));
+	    }
+	  else
+	    {
+	      /* GTU, use slt instruction */
+	      emit_insn (gen_slt_compare  (tmp_reg, operands[2], operands[1]));
+	    }
+
+	  PUT_CODE (operands[0], NE);
+	  operands[1] = tmp_reg;
+	  operands[2] = const0_rtx;
+	  emit_insn (gen_cbranchsi4 (operands[0], operands[1],
+				     operands[2], operands[3]));
+
+	  return EXPAND_DONE;
+	}

-    case POST_INC:
-      /* (mem (post_inc reg))
-	 => access location by using register which will be post increment,
-	 use "sbi.bi / shi.bi / swi.bi" */
-      snprintf (pattern, sizeof (pattern),
-		"s%ci.bi\t%%1, %%0, %d", size, byte);
-      break;
+    case GE:
+    case GEU:
+      /* GE  reg_A, reg_B      =>  !(LT  reg_A, reg_B) */
+      /* GE  reg_A, const_int  =>  !(LT  reg_A, const_int) */
+      if (optimize_size || optimize == 0)
+	tmp_reg = gen_rtx_REG (SImode, TA_REGNUM);
+      else
+	tmp_reg = gen_reg_rtx (SImode);

-    case POST_DEC:
-      /* (mem (post_dec reg))
-	 => access location by using register which will be post decrement,
-	 use "sbi.bi / shi.bi / swi.bi" */
-      snprintf (pattern, sizeof (pattern),
-		"s%ci.bi\t%%1, %%0, -%d", size, byte);
-      break;
+      if (code == GE)
+	{
+	  /* GE, use slts instruction */
+	  emit_insn (gen_slts_compare (tmp_reg, operands[1], operands[2]));
+	}
+      else
+	{
+	  /* GEU, use slt instruction */
+	  emit_insn (gen_slt_compare  (tmp_reg, operands[1], operands[2]));
+	}

-    case POST_MODIFY:
-      switch (GET_CODE (XEXP (XEXP (code, 1), 1)))
+      PUT_CODE (operands[0], EQ);
+      operands[1] = tmp_reg;
+      operands[2] = const0_rtx;
+      emit_insn (gen_cbranchsi4 (operands[0], operands[1],
+				 operands[2], operands[3]));
+
+      return EXPAND_DONE;
+
+    case LT:
+    case LTU:
+      /* LT  reg_A, reg_B      =>  LT  reg_A, reg_B */
+      /* LT  reg_A, const_int  =>  LT  reg_A, const_int */
+      if (optimize_size || optimize == 0)
+	tmp_reg = gen_rtx_REG (SImode, TA_REGNUM);
+      else
+	tmp_reg = gen_reg_rtx (SImode);
+
+      if (code == LT)
 	{
-	case REG:
-	case SUBREG:
-	  /* (mem (post_modify (reg) (plus (reg) (reg))))
-	     => access location by using register which will be
-	     post modified with reg,
-	     use "sb.bi/ sh.bi / sw.bi" */
-	  snprintf (pattern, sizeof (pattern), "s%c.bi\t%%1, %%0", size);
-	  break;
-	case CONST_INT:
-	  /* (mem (post_modify (reg) (plus (reg) (const_int))))
-	     => access location by using register which will be
-	     post modified with const_int,
-	     use "sbi.bi/ shi.bi / swi.bi" */
-	  snprintf (pattern, sizeof (pattern), "s%ci.bi\t%%1, %%0", size);
-	  break;
-	default:
-	  abort ();
+	  /* LT, use slts instruction */
+	  emit_insn (gen_slts_compare (tmp_reg, operands[1], operands[2]));
+	}
+      else
+	{
+	  /* LTU, use slt instruction */
+	  emit_insn (gen_slt_compare  (tmp_reg, operands[1], operands[2]));
 	}
-      break;

-    case PLUS:
-      switch (GET_CODE (XEXP (code, 1)))
+      PUT_CODE (operands[0], NE);
+      operands[1] = tmp_reg;
+      operands[2] = const0_rtx;
+      emit_insn (gen_cbranchsi4 (operands[0], operands[1],
+				 operands[2], operands[3]));
+
+      return EXPAND_DONE;
+
+    case LE:
+    case LEU:
+      if (GET_CODE (operands[2]) == CONST_INT)
 	{
-	case REG:
-	case SUBREG:
-	  /* (mem (plus reg reg)) or (mem (plus (mult reg const_int) reg))
-	     => access location by adding two registers,
-	     use "sb / sh / sw" */
-	  snprintf (pattern, sizeof (pattern), "s%c\t%%1, %%0", size);
-	  break;
-	case CONST_INT:
-	  /* (mem (plus reg const_int))
-	     => access location by adding one register with const_int,
-	     use "sbi / shi / swi" */
-	  snprintf (pattern, sizeof (pattern), "s%ci\t%%1, %%0", size);
-	  break;
-	default:
-	  abort ();
+	  /* LE  reg_A, const_int  =>  LT  reg_A, const_int + 1 */
+	  if (optimize_size || optimize == 0)
+	    tmp_reg = gen_rtx_REG (SImode, TA_REGNUM);
+	  else
+	    tmp_reg = gen_reg_rtx (SImode);
+
+	  enum rtx_code cmp_code;
+	  /* Note that (le:SI X INT_MAX) is not the same as (lt:SI X INT_MIN).
+	     We better have an assert here in case GCC does not properly
+	     optimize it away.  The INT_MAX here is 0x7fffffff for target.  */
+	  rtx plus1 = gen_int_mode (INTVAL (operands[2]) + 1, SImode);
+	  if (satisfies_constraint_Is15 (plus1))
+	    {
+	      operands[2] = plus1;
+	      cmp_code = NE;
+	      if (code == LE)
+		{
+		  /* LE, use slts instruction */
+		  emit_insn (
+		    gen_slts_compare (tmp_reg, operands[1], operands[2]));
+		}
+	      else
+		{
+		  /* LEU, use slt instruction */
+		  emit_insn (
+		    gen_slt_compare  (tmp_reg, operands[1], operands[2]));
+		}
+	    }
+	  else
+	    {
+	      cmp_code = EQ;
+	      if (code == LE)
+		{
+		  /* LE, use slts instruction */
+		  emit_insn (
+		    gen_slts_compare (tmp_reg, operands[2], operands[1]));
+		}
+	      else
+		{
+		  /* LEU, use slt instruction */
+		  emit_insn (
+		    gen_slt_compare  (tmp_reg, operands[2], operands[1]));
+		}
+	    }
+
+	  PUT_CODE (operands[0], cmp_code);
+	  operands[1] = tmp_reg;
+	  operands[2] = const0_rtx;
+	  emit_insn (gen_cbranchsi4 (operands[0], operands[1],
+				     operands[2], operands[3]));
+
+	  return EXPAND_DONE;
+	}
+      else
+	{
+	  /* LE  reg_A, reg_B  =>  !(LT  reg_B, reg_A) */
+	  if (optimize_size || optimize == 0)
+	    tmp_reg = gen_rtx_REG (SImode, TA_REGNUM);
+	  else
+	    tmp_reg = gen_reg_rtx (SImode);
+
+	  if (code == LE)
+	    {
+	      /* LE, use slts instruction */
+	      emit_insn (gen_slts_compare (tmp_reg, operands[2], operands[1]));
+	    }
+	  else
+	    {
+	      /* LEU, use slt instruction */
+	      emit_insn (gen_slt_compare  (tmp_reg, operands[2], operands[1]));
+	    }
+
+	  PUT_CODE (operands[0], EQ);
+	  operands[1] = tmp_reg;
+	  operands[2] = const0_rtx;
+	  emit_insn (gen_cbranchsi4 (operands[0], operands[1],
+				     operands[2], operands[3]));
+
+	  return EXPAND_DONE;
 	}
-      break;

-    case LO_SUM:
-      operands[2] = XEXP (code, 1);
-      operands[0] = XEXP (code, 0);
-      snprintf (pattern, sizeof (pattern),
-		"s%ci\t%%1, [%%0 + lo12(%%2)]", size);
-      break;
+    case EQ:
+    case NE:
+      /* NDS32 ISA has various form for eq/ne behavior no matter
+	 what kind of the operand is.
+	 So just generate original template rtx.  */
+
+      /* Put operands[2] into register if operands[2] is a large
+	 const_int or ISAv2.  */
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (!satisfies_constraint_Is11 (operands[2])
+	      || TARGET_ISA_V2))
+	operands[2] = force_reg (SImode, operands[2]);
+
+      return EXPAND_CREATE_TEMPLATE;

     default:
-      abort ();
+      return EXPAND_FAIL;
     }
-
-  output_asm_insn (pattern, operands);
-  return "";
 }

-/* Output 32-bit load.  */
-const char *
-nds32_output_32bit_load (rtx *operands, int byte)
+enum nds32_expand_result_type
+nds32_expand_cstore (rtx *operands)
 {
-  char pattern[100];
-  unsigned char size;
-  rtx code;
-
-  code = XEXP (operands[1], 0);
+  rtx tmp_reg;
+  enum rtx_code code;

-  size = nds32_byte_to_size (byte);
+  code = GET_CODE (operands[1]);

-  switch (GET_CODE (code))
+  switch (code)
     {
-    case REG:
-      /* (mem (reg X))
-	 => access location by using register,
-	 use "lbi / lhi / lwi" */
-      snprintf (pattern, sizeof (pattern), "l%ci\t%%0, %%1", size);
-      break;
-
-    case SYMBOL_REF:
-    case CONST:
-      /* (mem (symbol_ref X))
-	 (mem (const (...)))
-	 => access global variables,
-	 use "lbi.gp / lhi.gp / lwi.gp" */
-      operands[1] = XEXP (operands[1], 0);
-      snprintf (pattern, sizeof (pattern), "l%ci.gp\t%%0, [ + %%1]", size);
-      break;
+    case EQ:
+    case NE:
+      if (GET_CODE (operands[3]) == CONST_INT)
+	{
+	  /* reg_R = (reg_A == const_int_B)
+	     --> xori reg_C, reg_A, const_int_B
+		 slti reg_R, reg_C, const_int_1
+	     reg_R = (reg_A != const_int_B)
+	     --> xori reg_C, reg_A, const_int_B
+		 slti reg_R, const_int0, reg_C */
+	  tmp_reg = gen_reg_rtx (SImode);
+
+	  /* If the integer value is not in the range of imm15s,
+	     we need to force register first because our addsi3 pattern
+	     only accept nds32_rimm15s_operand predicate.  */
+	  rtx new_imm = gen_int_mode (-INTVAL (operands[3]), SImode);
+	  if (satisfies_constraint_Is15 (new_imm))
+	    emit_insn (gen_addsi3 (tmp_reg, operands[2], new_imm));
+	  else
+	    {
+	      if (!(satisfies_constraint_Iu15 (operands[3])
+		    || (TARGET_EXT_PERF
+			&& satisfies_constraint_It15 (operands[3]))))
+		operands[3] = force_reg (SImode, operands[3]);
+	      emit_insn (gen_xorsi3 (tmp_reg, operands[2], operands[3]));
+	    }
+
+	  if (code == EQ)
+	    emit_insn (gen_slt_eq0 (operands[0], tmp_reg));
+	  else
+	    emit_insn (gen_slt_compare (operands[0], const0_rtx, tmp_reg));
+
+	  return EXPAND_DONE;
+	}
+      else
+	{
+	  /* reg_R = (reg_A == reg_B)
+	     --> xor  reg_C, reg_A, reg_B
+		 slti reg_R, reg_C, const_int_1
+	     reg_R = (reg_A != reg_B)
+	     --> xor  reg_C, reg_A, reg_B
+		 slti reg_R, const_int0, reg_C */
+	  tmp_reg = gen_reg_rtx (SImode);
+	  emit_insn (gen_xorsi3 (tmp_reg, operands[2], operands[3]));
+	  if (code == EQ)
+	    emit_insn (gen_slt_eq0 (operands[0], tmp_reg));
+	  else
+	    emit_insn (gen_slt_compare (operands[0], const0_rtx, tmp_reg));
+
+	  return EXPAND_DONE;
+	}
+    case GT:
+    case GTU:
+      /* reg_R = (reg_A > reg_B)       --> slt reg_R, reg_B, reg_A */
+      /* reg_R = (reg_A > const_int_B) --> slt reg_R, const_int_B, reg_A */
+      if (code == GT)
+	{
+	  /* GT, use slts instruction */
+	  emit_insn (gen_slts_compare (operands[0], operands[3], operands[2]));
+	}
+      else
+	{
+	  /* GTU, use slt instruction */
+	  emit_insn (gen_slt_compare  (operands[0], operands[3], operands[2]));
+	}

-    case POST_INC:
-      /* (mem (post_inc reg))
-	 => access location by using register which will be post increment,
-	 use "lbi.bi / lhi.bi / lwi.bi" */
-      snprintf (pattern, sizeof (pattern),
-		"l%ci.bi\t%%0, %%1, %d", size, byte);
-      break;
+      return EXPAND_DONE;

-    case POST_DEC:
-      /* (mem (post_dec reg))
-	 => access location by using register which will be post decrement,
-	 use "lbi.bi / lhi.bi / lwi.bi" */
-      snprintf (pattern, sizeof (pattern),
-		"l%ci.bi\t%%0, %%1, -%d", size, byte);
-      break;
+    case GE:
+    case GEU:
+      if (GET_CODE (operands[3]) == CONST_INT)
+	{
+	  /* reg_R = (reg_A >= const_int_B)
+	     --> movi reg_C, const_int_B - 1
+		 slt  reg_R, reg_C, reg_A */
+	  tmp_reg = gen_reg_rtx (SImode);
+
+	  emit_insn (gen_movsi (tmp_reg,
+				gen_int_mode (INTVAL (operands[3]) - 1,
+					      SImode)));
+	  if (code == GE)
+	    {
+	      /* GE, use slts instruction */
+	      emit_insn (gen_slts_compare (operands[0], tmp_reg, operands[2]));
+	    }
+	  else
+	    {
+	      /* GEU, use slt instruction */
+	      emit_insn (gen_slt_compare  (operands[0], tmp_reg, operands[2]));
+	    }
+
+	  return EXPAND_DONE;
+	}
+      else
+	{
+	  /* reg_R = (reg_A >= reg_B)
+	     --> slt  reg_R, reg_A, reg_B
+		 xori reg_R, reg_R, const_int_1 */
+	  if (code == GE)
+	    {
+	      /* GE, use slts instruction */
+	      emit_insn (gen_slts_compare (operands[0],
+					   operands[2], operands[3]));
+	    }
+	  else
+	    {
+	      /* GEU, use slt instruction */
+	      emit_insn (gen_slt_compare  (operands[0],
+					   operands[2], operands[3]));
+	    }
+
+	  /* perform 'not' behavior */
+	  emit_insn (gen_xorsi3 (operands[0], operands[0], const1_rtx));
+
+	  return EXPAND_DONE;
+	}

-    case POST_MODIFY:
-      switch (GET_CODE (XEXP (XEXP (code, 1), 1)))
+    case LT:
+    case LTU:
+      /* reg_R = (reg_A < reg_B)       --> slt reg_R, reg_A, reg_B */
+      /* reg_R = (reg_A < const_int_B) --> slt reg_R, reg_A, const_int_B */
+      if (code == LT)
 	{
-	case REG:
-	case SUBREG:
-	  /* (mem (post_modify (reg) (plus (reg) (reg))))
-	     => access location by using register which will be
-	     post modified with reg,
-	     use "lb.bi/ lh.bi / lw.bi" */
-	  snprintf (pattern, sizeof (pattern), "l%c.bi\t%%0, %%1", size);
-	  break;
-	case CONST_INT:
-	  /* (mem (post_modify (reg) (plus (reg) (const_int))))
-	     => access location by using register which will be
-	     post modified with const_int,
-	     use "lbi.bi/ lhi.bi / lwi.bi" */
-	  snprintf (pattern, sizeof (pattern), "l%ci.bi\t%%0, %%1", size);
-	  break;
-	default:
-	  abort ();
+	  /* LT, use slts instruction */
+	  emit_insn (gen_slts_compare (operands[0], operands[2], operands[3]));
+	}
+      else
+	{
+	  /* LTU, use slt instruction */
+	  emit_insn (gen_slt_compare  (operands[0], operands[2], operands[3]));
 	}
-      break;

-    case PLUS:
-      switch (GET_CODE (XEXP (code, 1)))
+      return EXPAND_DONE;
+
+    case LE:
+    case LEU:
+      if (GET_CODE (operands[3]) == CONST_INT)
 	{
-	case REG:
-	case SUBREG:
-	  /* (mem (plus reg reg)) or (mem (plus (mult reg const_int) reg))
-	     use "lb / lh / lw" */
-	  snprintf (pattern, sizeof (pattern), "l%c\t%%0, %%1", size);
-	  break;
-	case CONST_INT:
-	  /* (mem (plus reg const_int))
-	     => access location by adding one register with const_int,
-	     use "lbi / lhi / lwi" */
-	  snprintf (pattern, sizeof (pattern), "l%ci\t%%0, %%1", size);
-	  break;
-	default:
-	  abort ();
+	  /* reg_R = (reg_A <= const_int_B)
+	     --> movi reg_C, const_int_B + 1
+		 slt  reg_R, reg_A, reg_C */
+	  tmp_reg = gen_reg_rtx (SImode);
+
+	  emit_insn (gen_movsi (tmp_reg,
+				gen_int_mode (INTVAL (operands[3]) + 1,
+						      SImode)));
+	  if (code == LE)
+	    {
+	      /* LE, use slts instruction */
+	      emit_insn (gen_slts_compare (operands[0], operands[2], tmp_reg));
+	    }
+	  else
+	    {
+	      /* LEU, use slt instruction */
+	      emit_insn (gen_slt_compare  (operands[0], operands[2], tmp_reg));
+	    }
+
+	  return EXPAND_DONE;
+	}
+      else
+	{
+	  /* reg_R = (reg_A <= reg_B) --> slt  reg_R, reg_B, reg_A
+					  xori reg_R, reg_R, const_int_1 */
+	  if (code == LE)
+	    {
+	      /* LE, use slts instruction */
+	      emit_insn (gen_slts_compare (operands[0],
+					   operands[3], operands[2]));
+	    }
+	  else
+	    {
+	      /* LEU, use slt instruction */
+	      emit_insn (gen_slt_compare  (operands[0],
+					   operands[3], operands[2]));
+	    }
+
+	  /* perform 'not' behavior */
+	  emit_insn (gen_xorsi3 (operands[0], operands[0], const1_rtx));
+
+	  return EXPAND_DONE;
 	}
-      break;

-    case LO_SUM:
-      operands[2] = XEXP (code, 1);
-      operands[1] = XEXP (code, 0);
-      snprintf (pattern, sizeof (pattern),
-		"l%ci\t%%0, [%%1 + lo12(%%2)]", size);
-      break;

     default:
-      abort ();
+      gcc_unreachable ();
     }
-
-  output_asm_insn (pattern, operands);
-  return "";
 }

-/* Output 32-bit load with signed extension.  */
-const char *
-nds32_output_32bit_load_s (rtx *operands, int byte)
+void
+nds32_expand_float_cbranch (rtx *operands)
 {
-  char pattern[100];
-  unsigned char size;
-  rtx code;
+  enum rtx_code code = GET_CODE (operands[0]);
+  enum rtx_code new_code = code;
+  rtx cmp_op0 = operands[1];
+  rtx cmp_op1 = operands[2];
+  rtx tmp_reg;
+  rtx tmp;

-  code = XEXP (operands[1], 0);
+  int reverse = 0;

-  size = nds32_byte_to_size (byte);
+  /* Main Goal: Use compare instruction + branch instruction.

-  switch (GET_CODE (code))
+     For example:
+     GT, GE: swap condition and swap operands and generate
+     compare instruction(LT, LE) + branch not equal instruction.
+
+     UNORDERED, LT, LE, EQ: no need to change and generate
+     compare instruction(UNORDERED, LT, LE, EQ) + branch not equal instruction.
+
+     ORDERED, NE: reverse condition and generate
+     compare instruction(EQ) + branch equal instruction. */
+
+  switch (code)
     {
-    case REG:
-      /* (mem (reg X))
-         => access location by using register,
-         use "lbsi / lhsi" */
-      snprintf (pattern, sizeof (pattern), "l%csi\t%%0, %%1", size);
+    case GT:
+    case GE:
+      tmp = cmp_op0;
+      cmp_op0 = cmp_op1;
+      cmp_op1 = tmp;
+      new_code = swap_condition (new_code);
       break;
-
-    case SYMBOL_REF:
-    case CONST:
-      /* (mem (symbol_ref X))
-         (mem (const (...)))
-         => access global variables,
-         use "lbsi.gp / lhsi.gp" */
-      operands[1] = XEXP (operands[1], 0);
-      snprintf (pattern, sizeof (pattern), "l%csi.gp\t%%0, [ + %%1]", size);
+    case UNORDERED:
+    case LT:
+    case LE:
+    case EQ:
       break;
-
-    case POST_INC:
-      /* (mem (post_inc reg))
-         => access location by using register which will be post increment,
-         use "lbsi.bi / lhsi.bi" */
-      snprintf (pattern, sizeof (pattern),
-		"l%csi.bi\t%%0, %%1, %d", size, byte);
+    case ORDERED:
+    case NE:
+      new_code = reverse_condition (new_code);
+      reverse = 1;
+      break;
+    case UNGT:
+    case UNGE:
+      new_code = reverse_condition_maybe_unordered (new_code);
+      reverse = 1;
       break;
+    case UNLT:
+    case UNLE:
+      new_code = reverse_condition_maybe_unordered (new_code);
+      tmp = cmp_op0;
+      cmp_op0 = cmp_op1;
+      cmp_op1 = tmp;
+      new_code = swap_condition (new_code);
+      reverse = 1;
+      break;
+    default:
+      return;
+    }

-    case POST_DEC:
-      /* (mem (post_dec reg))
-         => access location by using register which will be post decrement,
-         use "lbsi.bi / lhsi.bi" */
-      snprintf (pattern, sizeof (pattern),
-		"l%csi.bi\t%%0, %%1, -%d", size, byte);
+  tmp_reg = gen_reg_rtx (SImode);
+  emit_insn (gen_rtx_SET (tmp_reg,
+			  gen_rtx_fmt_ee (new_code, SImode,
+					  cmp_op0, cmp_op1)));
+
+  PUT_CODE (operands[0], reverse ? EQ : NE);
+  emit_insn (gen_cbranchsi4 (operands[0], tmp_reg,
+			     const0_rtx, operands[3]));
+}
+
+void
+nds32_expand_float_cstore (rtx *operands)
+{
+  enum rtx_code code = GET_CODE (operands[1]);
+  enum rtx_code new_code = code;
+  enum machine_mode mode = GET_MODE (operands[2]);
+
+  rtx cmp_op0 = operands[2];
+  rtx cmp_op1 = operands[3];
+  rtx tmp;
+
+  /* Main Goal: Use compare instruction to store value.
+
+     For example:
+     GT, GE: swap condition and swap operands.
+       reg_R = (reg_A >  reg_B) --> fcmplt reg_R, reg_B, reg_A
+       reg_R = (reg_A >= reg_B) --> fcmple reg_R, reg_B, reg_A
+
+     LT, LE, EQ: no need to change, it is already LT, LE, EQ.
+       reg_R = (reg_A <  reg_B) --> fcmplt reg_R, reg_A, reg_B
+       reg_R = (reg_A <= reg_B) --> fcmple reg_R, reg_A, reg_B
+       reg_R = (reg_A == reg_B) --> fcmpeq reg_R, reg_A, reg_B
+
+     ORDERED: reverse condition and using xor insturction to achieve 'ORDERED'.
+       reg_R = (reg_A != reg_B) --> fcmpun reg_R, reg_A, reg_B
+				       xor reg_R, reg_R, const1_rtx
+
+     NE: reverse condition and using xor insturction to achieve 'NE'.
+       reg_R = (reg_A != reg_B) --> fcmpeq reg_R, reg_A, reg_B
+				       xor reg_R, reg_R, const1_rtx */
+  switch (code)
+    {
+    case GT:
+    case GE:
+      tmp = cmp_op0;
+      cmp_op0 = cmp_op1;
+      cmp_op1 =tmp;
+      new_code = swap_condition (new_code);
       break;
+    case UNORDERED:
+    case LT:
+    case LE:
+    case EQ:
+      break;
+    case ORDERED:
+      if (mode == SFmode)
+	emit_insn (gen_cmpsf_un (operands[0], cmp_op0, cmp_op1));
+      else
+	emit_insn (gen_cmpdf_un (operands[0], cmp_op0, cmp_op1));

-    case POST_MODIFY:
-      switch (GET_CODE (XEXP (XEXP (code, 1), 1)))
+      emit_insn (gen_xorsi3 (operands[0], operands[0], const1_rtx));
+      return;
+    case NE:
+      if (mode == SFmode)
+	emit_insn (gen_cmpsf_eq (operands[0], cmp_op0, cmp_op1));
+      else
+	emit_insn (gen_cmpdf_eq (operands[0], cmp_op0, cmp_op1));
+
+      emit_insn (gen_xorsi3 (operands[0], operands[0], const1_rtx));
+      return;
+    default:
+      return;
+    }
+
+  emit_insn (gen_rtx_SET (operands[0],
+			  gen_rtx_fmt_ee (new_code, SImode,
+					  cmp_op0, cmp_op1)));
+}
+
+enum nds32_expand_result_type
+nds32_expand_movcc (rtx *operands)
+{
+  enum rtx_code code = GET_CODE (operands[1]);
+  enum rtx_code new_code = code;
+  enum machine_mode cmp0_mode = GET_MODE (XEXP (operands[1], 0));
+  rtx cmp_op0 = XEXP (operands[1], 0);
+  rtx cmp_op1 = XEXP (operands[1], 1);
+  rtx tmp;
+
+  if ((GET_CODE (operands[1]) == EQ || GET_CODE (operands[1]) == NE)
+      && XEXP (operands[1], 1) == const0_rtx)
+    {
+      /* If the operands[1] rtx is already (eq X 0) or (ne X 0),
+	 we have gcc generate original template rtx.  */
+      return EXPAND_CREATE_TEMPLATE;
+    }
+  else if ((TARGET_FPU_SINGLE && cmp0_mode == SFmode)
+	   || (TARGET_FPU_DOUBLE && cmp0_mode == DFmode))
+    {
+      nds32_expand_float_movcc (operands);
+    }
+  else
+    {
+      /* Since there is only 'slt'(Set when Less Than) instruction for
+	 comparison in Andes ISA, the major strategy we use here is to
+	 convert conditional move into 'LT + EQ' or 'LT + NE' rtx combination.
+	 We design constraints properly so that the reload phase will assist
+	 to make one source operand to use same register as result operand.
+	 Then we can use cmovz/cmovn to catch the other source operand
+	 which has different register.  */
+      int reverse = 0;
+
+      /* Main Goal: Use 'LT + EQ' or 'LT + NE' to target "then" part
+	 Strategy : Reverse condition and swap comparison operands
+
+	 For example:
+
+	     a <= b ? P : Q   (LE or LEU)
+	 --> a >  b ? Q : P   (reverse condition)
+	 --> b <  a ? Q : P   (swap comparison operands to achieve 'LT/LTU')
+
+	     a >= b ? P : Q   (GE or GEU)
+	 --> a <  b ? Q : P   (reverse condition to achieve 'LT/LTU')
+
+	     a <  b ? P : Q   (LT or LTU)
+	 --> (NO NEED TO CHANGE, it is already 'LT/LTU')
+
+	     a >  b ? P : Q   (GT or GTU)
+	 --> b <  a ? P : Q   (swap comparison operands to achieve 'LT/LTU') */
+      switch (code)
 	{
-	case REG:
-	case SUBREG:
-	  /* (mem (post_modify (reg) (plus (reg) (reg))))
-	     => access location by using register which will be
-	     post modified with reg,
-	     use "lbs.bi/ lhs.bi" */
-	  snprintf (pattern, sizeof (pattern), "l%cs.bi\t%%0, %%1", size);
+	case GE: case GEU: case LE: case LEU:
+	  new_code = reverse_condition (code);
+	  reverse = 1;
 	  break;
-	case CONST_INT:
-	  /* (mem (post_modify (reg) (plus (reg) (const_int))))
-	     => access location by using register which will be
-	     post modified with const_int,
-	     use "lbsi.bi/ lhsi.bi" */
-	  snprintf (pattern, sizeof (pattern), "l%csi.bi\t%%0, %%1", size);
+	case EQ:
+	case NE:
+	  /* no need to reverse condition */
 	  break;
 	default:
-	  abort ();
+	  return EXPAND_FAIL;
 	}
-      break;

-    case PLUS:
-      switch (GET_CODE (XEXP (code, 1)))
+      /* For '>' comparison operator, we swap operands
+	 so that we can have 'LT/LTU' operator.  */
+      if (new_code == GT || new_code == GTU)
 	{
-	case REG:
-	case SUBREG:
-	  /* (mem (plus reg reg)) or (mem (plus (mult reg const_int) reg))
-	     use "lbs / lhs" */
-	  snprintf (pattern, sizeof (pattern), "l%cs\t%%0, %%1", size);
+	  tmp     = cmp_op0;
+	  cmp_op0 = cmp_op1;
+	  cmp_op1 = tmp;
+
+	  new_code = swap_condition (new_code);
+	}
+
+      /* Use a temporary register to store slt/slts result.  */
+      tmp = gen_reg_rtx (SImode);
+
+      if (new_code == EQ || new_code == NE)
+	{
+	  emit_insn (gen_xorsi3 (tmp, cmp_op0, cmp_op1));
+	  /* tmp == 0 if cmp_op0 == cmp_op1.  */
+	  operands[1] = gen_rtx_fmt_ee (new_code, VOIDmode, tmp, const0_rtx);
+	}
+      else
+	{
+	  /* This emit_insn will create corresponding 'slt/slts'
+	      insturction.  */
+	  if (new_code == LT)
+	    emit_insn (gen_slts_compare (tmp, cmp_op0, cmp_op1));
+	  else if (new_code == LTU)
+	    emit_insn (gen_slt_compare (tmp, cmp_op0, cmp_op1));
+	  else
+	    gcc_unreachable ();
+
+	  /* Change comparison semantic into (eq X 0) or (ne X 0) behavior
+	     so that cmovz or cmovn will be matched later.
+
+	     For reverse condition cases, we want to create a semantic that:
+	       (eq X 0) --> pick up "else" part
+	     For normal cases, we want to create a semantic that:
+	       (ne X 0) --> pick up "then" part
+
+	     Later we will have cmovz/cmovn instruction pattern to
+	     match corresponding behavior and output instruction.  */
+	  operands[1] = gen_rtx_fmt_ee (reverse ? EQ : NE,
+					VOIDmode, tmp, const0_rtx);
+	}
+    }
+  return EXPAND_CREATE_TEMPLATE;
+}
+
+void
+nds32_expand_float_movcc (rtx *operands)
+{
+  if ((GET_CODE (operands[1]) == EQ || GET_CODE (operands[1]) == NE)
+      && GET_MODE (XEXP (operands[1], 0)) == SImode
+      && XEXP (operands[1], 1) == const0_rtx)
+    {
+      /* If the operands[1] rtx is already (eq X 0) or (ne X 0),
+	 we have gcc generate original template rtx.  */
+      return;
+    }
+  else
+    {
+      enum rtx_code code = GET_CODE (operands[1]);
+      enum rtx_code new_code = code;
+      enum machine_mode cmp0_mode = GET_MODE (XEXP (operands[1], 0));
+      enum machine_mode cmp1_mode = GET_MODE (XEXP (operands[1], 1));
+      rtx cmp_op0 = XEXP (operands[1], 0);
+      rtx cmp_op1 = XEXP (operands[1], 1);
+      rtx tmp;
+
+      /* Compare instruction Operations: (cmp_op0 condition cmp_op1) ? 1 : 0,
+	 when result is 1, and 'reverse' be set 1 for fcmovzs instructuin. */
+      int reverse = 0;
+
+      /* Main Goal: Use cmpare instruction + conditional move instruction.
+	 Strategy : swap condition and swap comparison operands.
+
+	 For example:
+	     a > b ? P : Q   (GT)
+	 --> a < b ? Q : P   (swap condition)
+	 --> b < a ? Q : P   (swap comparison operands to achieve 'GT')
+
+	     a >= b ? P : Q  (GE)
+	 --> a <= b ? Q : P  (swap condition)
+	 --> b <= a ? Q : P  (swap comparison operands to achieve 'GE')
+
+	     a <  b ? P : Q  (LT)
+	 --> (NO NEED TO CHANGE, it is already 'LT')
+
+	     a >= b ? P : Q  (LE)
+	 --> (NO NEED TO CHANGE, it is already 'LE')
+
+	     a == b ? P : Q  (EQ)
+	 --> (NO NEED TO CHANGE, it is already 'EQ') */
+
+      switch (code)
+	{
+	case GT:
+	case GE:
+	  tmp = cmp_op0;
+	  cmp_op0 = cmp_op1;
+	  cmp_op1 =tmp;
+	  new_code = swap_condition (new_code);
 	  break;
-	case CONST_INT:
-	  /* (mem (plus reg const_int))
-	     => access location by adding one register with const_int,
-	     use "lbsi / lhsi" */
-	  snprintf (pattern, sizeof (pattern), "l%csi\t%%0, %%1", size);
+	case UNORDERED:
+	case LT:
+	case LE:
+	case EQ:
+	  break;
+	case ORDERED:
+	case NE:
+	  reverse = 1;
+	  new_code = reverse_condition (new_code);
+	  break;
+	case UNGT:
+	case UNGE:
+	  new_code = reverse_condition_maybe_unordered (new_code);
+	  reverse = 1;
+	  break;
+	case UNLT:
+	case UNLE:
+	  new_code = reverse_condition_maybe_unordered (new_code);
+	  tmp = cmp_op0;
+	  cmp_op0 = cmp_op1;
+	  cmp_op1 = tmp;
+	  new_code = swap_condition (new_code);
+	  reverse = 1;
 	  break;
 	default:
-	  abort ();
+	  return;
 	}
-      break;

-    case LO_SUM:
-      operands[2] = XEXP (code, 1);
-      operands[1] = XEXP (code, 0);
-      snprintf (pattern, sizeof (pattern),
-		"l%csi\t%%0, [%%1 + lo12(%%2)]", size);
-      break;
+      /* Use a temporary register to store fcmpxxs result.  */
+      tmp = gen_reg_rtx (SImode);
+
+      /* Create float compare instruction for SFmode and DFmode,
+	 other MODE using cstoresi create compare instruction. */
+      if ((cmp0_mode == DFmode || cmp0_mode == SFmode)
+	  && (cmp1_mode == DFmode || cmp1_mode == SFmode))
+	{
+	  /* This emit_insn create corresponding float compare instruction */
+	  emit_insn (gen_rtx_SET (tmp,
+				  gen_rtx_fmt_ee (new_code, SImode,
+						  cmp_op0, cmp_op1)));
+	}
+      else
+	{
+	  /* This emit_insn using cstoresi create corresponding
+	     compare instruction */
+	  PUT_CODE (operands[1], new_code);
+	  emit_insn (gen_cstoresi4 (tmp, operands[1],
+				    cmp_op0, cmp_op1));
+	}
+      /* operands[1] crete corresponding condition move instruction
+	 for fcmovzs and fcmovns.  */
+      operands[1] = gen_rtx_fmt_ee (reverse ? EQ : NE,
+				    VOIDmode, tmp, const0_rtx);
+    }
+}
+
+void
+nds32_emit_push_fpr_callee_saved (int base_offset)
+{
+  rtx fpu_insn;
+  rtx reg, mem;
+  unsigned int regno = cfun->machine->callee_saved_first_fpr_regno;
+  unsigned int last_fpr = cfun->machine->callee_saved_last_fpr_regno;
+
+  while (regno <= last_fpr)
+    {
+      /* Handling two registers, using fsdi instruction.  */
+      reg = gen_rtx_REG (DFmode, regno);
+      mem = gen_frame_mem (DFmode, plus_constant (Pmode,
+						  stack_pointer_rtx,
+						  base_offset));
+      base_offset += 8;
+      regno += 2;
+      fpu_insn = emit_move_insn (mem, reg);
+      RTX_FRAME_RELATED_P (fpu_insn) = 1;
+    }
+}
+
+void
+nds32_emit_pop_fpr_callee_saved (int gpr_padding_size)
+{
+  rtx fpu_insn;
+  rtx reg, mem, addr;
+  rtx dwarf, adjust_sp_rtx;
+  unsigned int regno = cfun->machine->callee_saved_first_fpr_regno;
+  unsigned int last_fpr = cfun->machine->callee_saved_last_fpr_regno;
+  int padding = 0;
+
+  while (regno <= last_fpr)
+    {
+      /* Handling two registers, using fldi.bi instruction.  */
+      if ((regno + 1) >= last_fpr)
+	padding = gpr_padding_size;
+
+      reg = gen_rtx_REG (DFmode, (regno));
+      addr = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx,
+				  gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+						GEN_INT (8 + padding)));
+      mem = gen_frame_mem (DFmode, addr);
+      regno += 2;
+      fpu_insn = emit_move_insn (reg, mem);
+
+      adjust_sp_rtx =
+	gen_rtx_SET (stack_pointer_rtx,
+		     plus_constant (Pmode, stack_pointer_rtx,
+				    8 + padding));
+
+      dwarf = alloc_reg_note (REG_CFA_RESTORE, reg, NULL_RTX);
+      /* Tell gcc we adjust SP in this insn.  */
+      dwarf = alloc_reg_note (REG_CFA_ADJUST_CFA, copy_rtx (adjust_sp_rtx),
+			      dwarf);
+      RTX_FRAME_RELATED_P (fpu_insn) = 1;
+      REG_NOTES (fpu_insn) = dwarf;
+    }
+}
+
+void
+nds32_emit_v3pop_fpr_callee_saved (int base)
+{
+  int fpu_base_addr = base;
+  int regno;
+  rtx fpu_insn;
+  rtx reg, mem;
+  rtx dwarf;
+
+  regno = cfun->machine->callee_saved_first_fpr_regno;
+  while (regno <= cfun->machine->callee_saved_last_fpr_regno)
+    {
+      /* Handling two registers, using fldi instruction.  */
+      reg = gen_rtx_REG (DFmode, regno);
+      mem = gen_frame_mem (DFmode, plus_constant (Pmode,
+						  stack_pointer_rtx,
+						  fpu_base_addr));
+      fpu_base_addr += 8;
+      regno += 2;
+      fpu_insn = emit_move_insn (reg, mem);
+      dwarf = alloc_reg_note (REG_CFA_RESTORE, reg, NULL_RTX);
+      RTX_FRAME_RELATED_P (fpu_insn) = 1;
+      REG_NOTES (fpu_insn) = dwarf;
+    }
+}
+
+enum nds32_expand_result_type
+nds32_expand_extv (rtx *operands)
+{
+  gcc_assert (CONST_INT_P (operands[2]) && CONST_INT_P (operands[3]));
+  HOST_WIDE_INT width = INTVAL (operands[2]);
+  HOST_WIDE_INT bitpos = INTVAL (operands[3]);
+  rtx dst = operands[0];
+  rtx src = operands[1];
+
+  if (MEM_P (src)
+      && width == 32
+      && (bitpos % BITS_PER_UNIT)  == 0
+      && GET_MODE_BITSIZE (GET_MODE (dst)) == width)
+    {
+      rtx newmem = adjust_address (src, GET_MODE (dst),
+				   bitpos / BITS_PER_UNIT);
+
+      rtx base_addr = force_reg (Pmode, XEXP (newmem, 0));
+
+      emit_insn (gen_unaligned_loadsi (dst, base_addr));
+
+      return EXPAND_DONE;
+    }
+  return EXPAND_FAIL;
+}
+
+enum nds32_expand_result_type
+nds32_expand_insv (rtx *operands)
+{
+  gcc_assert (CONST_INT_P (operands[1]) && CONST_INT_P (operands[2]));
+  HOST_WIDE_INT width = INTVAL (operands[1]);
+  HOST_WIDE_INT bitpos = INTVAL (operands[2]);
+  rtx dst = operands[0];
+  rtx src = operands[3];
+
+  if (MEM_P (dst)
+      && width == 32
+      && (bitpos % BITS_PER_UNIT)  == 0
+      && GET_MODE_BITSIZE (GET_MODE (src)) == width)
+    {
+      rtx newmem = adjust_address (dst, GET_MODE (src),
+				      bitpos / BITS_PER_UNIT);
+
+      rtx base_addr = force_reg (Pmode, XEXP (newmem, 0));
+
+      emit_insn (gen_unaligned_storesi (base_addr, src));
+
+      return EXPAND_DONE;
+    }
+  return EXPAND_FAIL;
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* PART 3: Auxiliary function for output asm template. */
+
+/* Function to generate PC relative jump table.
+   Refer to nds32.md for more details.
+
+   The following is the sample for the case that diff value
+   can be presented in '.short' size.
+
+     addi    $r1, $r1, -(case_lower_bound)
+     slti    $ta, $r1, (case_number)
+     beqz    $ta, .L_skip_label
+
+     la      $ta, .L35             ! get jump table address
+     lh      $r1, [$ta + $r1 << 1] ! load symbol diff from jump table entry
+     addi    $ta, $r1, $ta
+     jr5     $ta
+
+     ! jump table entry
+   L35:
+     .short  .L25-.L35
+     .short  .L26-.L35
+     .short  .L27-.L35
+     .short  .L28-.L35
+     .short  .L29-.L35
+     .short  .L30-.L35
+     .short  .L31-.L35
+     .short  .L32-.L35
+     .short  .L33-.L35
+     .short  .L34-.L35 */
+const char *
+nds32_output_casesi_pc_relative (rtx *operands)
+{
+  enum machine_mode mode;
+  rtx diff_vec;
+
+  diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[1])));
+
+  gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
+
+  /* Step C: "t <-- operands[1]".  */
+  if (flag_pic)
+    {
+      output_asm_insn ("sethi\t$ta, hi20(%l1@GOTOFF)", operands);
+      output_asm_insn ("ori\t$ta, $ta, lo12(%l1@GOTOFF)", operands);
+      output_asm_insn ("add\t$ta, $ta, $gp", operands);
+    }
+  else
+    output_asm_insn ("la\t$ta, %l1", operands);
+
+  /* Get the mode of each element in the difference vector.  */
+  mode = GET_MODE (diff_vec);

+  /* Step D: "z <-- (mem (plus (operands[0] << m) t))",
+     where m is 0, 1, or 2 to load address-diff value from table.  */
+  switch (mode)
+    {
+    case QImode:
+      output_asm_insn ("lb\t%2, [$ta + %0 << 0]", operands);
+      break;
+    case HImode:
+      output_asm_insn ("lh\t%2, [$ta + %0 << 1]", operands);
+      break;
+    case SImode:
+      output_asm_insn ("lw\t%2, [$ta + %0 << 2]", operands);
+      break;
     default:
-      abort ();
+      gcc_unreachable ();
     }

-  output_asm_insn (pattern, operands);
-  return "";
+  /* Step E: "t <-- z + t".
+     Add table label_ref with address-diff value to
+     obtain target case address.  */
+  output_asm_insn ("add\t$ta, %2, $ta", operands);
+
+  /* Step F: jump to target with register t.  */
+  if (TARGET_16_BIT)
+    return "jr5\t$ta";
+  else
+    return "jr\t$ta";
 }

-/* Function to output stack push operation.
-   We need to deal with normal stack push multiple or stack v3push.  */
+/* Function to generate normal jump table.  */
 const char *
-nds32_output_stack_push (rtx par_rtx)
+nds32_output_casesi (rtx *operands)
 {
-  /* A string pattern for output_asm_insn().  */
-  char pattern[100];
-  /* The operands array which will be used in output_asm_insn().  */
-  rtx operands[3];
-  /* Pick up varargs first regno and last regno for further use.  */
-  int rb_va_args = cfun->machine->va_args_first_regno;
-  int re_va_args = cfun->machine->va_args_last_regno;
-  int last_argument_regno = NDS32_FIRST_GPR_REGNUM
-			    + NDS32_MAX_GPR_REGS_FOR_ARGS
-			    - 1;
-  /* Pick up callee-saved first regno and last regno for further use.  */
-  int rb_callee_saved = cfun->machine->callee_saved_first_gpr_regno;
-  int re_callee_saved = cfun->machine->callee_saved_last_gpr_regno;
+  /* Step C: "t <-- operands[1]".  */
+  if (flag_pic)
+    {
+      output_asm_insn ("sethi\t$ta, hi20(%l1@GOTOFF)", operands);
+      output_asm_insn ("ori\t$ta, $ta, lo12(%l1@GOTOFF)", operands);
+      output_asm_insn ("add\t$ta, $ta, $gp", operands);
+    }
+  else
+    output_asm_insn ("la\t$ta, %l1", operands);

-  /* First we need to check if we are pushing argument registers not used
-     for the named arguments.  If so, we have to create 'smw.adm' (push.s)
-     instruction.  */
-  if (reg_mentioned_p (gen_rtx_REG (SImode, last_argument_regno), par_rtx))
+  /* Step D: "z <-- (mem (plus (operands[0] << 2) t))".  */
+  output_asm_insn ("lw\t%2, [$ta + %0 << 2]", operands);
+
+  /* No need to perform Step E, which is only used for
+     pc relative jump table.  */
+
+  /* Step F: jump to target with register z.  */
+  if (TARGET_16_BIT)
+    return "jr5\t%2";
+  else
+    return "jr\t%2";
+}
+
+
+/* Function to return memory format.  */
+enum nds32_16bit_address_type
+nds32_mem_format (rtx op)
+{
+  enum machine_mode mode_test;
+  int val;
+  int regno;
+
+  if (!TARGET_16_BIT)
+    return ADDRESS_NOT_16BIT_FORMAT;
+
+  mode_test = GET_MODE (op);
+
+  op = XEXP (op, 0);
+
+  /* 45 format.  */
+  if (GET_CODE (op) == REG
+      && ((mode_test == SImode) || (mode_test == SFmode)))
+    return ADDRESS_REG;
+
+  /* 333 format for QI/HImode.  */
+  if (GET_CODE (op) == REG && (REGNO (op) < R8_REGNUM))
+    return ADDRESS_LO_REG_IMM3U;
+
+  /* post_inc 333 format.  */
+  if ((GET_CODE (op) == POST_INC)
+      && ((mode_test == SImode) || (mode_test == SFmode)))
     {
-      /* Set operands[0] and operands[1].  */
-      operands[0] = gen_rtx_REG (SImode, rb_va_args);
-      operands[1] = gen_rtx_REG (SImode, re_va_args);
-      /* Create assembly code pattern: "Rb, Re, { }".  */
-      snprintf (pattern, sizeof (pattern), "push.s\t%s", "%0, %1, { }");
-      /* We use output_asm_insn() to output assembly code by ourself.  */
-      output_asm_insn (pattern, operands);
-      return "";
+      regno = REGNO(XEXP (op, 0));
+
+      if (regno < 8)
+	return ADDRESS_POST_INC_LO_REG_IMM3U;
+    }
+
+  /* post_inc 333 format.  */
+  if ((GET_CODE (op) == POST_MODIFY)
+      && ((mode_test == SImode) || (mode_test == SFmode))
+      && (REG_P (XEXP (XEXP (op, 1), 0)))
+      && (CONST_INT_P (XEXP (XEXP (op, 1), 1))))
+    {
+      regno = REGNO (XEXP (XEXP (op, 1), 0));
+      val = INTVAL (XEXP (XEXP (op, 1), 1));
+      if (regno < 8 && val > 0 && val < 32)
+	return ADDRESS_POST_MODIFY_LO_REG_IMM3U;
     }

-  /* If we step here, we are going to do v3push or multiple push operation.  */
+  if ((GET_CODE (op) == PLUS)
+      && (GET_CODE (XEXP (op, 0)) == REG)
+      && (GET_CODE (XEXP (op, 1)) == CONST_INT))
+    {
+      val = INTVAL (XEXP (op, 1));
+
+      regno = REGNO(XEXP (op, 0));
+
+      if (regno > 8
+	  && regno != SP_REGNUM
+	  && regno != FP_REGNUM)
+	return ADDRESS_NOT_16BIT_FORMAT;
+
+      switch (mode_test)
+	{
+	case QImode:
+	  /* 333 format.  */
+	  if (val >= 0 && val < 8 && regno < 8)
+	    return ADDRESS_LO_REG_IMM3U;
+	  break;
+
+	case HImode:
+	  /* 333 format.  */
+	  if (val >= 0 && val < 16 && (val % 2 == 0) && regno < 8)
+	    return ADDRESS_LO_REG_IMM3U;
+	  break;
+
+	case SImode:
+	case SFmode:
+	case DFmode:
+	  /* r8 imply fe format.  */
+	  if ((regno == 8) &&
+	      (val >= -128 && val <= -4 && (val % 4 == 0)))
+	    return ADDRESS_R8_IMM7U;
+	  /* fp imply 37 format.  */
+	  if ((regno == FP_REGNUM) &&
+	      (val >= 0 && val < 512 && (val % 4 == 0)))
+	    return ADDRESS_FP_IMM7U;
+	  /* sp imply 37 format.  */
+	  else if ((regno == SP_REGNUM) &&
+		   (val >= 0 && val < 512 && (val % 4 == 0)))
+	    return ADDRESS_SP_IMM7U;
+	  /* 333 format.  */
+	  else if (val >= 0 && val < 32 && (val % 4 == 0) && regno < 8)
+	    return ADDRESS_LO_REG_IMM3U;
+	  break;
+
+	default:
+	  break;
+	}
+    }
+
+  return ADDRESS_NOT_16BIT_FORMAT;
+}
+
+/* Output 16-bit store.  */
+const char *
+nds32_output_16bit_store (rtx *operands, int byte)
+{
+  char pattern[100];
+  char size;
+  rtx code = XEXP (operands[0], 0);
+
+  size = nds32_byte_to_size (byte);
+
+  switch (nds32_mem_format (operands[0]))
+    {
+    case ADDRESS_REG:
+      operands[0] = code;
+      output_asm_insn ("swi450\t%1, [%0]", operands);
+      break;
+    case ADDRESS_LO_REG_IMM3U:
+      snprintf (pattern, sizeof (pattern), "s%ci333\t%%1, %%0", size);
+      output_asm_insn (pattern, operands);
+      break;
+    case ADDRESS_POST_INC_LO_REG_IMM3U:
+      snprintf (pattern, sizeof (pattern), "swi333.bi\t%%1, %%0, 4");
+      output_asm_insn (pattern, operands);
+      break;
+    case ADDRESS_POST_MODIFY_LO_REG_IMM3U:
+      snprintf (pattern, sizeof (pattern), "swi333.bi\t%%1, %%0");
+      output_asm_insn (pattern, operands);
+      break;
+    case ADDRESS_FP_IMM7U:
+      output_asm_insn ("swi37\t%1, %0", operands);
+      break;
+    case ADDRESS_SP_IMM7U:
+      /* Get immediate value and set back to operands[1].  */
+      operands[0] = XEXP (code, 1);
+      output_asm_insn ("swi37.sp\t%1, [ + (%0)]", operands);
+      break;
+    default:
+      break;
+    }
+
+  return "";
+}
+
+/* Output 16-bit load.  */
+const char *
+nds32_output_16bit_load (rtx *operands, int byte)
+{
+  char pattern[100];
+  unsigned char size;
+  rtx code = XEXP (operands[1], 0);
+
+  size = nds32_byte_to_size (byte);
+
+  switch (nds32_mem_format (operands[1]))
+    {
+    case ADDRESS_REG:
+      operands[1] = code;
+      output_asm_insn ("lwi450\t%0, [%1]", operands);
+      break;
+    case ADDRESS_LO_REG_IMM3U:
+      snprintf (pattern, sizeof (pattern), "l%ci333\t%%0, %%1", size);
+      output_asm_insn (pattern, operands);
+      break;
+    case ADDRESS_POST_INC_LO_REG_IMM3U:
+      snprintf (pattern, sizeof (pattern), "lwi333.bi\t%%0, %%1, 4");
+      output_asm_insn (pattern, operands);
+      break;
+    case ADDRESS_POST_MODIFY_LO_REG_IMM3U:
+      snprintf (pattern, sizeof (pattern), "lwi333.bi\t%%0, %%1");
+      output_asm_insn (pattern, operands);
+      break;
+    case ADDRESS_R8_IMM7U:
+      output_asm_insn ("lwi45.fe\t%0, %e1", operands);
+      break;
+    case ADDRESS_FP_IMM7U:
+      output_asm_insn ("lwi37\t%0, %1", operands);
+      break;
+    case ADDRESS_SP_IMM7U:
+      /* Get immediate value and set back to operands[0].  */
+      operands[1] = XEXP (code, 1);
+      output_asm_insn ("lwi37.sp\t%0, [ + (%1)]", operands);
+      break;
+    default:
+      break;
+    }
+
+  return "";
+}
+
+/* Output 32-bit store.  */
+const char *
+nds32_output_32bit_store (rtx *operands, int byte)
+{
+  char pattern[100];
+  unsigned char size;
+  rtx code = XEXP (operands[0], 0);
+
+  size = nds32_byte_to_size (byte);
+
+  switch (GET_CODE (code))
+    {
+    case REG:
+      /* (mem (reg X))
+	 => access location by using register,
+	 use "sbi / shi / swi" */
+      snprintf (pattern, sizeof (pattern), "s%ci\t%%1, %%0", size);
+      break;
+
+    case SYMBOL_REF:
+    case CONST:
+      /* (mem (symbol_ref X))
+	 (mem (const (...)))
+	 => access global variables,
+	 use "sbi.gp / shi.gp / swi.gp" */
+      operands[0] = XEXP (operands[0], 0);
+      snprintf (pattern, sizeof (pattern), "s%ci.gp\t%%1, [ + %%0]", size);
+      break;
+
+    case POST_INC:
+      /* (mem (post_inc reg))
+	 => access location by using register which will be post increment,
+	 use "sbi.bi / shi.bi / swi.bi" */
+      snprintf (pattern, sizeof (pattern),
+		"s%ci.bi\t%%1, %%0, %d", size, byte);
+      break;
+
+    case POST_DEC:
+      /* (mem (post_dec reg))
+	 => access location by using register which will be post decrement,
+	 use "sbi.bi / shi.bi / swi.bi" */
+      snprintf (pattern, sizeof (pattern),
+		"s%ci.bi\t%%1, %%0, -%d", size, byte);
+      break;
+
+    case POST_MODIFY:
+      switch (GET_CODE (XEXP (XEXP (code, 1), 1)))
+	{
+	case REG:
+	case SUBREG:
+	  /* (mem (post_modify (reg) (plus (reg) (reg))))
+	     => access location by using register which will be
+	     post modified with reg,
+	     use "sb.bi/ sh.bi / sw.bi" */
+	  snprintf (pattern, sizeof (pattern), "s%c.bi\t%%1, %%0", size);
+	  break;
+	case CONST_INT:
+	  /* (mem (post_modify (reg) (plus (reg) (const_int))))
+	     => access location by using register which will be
+	     post modified with const_int,
+	     use "sbi.bi/ shi.bi / swi.bi" */
+	  snprintf (pattern, sizeof (pattern), "s%ci.bi\t%%1, %%0", size);
+	  break;
+	default:
+	  abort ();
+	}
+      break;
+
+    case PLUS:
+      switch (GET_CODE (XEXP (code, 1)))
+	{
+	case REG:
+	case SUBREG:
+	  /* (mem (plus reg reg)) or (mem (plus (mult reg const_int) reg))
+	     => access location by adding two registers,
+	     use "sb / sh / sw" */
+	  snprintf (pattern, sizeof (pattern), "s%c\t%%1, %%0", size);
+	  break;
+	case CONST_INT:
+	  /* (mem (plus reg const_int))
+	     => access location by adding one register with const_int,
+	     use "sbi / shi / swi" */
+	  snprintf (pattern, sizeof (pattern), "s%ci\t%%1, %%0", size);
+	  break;
+	default:
+	  abort ();
+	}
+      break;
+
+    case LO_SUM:
+      operands[2] = XEXP (code, 1);
+      operands[0] = XEXP (code, 0);
+      snprintf (pattern, sizeof (pattern),
+		"s%ci\t%%1, [%%0 + lo12(%%2)]", size);
+      break;
+
+    default:
+      abort ();
+    }
+
+  output_asm_insn (pattern, operands);
+  return "";
+}
+
+/* Output 32-bit load.  */
+const char *
+nds32_output_32bit_load (rtx *operands, int byte)
+{
+  char pattern[100];
+  unsigned char size;
+  rtx code;
+
+  code = XEXP (operands[1], 0);
+
+  size = nds32_byte_to_size (byte);
+
+  switch (GET_CODE (code))
+    {
+    case REG:
+      /* (mem (reg X))
+	 => access location by using register,
+	 use "lbi / lhi / lwi" */
+      snprintf (pattern, sizeof (pattern), "l%ci\t%%0, %%1", size);
+      break;
+
+    case SYMBOL_REF:
+    case CONST:
+      /* (mem (symbol_ref X))
+	 (mem (const (...)))
+	 => access global variables,
+	 use "lbi.gp / lhi.gp / lwi.gp" */
+      operands[1] = XEXP (operands[1], 0);
+      snprintf (pattern, sizeof (pattern), "l%ci.gp\t%%0, [ + %%1]", size);
+      break;
+
+    case POST_INC:
+      /* (mem (post_inc reg))
+	 => access location by using register which will be post increment,
+	 use "lbi.bi / lhi.bi / lwi.bi" */
+      snprintf (pattern, sizeof (pattern),
+		"l%ci.bi\t%%0, %%1, %d", size, byte);
+      break;
+
+    case POST_DEC:
+      /* (mem (post_dec reg))
+	 => access location by using register which will be post decrement,
+	 use "lbi.bi / lhi.bi / lwi.bi" */
+      snprintf (pattern, sizeof (pattern),
+		"l%ci.bi\t%%0, %%1, -%d", size, byte);
+      break;
+
+    case POST_MODIFY:
+      switch (GET_CODE (XEXP (XEXP (code, 1), 1)))
+	{
+	case REG:
+	case SUBREG:
+	  /* (mem (post_modify (reg) (plus (reg) (reg))))
+	     => access location by using register which will be
+	     post modified with reg,
+	     use "lb.bi/ lh.bi / lw.bi" */
+	  snprintf (pattern, sizeof (pattern), "l%c.bi\t%%0, %%1", size);
+	  break;
+	case CONST_INT:
+	  /* (mem (post_modify (reg) (plus (reg) (const_int))))
+	     => access location by using register which will be
+	     post modified with const_int,
+	     use "lbi.bi/ lhi.bi / lwi.bi" */
+	  snprintf (pattern, sizeof (pattern), "l%ci.bi\t%%0, %%1", size);
+	  break;
+	default:
+	  abort ();
+	}
+      break;
+
+    case PLUS:
+      switch (GET_CODE (XEXP (code, 1)))
+	{
+	case REG:
+	case SUBREG:
+	  /* (mem (plus reg reg)) or (mem (plus (mult reg const_int) reg))
+	     use "lb / lh / lw" */
+	  snprintf (pattern, sizeof (pattern), "l%c\t%%0, %%1", size);
+	  break;
+	case CONST_INT:
+	  /* (mem (plus reg const_int))
+	     => access location by adding one register with const_int,
+	     use "lbi / lhi / lwi" */
+	  snprintf (pattern, sizeof (pattern), "l%ci\t%%0, %%1", size);
+	  break;
+	default:
+	  abort ();
+	}
+      break;
+
+    case LO_SUM:
+      operands[2] = XEXP (code, 1);
+      operands[1] = XEXP (code, 0);
+      snprintf (pattern, sizeof (pattern),
+		"l%ci\t%%0, [%%1 + lo12(%%2)]", size);
+      break;
+
+    default:
+      abort ();
+    }
+
+  output_asm_insn (pattern, operands);
+  return "";
+}
+
+/* Output 32-bit load with signed extension.  */
+const char *
+nds32_output_32bit_load_se (rtx *operands, int byte)
+{
+  char pattern[100];
+  unsigned char size;
+  rtx code;
+
+  code = XEXP (operands[1], 0);
+
+  size = nds32_byte_to_size (byte);
+
+  switch (GET_CODE (code))
+    {
+    case REG:
+      /* (mem (reg X))
+	 => access location by using register,
+	 use "lbsi / lhsi" */
+      snprintf (pattern, sizeof (pattern), "l%csi\t%%0, %%1", size);
+      break;
+
+    case SYMBOL_REF:
+    case CONST:
+      /* (mem (symbol_ref X))
+	 (mem (const (...)))
+	 => access global variables,
+	 use "lbsi.gp / lhsi.gp" */
+      operands[1] = XEXP (operands[1], 0);
+      snprintf (pattern, sizeof (pattern), "l%csi.gp\t%%0, [ + %%1]", size);
+      break;
+
+    case POST_INC:
+      /* (mem (post_inc reg))
+	 => access location by using register which will be post increment,
+	 use "lbsi.bi / lhsi.bi" */
+      snprintf (pattern, sizeof (pattern),
+		"l%csi.bi\t%%0, %%1, %d", size, byte);
+      break;
+
+    case POST_DEC:
+      /* (mem (post_dec reg))
+	 => access location by using register which will be post decrement,
+	 use "lbsi.bi / lhsi.bi" */
+      snprintf (pattern, sizeof (pattern),
+		"l%csi.bi\t%%0, %%1, -%d", size, byte);
+      break;
+
+    case POST_MODIFY:
+      switch (GET_CODE (XEXP (XEXP (code, 1), 1)))
+	{
+	case REG:
+	case SUBREG:
+	  /* (mem (post_modify (reg) (plus (reg) (reg))))
+	     => access location by using register which will be
+	     post modified with reg,
+	     use "lbs.bi/ lhs.bi" */
+	  snprintf (pattern, sizeof (pattern), "l%cs.bi\t%%0, %%1", size);
+	  break;
+	case CONST_INT:
+	  /* (mem (post_modify (reg) (plus (reg) (const_int))))
+	     => access location by using register which will be
+	     post modified with const_int,
+	     use "lbsi.bi/ lhsi.bi" */
+	  snprintf (pattern, sizeof (pattern), "l%csi.bi\t%%0, %%1", size);
+	  break;
+	default:
+	  abort ();
+	}
+      break;
+
+    case PLUS:
+      switch (GET_CODE (XEXP (code, 1)))
+	{
+	case REG:
+	case SUBREG:
+	  /* (mem (plus reg reg)) or (mem (plus (mult reg const_int) reg))
+	     use "lbs / lhs" */
+	  snprintf (pattern, sizeof (pattern), "l%cs\t%%0, %%1", size);
+	  break;
+	case CONST_INT:
+	  /* (mem (plus reg const_int))
+	     => access location by adding one register with const_int,
+	     use "lbsi / lhsi" */
+	  snprintf (pattern, sizeof (pattern), "l%csi\t%%0, %%1", size);
+	  break;
+	default:
+	  abort ();
+	}
+      break;
+
+    case LO_SUM:
+      operands[2] = XEXP (code, 1);
+      operands[1] = XEXP (code, 0);
+      snprintf (pattern, sizeof (pattern),
+		"l%csi\t%%0, [%%1 + lo12(%%2)]", size);
+      break;
+
+    default:
+      abort ();
+    }
+
+  output_asm_insn (pattern, operands);
+  return "";
+}
+
+/* Function to output stack push operation.
+   We need to deal with normal stack push multiple or stack v3push.  */
+const char *
+nds32_output_stack_push (rtx par_rtx)
+{
+  /* A string pattern for output_asm_insn().  */
+  char pattern[100];
+  /* The operands array which will be used in output_asm_insn().  */
+  rtx operands[3];
+  /* Pick up varargs first regno and last regno for further use.  */
+  int rb_va_args = cfun->machine->va_args_first_regno;
+  int re_va_args = cfun->machine->va_args_last_regno;
+  int last_argument_regno = NDS32_FIRST_GPR_REGNUM
+			    + NDS32_MAX_GPR_REGS_FOR_ARGS
+			    - 1;
+  /* Pick up first and last eh data regno for further use.  */
+  int rb_eh_data = cfun->machine->eh_return_data_first_regno;
+  int re_eh_data = cfun->machine->eh_return_data_last_regno;
+  int first_eh_data_regno = EH_RETURN_DATA_REGNO (0);
+  /* Pick up callee-saved first regno and last regno for further use.  */
+  int rb_callee_saved = cfun->machine->callee_saved_first_gpr_regno;
+  int re_callee_saved = cfun->machine->callee_saved_last_gpr_regno;
+
+  /* First we need to check if we are pushing argument registers not used
+     for the named arguments.  If so, we have to create 'smw.adm' (push.s)
+     instruction.  */
+  if (reg_mentioned_p (gen_rtx_REG (SImode, last_argument_regno), par_rtx))
+    {
+      /* Set operands[0] and operands[1].  */
+      operands[0] = gen_rtx_REG (SImode, rb_va_args);
+      operands[1] = gen_rtx_REG (SImode, re_va_args);
+      /* Create assembly code pattern: "Rb, Re, { }".  */
+      snprintf (pattern, sizeof (pattern), "push.s\t%s", "%0, %1, { }");
+      /* We use output_asm_insn() to output assembly code by ourself.  */
+      output_asm_insn (pattern, operands);
+      return "";
+    }
+
+  /* If last_argument_regno is not mentioned in par_rtx, we can confirm that
+     we do not need to push argument registers for variadic function.
+     But we still need to check if we need to push exception handling
+     data registers.  */
+  if (reg_mentioned_p (gen_rtx_REG (SImode, first_eh_data_regno), par_rtx))
+    {
+      /* Set operands[0] and operands[1].  */
+      operands[0] = gen_rtx_REG (SImode, rb_eh_data);
+      operands[1] = gen_rtx_REG (SImode, re_eh_data);
+      /* Create assembly code pattern: "Rb, Re, { }".  */
+      snprintf (pattern, sizeof (pattern), "push.s\t%s", "%0, %1, { }");
+      /* We use output_asm_insn() to output assembly code by ourself.  */
+      output_asm_insn (pattern, operands);
+      return "";
+    }
+
+  /* If we step here, we are going to do v3push or multiple push operation.  */
+
+  /* Refer to nds32.h, where we comment when push25/pop25 are available.  */
+  if (NDS32_V3PUSH_AVAILABLE_P)
+    {
+      /* For stack v3push:
+	   operands[0]: Re
+	   operands[1]: imm8u */
+
+      /* This variable is to check if 'push25 Re,imm8u' is available.  */
+      int sp_adjust;
+
+      /* Set operands[0].  */
+      operands[0] = gen_rtx_REG (SImode, re_callee_saved);
+
+      /* Check if we can generate 'push25 Re,imm8u',
+	 otherwise, generate 'push25 Re,0'.  */
+      sp_adjust = cfun->machine->local_size
+		  + cfun->machine->out_args_size
+		  + cfun->machine->callee_saved_area_gpr_padding_bytes
+		  + cfun->machine->callee_saved_fpr_regs_size;
+      if (satisfies_constraint_Iu08 (GEN_INT (sp_adjust))
+	  && NDS32_DOUBLE_WORD_ALIGN_P (sp_adjust))
+	operands[1] = GEN_INT (sp_adjust);
+      else
+	{
+	  /* Allocate callee saved fpr space.  */
+	  if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	    {
+	      sp_adjust = cfun->machine->callee_saved_area_gpr_padding_bytes
+			  + cfun->machine->callee_saved_fpr_regs_size;
+	      operands[1] = GEN_INT (sp_adjust);
+	    }
+	  else
+	    {
+	      operands[1] = GEN_INT (0);
+	    }
+	}
+
+      /* Create assembly code pattern.  */
+      snprintf (pattern, sizeof (pattern), "push25\t%%0, %%1");
+    }
+  else
+    {
+      /* For normal stack push multiple:
+	 operands[0]: Rb
+	 operands[1]: Re
+	 operands[2]: En4 */
+
+      /* This variable is used to check if we only need to generate En4 field.
+	 As long as Rb==Re=SP_REGNUM, we set this variable to 1.  */
+      int push_en4_only_p = 0;
+
+      /* Set operands[0] and operands[1].  */
+      operands[0] = gen_rtx_REG (SImode, rb_callee_saved);
+      operands[1] = gen_rtx_REG (SImode, re_callee_saved);
+
+      /* 'smw.adm $sp,[$sp],$sp,0' means push nothing.  */
+      if (!cfun->machine->fp_size
+	  && !cfun->machine->gp_size
+	  && !cfun->machine->lp_size
+	  && REGNO (operands[0]) == SP_REGNUM
+	  && REGNO (operands[1]) == SP_REGNUM)
+	{
+	  /* No need to generate instruction.  */
+	  return "";
+	}
+      else
+	{
+	  /* If Rb==Re=SP_REGNUM, we only need to generate En4 field.  */
+	  if (REGNO (operands[0]) == SP_REGNUM
+	      && REGNO (operands[1]) == SP_REGNUM)
+	    push_en4_only_p = 1;
+
+	  /* Create assembly code pattern.
+	     We need to handle the form: "Rb, Re, { $fp $gp $lp }".  */
+	  snprintf (pattern, sizeof (pattern),
+		    "push.s\t%s{%s%s%s }",
+		    push_en4_only_p ? "" : "%0, %1, ",
+		    cfun->machine->fp_size ? " $fp" : "",
+		    cfun->machine->gp_size ? " $gp" : "",
+		    cfun->machine->lp_size ? " $lp" : "");
+	}
+    }
+
+  /* We use output_asm_insn() to output assembly code by ourself.  */
+  output_asm_insn (pattern, operands);
+  return "";
+}
+
+/* Function to output stack pop operation.
+   We need to deal with normal stack pop multiple or stack v3pop.  */
+const char *
+nds32_output_stack_pop (rtx par_rtx ATTRIBUTE_UNUSED)
+{
+  /* A string pattern for output_asm_insn().  */
+  char pattern[100];
+  /* The operands array which will be used in output_asm_insn().  */
+  rtx operands[3];
+  /* Pick up first and last eh data regno for further use.  */
+  int rb_eh_data = cfun->machine->eh_return_data_first_regno;
+  int re_eh_data = cfun->machine->eh_return_data_last_regno;
+  int first_eh_data_regno = EH_RETURN_DATA_REGNO (0);
+  /* Pick up callee-saved first regno and last regno for further use.  */
+  int rb_callee_saved = cfun->machine->callee_saved_first_gpr_regno;
+  int re_callee_saved = cfun->machine->callee_saved_last_gpr_regno;
+
+  /* We need to check if we need to push exception handling
+     data registers.  */
+  if (reg_mentioned_p (gen_rtx_REG (SImode, first_eh_data_regno), par_rtx))
+    {
+      /* Set operands[0] and operands[1].  */
+      operands[0] = gen_rtx_REG (SImode, rb_eh_data);
+      operands[1] = gen_rtx_REG (SImode, re_eh_data);
+      /* Create assembly code pattern: "Rb, Re, { }".  */
+      snprintf (pattern, sizeof (pattern), "pop.s\t%s", "%0, %1, { }");
+      /* We use output_asm_insn() to output assembly code by ourself.  */
+      output_asm_insn (pattern, operands);
+      return "";
+    }
+
+  /* If we step here, we are going to do v3pop or multiple pop operation.  */
+
+  /* Refer to nds32.h, where we comment when push25/pop25 are available.  */
+  if (NDS32_V3PUSH_AVAILABLE_P)
+    {
+      /* For stack v3pop:
+	   operands[0]: Re
+	   operands[1]: imm8u */
+
+      /* This variable is to check if 'pop25 Re,imm8u' is available.  */
+      int sp_adjust;
+
+      /* Set operands[0].  */
+      operands[0] = gen_rtx_REG (SImode, re_callee_saved);
+
+      /* Check if we can generate 'pop25 Re,imm8u',
+	 otherwise, generate 'pop25 Re,0'.
+	 We have to consider alloca issue as well.
+	 If the function does call alloca(), the stack pointer is not fixed.
+	 In that case, we cannot use 'pop25 Re,imm8u' directly.
+	 We have to caculate stack pointer from frame pointer
+	 and then use 'pop25 Re,0'.  */
+      sp_adjust = cfun->machine->local_size
+		  + cfun->machine->out_args_size
+		  + cfun->machine->callee_saved_area_gpr_padding_bytes
+		  + cfun->machine->callee_saved_fpr_regs_size;
+      if (satisfies_constraint_Iu08 (GEN_INT (sp_adjust))
+	  && NDS32_DOUBLE_WORD_ALIGN_P (sp_adjust)
+	  && !cfun->calls_alloca)
+	operands[1] = GEN_INT (sp_adjust);
+      else
+	{
+	  if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	    {
+	      /* If has fpr need to restore, the $sp on callee saved fpr
+		 position, so we need to consider gpr pading bytes and
+		 callee saved fpr size.  */
+	      sp_adjust = cfun->machine->callee_saved_area_gpr_padding_bytes
+			  + cfun->machine->callee_saved_fpr_regs_size;
+	      operands[1] = GEN_INT (sp_adjust);
+	    }
+	  else
+	    {
+	      operands[1] = GEN_INT (0);
+	    }
+	}
+
+      /* Create assembly code pattern.  */
+      snprintf (pattern, sizeof (pattern), "pop25\t%%0, %%1");
+    }
+  else
+    {
+      /* For normal stack pop multiple:
+	 operands[0]: Rb
+	 operands[1]: Re
+	 operands[2]: En4 */
+
+      /* This variable is used to check if we only need to generate En4 field.
+	 As long as Rb==Re=SP_REGNUM, we set this variable to 1.  */
+      int pop_en4_only_p = 0;
+
+      /* Set operands[0] and operands[1].  */
+      operands[0] = gen_rtx_REG (SImode, rb_callee_saved);
+      operands[1] = gen_rtx_REG (SImode, re_callee_saved);
+
+      /* 'lmw.bim $sp,[$sp],$sp,0' means pop nothing.  */
+      if (!cfun->machine->fp_size
+	  && !cfun->machine->gp_size
+	  && !cfun->machine->lp_size
+	  && REGNO (operands[0]) == SP_REGNUM
+	  && REGNO (operands[1]) == SP_REGNUM)
+	{
+	  /* No need to generate instruction.  */
+	  return "";
+	}
+      else
+	{
+	  /* If Rb==Re=SP_REGNUM, we only need to generate En4 field.  */
+	  if (REGNO (operands[0]) == SP_REGNUM
+	      && REGNO (operands[1]) == SP_REGNUM)
+	    pop_en4_only_p = 1;
+
+	  /* Create assembly code pattern.
+	     We need to handle the form: "Rb, Re, { $fp $gp $lp }".  */
+	  snprintf (pattern, sizeof (pattern),
+		    "pop.s\t%s{%s%s%s }",
+		    pop_en4_only_p ? "" : "%0, %1, ",
+		    cfun->machine->fp_size ? " $fp" : "",
+		    cfun->machine->gp_size ? " $gp" : "",
+		    cfun->machine->lp_size ? " $lp" : "");
+	}
+    }
+
+  /* We use output_asm_insn() to output assembly code by ourself.  */
+  output_asm_insn (pattern, operands);
+  return "";
+}
+
+/* Function to output return operation.  */
+const char *
+nds32_output_return (void)
+{
+  /* A string pattern for output_asm_insn().  */
+  char pattern[100];
+  /* The operands array which will be used in output_asm_insn().  */
+  rtx operands[2];
+  /* For stack v3pop:
+     operands[0]: Re
+     operands[1]: imm8u */
+  int re_callee_saved = cfun->machine->callee_saved_last_gpr_regno;
+  int sp_adjust;
+
+  /* Set operands[0].  */
+  operands[0] = gen_rtx_REG (SImode, re_callee_saved);
+
+  /* Check if we can generate 'pop25 Re,imm8u',
+     otherwise, generate 'pop25 Re,0'.
+     We have to consider alloca issue as well.
+     If the function does call alloca(), the stack pointer is not fixed.
+     In that case, we cannot use 'pop25 Re,imm8u' directly.
+     We have to caculate stack pointer from frame pointer
+     and then use 'pop25 Re,0'.  */
+  sp_adjust = cfun->machine->local_size
+    + cfun->machine->out_args_size
+    + cfun->machine->callee_saved_area_gpr_padding_bytes
+    + cfun->machine->callee_saved_fpr_regs_size;
+  if (satisfies_constraint_Iu08 (GEN_INT (sp_adjust))
+      && NDS32_DOUBLE_WORD_ALIGN_P (sp_adjust)
+      && !cfun->calls_alloca)
+    operands[1] = GEN_INT (sp_adjust);
+  else
+    operands[1] = GEN_INT (0);
+
+  /* Create assembly code pattern.  */
+  snprintf (pattern, sizeof (pattern), "pop25\t%%0, %%1");
+  /* We use output_asm_insn() to output assembly code by ourself.  */
+  output_asm_insn (pattern, operands);
+  return "";
+}
+
+
+/* output a float load instruction */
+const char *
+nds32_output_float_load (rtx *operands)
+{
+  char buff[100];
+  const char *pattern;
+  rtx addr, addr_op0, addr_op1;
+  int dp = GET_MODE_SIZE (GET_MODE (operands[0])) == 8;
+  addr = XEXP (operands[1], 0);
+  switch (GET_CODE (addr))
+    {
+    case REG:
+      pattern = "fl%ci\t%%0, %%1";
+      break;
+
+    case PLUS:
+      addr_op0 = XEXP (addr, 0);
+      addr_op1 = XEXP (addr, 1);
+
+      if (REG_P (addr_op0) && REG_P (addr_op1))
+	pattern = "fl%c\t%%0, %%1";
+      else if (REG_P (addr_op0) && CONST_INT_P (addr_op1))
+	pattern = "fl%ci\t%%0, %%1";
+      else if (GET_CODE (addr_op0) == MULT && REG_P (addr_op1)
+	       && REG_P (XEXP (addr_op0, 0))
+	       && CONST_INT_P (XEXP (addr_op0, 1)))
+	pattern = "fl%c\t%%0, %%1";
+      else
+	gcc_unreachable ();
+      break;
+
+    case POST_MODIFY:
+      addr_op0 = XEXP (addr, 0);
+      addr_op1 = XEXP (addr, 1);
+
+      if (REG_P (addr_op0) && GET_CODE (addr_op1) == PLUS
+	  && REG_P (XEXP (addr_op1, 1)))
+	pattern = "fl%c.bi\t%%0, %%1";
+      else if (REG_P (addr_op0) && GET_CODE (addr_op1) == PLUS
+	       && CONST_INT_P (XEXP (addr_op1, 1)))
+	pattern = "fl%ci.bi\t%%0, %%1";
+      else
+	gcc_unreachable ();
+      break;
+
+    case POST_INC:
+      if (REG_P (XEXP (addr, 0)))
+	{
+	  if (dp)
+	    pattern = "fl%ci.bi\t%%0, %%1, 8";
+	  else
+	    pattern = "fl%ci.bi\t%%0, %%1, 4";
+	}
+      else
+	gcc_unreachable ();
+      break;
+
+    case POST_DEC:
+      if (REG_P (XEXP (addr, 0)))
+	{
+	  if (dp)
+	    pattern = "fl%ci.bi\t%%0, %%1, -8";
+	  else
+	    pattern = "fl%ci.bi\t%%0, %%1, -4";
+	}
+      else
+	gcc_unreachable ();
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  sprintf (buff, pattern, dp ? 'd' : 's');
+  output_asm_insn (buff, operands);
+  return "";
+}
+
+/* output a float store instruction */
+const char *
+nds32_output_float_store (rtx *operands)
+{
+  char buff[100];
+  const char *pattern;
+  rtx addr, addr_op0, addr_op1;
+  int dp = GET_MODE_SIZE (GET_MODE (operands[0])) == 8;
+  addr = XEXP (operands[0], 0);
+  switch (GET_CODE (addr))
+    {
+    case REG:
+      pattern = "fs%ci\t%%1, %%0";
+      break;
+
+    case PLUS:
+      addr_op0 = XEXP (addr, 0);
+      addr_op1 = XEXP (addr, 1);
+
+      if (REG_P (addr_op0) && REG_P (addr_op1))
+	pattern = "fs%c\t%%1, %%0";
+      else if (REG_P (addr_op0) && CONST_INT_P (addr_op1))
+	pattern = "fs%ci\t%%1, %%0";
+      else if (GET_CODE (addr_op0) == MULT && REG_P (addr_op1)
+	       && REG_P (XEXP (addr_op0, 0))
+	       && CONST_INT_P (XEXP (addr_op0, 1)))
+	pattern = "fs%c\t%%1, %%0";
+      else
+	gcc_unreachable ();
+      break;
+
+    case POST_MODIFY:
+      addr_op0 = XEXP (addr, 0);
+      addr_op1 = XEXP (addr, 1);
+
+      if (REG_P (addr_op0) && GET_CODE (addr_op1) == PLUS
+	  && REG_P (XEXP (addr_op1, 1)))
+	pattern = "fs%c.bi\t%%1, %%0";
+      else if (REG_P (addr_op0) && GET_CODE (addr_op1) == PLUS
+	       && CONST_INT_P (XEXP (addr_op1, 1)))
+	pattern = "fs%ci.bi\t%%1, %%0";
+      else
+	gcc_unreachable ();
+      break;
+
+    case POST_INC:
+      if (REG_P (XEXP (addr, 0)))
+	{
+	  if (dp)
+	    pattern = "fs%ci.bi\t%%1, %%0, 8";
+	  else
+	    pattern = "fs%ci.bi\t%%1, %%0, 4";
+	}
+      else
+	gcc_unreachable ();
+      break;
+
+    case POST_DEC:
+      if (REG_P (XEXP (addr, 0)))
+	{
+	  if (dp)
+	    pattern = "fs%ci.bi\t%%1, %%0, -8";
+	  else
+	    pattern = "fs%ci.bi\t%%1, %%0, -4";
+	}
+      else
+	gcc_unreachable ();
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  sprintf (buff, pattern, dp ? 'd' : 's');
+  output_asm_insn (buff, operands);
+  return "";
+}
+
+const char *
+nds32_output_smw_single_word (rtx *operands)
+{
+  char buff[100];
+  unsigned regno;
+  int enable4;
+  bool update_base_p;
+  rtx base_addr = operands[0];
+  rtx base_reg;
+  rtx otherops[2];
+
+  if (REG_P (XEXP (base_addr, 0)))
+    {
+      update_base_p = false;
+      base_reg = XEXP (base_addr, 0);
+    }
+  else
+    {
+      update_base_p = true;
+      base_reg = XEXP (XEXP (base_addr, 0), 0);
+    }
+
+  const char *update_base = update_base_p ? "m" : "";
+
+  regno = REGNO (operands[1]);
+
+  otherops[0] = base_reg;
+  otherops[1] = operands[1];
+
+  if (regno >= 28)
+    {
+      enable4 = nds32_regno_to_enable4 (regno);
+      sprintf (buff, "smw.bi%s\t$sp, [%%0], $sp, %x", update_base, enable4);
+    }
+  else
+    {
+      sprintf (buff, "smw.bi%s\t%%1, [%%0], %%1", update_base);
+    }
+  output_asm_insn (buff, otherops);
+  return "";
+}
+
+const char *
+nds32_output_smw_double_word (rtx *operands)
+{
+  char buff[100];
+  unsigned regno;
+  int enable4;
+  bool update_base_p;
+  rtx base_addr = operands[0];
+  rtx base_reg;
+  rtx otherops[3];
+
+  if (REG_P (XEXP (base_addr, 0)))
+    {
+      update_base_p = false;
+      base_reg = XEXP (base_addr, 0);
+    }
+  else
+    {
+      update_base_p = true;
+      base_reg = XEXP (XEXP (base_addr, 0), 0);
+    }
+
+  const char *update_base = update_base_p ? "m" : "";
+
+  regno = REGNO (operands[1]);
+
+  otherops[0] = base_reg;
+  otherops[1] = operands[1];
+  otherops[2] = gen_rtx_REG (SImode, REGNO (operands[1]) + 1);;
+
+  if (regno >= 28)
+    {
+      enable4 = nds32_regno_to_enable4 (regno)
+		| nds32_regno_to_enable4 (regno + 1);
+      sprintf (buff, "smw.bi%s\t$sp, [%%0], $sp, %x", update_base, enable4);
+    }
+  else if (regno == 27)
+    {
+      enable4 = nds32_regno_to_enable4 (regno + 1);
+      sprintf (buff, "smw.bi%s\t%%1, [%%0], %%1, %x", update_base, enable4);
+    }
+  else
+    {
+      sprintf (buff, "smw.bi%s\t%%1, [%%0], %%2", update_base);
+    }
+  output_asm_insn (buff, otherops);
+  return "";
+}
+
+
+const char *
+nds32_output_lmw_single_word (rtx *operands)
+{
+  char buff[100];
+  unsigned regno;
+  bool update_base_p;
+  int enable4;
+  rtx base_addr = operands[1];
+  rtx base_reg;
+  rtx otherops[2];
+
+  if (REG_P (XEXP (base_addr, 0)))
+    {
+      update_base_p = false;
+      base_reg = XEXP (base_addr, 0);
+    }
+  else
+    {
+      update_base_p = true;
+      base_reg = XEXP (XEXP (base_addr, 0), 0);
+    }
+
+  const char *update_base = update_base_p ? "m" : "";
+
+  regno = REGNO (operands[0]);
+
+  otherops[0] = operands[0];
+  otherops[1] = base_reg;
+
+  if (regno >= 28)
+    {
+      enable4 = nds32_regno_to_enable4 (regno);
+      sprintf (buff, "lmw.bi%s\t$sp, [%%1], $sp, %x", update_base, enable4);
+    }
+  else
+    {
+      sprintf (buff, "lmw.bi%s\t%%0, [%%1], %%0", update_base);
+    }
+  output_asm_insn (buff, otherops);
+  return "";
+}
+
+void
+nds32_expand_unaligned_load (rtx *operands, enum machine_mode mode)
+{
+  /* Initial memory offset.  */
+  int offset = WORDS_BIG_ENDIAN ? GET_MODE_SIZE (mode) - 1 : 0;
+  int offset_adj = WORDS_BIG_ENDIAN ? -1 : 1;
+  /* Initial register shift byte.  */
+  int shift = 0;
+  /* The first load byte instruction is not the same. */
+  int width = GET_MODE_SIZE (mode) - 1;
+  rtx mem[2];
+  rtx reg[2];
+  rtx sub_reg;
+  rtx temp_reg, temp_sub_reg;
+  int num_reg;
+
+  /* Generating a series of load byte instructions.
+     The first load byte instructions and other
+     load byte instructions are not the same. like:
+     First:
+       lbi reg0, [mem]
+       zeh reg0, reg0
+     Second:
+       lbi temp_reg, [mem + offset]
+       sll temp_reg, (8 * shift)
+       ior reg0, temp_reg
+
+       lbi temp_reg, [mem + (offset + 1)]
+       sll temp_reg, (8 * (shift + 1))
+       ior reg0, temp_reg  */
+
+  temp_reg = gen_reg_rtx (SImode);
+  temp_sub_reg = gen_lowpart (QImode, temp_reg);
+
+  if (mode == DImode)
+    {
+      /* Load doubleword, we need two registers to access.  */
+      reg[0] = nds32_di_low_part_subreg (operands[0]);
+      reg[1] = nds32_di_high_part_subreg (operands[0]);
+      /* A register only store 4 byte.  */
+      width = GET_MODE_SIZE (SImode) - 1;
+    }
+  else
+    {
+      if (VECTOR_MODE_P (mode))
+	reg[0] = gen_reg_rtx (SImode);
+      else
+	reg[0] = operands[0];
+    }
+
+  for (num_reg = (mode == DImode) ? 2 : 1; num_reg > 0; num_reg--)
+    {
+      sub_reg = gen_lowpart (QImode, reg[0]);
+      mem[0] = gen_rtx_MEM (QImode, plus_constant (Pmode, operands[1], offset));
+
+      /* Generating the first part instructions.
+	   lbi reg0, [mem]
+	   zeh reg0, reg0 */
+      emit_move_insn (sub_reg, mem[0]);
+      emit_insn (gen_zero_extendqisi2 (reg[0], sub_reg));
+
+      while (width > 0)
+	{
+	  offset = offset + offset_adj;
+	  shift++;
+	  width--;
+
+	  mem[1] = gen_rtx_MEM (QImode, plus_constant (Pmode,
+						       operands[1],
+						       offset));
+	  /* Generating the second part instructions.
+	       lbi temp_reg, [mem + offset]
+	       sll temp_reg, (8 * shift)
+	       ior reg0, temp_reg  */
+	  emit_move_insn (temp_sub_reg, mem[1]);
+	  emit_insn (gen_ashlsi3 (temp_reg, temp_reg,
+				  GEN_INT (shift * 8)));
+	  emit_insn (gen_iorsi3 (reg[0], reg[0], temp_reg));
+	}
+
+      if (mode == DImode)
+	{
+	  /* Using the second register to load memory information. */
+	  reg[0] = reg[1];
+	  shift = 0;
+	  width = GET_MODE_SIZE (SImode) - 1;
+	  offset = offset + offset_adj;
+	}
+    }
+    if (VECTOR_MODE_P (mode))
+      convert_move (operands[0], reg[0], false);
+}
+
+void
+nds32_expand_unaligned_store (rtx *operands, enum machine_mode mode)
+{
+  /* Initial memory offset.  */
+  int offset = WORDS_BIG_ENDIAN ? GET_MODE_SIZE (mode) - 1 : 0;
+  int offset_adj = WORDS_BIG_ENDIAN ? -1 : 1;
+  /* Initial register shift byte.  */
+  int shift = 0;
+  /* The first load byte instruction is not the same. */
+  int width = GET_MODE_SIZE (mode) - 1;
+  rtx mem[2];
+  rtx reg[2];
+  rtx sub_reg;
+  rtx temp_reg, temp_sub_reg;
+  int num_reg;
+
+  /* Generating a series of store byte instructions.
+     The first store byte instructions and other
+     load byte instructions are not the same. like:
+     First:
+	sbi  reg0, [mem + 0]
+     Second:
+	srli    temp_reg, reg0, (8 * shift)
+	sbi	temp_reg, [mem + offset]  */
+
+  temp_reg = gen_reg_rtx (SImode);
+  temp_sub_reg = gen_lowpart (QImode, temp_reg);
+
+  if (mode == DImode)
+    {
+      /* Load doubleword, we need two registers to access.  */
+      reg[0] = nds32_di_low_part_subreg (operands[1]);
+      reg[1] = nds32_di_high_part_subreg (operands[1]);
+      /* A register only store 4 byte.  */
+      width = GET_MODE_SIZE (SImode) - 1;
+    }
+  else
+    {
+      if (VECTOR_MODE_P (mode))
+	{
+	  reg[0] = gen_reg_rtx (SImode);
+	  convert_move (reg[0], operands[1], false);
+	}
+      else
+	reg[0] = operands[1];
+    }
+
+  for (num_reg = (mode == DImode) ? 2 : 1; num_reg > 0; num_reg--)
+    {
+      sub_reg = gen_lowpart (QImode, reg[0]);
+      mem[0] = gen_rtx_MEM (QImode, plus_constant (Pmode, operands[0], offset));
+
+      /* Generating the first part instructions.
+	   sbi reg0, [mem + 0] */
+      emit_move_insn (mem[0], sub_reg);
+
+      while (width > 0)
+	{
+	  offset = offset + offset_adj;
+	  shift++;
+	  width--;
+
+	  mem[1] = gen_rtx_MEM (QImode, plus_constant (Pmode,
+						       operands[0],
+						       offset));
+	  /* Generating the second part instructions.
+	       srli  temp_reg, reg0, (8 * shift)
+	       sbi   temp_reg, [mem + offset]  */
+	  emit_insn (gen_lshrsi3 (temp_reg, reg[0],
+				  GEN_INT (shift * 8)));
+	  emit_move_insn (mem[1], temp_sub_reg);
+	}
+
+      if (mode == DImode)
+	{
+	  /* Using the second register to load memory information. */
+	  reg[0] = reg[1];
+	  shift = 0;
+	  width = GET_MODE_SIZE (SImode) - 1;
+	  offset = offset + offset_adj;
+	}
+    }
+}
+
+/* Using multiple load/store instruction to output doubleword instruction.  */
+const char *
+nds32_output_double (rtx *operands, bool load_p)
+{
+  char pattern[100];
+  int reg = load_p ? 0 : 1;
+  int mem = load_p ? 1 : 0;
+  rtx otherops[3];
+  rtx addr = XEXP (operands[mem], 0);
+
+  otherops[0] = gen_rtx_REG (SImode, REGNO (operands[reg]));
+  otherops[1] = gen_rtx_REG (SImode, REGNO (operands[reg]) + 1);
+
+  if (GET_CODE (addr)  == POST_INC)
+    {
+      /* (mem (post_inc (reg))) */
+      otherops[2] = XEXP (addr, 0);
+      snprintf (pattern, sizeof (pattern),
+		"%cmw.bim\t%%0, [%%2], %%1, 0", load_p ? 'l' : 's');
+    }
+  else
+    {
+      /* (mem (reg)) */
+      otherops[2] = addr;
+      snprintf (pattern, sizeof (pattern),
+		"%cmw.bi\t%%0, [%%2], %%1, 0", load_p ? 'l' : 's');
+
+    }
+
+  output_asm_insn (pattern, otherops);
+  return "";
+}
+
+const char *
+nds32_output_cbranchsi4_equality_zero (rtx_insn *insn, rtx *operands)
+{
+  enum rtx_code code;
+  bool long_jump_p = false;
+
+  code = GET_CODE (operands[0]);
+
+  /* This zero-comparison conditional branch has two forms:
+       32-bit instruction =>          beqz/bnez           imm16s << 1
+       16-bit instruction => beqzs8/bnezs8/beqz38/bnez38  imm8s << 1
+
+     For 32-bit case,
+     we assume it is always reachable. (but check range -65500 ~ 65500)
+
+     For 16-bit case,
+     it must satisfy { 255 >= (label - pc) >= -256 } condition.
+     However, since the $pc for nds32 is at the beginning of the instruction,
+     we should leave some length space for current insn.
+     So we use range -250 ~ 250.  */
+
+  switch (get_attr_length (insn))
+    {
+    case 8:
+      long_jump_p = true;
+      /* fall through  */
+    case 2:
+      if (which_alternative == 0)
+	{
+	  /* constraint: t */
+	  /*    b<cond>zs8  .L0
+	      or
+		b<inverse_cond>zs8  .LCB0
+		j  .L0
+	      .LCB0:
+	   */
+	  output_cond_branch_compare_zero (code, "s8", long_jump_p,
+					   operands, true);
+	  return "";
+	}
+      else if (which_alternative == 1)
+	{
+	  /* constraint: l */
+	  /*    b<cond>z38  $r0, .L0
+	      or
+		b<inverse_cond>z38  $r0, .LCB0
+		j  .L0
+	      .LCB0:
+	   */
+	  output_cond_branch_compare_zero (code, "38", long_jump_p,
+					   operands, false);
+	  return "";
+	}
+      else
+	{
+	  /* constraint: r */
+	  /* For which_alternative==2, it should not be here.  */
+	  gcc_unreachable ();
+	}
+    case 10:
+      /* including constraints: t, l, and r */
+      long_jump_p = true;
+      /* fall through  */
+    case 4:
+      /* including constraints: t, l, and r */
+      output_cond_branch_compare_zero (code, "", long_jump_p, operands, false);
+      return "";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+const char *
+nds32_output_cbranchsi4_equality_reg (rtx_insn *insn, rtx *operands)
+{
+  enum rtx_code code;
+  bool long_jump_p, r5_p;
+  int insn_length;
+
+  insn_length = get_attr_length (insn);
+
+  long_jump_p = (insn_length == 10 || insn_length == 8) ? true : false;
+  r5_p = (insn_length == 2 || insn_length == 8) ? true : false;
+
+  code = GET_CODE (operands[0]);
+
+  /* This register-comparison conditional branch has one form:
+       32-bit instruction =>          beq/bne           imm14s << 1
+
+     For 32-bit case,
+     we assume it is always reachable. (but check range -16350 ~ 16350).  */
+
+  switch (code)
+    {
+    case EQ:
+    case NE:
+      output_cond_branch (code, "", r5_p, long_jump_p, operands);
+      return "";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+const char *
+nds32_output_cbranchsi4_equality_reg_or_const_int (rtx_insn *insn,
+						   rtx *operands)
+{
+  enum rtx_code code;
+  bool long_jump_p, r5_p;
+  int insn_length;
+
+  insn_length = get_attr_length (insn);
+
+  long_jump_p = (insn_length == 10 || insn_length == 8) ? true : false;
+  r5_p = (insn_length == 2 || insn_length == 8) ? true : false;
+
+  code = GET_CODE (operands[0]);
+
+  /* This register-comparison conditional branch has one form:
+       32-bit instruction =>          beq/bne           imm14s << 1
+       32-bit instruction =>         beqc/bnec          imm8s << 1
+
+     For 32-bit case, we assume it is always reachable.
+     (but check range -16350 ~ 16350 and -250 ~ 250).  */
+
+  switch (code)
+    {
+    case EQ:
+    case NE:
+      if (which_alternative == 2)
+	{
+	  /* r, Is11 */
+	  /* b<cond>c */
+	  output_cond_branch (code, "c", r5_p, long_jump_p, operands);
+	}
+      else
+	{
+	  /* r, r */
+	  /* v, r */
+	  output_cond_branch (code, "", r5_p, long_jump_p, operands);
+	}
+      return "";
+    default:
+      gcc_unreachable ();
+    }
+}
+
+const char *
+nds32_output_cbranchsi4_greater_less_zero (rtx_insn *insn, rtx *operands)
+{
+  enum rtx_code code;
+  bool long_jump_p;
+  int insn_length;
+
+  insn_length = get_attr_length (insn);
+
+  gcc_assert (insn_length == 4 || insn_length == 10);
+
+  long_jump_p = (insn_length == 10) ? true : false;
+
+  code = GET_CODE (operands[0]);
+
+  /* This zero-greater-less-comparison conditional branch has one form:
+       32-bit instruction =>      bgtz/bgez/bltz/blez     imm16s << 1
+
+     For 32-bit case, we assume it is always reachable.
+     (but check range -65500 ~ 65500).  */
+
+  switch (code)
+    {
+    case GT:
+    case GE:
+    case LT:
+    case LE:
+      output_cond_branch_compare_zero (code, "", long_jump_p, operands, false);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  return "";
+}
+
+const char *
+nds32_output_unpkd8 (rtx output, rtx input,
+		     rtx high_idx_rtx, rtx low_idx_rtx,
+		     bool signed_p)
+{
+  char pattern[100];
+  rtx output_operands[2];
+  HOST_WIDE_INT high_idx, low_idx;
+  high_idx = INTVAL (high_idx_rtx);
+  low_idx = INTVAL (low_idx_rtx);
+
+  gcc_assert (high_idx >= 0 && high_idx <= 3);
+  gcc_assert (low_idx >= 0 && low_idx <= 3);
+
+  /* We only have 10, 20, 30 and 31.  */
+  if ((low_idx != 0 || high_idx == 0) &&
+      !(low_idx == 1 && high_idx == 3))
+    return "#";
+
+  char sign_char = signed_p ? 's' : 'z';
+
+  sprintf (pattern,
+	   "%cunpkd8" HOST_WIDE_INT_PRINT_DEC HOST_WIDE_INT_PRINT_DEC "\t%%0, %%1",
+	   sign_char, high_idx, low_idx);
+  output_operands[0] = output;
+  output_operands[1] = input;
+  output_asm_insn (pattern, output_operands);
+  return "";
+}
+
+/* Return true if SYMBOL_REF X binds locally.  */
+
+static bool
+nds32_symbol_binds_local_p (const_rtx x)
+{
+  return (SYMBOL_REF_DECL (x)
+	  ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
+	  : SYMBOL_REF_LOCAL_P (x));
+}
+
+const char *
+nds32_output_call (rtx insn, rtx *operands, rtx symbol, const char *long_call,
+		   const char *call, bool align_p)
+{
+  char pattern[100];
+  bool noreturn_p;
+
+  if (nds32_long_call_p (symbol))
+    strcpy (pattern, long_call);
+  else
+    strcpy (pattern, call);
+
+  if (flag_pic && CONSTANT_P (symbol)
+      && !nds32_symbol_binds_local_p (symbol))
+    strcat (pattern, "@PLT");
+
+  if (align_p)
+    strcat (pattern, "\n\t.align 2");
+
+  noreturn_p = find_reg_note (insn, REG_NORETURN, NULL_RTX) != NULL_RTX;
+
+  if (noreturn_p)
+    {
+      if (TARGET_16_BIT)
+	strcat (pattern, "\n\tnop16");
+      else
+	strcat (pattern, "\n\tnop");
+    }
+
+  output_asm_insn (pattern, operands);
+  return "";
+}
+
+bool
+nds32_need_split_sms_p (rtx in0_idx0, rtx in1_idx0,
+			rtx in0_idx1, rtx in1_idx1)
+{
+  /* smds or smdrs.  */
+  if (INTVAL (in0_idx0) == INTVAL (in1_idx0)
+      && INTVAL (in0_idx1) == INTVAL (in1_idx1)
+      && INTVAL (in0_idx0) != INTVAL (in0_idx1))
+    return false;
+
+  /* smxds.  */
+  if (INTVAL (in0_idx0) != INTVAL (in0_idx1)
+      && INTVAL (in1_idx0) != INTVAL (in1_idx1))
+    return false;
+
+  return true;
+}
+
+const char *
+nds32_output_sms (rtx in0_idx0, rtx in1_idx0,
+		  rtx in0_idx1, rtx in1_idx1)
+{
+  if (nds32_need_split_sms_p (in0_idx0, in1_idx0,
+			      in0_idx1, in1_idx1))
+    return "#";
+  /* out = in0[in0_idx0] * in1[in1_idx0] - in0[in0_idx1] * in1[in1_idx1] */
+
+  /* smds or smdrs.  */
+  if (INTVAL (in0_idx0) == INTVAL (in1_idx0)
+      && INTVAL (in0_idx1) == INTVAL (in1_idx1)
+      && INTVAL (in0_idx0) != INTVAL (in0_idx1))
+    {
+      if (INTVAL (in0_idx0) == 0)
+	{
+	  if (TARGET_BIG_ENDIAN)
+	    return "smds\t%0, %1, %2";
+	  else
+	    return "smdrs\t%0, %1, %2";
+	}
+      else
+	{
+	  if (TARGET_BIG_ENDIAN)
+	    return "smdrs\t%0, %1, %2";
+	  else
+	    return "smds\t%0, %1, %2";
+	}
+    }
+
+  if (INTVAL (in0_idx0) != INTVAL (in0_idx1)
+      && INTVAL (in1_idx0) != INTVAL (in1_idx1))
+    {
+      if (INTVAL (in0_idx0) == 1)
+	{
+	  if (TARGET_BIG_ENDIAN)
+	    return "smxds\t%0, %2, %1";
+	  else
+	    return "smxds\t%0, %1, %2";
+	}
+      else
+	{
+	  if (TARGET_BIG_ENDIAN)
+	    return "smxds\t%0, %1, %2";
+	  else
+	    return "smxds\t%0, %2, %1";
+	}
+    }
+
+  gcc_unreachable ();
+  return "";
+}
+
+void
+nds32_split_sms (rtx out, rtx in0, rtx in1,
+		 rtx in0_idx0, rtx in1_idx0,
+		 rtx in0_idx1, rtx in1_idx1)
+{
+  rtx result0 = gen_reg_rtx (SImode);
+  rtx result1 = gen_reg_rtx (SImode);
+  emit_insn (gen_mulhisi3v (result0, in0, in1,
+			    in0_idx0, in1_idx0));
+  emit_insn (gen_mulhisi3v (result1, in0, in1,
+			    in0_idx1, in1_idx1));
+  emit_insn (gen_subsi3 (out, result0, result1));
+}
+
+/* Spilt a doubleword instrucion to two single word instructions.  */
+void
+nds32_spilt_doubleword (rtx *operands, bool load_p)
+{
+  int reg = load_p ? 0 : 1;
+  int mem = load_p ? 1 : 0;
+  rtx reg_rtx = load_p ? operands[0] : operands[1];
+  rtx mem_rtx = load_p ? operands[1] : operands[0];
+  rtx low_part[2], high_part[2];
+  rtx sub_mem = XEXP (mem_rtx, 0);
+
+  /* Generate low_part and high_part register pattern.
+     i.e. register pattern like:
+     (reg:DI) -> (subreg:SI (reg:DI))
+		 (subreg:SI (reg:DI)) */
+  low_part[reg] = simplify_gen_subreg (SImode, reg_rtx, GET_MODE (reg_rtx), 0);
+  high_part[reg] = simplify_gen_subreg (SImode, reg_rtx, GET_MODE (reg_rtx), 4);
+
+  /* Generate low_part and high_part memory pattern.
+     Memory format is (post_dec) will generate:
+       low_part:  lwi.bi reg, [mem], 4
+       high_part: lwi.bi reg, [mem], -12 */
+  if (GET_CODE (sub_mem) == POST_DEC)
+    {
+      /* memory format is (post_dec (reg)),
+	 so that extract (reg) from the (post_dec (reg)) pattern.  */
+      sub_mem = XEXP (sub_mem, 0);
+
+      /* generate low_part and high_part memory format:
+	   low_part:  (post_modify ((reg) (plus (reg) (const 4)))
+	   high_part: (post_modify ((reg) (plus (reg) (const -12))) */
+      low_part[mem] = gen_frame_mem (SImode,
+				     gen_rtx_POST_MODIFY (Pmode, sub_mem,
+							  gen_rtx_PLUS (Pmode,
+							  sub_mem,
+							  GEN_INT (4))));
+      high_part[mem] = gen_frame_mem (SImode,
+				      gen_rtx_POST_MODIFY (Pmode, sub_mem,
+							   gen_rtx_PLUS (Pmode,
+							   sub_mem,
+							   GEN_INT (-12))));
+    }
+  else if (GET_CODE (sub_mem) == POST_MODIFY)
+    {
+      /* Memory format is (post_modify (reg) (plus (reg) (const))),
+	 so that extract (reg) from the post_modify pattern.  */
+      rtx post_mem = XEXP (sub_mem, 0);
+
+      /* Extract (const) from the (post_modify (reg) (plus (reg) (const)))
+	 pattern.  */
+
+      rtx plus_op = XEXP (sub_mem, 1);
+      rtx post_val = XEXP (plus_op, 1);
+
+      /* Generate low_part and high_part memory format:
+	   low_part:  (post_modify ((reg) (plus (reg) (const)))
+	   high_part: ((plus (reg) (const 4))) */
+      low_part[mem] = gen_frame_mem (SImode,
+				     gen_rtx_POST_MODIFY (Pmode, post_mem,
+							  gen_rtx_PLUS (Pmode,
+							  post_mem,
+							  post_val)));
+      high_part[mem] = gen_frame_mem (SImode, plus_constant (Pmode,
+							     post_mem,
+							     4));
+    }
+  else
+    {
+      /* memory format: (symbol_ref), (const), (reg + const_int).  */
+      low_part[mem] = adjust_address (mem_rtx, SImode, 0);
+      high_part[mem] = adjust_address (mem_rtx, SImode, 4);
+    }
+
+  /* After reload completed, we have dependent issue by low part register and
+     higt part memory. i.e. we cannot split a sequence
+     like:
+	load $r0, [%r1]
+     spilt to
+	lw  $r0, [%r0]
+	lwi $r1, [%r0 + 4]
+     swap position
+	lwi $r1, [%r0 + 4]
+	lw  $r0, [%r0]
+     For store instruction we don't have a problem.
+
+     When memory format is [post_modify], we need to emit high part instruction,
+     before low part instruction.
+     expamle:
+       load $r0, [%r2], post_val
+     spilt to
+       load $r1, [%r2 + 4]
+       load $r0, [$r2], post_val.  */
+  if ((load_p && reg_overlap_mentioned_p (low_part[0], high_part[1]))
+      || GET_CODE (sub_mem) == POST_MODIFY)
+    {
+      operands[2] = high_part[0];
+      operands[3] = high_part[1];
+      operands[4] = low_part[0];
+      operands[5] = low_part[1];
+    }
+  else
+    {
+      operands[2] = low_part[0];
+      operands[3] = low_part[1];
+      operands[4] = high_part[0];
+      operands[5] = high_part[1];
+    }
+}
+
+void
+nds32_split_ashiftdi3 (rtx dst, rtx src, rtx shiftamount)
+{
+  rtx src_high_part, src_low_part;
+  rtx dst_high_part, dst_low_part;
+
+  dst_high_part = nds32_di_high_part_subreg (dst);
+  dst_low_part = nds32_di_low_part_subreg (dst);
+
+  src_high_part = nds32_di_high_part_subreg (src);
+  src_low_part = nds32_di_low_part_subreg (src);
+
+  /* We need to handle shift more than 32 bit!!!! */
+  if (CONST_INT_P (shiftamount))
+    {
+      if (INTVAL (shiftamount) < 32)
+	{
+	  rtx ext_start;
+	  ext_start = gen_int_mode(32 - INTVAL (shiftamount), SImode);
+
+	  emit_insn (gen_wext (dst_high_part, src, ext_start));
+	  emit_insn (gen_ashlsi3 (dst_low_part, src_low_part, shiftamount));
+	}
+      else
+	{
+	  rtx new_shift_amout = gen_int_mode(INTVAL (shiftamount) - 32, SImode);
+
+	  emit_insn (gen_ashlsi3 (dst_high_part, src_low_part,
+						 new_shift_amout));
+
+	  emit_move_insn (dst_low_part, GEN_INT (0));
+	}
+    }
+  else
+    {
+      rtx dst_low_part_l32, dst_high_part_l32;
+      rtx dst_low_part_g32, dst_high_part_g32;
+      rtx new_shift_amout, select_reg;
+      dst_low_part_l32 = gen_reg_rtx (SImode);
+      dst_high_part_l32 = gen_reg_rtx (SImode);
+      dst_low_part_g32 = gen_reg_rtx (SImode);
+      dst_high_part_g32 = gen_reg_rtx (SImode);
+      new_shift_amout = gen_reg_rtx (SImode);
+      select_reg = gen_reg_rtx (SImode);
+
+      rtx ext_start;
+      ext_start = gen_reg_rtx (SImode);
+
+      /*
+	 if (shiftamount < 32)
+	   dst_low_part = src_low_part << shiftamout
+	   dst_high_part = wext (src, 32 - shiftamount)
+	   # wext can't handle wext (src, 32) since it's only take rb[0:4]
+	   # for extract.
+	   dst_high_part = shiftamount == 0 ? src_high_part : dst_high_part
+	 else
+	   dst_low_part = 0
+	   dst_high_part = src_low_part << shiftamount & 0x1f
+      */
+
+      emit_insn (gen_subsi3 (ext_start,
+			     gen_int_mode (32, SImode),
+			     shiftamount));
+      emit_insn (gen_wext (dst_high_part_l32, src, ext_start));
+
+      /* Handle for shiftamout == 0.  */
+      emit_insn (gen_cmovzsi (dst_high_part_l32, shiftamount,
+			      src_high_part, dst_high_part_l32));
+
+      emit_insn (gen_ashlsi3 (dst_low_part_l32, src_low_part, shiftamount));
+
+      emit_move_insn (dst_low_part_g32, const0_rtx);
+      emit_insn (gen_andsi3 (new_shift_amout, shiftamount, GEN_INT (0x1f)));
+      emit_insn (gen_ashlsi3 (dst_high_part_g32, src_low_part,
+						 new_shift_amout));
+
+      emit_insn (gen_slt_compare (select_reg, shiftamount, GEN_INT (32)));
+
+      emit_insn (gen_cmovnsi (dst_low_part, select_reg,
+			      dst_low_part_l32, dst_low_part_g32));
+      emit_insn (gen_cmovnsi (dst_high_part, select_reg,
+			      dst_high_part_l32, dst_high_part_g32));
+    }
+}
+
+void
+nds32_split_ashiftrtdi3 (rtx dst, rtx src, rtx shiftamount)
+{
+  nds32_split_shiftrtdi3 (dst, src, shiftamount, false);
+}
+
+void
+nds32_split_lshiftrtdi3 (rtx dst, rtx src, rtx shiftamount)
+{
+  nds32_split_shiftrtdi3 (dst, src, shiftamount, true);
+}
+
+void
+nds32_split_rotatertdi3 (rtx dst, rtx src, rtx shiftamount)
+{
+  rtx dst_low_part_l32, dst_high_part_l32;
+  rtx dst_low_part_g32, dst_high_part_g32;
+  rtx select_reg, low5bit, low5bit_inv, minus32sa;
+  rtx dst_low_part_g32_tmph;
+  rtx dst_low_part_g32_tmpl;
+  rtx dst_high_part_l32_tmph;
+  rtx dst_high_part_l32_tmpl;
+
+  rtx src_low_part, src_high_part;
+  rtx dst_high_part, dst_low_part;
+
+  shiftamount = force_reg (SImode, shiftamount);
+
+  emit_insn (gen_andsi3 (shiftamount,
+			 shiftamount,
+			 gen_int_mode (0x3f, SImode)));
+
+  dst_high_part = nds32_di_high_part_subreg (dst);
+  dst_low_part = nds32_di_low_part_subreg (dst);
+
+  src_high_part = nds32_di_high_part_subreg (src);
+  src_low_part = nds32_di_low_part_subreg (src);
+
+  dst_low_part_l32 = gen_reg_rtx (SImode);
+  dst_high_part_l32 = gen_reg_rtx (SImode);
+  dst_low_part_g32 = gen_reg_rtx (SImode);
+  dst_high_part_g32 = gen_reg_rtx (SImode);
+  low5bit = gen_reg_rtx (SImode);
+  low5bit_inv = gen_reg_rtx (SImode);
+  minus32sa = gen_reg_rtx (SImode);
+  select_reg = gen_reg_rtx (SImode);
+
+  dst_low_part_g32_tmph = gen_reg_rtx (SImode);
+  dst_low_part_g32_tmpl = gen_reg_rtx (SImode);
+
+  dst_high_part_l32_tmph = gen_reg_rtx (SImode);
+  dst_high_part_l32_tmpl = gen_reg_rtx (SImode);
+
+  emit_insn (gen_slt_compare (select_reg, shiftamount, GEN_INT (32)));
+
+  /* if shiftamount < 32
+       dst_low_part = wext(src, shiftamount)
+     else
+       dst_low_part = ((src_high_part >> (shiftamount & 0x1f))
+		       | (src_low_part << (32 - (shiftamount & 0x1f))))
+  */
+  emit_insn (gen_andsi3 (low5bit, shiftamount, gen_int_mode (0x1f, SImode)));
+  emit_insn (gen_subsi3 (low5bit_inv, gen_int_mode (32, SImode), low5bit));
+
+  emit_insn (gen_wext (dst_low_part_l32, src, shiftamount));
+
+  emit_insn (gen_lshrsi3 (dst_low_part_g32_tmpl, src_high_part, low5bit));
+  emit_insn (gen_ashlsi3 (dst_low_part_g32_tmph, src_low_part, low5bit_inv));
+
+  emit_insn (gen_iorsi3 (dst_low_part_g32,
+			 dst_low_part_g32_tmpl,
+			 dst_low_part_g32_tmph));
+
+  emit_insn (gen_cmovnsi (dst_low_part, select_reg,
+			  dst_low_part_l32, dst_low_part_g32));
+
+  /* if shiftamount < 32
+       dst_high_part = ((src_high_part >> shiftamount)
+			| (src_low_part << (32 - shiftamount)))
+       dst_high_part = shiftamount == 0 ? src_high_part : dst_high_part
+     else
+       dst_high_part = wext(src, shiftamount & 0x1f)
+  */
+
+  emit_insn (gen_subsi3 (minus32sa, gen_int_mode (32, SImode), shiftamount));
+
+  emit_insn (gen_lshrsi3 (dst_high_part_l32_tmpl, src_high_part, shiftamount));
+  emit_insn (gen_ashlsi3 (dst_high_part_l32_tmph, src_low_part, minus32sa));
+
+  emit_insn (gen_iorsi3 (dst_high_part_l32,
+			 dst_high_part_l32_tmpl,
+			 dst_high_part_l32_tmph));
+
+  emit_insn (gen_cmovzsi (dst_high_part_l32, shiftamount,
+			  src_high_part, dst_high_part_l32));
+
+  emit_insn (gen_wext (dst_high_part_g32, src, low5bit));
+
+  emit_insn (gen_cmovnsi (dst_high_part, select_reg,
+			  dst_high_part_l32, dst_high_part_g32));
+}
+
+/* Return true if OP contains a symbol reference.  */
+bool
+symbolic_reference_mentioned_p (rtx op)
+{
+  const char *fmt;
+  int i;

-  /* The v3push/v3pop instruction should only be applied on
-     none-isr and none-variadic function.  */
-  if (TARGET_V3PUSH
-      && !nds32_isr_function_p (current_function_decl)
-      && (cfun->machine->va_args_size == 0))
+  if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
+    return true;
+
+  fmt = GET_RTX_FORMAT (GET_CODE (op));
+  for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
     {
-      /* For stack v3push:
-           operands[0]: Re
-           operands[1]: imm8u */
+      if (fmt[i] == 'E')
+	{
+	  int j;

-      /* This variable is to check if 'push25 Re,imm8u' is available.  */
-      int sp_adjust;
+	  for (j = XVECLEN (op, i) - 1; j >= 0; j--)
+	    if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
+	      return true;
+	}

-      /* Set operands[0].  */
-      operands[0] = gen_rtx_REG (SImode, re_callee_saved);
+      else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
+	return true;
+    }

-      /* Check if we can generate 'push25 Re,imm8u',
-         otherwise, generate 'push25 Re,0'.  */
-      sp_adjust = cfun->machine->local_size
-		  + cfun->machine->out_args_size
-		  + cfun->machine->callee_saved_area_gpr_padding_bytes;
-      if (satisfies_constraint_Iu08 (GEN_INT (sp_adjust))
-	  && NDS32_DOUBLE_WORD_ALIGN_P (sp_adjust))
-	operands[1] = GEN_INT (sp_adjust);
-      else
-	operands[1] = GEN_INT (0);
+  return false;
+}

-      /* Create assembly code pattern.  */
-      snprintf (pattern, sizeof (pattern), "push25\t%%0, %%1");
-    }
-  else
-    {
-      /* For normal stack push multiple:
-         operands[0]: Rb
-         operands[1]: Re
-         operands[2]: En4 */
+/* Expand PIC code for @GOTOFF and @GOT.

-      /* This variable is used to check if we only need to generate En4 field.
-         As long as Rb==Re=SP_REGNUM, we set this variable to 1.  */
-      int push_en4_only_p = 0;
+  Example for @GOTOFF:

-      /* Set operands[0] and operands[1].  */
-      operands[0] = gen_rtx_REG (SImode, rb_callee_saved);
-      operands[1] = gen_rtx_REG (SImode, re_callee_saved);
+    la $r0, symbol@GOTOFF
+      -> sethi $ta, hi20(symbol@GOTOFF)
+	 ori $ta, $ta, lo12(symbol@GOTOFF)
+	 add $r0, $ta, $gp

-      /* 'smw.adm $sp,[$sp],$sp,0' means push nothing.  */
-      if (!cfun->machine->fp_size
-	  && !cfun->machine->gp_size
-	  && !cfun->machine->lp_size
-	  && REGNO (operands[0]) == SP_REGNUM
-	  && REGNO (operands[1]) == SP_REGNUM)
+  Example for @GOT:
+
+    la $r0, symbol@GOT
+      -> sethi $ta, hi20(symbol@GOT)
+	 ori $ta, $ta, lo12(symbol@GOT)
+	 lw  $r0, [$ta + $gp]
+*/
+rtx
+nds32_legitimize_pic_address (rtx x)
+{
+  rtx addr = x;
+  rtx reg = gen_reg_rtx (Pmode);
+  rtx pat;
+
+  if (GET_CODE (x) == LABEL_REF
+      || (GET_CODE (x) == SYMBOL_REF
+	  && (CONSTANT_POOL_ADDRESS_P (x)
+	      || SYMBOL_REF_LOCAL_P (x))))
+    {
+      addr = gen_rtx_UNSPEC (SImode, gen_rtvec (1, x), UNSPEC_GOTOFF);
+      addr = gen_rtx_CONST (SImode, addr);
+      emit_insn (gen_sethi (reg, addr));
+      emit_insn (gen_lo_sum (reg, reg, addr));
+      x = gen_rtx_PLUS (Pmode, reg, pic_offset_table_rtx);
+    }
+  else if (GET_CODE (x) == SYMBOL_REF)
+    {
+      addr = gen_rtx_UNSPEC (SImode, gen_rtvec (1, x), UNSPEC_GOT);
+      addr = gen_rtx_CONST (SImode, addr);
+      emit_insn (gen_sethi (reg, addr));
+      emit_insn (gen_lo_sum (reg, reg, addr));
+
+      x = gen_const_mem (SImode, gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
+					       reg));
+    }
+  else if (GET_CODE (x) == CONST)
+    {
+      /* We don't split constant in expand_pic_move because GOTOFF can combine
+	 the addend with the symbol.  */
+      addr = XEXP (x, 0);
+      gcc_assert (GET_CODE (addr) == PLUS);
+
+      rtx op0 = XEXP (addr, 0);
+      rtx op1 = XEXP (addr, 1);
+
+      if ((GET_CODE (op0) == LABEL_REF
+	   || (GET_CODE (op0) == SYMBOL_REF
+	       && (CONSTANT_POOL_ADDRESS_P (op0)
+		   || SYMBOL_REF_LOCAL_P (op0))))
+	  && GET_CODE (op1) == CONST_INT)
 	{
-	  /* No need to generate instruction.  */
-	  return "";
+	  pat = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0), UNSPEC_GOTOFF);
+	  pat = gen_rtx_PLUS (Pmode, pat, op1);
+	  pat = gen_rtx_CONST (Pmode, pat);
+	  emit_insn (gen_sethi (reg, pat));
+	  emit_insn (gen_lo_sum (reg, reg, pat));
+	  x = gen_rtx_PLUS (Pmode, reg, pic_offset_table_rtx);
+	}
+      else if (GET_CODE (op0) == SYMBOL_REF
+	       && GET_CODE (op1) == CONST_INT)
+	{
+	  /* This is a constant offset from a @GOT symbol reference.  */
+	  addr = gen_rtx_UNSPEC (SImode, gen_rtvec (1, op0), UNSPEC_GOT);
+	  addr = gen_rtx_CONST (SImode, addr);
+	  emit_insn (gen_sethi (reg, addr));
+	  emit_insn (gen_lo_sum (reg, reg, addr));
+	  addr = gen_const_mem (SImode, gen_rtx_PLUS (Pmode,
+						      pic_offset_table_rtx,
+						      reg));
+	  emit_move_insn (reg, addr);
+	  if (satisfies_constraint_Is15 (op1))
+	    x = gen_rtx_PLUS (Pmode, reg, op1);
+	  else
+	    {
+	      rtx tmp_reg = gen_reg_rtx (SImode);
+	      emit_insn (gen_movsi (tmp_reg, op1));
+	      x = gen_rtx_PLUS (Pmode, reg, tmp_reg);
+	    }
 	}
       else
 	{
-	  /* If Rb==Re=SP_REGNUM, we only need to generate En4 field.  */
-	  if (REGNO (operands[0]) == SP_REGNUM
-	      && REGNO (operands[1]) == SP_REGNUM)
-	    push_en4_only_p = 1;
-
-	  /* Create assembly code pattern.
-	     We need to handle the form: "Rb, Re, { $fp $gp $lp }".  */
-	  snprintf (pattern, sizeof (pattern),
-		    "push.s\t%s{%s%s%s }",
-		    push_en4_only_p ? "" : "%0, %1, ",
-		    cfun->machine->fp_size ? " $fp" : "",
-		    cfun->machine->gp_size ? " $gp" : "",
-		    cfun->machine->lp_size ? " $lp" : "");
+	  /* Don't handle this pattern.  */
+	  debug_rtx (x);
+	  gcc_unreachable ();
 	}
     }
+  return x;
+}

-  /* We use output_asm_insn() to output assembly code by ourself.  */
-  output_asm_insn (pattern, operands);
-  return "";
+void
+nds32_expand_pic_move (rtx *operands)
+{
+  rtx src;
+
+  src = nds32_legitimize_pic_address (operands[1]);
+  emit_move_insn (operands[0], src);
 }

-/* Function to output stack pop operation.
-   We need to deal with normal stack pop multiple or stack v3pop.  */
-const char *
-nds32_output_stack_pop (rtx par_rtx ATTRIBUTE_UNUSED)
+/* Expand ICT symbol.
+    Example for @ICT and ICT model=large:
+
+    la $r0, symbol@ICT
+      -> sethi $rt, hi20(symbol@ICT)
+	 lwi $r0, [$rt + lo12(symbol@ICT)]
+
+*/
+rtx
+nds32_legitimize_ict_address (rtx x)
 {
-  /* A string pattern for output_asm_insn().  */
-  char pattern[100];
-  /* The operands array which will be used in output_asm_insn().  */
-  rtx operands[3];
-  /* Pick up callee-saved first regno and last regno for further use.  */
-  int rb_callee_saved = cfun->machine->callee_saved_first_gpr_regno;
-  int re_callee_saved = cfun->machine->callee_saved_last_gpr_regno;
+  rtx symbol = x;
+  rtx addr = x;
+  rtx reg = gen_reg_rtx (Pmode);
+  gcc_assert (GET_CODE (x) == SYMBOL_REF
+	      && nds32_indirect_call_referenced_p (x));

-  /* If we step here, we are going to do v3pop or multiple pop operation.  */
+  addr = gen_rtx_UNSPEC (SImode, gen_rtvec (1, symbol), UNSPEC_ICT);
+  addr = gen_rtx_CONST (SImode, addr);
+  emit_insn (gen_sethi (reg, addr));

-  /* The v3push/v3pop instruction should only be applied on
-     none-isr and none-variadic function.  */
-  if (TARGET_V3PUSH
-      && !nds32_isr_function_p (current_function_decl)
-      && (cfun->machine->va_args_size == 0))
-    {
-      /* For stack v3pop:
-           operands[0]: Re
-           operands[1]: imm8u */
+  x = gen_const_mem (SImode, gen_rtx_LO_SUM (Pmode, reg, addr));

-      /* This variable is to check if 'pop25 Re,imm8u' is available.  */
-      int sp_adjust;
+  return x;
+}

-      /* Set operands[0].  */
-      operands[0] = gen_rtx_REG (SImode, re_callee_saved);
+void
+nds32_expand_ict_move (rtx *operands)
+{
+  rtx src = operands[1];

-      /* Check if we can generate 'pop25 Re,imm8u',
-         otherwise, generate 'pop25 Re,0'.
-         We have to consider alloca issue as well.
-         If the function does call alloca(), the stack pointer is not fixed.
-         In that case, we cannot use 'pop25 Re,imm8u' directly.
-         We have to caculate stack pointer from frame pointer
-         and then use 'pop25 Re,0'.  */
-      sp_adjust = cfun->machine->local_size
-		  + cfun->machine->out_args_size
-		  + cfun->machine->callee_saved_area_gpr_padding_bytes;
-      if (satisfies_constraint_Iu08 (GEN_INT (sp_adjust))
-	  && NDS32_DOUBLE_WORD_ALIGN_P (sp_adjust)
-	  && !cfun->calls_alloca)
-	operands[1] = GEN_INT (sp_adjust);
-      else
-	operands[1] = GEN_INT (0);
+  src = nds32_legitimize_ict_address (src);

-      /* Create assembly code pattern.  */
-      snprintf (pattern, sizeof (pattern), "pop25\t%%0, %%1");
+  emit_move_insn (operands[0], src);
+}
+
+/* Return true X is a indirect call symbol.  */
+bool
+nds32_indirect_call_referenced_p (rtx x)
+{
+  if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_ICT)
+    x = XVECEXP (x, 0, 0);
+
+  if (GET_CODE (x) == SYMBOL_REF)
+    {
+      tree decl = SYMBOL_REF_DECL (x);
+
+      return decl
+	     && (lookup_attribute("indirect_call",
+				  DECL_ATTRIBUTES(decl))
+		 != NULL);
     }
+
+  return false;
+}
+
+/* Return true X is need use long call.  */
+bool
+nds32_long_call_p (rtx symbol)
+{
+  if (nds32_indirect_call_referenced_p (symbol))
+    return TARGET_ICT_MODEL_LARGE;
   else
-    {
-      /* For normal stack pop multiple:
-         operands[0]: Rb
-         operands[1]: Re
-         operands[2]: En4 */
+    return TARGET_CMODEL_LARGE;
+}

-      /* This variable is used to check if we only need to generate En4 field.
-         As long as Rb==Re=SP_REGNUM, we set this variable to 1.  */
-      int pop_en4_only_p = 0;
+/* Return true if X contains a thread-local symbol.  */
+bool
+nds32_tls_referenced_p (rtx x)
+{
+  if (!targetm.have_tls)
+   return false;

-      /* Set operands[0] and operands[1].  */
-      operands[0] = gen_rtx_REG (SImode, rb_callee_saved);
-      operands[1] = gen_rtx_REG (SImode, re_callee_saved);
+  if (GET_CODE (x) == CONST && GET_CODE (XEXP (x, 0)) == PLUS)
+    x = XEXP (XEXP (x, 0), 0);

-      /* 'lmw.bim $sp,[$sp],$sp,0' means pop nothing.  */
-      if (!cfun->machine->fp_size
-	  && !cfun->machine->gp_size
-	  && !cfun->machine->lp_size
-	  && REGNO (operands[0]) == SP_REGNUM
-	  && REGNO (operands[1]) == SP_REGNUM)
-	{
-	  /* No need to generate instruction.  */
-	  return "";
-	}
-      else
-	{
-	  /* If Rb==Re=SP_REGNUM, we only need to generate En4 field.  */
-	  if (REGNO (operands[0]) == SP_REGNUM
-	      && REGNO (operands[1]) == SP_REGNUM)
-	    pop_en4_only_p = 1;
+  if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x))
+    return true;

-	  /* Create assembly code pattern.
-	     We need to handle the form: "Rb, Re, { $fp $gp $lp }".  */
-	  snprintf (pattern, sizeof (pattern),
-		    "pop.s\t%s{%s%s%s }",
-		    pop_en4_only_p ? "" : "%0, %1, ",
-		    cfun->machine->fp_size ? " $fp" : "",
-		    cfun->machine->gp_size ? " $gp" : "",
-		    cfun->machine->lp_size ? " $lp" : "");
+  return false;
+}
+
+/* ADDR contains a thread-local SYMBOL_REF.  Generate code to compute
+   this (thread-local) address.  */
+rtx
+nds32_legitimize_tls_address (rtx x)
+{
+  rtx tmp_reg;
+  rtx tp_reg = gen_rtx_REG (Pmode, TP_REGNUM);
+  rtx pat, insns, reg0;
+
+  if (GET_CODE (x) == SYMBOL_REF)
+    switch (SYMBOL_REF_TLS_MODEL (x))
+      {
+      case TLS_MODEL_GLOBAL_DYNAMIC:
+      case TLS_MODEL_LOCAL_DYNAMIC:
+	/* Emit UNSPEC_TLS_DESC rather than expand rtl directly because spill
+	   may destroy the define-use chain anylysis to insert relax_hint.  */
+	if (SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_GLOBAL_DYNAMIC)
+	  pat = gen_rtx_UNSPEC (SImode, gen_rtvec (1, x), UNSPEC_TLSGD);
+	else
+	  pat = gen_rtx_UNSPEC (SImode, gen_rtvec (1, x), UNSPEC_TLSLD);
+
+	pat = gen_rtx_CONST (SImode, pat);
+	reg0 = gen_rtx_REG (Pmode, 0);
+	/* If we can confirm all clobber reigsters, it doesn't have to use call
+	   instruction.  */
+	insns = emit_call_insn (gen_tls_desc (pat, GEN_INT (0)));
+	use_reg (&CALL_INSN_FUNCTION_USAGE (insns), pic_offset_table_rtx);
+	RTL_CONST_CALL_P (insns) = 1;
+	tmp_reg = gen_reg_rtx (SImode);
+	emit_move_insn (tmp_reg, reg0);
+	x = tmp_reg;
+	break;
+
+      case TLS_MODEL_INITIAL_EXEC:
+	pat = gen_rtx_UNSPEC (SImode, gen_rtvec (1, x), UNSPEC_TLSIE);
+	tmp_reg  = gen_reg_rtx (SImode);
+	pat = gen_rtx_CONST (SImode, pat);
+	emit_insn (gen_tls_ie (tmp_reg, pat, GEN_INT (0)));
+	if (flag_pic)
+	  emit_use (pic_offset_table_rtx);
+	x = gen_rtx_PLUS (Pmode, tmp_reg, tp_reg);
+	break;
+
+      case TLS_MODEL_LOCAL_EXEC:
+	/* Expand symbol_ref@TPOFF':
+	     sethi $ta, hi20(symbol_ref@TPOFF)
+	     ori   $ta, $ta, lo12(symbol_ref@TPOFF)
+	     add   $r0, $ta, $tp */
+	tmp_reg  = gen_reg_rtx (SImode);
+	pat = gen_rtx_UNSPEC (SImode, gen_rtvec (1, x), UNSPEC_TLSLE);
+	pat = gen_rtx_CONST (SImode, pat);
+	emit_insn (gen_sethi (tmp_reg, pat));
+	emit_insn (gen_lo_sum (tmp_reg, tmp_reg, pat));
+	x = gen_rtx_PLUS (Pmode, tmp_reg, tp_reg);
+	break;
+
+      default:
+	gcc_unreachable ();
+      }
+  else if (GET_CODE (x) == CONST)
+    {
+      rtx base, addend;
+      split_const (x, &base, &addend);
+
+      if (SYMBOL_REF_TLS_MODEL (base) == TLS_MODEL_LOCAL_EXEC)
+	{
+	  /* Expand symbol_ref@TPOFF':
+	     sethi $ta, hi20(symbol_ref@TPOFF + addend)
+	     ori   $ta, $ta, lo12(symbol_ref@TPOFF + addend)
+	     add   $r0, $ta, $tp */
+	  tmp_reg  = gen_reg_rtx (SImode);
+	  pat = gen_rtx_UNSPEC (SImode, gen_rtvec (1, base), UNSPEC_TLSLE);
+	  pat = gen_rtx_PLUS (SImode, pat, addend);
+	  pat = gen_rtx_CONST (SImode, pat);
+	  emit_insn (gen_sethi (tmp_reg, pat));
+	  emit_insn (gen_lo_sum (tmp_reg, tmp_reg, pat));
+	  x = gen_rtx_PLUS (Pmode, tmp_reg, tp_reg);
 	}
     }

-  /* We use output_asm_insn() to output assembly code by ourself.  */
-  output_asm_insn (pattern, operands);
-  return "";
+  return x;
 }

-/* Function to generate PC relative jump table.
-   Refer to nds32.md for more details.
+void
+nds32_expand_tls_move (rtx *operands)
+{
+  rtx src = operands[1];
+  rtx base, addend;

-   The following is the sample for the case that diff value
-   can be presented in '.short' size.
+  if (CONSTANT_P (src))
+    split_const (src, &base, &addend);

-     addi    $r1, $r1, -(case_lower_bound)
-     slti    $ta, $r1, (case_number)
-     beqz    $ta, .L_skip_label
+  if (SYMBOL_REF_TLS_MODEL (base) == TLS_MODEL_LOCAL_EXEC)
+    src = nds32_legitimize_tls_address (src);
+  else
+    {
+      src = nds32_legitimize_tls_address (base);
+      if (addend != const0_rtx)
+	{
+	  src = gen_rtx_PLUS (SImode, src, addend);
+	  src = force_operand (src, operands[0]);
+	}
+    }

-     la      $ta, .L35             ! get jump table address
-     lh      $r1, [$ta + $r1 << 1] ! load symbol diff from jump table entry
-     addi    $ta, $r1, $ta
-     jr5     $ta
+  emit_move_insn (operands[0], src);
+}

-     ! jump table entry
-   L35:
-     .short  .L25-.L35
-     .short  .L26-.L35
-     .short  .L27-.L35
-     .short  .L28-.L35
-     .short  .L29-.L35
-     .short  .L30-.L35
-     .short  .L31-.L35
-     .short  .L32-.L35
-     .short  .L33-.L35
-     .short  .L34-.L35 */
-const char *
-nds32_output_casesi_pc_relative (rtx *operands)
+void
+nds32_expand_constant (enum machine_mode mode, HOST_WIDE_INT val,
+		       rtx target, rtx source)
 {
-  machine_mode mode;
-  rtx diff_vec;
+  rtx temp = gen_reg_rtx (mode);
+  int clear_sign_bit_copies = 0;
+  int clear_zero_bit_copies = 0;
+  unsigned HOST_WIDE_INT remainder = val & 0xffffffffUL;
+
+  /* Count number of leading zeros.  */
+  clear_sign_bit_copies =  __builtin_clz (remainder);
+  /* Count number of trailing zeros.  */
+  clear_zero_bit_copies = __builtin_ctz (remainder);
+
+  HOST_WIDE_INT sign_shift_mask = ((0xffffffffUL
+				    << (32 - clear_sign_bit_copies))
+				   & 0xffffffffUL);
+  HOST_WIDE_INT zero_shift_mask = (1 << clear_zero_bit_copies) - 1;
+
+  if (clear_sign_bit_copies > 0 && clear_sign_bit_copies < 17
+      && (remainder | sign_shift_mask) == 0xffffffffUL)
+    {
+      /* Transfer AND to two shifts, example:
+	 a = b & 0x7fffffff => (b << 1) >> 1 */
+      rtx shift = GEN_INT (clear_sign_bit_copies);

-  diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[1])));
+      emit_insn (gen_ashlsi3 (temp, source, shift));
+      emit_insn (gen_lshrsi3 (target, temp, shift));
+    }
+  else if (clear_zero_bit_copies > 0 && clear_sign_bit_copies < 17
+	   && (remainder | zero_shift_mask) == 0xffffffffUL)
+    {
+      /* Transfer AND to two shifts, example:
+	 a = b & 0xfff00000 => (b >> 20) << 20 */
+      rtx shift = GEN_INT (clear_zero_bit_copies);

-  gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
+      emit_insn (gen_lshrsi3 (temp, source, shift));
+      emit_insn (gen_ashlsi3 (target, temp, shift));
+    }
+  else
+    {
+      emit_move_insn (temp, GEN_INT (val));
+      emit_move_insn (target, gen_rtx_fmt_ee (AND, mode, source, temp));
+    }
+}

-  /* Step C: "t <-- operands[1]".  */
-  output_asm_insn ("la\t$ta, %l1", operands);
+/* Auxiliary functions for lwm/smw.  */
+bool
+nds32_valid_smw_lwm_base_p (rtx op)
+{
+  rtx base_addr;

-  /* Get the mode of each element in the difference vector.  */
-  mode = GET_MODE (diff_vec);
+  if (!MEM_P (op))
+    return false;

-  /* Step D: "z <-- (mem (plus (operands[0] << m) t))",
-     where m is 0, 1, or 2 to load address-diff value from table.  */
-  switch (mode)
+  base_addr = XEXP (op, 0);
+
+  if (REG_P (base_addr))
+    return true;
+  else
     {
-    case QImode:
-      output_asm_insn ("lb\t%2, [$ta + %0 << 0]", operands);
-      break;
-    case HImode:
-      output_asm_insn ("lh\t%2, [$ta + %0 << 1]", operands);
-      break;
-    case SImode:
-      output_asm_insn ("lw\t%2, [$ta + %0 << 2]", operands);
-      break;
-    default:
-      gcc_unreachable ();
+      if (GET_CODE (base_addr) == POST_INC
+	  && REG_P (XEXP (base_addr, 0)))
+	return true;
     }

-  /* Step E: "t <-- z + t".
-     Add table label_ref with address-diff value to
-     obtain target case address.  */
-  output_asm_insn ("add\t$ta, %2, $ta", operands);
+  return false;
+}

-  /* Step F: jump to target with register t.  */
-  if (TARGET_16_BIT)
-    return "jr5\t$ta";
-  else
-    return "jr\t$ta";
+/* Auxiliary functions for manipulation DI mode.  */
+rtx nds32_di_high_part_subreg(rtx reg)
+{
+  unsigned high_part_offset = subreg_highpart_offset (SImode, DImode);
+
+  return simplify_gen_subreg (
+	   SImode, reg,
+	   DImode, high_part_offset);
 }

-/* Function to generate normal jump table.  */
-const char *
-nds32_output_casesi (rtx *operands)
+rtx nds32_di_low_part_subreg(rtx reg)
 {
-  /* Step C: "t <-- operands[1]".  */
-  output_asm_insn ("la\t$ta, %l1", operands);
+  unsigned low_part_offset = subreg_lowpart_offset (SImode, DImode);

-  /* Step D: "z <-- (mem (plus (operands[0] << 2) t))".  */
-  output_asm_insn ("lw\t%2, [$ta + %0 << 2]", operands);
+  return simplify_gen_subreg (
+	   SImode, reg,
+	   DImode, low_part_offset);
+}

-  /* No need to perform Step E, which is only used for
-     pc relative jump table.  */
+/* ------------------------------------------------------------------------ */

-  /* Step F: jump to target with register z.  */
-  if (TARGET_16_BIT)
-    return "jr5\t%2";
+/* Auxiliary function for output TLS patterns.  */
+
+const char *
+nds32_output_tls_desc (rtx *operands)
+{
+  char pattern[1000];
+
+  if (TARGET_RELAX_HINT)
+    snprintf (pattern, sizeof (pattern),
+	      ".relax_hint %%1\n\tsethi $r0, hi20(%%0)\n\t"
+	      ".relax_hint %%1\n\tori $r0, $r0, lo12(%%0)\n\t"
+	      ".relax_hint %%1\n\tlw $r15, [$r0 + $gp]\n\t"
+	      ".relax_hint %%1\n\tadd $r0, $r0, $gp\n\t"
+	      ".relax_hint %%1\n\tjral $r15");
   else
-    return "jr\t%2";
+    snprintf (pattern, sizeof (pattern),
+	      "sethi $r0, hi20(%%0)\n\t"
+	      "ori $r0, $r0, lo12(%%0)\n\t"
+	      "lw $r15, [$r0 + $gp]\n\t"
+	      "add $r0, $r0, $gp\n\t"
+	      "jral $r15");
+  output_asm_insn (pattern, operands);
+  return "";
 }

-/* ------------------------------------------------------------------------ */
+const char *
+nds32_output_tls_ie (rtx *operands)
+{
+  char pattern[1000];
+
+  if (flag_pic)
+  {
+      if (TARGET_RELAX_HINT)
+	snprintf (pattern, sizeof (pattern),
+		  ".relax_hint %%2\n\tsethi %%0, hi20(%%1)\n\t"
+		  ".relax_hint %%2\n\tori %%0, %%0, lo12(%%1)\n\t"
+		  ".relax_hint %%2\n\tlw %%0, [%%0 + $gp]");
+      else
+	snprintf (pattern, sizeof (pattern),
+		  "sethi %%0, hi20(%%1)\n\t"
+		  "ori %%0, %%0, lo12(%%1)\n\t"
+		  "lw %%0, [%%0 + $gp]");
+  }
+  else
+    {
+      if (TARGET_RELAX_HINT)
+	snprintf (pattern, sizeof (pattern),
+		  ".relax_hint %%2\n\tsethi %%0, hi20(%%1)\n\t"
+		  ".relax_hint %%2\n\tlwi %%0, [%%0 + lo12(%%1)]");
+      else
+	snprintf (pattern, sizeof (pattern),
+		  "sethi %%0, hi20(%%1)\n\t"
+		  "lwi %%0, [%%0 + lo12(%%1)]");
+    }
+  output_asm_insn (pattern, operands);
+  return "";
+}
diff --git a/gcc/config/nds32/nds32-memory-manipulation.c b/gcc/config/nds32/nds32-memory-manipulation.c
index 4c26dcc..c46ac8f 100644
--- a/gcc/config/nds32/nds32-memory-manipulation.c
+++ b/gcc/config/nds32/nds32-memory-manipulation.c
@@ -25,28 +25,1255 @@
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
-#include "target.h"
+#include "tree.h"
 #include "rtl.h"
-#include "emit-rtl.h"
+#include "df.h"
+#include "alias.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "regs.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
 #include "explow.h"
+#include "emit-rtl.h"
+#include "stmt.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"		/* For add_builtin_function().  */
+#include "builtins.h"
+
+/* ------------------------------------------------------------------------ */
+
+/* This file is divided into six parts:
+
+     PART 1: Auxiliary static function definitions.
+
+     PART 2: Auxiliary function for expand movmem pattern.
+
+     PART 3: Auxiliary function for expand setmem pattern.
+
+     PART 4: Auxiliary function for expand movstr pattern.
+
+     PART 5: Auxiliary function for expand strlen pattern.
+
+     PART 6: Auxiliary function for expand load_multiple/store_multiple
+	     pattern.  */
+
+/* ------------------------------------------------------------------------ */
+
+/* PART 1: Auxiliary static function definitions.  */
+
+static void
+nds32_emit_load_store (rtx reg, rtx mem,
+		       enum machine_mode mode,
+		       int offset, bool load_p)
+{
+  rtx new_mem;
+  new_mem = adjust_address (mem, mode, offset);
+  if (load_p)
+    emit_move_insn (reg, new_mem);
+  else
+    emit_move_insn (new_mem, reg);
+}
+
+static void
+nds32_emit_post_inc_load_store (rtx reg, rtx base_reg,
+				enum machine_mode mode,
+				bool load_p)
+{
+  gcc_assert (GET_MODE (reg) == mode);
+  gcc_assert (GET_MODE (base_reg) == Pmode);
+
+  /* Do not gen (set (reg) (mem (post_inc (reg)))) directly here since it may
+     not recognize by gcc, so let gcc combine it at auto_inc_dec pass.  */
+  if (load_p)
+    emit_move_insn (reg,
+		    gen_rtx_MEM (mode,
+				 base_reg));
+  else
+    emit_move_insn (gen_rtx_MEM (mode,
+				 base_reg),
+		    reg);
+
+  emit_move_insn (base_reg,
+		  plus_constant(Pmode, base_reg, GET_MODE_SIZE (mode)));
+}
+
+static void
+nds32_emit_mem_move (rtx src, rtx dst,
+		     enum machine_mode mode,
+		     int addr_offset)
+{
+  gcc_assert (MEM_P (src) && MEM_P (dst));
+  rtx tmp_reg = gen_reg_rtx (mode);
+  nds32_emit_load_store (tmp_reg, src, mode,
+			 addr_offset, /* load_p */ true);
+  nds32_emit_load_store (tmp_reg, dst, mode,
+			 addr_offset, /* load_p */ false);
+}
+
+static void
+nds32_emit_mem_move_block (int base_regno, int count,
+			   rtx *dst_base_reg, rtx *dst_mem,
+			   rtx *src_base_reg, rtx *src_mem,
+			   bool update_base_reg_p)
+{
+  rtx new_base_reg;
+
+  emit_insn (nds32_expand_load_multiple (base_regno, count,
+					 *src_base_reg, *src_mem,
+					 update_base_reg_p, &new_base_reg));
+  if (update_base_reg_p)
+    {
+      *src_base_reg = new_base_reg;
+      *src_mem = gen_rtx_MEM (SImode, *src_base_reg);
+    }
+
+  emit_insn (nds32_expand_store_multiple (base_regno, count,
+					  *dst_base_reg, *dst_mem,
+					  update_base_reg_p, &new_base_reg));
+
+  if (update_base_reg_p)
+    {
+      *dst_base_reg = new_base_reg;
+      *dst_mem = gen_rtx_MEM (SImode, *dst_base_reg);
+    }
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* PART 2: Auxiliary function for expand movmem pattern.  */
+
+static bool
+nds32_expand_movmemsi_loop_unknown_size (rtx dstmem, rtx srcmem,
+					 rtx size,
+					 rtx alignment, bool use_zol_p)
+{
+  /* Emit loop version of movmem.
+
+       andi    $size_least_3_bit, $size, #~7
+       add     $dst_end, $dst, $size
+       move    $dst_itr, $dst
+       move    $src_itr, $src
+       beqz    $size_least_3_bit, .Lbyte_mode_entry ! Not large enough.
+       add     $double_word_end, $dst, $size_least_3_bit
+
+     .Ldouble_word_mode_loop:
+       lmw.bim $tmp-begin, [$src_itr], $tmp-end, #0 ! $src_itr' = $src_itr
+       smw.bim $tmp-begin, [$dst_itr], $tmp-end, #0 ! $dst_itr' = $dst_itr
+       ! move will delete after register allocation
+       move    $src_itr, $src_itr'
+       move    $dst_itr, $dst_itr'
+       ! Not readch upper bound. Loop.
+       bne     $double_word_end, $dst_itr, .Ldouble_word_mode_loop
+
+     .Lbyte_mode_entry:
+       beq     $dst_itr, $dst_end, .Lend_label
+     .Lbyte_mode_loop:
+       lbi.bi  $tmp, [$src_itr], #1
+       sbi.bi  $tmp, [$dst_itr], #1
+       ! Not readch upper bound. Loop.
+       bne     $dst_itr, $dst_end, .Lbyte_mode_loop
+     .Lend_label:
+  */
+  rtx dst_base_reg, src_base_reg;
+  rtx dst_itr, src_itr;
+  rtx dstmem_m, srcmem_m, dst_itr_m, src_itr_m;
+  rtx dst_end;
+  rtx size_least_3_bit;
+  rtx double_word_end = NULL;
+  rtx double_word_mode_loop, byte_mode_entry, byte_mode_loop, end_label;
+  rtx tmp;
+  rtx mask_least_3_bit;
+  int start_regno;
+  bool align_to_4_bytes = (INTVAL (alignment) & 3) == 0;
+  int hwloop_id = cfun->machine->hwloop_group_id;
+
+  if (TARGET_ISA_V3M && !align_to_4_bytes)
+    return 0;
+
+  if (TARGET_REDUCED_REGS)
+    start_regno = 2;
+  else
+    start_regno = 16;
+
+  dst_itr = gen_reg_rtx (Pmode);
+  src_itr = gen_reg_rtx (Pmode);
+  dst_end = gen_reg_rtx (Pmode);
+  tmp = gen_reg_rtx (QImode);
+  mask_least_3_bit = GEN_INT (~7);
+
+  double_word_mode_loop = gen_label_rtx ();
+  byte_mode_entry = gen_label_rtx ();
+  byte_mode_loop = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  dst_base_reg = copy_to_mode_reg (Pmode, XEXP (dstmem, 0));
+  src_base_reg = copy_to_mode_reg (Pmode, XEXP (srcmem, 0));
+  /* andi   $size_least_3_bit, $size, #~7 */
+  size_least_3_bit = expand_binop (SImode, and_optab, size, mask_least_3_bit,
+				   NULL_RTX, 0, OPTAB_WIDEN);
+  /* add     $dst_end, $dst, $size */
+  dst_end = expand_binop (Pmode, add_optab, dst_base_reg, size,
+			  NULL_RTX, 0, OPTAB_WIDEN);
+
+  /* move    $dst_itr, $dst
+     move    $src_itr, $src */
+  emit_move_insn (dst_itr, dst_base_reg);
+  emit_move_insn (src_itr, src_base_reg);
+
+  /* beqz    $size_least_3_bit, .Lbyte_mode_entry ! Not large enough. */
+  emit_cmp_and_jump_insns (size_least_3_bit, const0_rtx, EQ, NULL,
+			   SImode, 1, byte_mode_entry);
+  if (TARGET_HWLOOP && use_zol_p)
+    {
+      rtx start_label = gen_rtx_LABEL_REF (Pmode, double_word_mode_loop);
+      /* We use multiple-load/store instruction once to process 8-bytes,
+	 division 8-bytes for one cycle, generate
+	 srli $size_least_3_bit, size_least_3_bit, 3.  */
+      emit_insn (gen_lshrsi3 (size_least_3_bit, size_least_3_bit, GEN_INT (3)));
+      /* mtlbi .Ldouble_word_mode_loop */
+      emit_insn (gen_mtlbi_hint (start_label, GEN_INT (hwloop_id)));
+      emit_insn (gen_init_lc (size_least_3_bit, GEN_INT (hwloop_id)));
+      emit_insn (gen_no_hwloop ());
+    }
+  else
+    {
+      /* add     $double_word_end, $dst, $size_least_3_bit */
+      double_word_end = expand_binop (Pmode, add_optab,
+				      dst_base_reg, size_least_3_bit,
+				      NULL_RTX, 0, OPTAB_WIDEN);
+    }
+
+  /* .Ldouble_word_mode_loop: */
+  emit_label (double_word_mode_loop);
+  /* lmw.bim $tmp-begin, [$src_itr], $tmp-end, #0 ! $src_itr' = $src_itr
+     smw.bim $tmp-begin, [$dst_itr], $tmp-end, #0 ! $dst_itr' = $dst_itr */
+  src_itr_m = src_itr;
+  dst_itr_m = dst_itr;
+  srcmem_m = srcmem;
+  dstmem_m = dstmem;
+  nds32_emit_mem_move_block (start_regno, 2,
+			     &dst_itr_m, &dstmem_m,
+			     &src_itr_m, &srcmem_m,
+			     true);
+  /* move    $src_itr, $src_itr'
+     move    $dst_itr, $dst_itr' */
+  emit_move_insn (dst_itr, dst_itr_m);
+  emit_move_insn (src_itr, src_itr_m);
+
+  if (TARGET_HWLOOP && use_zol_p)
+    {
+      rtx start_label = gen_rtx_LABEL_REF (Pmode, double_word_mode_loop);
+      /* Hwloop pseduo instrtion to handle CFG.  */
+      rtx cfg_insn = emit_jump_insn (gen_hwloop_cfg (GEN_INT (hwloop_id),
+				     start_label));
+      JUMP_LABEL (cfg_insn) = double_word_mode_loop;
+      cfun->machine->hwloop_group_id++;
+    }
+  else
+    {
+      /* ! Not readch upper bound. Loop.
+	 bne     $double_word_end, $dst_itr, .Ldouble_word_mode_loop */
+      emit_cmp_and_jump_insns (double_word_end, dst_itr, NE, NULL,
+			       Pmode, 1, double_word_mode_loop);
+    }
+
+  /* .Lbyte_mode_entry: */
+  emit_label (byte_mode_entry);
+
+  /* beq     $dst_itr, $dst_end, .Lend_label */
+  emit_cmp_and_jump_insns (dst_itr, dst_end, EQ, NULL,
+			   Pmode, 1, end_label);
+  /* .Lbyte_mode_loop: */
+  emit_label (byte_mode_loop);
+
+  emit_insn (gen_no_hwloop ());
+  /* lbi.bi  $tmp, [$src_itr], #1 */
+  nds32_emit_post_inc_load_store (tmp, src_itr, QImode, true);
+
+  /* sbi.bi  $tmp, [$dst_itr], #1 */
+  nds32_emit_post_inc_load_store (tmp, dst_itr, QImode, false);
+  /* ! Not readch upper bound. Loop.
+     bne     $dst_itr, $dst_end, .Lbyte_mode_loop */
+  emit_cmp_and_jump_insns (dst_itr, dst_end, NE, NULL,
+			   SImode, 1, byte_mode_loop);
+
+  /* .Lend_label: */
+  emit_label (end_label);
+
+  return true;
+}
+
+static bool
+nds32_expand_movmemsi_loop_known_size (rtx dstmem, rtx srcmem,
+				       rtx size, rtx alignment)
+{
+  rtx dst_base_reg, src_base_reg;
+  rtx dst_itr, src_itr;
+  rtx dstmem_m, srcmem_m, dst_itr_m, src_itr_m;
+  rtx dst_end;
+  rtx double_word_mode_loop, byte_mode_loop;
+  rtx tmp;
+  int start_regno;
+  bool align_to_4_bytes = (INTVAL (alignment) & 3) == 0;
+  int hwloop_id = cfun->machine->hwloop_group_id;
+  unsigned HOST_WIDE_INT total_bytes = UINTVAL (size);
+
+  if (TARGET_ISA_V3M && !align_to_4_bytes)
+    return 0;
+
+  if (TARGET_REDUCED_REGS)
+    start_regno = 2;
+  else
+    start_regno = 16;
+
+  dst_itr = gen_reg_rtx (Pmode);
+  src_itr = gen_reg_rtx (Pmode);
+  dst_end = gen_reg_rtx (Pmode);
+  tmp = gen_reg_rtx (QImode);
+
+  double_word_mode_loop = gen_label_rtx ();
+  byte_mode_loop = gen_label_rtx ();
+
+  dst_base_reg = copy_to_mode_reg (Pmode, XEXP (dstmem, 0));
+  src_base_reg = copy_to_mode_reg (Pmode, XEXP (srcmem, 0));
+
+  if (total_bytes < 8)
+    {
+      /* Emit total_bytes less than 8 loop version of movmem.
+	add     $dst_end, $dst, $size
+	move    $dst_itr, $dst
+	.Lbyte_mode_loop:
+	lbi.bi  $tmp, [$src_itr], #1
+	sbi.bi  $tmp, [$dst_itr], #1
+	! Not readch upper bound. Loop.
+	bne     $dst_itr, $dst_end, .Lbyte_mode_loop */
+
+      /* add     $dst_end, $dst, $size */
+      dst_end = expand_binop (Pmode, add_optab, dst_base_reg, size,
+			      NULL_RTX, 0, OPTAB_WIDEN);
+      /* move    $dst_itr, $dst
+	 move    $src_itr, $src */
+      emit_move_insn (dst_itr, dst_base_reg);
+      emit_move_insn (src_itr, src_base_reg);
+
+      /* .Lbyte_mode_loop: */
+      emit_label (byte_mode_loop);
+
+      emit_insn (gen_no_hwloop ());
+      /* lbi.bi  $tmp, [$src_itr], #1 */
+      nds32_emit_post_inc_load_store (tmp, src_itr, QImode, true);
+
+      /* sbi.bi  $tmp, [$dst_itr], #1 */
+      nds32_emit_post_inc_load_store (tmp, dst_itr, QImode, false);
+      /* ! Not readch upper bound. Loop.
+	 bne     $dst_itr, $dst_end, .Lbyte_mode_loop */
+      emit_cmp_and_jump_insns (dst_itr, dst_end, NE, NULL,
+			       SImode, 1, byte_mode_loop);
+      return true;
+    }
+  else if (total_bytes % 8 == 0)
+    {
+      /* Emit multiple of 8 loop version of movmem.
+
+	 add     $dst_end, $dst, $size
+	 move    $dst_itr, $dst
+	 move    $src_itr, $src
+
+	.Ldouble_word_mode_loop:
+	lmw.bim $tmp-begin, [$src_itr], $tmp-end, #0 ! $src_itr' = $src_itr
+	smw.bim $tmp-begin, [$dst_itr], $tmp-end, #0 ! $dst_itr' = $dst_itr
+	! move will delete after register allocation
+	move    $src_itr, $src_itr'
+	move    $dst_itr, $dst_itr'
+	! Not readch upper bound. Loop.
+	bne     $double_word_end, $dst_itr, .Ldouble_word_mode_loop */
+
+      if (TARGET_HWLOOP)
+	{
+	  rtx start_label = gen_rtx_LABEL_REF (Pmode, double_word_mode_loop);
+
+	  rtx loop_count_reg = gen_reg_rtx (Pmode);
+	  /* movi $loop_count_reg, total_bytes / 8 */
+	  emit_move_insn (loop_count_reg, GEN_INT (total_bytes / 8));
+	  /* mtlbi .Ldouble_word_mode_loop */
+	  emit_insn (gen_mtlbi_hint (start_label, GEN_INT (hwloop_id)));
+	  /* mtusr  $loop_count_reg, LC */
+	  emit_insn (gen_init_lc (loop_count_reg, GEN_INT (hwloop_id)));
+	  emit_insn (gen_no_hwloop ());
+	}
+      else
+	{
+	  /* add     $dst_end, $dst, $size */
+	  dst_end = expand_binop (Pmode, add_optab, dst_base_reg, size,
+				  NULL_RTX, 0, OPTAB_WIDEN);
+	}
+
+      /* move    $dst_itr, $dst
+	 move    $src_itr, $src */
+      emit_move_insn (dst_itr, dst_base_reg);
+      emit_move_insn (src_itr, src_base_reg);
+
+      /* .Ldouble_word_mode_loop: */
+      emit_label (double_word_mode_loop);
+      /* lmw.bim $tmp-begin, [$src_itr], $tmp-end, #0 ! $src_itr' = $src_itr
+	 smw.bim $tmp-begin, [$dst_itr], $tmp-end, #0 ! $dst_itr' = $dst_itr */
+      src_itr_m = src_itr;
+      dst_itr_m = dst_itr;
+      srcmem_m = srcmem;
+      dstmem_m = dstmem;
+      nds32_emit_mem_move_block (start_regno, 2,
+				 &dst_itr_m, &dstmem_m,
+				 &src_itr_m, &srcmem_m,
+				 true);
+      /* move    $src_itr, $src_itr'
+	 move    $dst_itr, $dst_itr' */
+      emit_move_insn (dst_itr, dst_itr_m);
+      emit_move_insn (src_itr, src_itr_m);
+
+      if (TARGET_HWLOOP)
+	{
+	  rtx start_label = gen_rtx_LABEL_REF (Pmode, double_word_mode_loop);
+	  /* Hwloop pseduo instrtion to handle CFG.  */
+	  rtx cfg_insn = emit_jump_insn (gen_hwloop_cfg (GEN_INT (hwloop_id),
+					 start_label));
+	  JUMP_LABEL (cfg_insn) = double_word_mode_loop;
+	  cfun->machine->hwloop_group_id++;
+	}
+      else
+	{
+	  /* ! Not readch upper bound. Loop.
+	     bne     $double_word_end, $dst_itr, .Ldouble_word_mode_loop */
+	  emit_cmp_and_jump_insns (dst_end, dst_itr, NE, NULL,
+				   Pmode, 1, double_word_mode_loop);
+	}
+    }
+  else
+    {
+      /* Handle size greater than 8, and not a multiple of 8.  */
+      return nds32_expand_movmemsi_loop_unknown_size (dstmem, srcmem,
+						      size, alignment,
+						      true);
+    }
+
+  return true;
+}
+
+static bool
+nds32_expand_movmemsi_loop (rtx dstmem, rtx srcmem,
+			    rtx size, rtx alignment)
+{
+  if (CONST_INT_P (size))
+    return nds32_expand_movmemsi_loop_known_size (dstmem, srcmem,
+						  size, alignment);
+  else
+    return nds32_expand_movmemsi_loop_unknown_size (dstmem, srcmem,
+						    size, alignment, false);
+}
+
+static bool
+nds32_expand_movmemsi_unroll (rtx dstmem, rtx srcmem,
+			      rtx total_bytes, rtx alignment)
+{
+  rtx dst_base_reg, src_base_reg;
+  rtx tmp_reg;
+  int maximum_bytes;
+  int maximum_bytes_per_inst;
+  int maximum_regs;
+  int start_regno;
+  int i, inst_num;
+  HOST_WIDE_INT remain_bytes, remain_words;
+  bool align_to_4_bytes = (INTVAL (alignment) & 3) == 0;
+  bool align_to_2_bytes = (INTVAL (alignment) & 1) == 0;
+
+  /* Because reduced-set regsiters has few registers
+     (r0~r5, r6~10, r15, r28~r31, where 'r15' and 'r28~r31'
+      cannot be used for register allocation),
+     using 8 registers (32 bytes) for moving memory block
+     may easily consume all of them.
+     It makes register allocation/spilling hard to work.
+     So we only allow maximum=4 registers (16 bytes) for
+     moving memory block under reduced-set registers.  */
+  if (TARGET_REDUCED_REGS)
+    {
+      maximum_regs  = 4;
+      maximum_bytes = 64;
+      start_regno   = 2;
+    }
+  else
+    {
+      if (TARGET_LINUX_ABI)
+	{
+	  /* $r25 is $tp so we use up to 8 registers if using Linux ABI.  */
+	  maximum_regs  = 8;
+	  maximum_bytes = 160;
+	  start_regno   = 16;
+	}
+      else
+	{
+	  maximum_regs  = 10;
+	  maximum_bytes = 160;
+	  start_regno   = 16;
+	}
+    }
+  maximum_bytes_per_inst = maximum_regs * UNITS_PER_WORD;
+
+  /* 1. Total_bytes is integer for sure.
+     2. Alignment is integer for sure.
+     3. Maximum 4 or 10 registers and up to 4 instructions,
+	4 * 4 * 4 = 64 bytes, 8 * 4 * 10 = 160 bytes.
+     4. The dstmem cannot be volatile memory access.
+     5. The srcmem cannot be volatile memory access.
+     6. Known shared alignment not align to 4 byte in v3m since lmw/smw *NOT*
+	support unalign access with v3m configure.  */
+  if (GET_CODE (total_bytes) != CONST_INT
+      || GET_CODE (alignment) != CONST_INT
+      || INTVAL (total_bytes) > maximum_bytes
+      || MEM_VOLATILE_P (dstmem)
+      || MEM_VOLATILE_P (srcmem)
+      || (TARGET_ISA_V3M && !align_to_4_bytes))
+    return false;
+
+  dst_base_reg = copy_to_mode_reg (SImode, XEXP (dstmem, 0));
+  src_base_reg = copy_to_mode_reg (SImode, XEXP (srcmem, 0));
+  remain_bytes = INTVAL (total_bytes);
+
+  /* Do not update base address for last lmw/smw pair.  */
+  inst_num = ((INTVAL (total_bytes) + (maximum_bytes_per_inst - 1))
+	      / maximum_bytes_per_inst) - 1;
+
+  for (i = 0; i < inst_num; i++)
+    {
+      nds32_emit_mem_move_block (start_regno, maximum_regs,
+				 &dst_base_reg, &dstmem,
+				 &src_base_reg, &srcmem,
+				 true);
+    }
+  remain_bytes -= maximum_bytes_per_inst * inst_num;
+
+  remain_words = remain_bytes / UNITS_PER_WORD;
+  remain_bytes = remain_bytes - (remain_words * UNITS_PER_WORD);
+
+  if (remain_words != 0)
+    {
+      if (remain_bytes != 0)
+	nds32_emit_mem_move_block (start_regno, remain_words,
+				   &dst_base_reg, &dstmem,
+				   &src_base_reg, &srcmem,
+				   true);
+      else
+	{
+	  /* Do not update address if no further byte to move.  */
+	  if (remain_words == 1)
+	   {
+	      /* emit move instruction if align to 4 byte and only 1
+		 word to move.  */
+	      if (align_to_4_bytes)
+		nds32_emit_mem_move (srcmem, dstmem, SImode, 0);
+	      else
+		{
+		  tmp_reg = gen_reg_rtx (SImode);
+		  emit_insn (
+		    gen_unaligned_load_w (tmp_reg,
+					  gen_rtx_MEM (SImode, src_base_reg)));
+		  emit_insn (
+		    gen_unaligned_store_w (gen_rtx_MEM (SImode, dst_base_reg),
+					   tmp_reg));
+		}
+	    }
+	  else
+	    nds32_emit_mem_move_block (start_regno, remain_words,
+				       &dst_base_reg, &dstmem,
+				       &src_base_reg, &srcmem,
+				       false);
+	}
+    }
+
+  switch (remain_bytes)
+    {
+    case 3:
+    case 2:
+      {
+	if (align_to_2_bytes)
+	  nds32_emit_mem_move (srcmem, dstmem, HImode, 0);
+	else
+	  {
+	    nds32_emit_mem_move (srcmem, dstmem, QImode, 0);
+	    nds32_emit_mem_move (srcmem, dstmem, QImode, 1);
+	  }
+
+	if (remain_bytes == 3)
+	  nds32_emit_mem_move (srcmem, dstmem, QImode, 2);
+	break;
+      }
+    case 1:
+      nds32_emit_mem_move (srcmem, dstmem, QImode, 0);
+      break;
+    case 0:
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Successfully create patterns, return true.  */
+  return true;
+}
+
+/* Function to move block memory content by
+   using load_multiple and store_multiple.
+   This is auxiliary extern function to help create rtx template.
+   Check nds32-multiple.md file for the patterns.  */
+bool
+nds32_expand_movmemsi (rtx dstmem, rtx srcmem, rtx total_bytes, rtx alignment)
+{
+  if (nds32_expand_movmemsi_unroll (dstmem, srcmem, total_bytes, alignment))
+    return true;
+
+  if (!optimize_size && optimize > 2)
+    return nds32_expand_movmemsi_loop (dstmem, srcmem, total_bytes, alignment);
+
+  return false;
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* PART 3: Auxiliary function for expand setmem pattern.  */
+
+static rtx
+nds32_gen_dup_4_byte_to_word_value_aux (rtx value, rtx value4word)
+{
+  gcc_assert (GET_MODE (value) == QImode || CONST_INT_P (value));
+
+  if (CONST_INT_P (value))
+    {
+      unsigned HOST_WIDE_INT val = UINTVAL (value) & GET_MODE_MASK(QImode);
+      rtx new_val = gen_int_mode (val | (val << 8)
+				  | (val << 16) | (val << 24), SImode);
+      /* Just calculate at here if it's constant value.  */
+      emit_move_insn (value4word, new_val);
+    }
+  else
+    {
+      if (NDS32_EXT_DSP_P ())
+	{
+	  /* ! prepare word
+	     insb    $tmp, $value, 1         ! $tmp  <- 0x0000abab
+	     pkbb16  $tmp6, $tmp2, $tmp2   ! $value4word  <- 0xabababab */
+	  rtx tmp = gen_reg_rtx (SImode);
+
+	  convert_move (tmp, value, true);
+
+	  emit_insn (
+	    gen_insvsi_internal (tmp, gen_int_mode (0x8, SImode), tmp));
+
+	  emit_insn (gen_pkbbsi_1 (value4word, tmp, tmp));
+	}
+      else
+	{
+	  /* ! prepare word
+	     andi    $tmp1, $value, 0xff       ! $tmp1  <- 0x000000ab
+	     slli    $tmp2, $tmp1, 8           ! $tmp2  <- 0x0000ab00
+	     or      $tmp3, $tmp1, $tmp2       ! $tmp3  <- 0x0000abab
+	     slli    $tmp4, $tmp3, 16          ! $tmp4  <- 0xabab0000
+	     or      $val4word, $tmp3, $tmp4   ! $value4word  <- 0xabababab  */
+
+	  rtx tmp1, tmp2, tmp3, tmp4;
+	  tmp1 = expand_binop (SImode, and_optab, value,
+			       gen_int_mode (0xff, SImode),
+			       NULL_RTX, 0, OPTAB_WIDEN);
+	  tmp2 = expand_binop (SImode, ashl_optab, tmp1,
+			       gen_int_mode (8, SImode),
+			       NULL_RTX, 0, OPTAB_WIDEN);
+	  tmp3 = expand_binop (SImode, ior_optab, tmp1, tmp2,
+			       NULL_RTX, 0, OPTAB_WIDEN);
+	  tmp4 = expand_binop (SImode, ashl_optab, tmp3,
+			       gen_int_mode (16, SImode),
+			       NULL_RTX, 0, OPTAB_WIDEN);
+
+	  emit_insn (gen_iorsi3 (value4word, tmp3, tmp4));
+	}
+    }
+
+  return value4word;
+}
+
+static rtx
+nds32_gen_dup_4_byte_to_word_value (rtx value)
+{
+  rtx value4word = gen_reg_rtx (SImode);
+  nds32_gen_dup_4_byte_to_word_value_aux (value, value4word);
+
+  return value4word;
+}
+
+static rtx
+nds32_gen_dup_8_byte_to_double_word_value (rtx value)
+{
+  rtx value4doubleword = gen_reg_rtx (DImode);
+
+  nds32_gen_dup_4_byte_to_word_value_aux (
+    value, nds32_di_low_part_subreg(value4doubleword));
+
+  emit_move_insn (nds32_di_high_part_subreg(value4doubleword),
+		  nds32_di_low_part_subreg(value4doubleword));
+  return value4doubleword;
+}
+
+
+static rtx
+emit_setmem_doubleword_loop (rtx itr, rtx size, rtx value)
+{
+  rtx word_mode_label = gen_label_rtx ();
+  rtx word_mode_end_label = gen_label_rtx ();
+  rtx byte_mode_size = gen_reg_rtx (SImode);
+  rtx byte_mode_size_tmp = gen_reg_rtx (SImode);
+  rtx word_mode_end = gen_reg_rtx (SImode);
+  rtx size_for_word = gen_reg_rtx (SImode);
+
+  /* and     $size_for_word, $size, #~0x7  */
+  size_for_word = expand_binop (SImode, and_optab, size,
+				gen_int_mode (~0x7, SImode),
+				NULL_RTX, 0, OPTAB_WIDEN);
+
+  emit_move_insn (byte_mode_size, size);
+
+  /* beqz    $size_for_word, .Lbyte_mode_entry  */
+  emit_cmp_and_jump_insns (size_for_word, const0_rtx, EQ, NULL,
+			   SImode, 1, word_mode_end_label);
+  /* add     $word_mode_end, $dst, $size_for_word  */
+  word_mode_end = expand_binop (Pmode, add_optab, itr, size_for_word,
+				NULL_RTX, 0, OPTAB_WIDEN);
+
+  /* andi    $byte_mode_size, $size, 0x7  */
+  byte_mode_size_tmp = expand_binop (SImode, and_optab, size, GEN_INT (0x7),
+				     NULL_RTX, 0, OPTAB_WIDEN);
+
+  emit_move_insn (byte_mode_size, byte_mode_size_tmp);
+
+  /* .Lword_mode:  */
+  emit_label (word_mode_label);
+  /*   ! word-mode set loop
+       smw.bim $value4word, [$dst_itr], $value4word, 0
+       bne     $word_mode_end, $dst_itr, .Lword_mode  */
+  emit_insn (gen_unaligned_store_update_base_dw (itr,
+						 itr,
+						 value));
+  emit_cmp_and_jump_insns (word_mode_end, itr, NE, NULL,
+			   Pmode, 1, word_mode_label);
+
+  emit_label (word_mode_end_label);
+
+  return byte_mode_size;
+}
+
+static rtx
+emit_setmem_byte_loop (rtx itr, rtx size, rtx value, bool need_end)
+{
+  rtx end  = gen_reg_rtx (Pmode);
+  rtx byte_mode_label = gen_label_rtx ();
+  rtx end_label = gen_label_rtx ();
+
+  value = force_reg (QImode, value);
+
+  if (need_end)
+    end = expand_binop (Pmode, add_optab, itr, size,
+			NULL_RTX, 0, OPTAB_WIDEN);
+  /*   beqz    $byte_mode_size, .Lend
+       add     $byte_mode_end, $dst_itr, $byte_mode_size  */
+  emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL,
+			   SImode, 1, end_label);
+
+  if (!need_end)
+    end = expand_binop (Pmode, add_optab, itr, size,
+			NULL_RTX, 0, OPTAB_WIDEN);
+
+  /* .Lbyte_mode:  */
+  emit_label (byte_mode_label);
+
+  emit_insn (gen_no_hwloop ());
+  /*   ! byte-mode set loop
+       sbi.bi  $value, [$dst_itr] ,1
+       bne     $byte_mode_end, $dst_itr, .Lbyte_mode */
+  nds32_emit_post_inc_load_store (value, itr, QImode, false);
+
+  emit_cmp_and_jump_insns (end, itr, NE, NULL,
+			   Pmode, 1, byte_mode_label);
+  /* .Lend: */
+  emit_label (end_label);
+
+  if (need_end)
+    return end;
+  else
+    return NULL_RTX;
+}
+
+static bool
+nds32_expand_setmem_loop (rtx dstmem, rtx size, rtx value)
+{
+  rtx value4doubleword;
+  rtx value4byte;
+  rtx dst;
+  rtx byte_mode_size;
+
+  /* Emit loop version of setmem.
+     memset:
+       ! prepare word
+       andi    $tmp1, $val, 0xff               ! $tmp1  <- 0x000000ab
+       slli    $tmp2, $tmp1, 8                 ! $tmp2  <- 0x0000ab00
+       or      $tmp3, $val, $tmp2              ! $tmp3  <- 0x0000abab
+       slli    $tmp4, $tmp3, 16                ! $tmp4  <- 0xabab0000
+       or      $val4word, $tmp3, $tmp4         ! $value4word  <- 0xabababab
+
+       and     $size_for_word, $size, #-4
+       beqz    $size_for_word, .Lword_mode_end
+
+       add     $word_mode_end, $dst, $size_for_word
+       andi    $byte_mode_size, $size, 3
+
+     .Lword_mode:
+       ! word-mode set loop
+       smw.bim $value4word, [$dst], $value4word, 0
+       bne     $word_mode_end, $dst, .Lword_mode
+
+     .Lword_mode_end:
+       beqz    $byte_mode_size, .Lend
+       add     $byte_mode_end, $dst, $byte_mode_size
+
+     .Lbyte_mode:
+       ! byte-mode set loop
+       sbi.bi  $value4word, [$dst] ,1
+       bne     $byte_mode_end, $dst, .Lbyte_mode
+     .Lend: */
+
+  dst = copy_to_mode_reg (SImode, XEXP (dstmem, 0));
+
+  /* ! prepare word
+     andi    $tmp1, $value, 0xff             ! $tmp1  <- 0x000000ab
+     slli    $tmp2, $tmp1, 8                 ! $tmp2  <- 0x0000ab00
+     or      $tmp3, $tmp1, $tmp2             ! $tmp3  <- 0x0000abab
+     slli    $tmp4, $tmp3, 16                ! $tmp4  <- 0xabab0000
+     or      $val4word, $tmp3, $tmp4         ! $value4word  <- 0xabababab  */
+  value4doubleword = nds32_gen_dup_8_byte_to_double_word_value (value);
+
+  /*   and     $size_for_word, $size, #-4
+       beqz    $size_for_word, .Lword_mode_end
+
+       add     $word_mode_end, $dst, $size_for_word
+       andi    $byte_mode_size, $size, 3
+
+     .Lword_mode:
+       ! word-mode set loop
+       smw.bim $value4word, [$dst], $value4word, 0
+       bne     $word_mode_end, $dst, .Lword_mode
+     .Lword_mode_end:  */
+  byte_mode_size = emit_setmem_doubleword_loop (dst, size, value4doubleword);
+
+  /*   beqz    $byte_mode_size, .Lend
+       add     $byte_mode_end, $dst, $byte_mode_size
+
+     .Lbyte_mode:
+       ! byte-mode set loop
+       sbi.bi  $value, [$dst] ,1
+       bne     $byte_mode_end, $dst, .Lbyte_mode
+     .Lend: */
+
+  value4byte = simplify_gen_subreg (QImode, value4doubleword, DImode,
+				    subreg_lowpart_offset (QImode, DImode));
+
+  emit_setmem_byte_loop (dst, byte_mode_size, value4byte, false);
+
+  return true;
+}
+
+static bool
+nds32_expand_setmem_loop_v3m (rtx dstmem, rtx size, rtx value)
+{
+  rtx base_reg = copy_to_mode_reg (Pmode, XEXP (dstmem, 0));
+  rtx need_align_bytes = gen_reg_rtx (SImode);
+  rtx last_2_bit = gen_reg_rtx (SImode);
+  rtx byte_loop_base = gen_reg_rtx (SImode);
+  rtx byte_loop_size = gen_reg_rtx (SImode);
+  rtx remain_size = gen_reg_rtx (SImode);
+  rtx new_base_reg;
+  rtx value4byte, value4doubleword;
+  rtx byte_mode_size;
+  rtx last_byte_loop_label = gen_label_rtx ();
+
+  size = force_reg (SImode, size);
+
+  value4doubleword = nds32_gen_dup_8_byte_to_double_word_value (value);
+  value4byte = simplify_gen_subreg (QImode, value4doubleword, DImode,
+				    subreg_lowpart_offset (QImode, DImode));
+
+  emit_move_insn (byte_loop_size, size);
+  emit_move_insn (byte_loop_base, base_reg);
+
+  /* Jump to last byte loop if size is less than 16.  */
+  emit_cmp_and_jump_insns (size, gen_int_mode (16, SImode), LE, NULL,
+			   SImode, 1, last_byte_loop_label);
+
+  /* Make sure align to 4 byte first since v3m can't unalign access.  */
+  emit_insn (gen_andsi3 (last_2_bit,
+			 base_reg,
+			 gen_int_mode (0x3, SImode)));
+
+  emit_insn (gen_subsi3 (need_align_bytes,
+			 gen_int_mode (4, SImode),
+			 last_2_bit));
+
+  /* Align to 4 byte. */
+  new_base_reg = emit_setmem_byte_loop (base_reg,
+					need_align_bytes,
+					value4byte,
+					true);
+
+  /* Calculate remain size. */
+  emit_insn (gen_subsi3 (remain_size, size, need_align_bytes));
+
+  /* Set memory word by word. */
+  byte_mode_size = emit_setmem_doubleword_loop (new_base_reg,
+						remain_size,
+						value4doubleword);
+
+  emit_move_insn (byte_loop_base, new_base_reg);
+  emit_move_insn (byte_loop_size, byte_mode_size);
+
+  emit_label (last_byte_loop_label);
+
+  /* And set memory for remain bytes. */
+  emit_setmem_byte_loop (byte_loop_base, byte_loop_size, value4byte, false);
+  return true;
+}
+
+static bool
+nds32_expand_setmem_unroll (rtx dstmem, rtx size, rtx value,
+			    rtx align ATTRIBUTE_UNUSED,
+			    rtx expected_align ATTRIBUTE_UNUSED,
+			    rtx expected_size ATTRIBUTE_UNUSED)
+{
+  unsigned maximum_regs, maximum_bytes, start_regno, regno;
+  rtx value4word;
+  rtx dst_base_reg, new_base_reg;
+  unsigned HOST_WIDE_INT remain_bytes, remain_words, prepare_regs, fill_per_smw;
+  unsigned HOST_WIDE_INT real_size;
+
+  if (TARGET_REDUCED_REGS)
+    {
+      maximum_regs  = 4;
+      maximum_bytes = 64;
+      start_regno   = 2;
+    }
+  else
+    {
+      maximum_regs  = 8;
+      maximum_bytes = 128;
+      start_regno   = 16;
+    }
+
+  real_size = UINTVAL (size) & GET_MODE_MASK(SImode);
+
+  if (!(CONST_INT_P (size) && real_size <= maximum_bytes))
+    return false;
+
+  remain_bytes = real_size;
+
+  gcc_assert (GET_MODE (value) == QImode || CONST_INT_P (value));
+
+  value4word = nds32_gen_dup_4_byte_to_word_value (value);
+
+  prepare_regs = remain_bytes / UNITS_PER_WORD;
+
+  dst_base_reg = copy_to_mode_reg (SImode, XEXP (dstmem, 0));
+
+  if (prepare_regs > maximum_regs)
+    prepare_regs = maximum_regs;
+
+  fill_per_smw = prepare_regs * UNITS_PER_WORD;
+
+  regno = start_regno;
+  switch (prepare_regs)
+    {
+    case 2:
+    default:
+      {
+	rtx reg0 = gen_rtx_REG (SImode, regno);
+	rtx reg1 = gen_rtx_REG (SImode, regno+1);
+	unsigned last_regno = start_regno + prepare_regs - 1;
+
+	emit_move_insn (reg0, value4word);
+	emit_move_insn (reg1, value4word);
+	rtx regd = gen_rtx_REG (DImode, regno);
+	regno += 2;
+
+	/* Try to utilize movd44!  */
+	while (regno <= last_regno)
+	  {
+	    if ((regno + 1) <=last_regno)
+	      {
+		rtx reg = gen_rtx_REG (DImode, regno);
+		emit_move_insn (reg, regd);
+		regno += 2;
+	      }
+	    else
+	      {
+		rtx reg = gen_rtx_REG (SImode, regno);
+		emit_move_insn (reg, reg0);
+		regno += 1;
+	      }
+	  }
+	break;
+      }
+    case 1:
+      {
+	rtx reg = gen_rtx_REG (SImode, regno++);
+	emit_move_insn (reg, value4word);
+      }
+      break;
+    case 0:
+      break;
+    }
+
+  if (fill_per_smw)
+    for (;remain_bytes >= fill_per_smw;remain_bytes -= fill_per_smw)
+      {
+	emit_insn (nds32_expand_store_multiple (start_regno, prepare_regs,
+						dst_base_reg, dstmem,
+						true, &new_base_reg));
+	dst_base_reg = new_base_reg;
+	dstmem = gen_rtx_MEM (SImode, dst_base_reg);
+      }
+
+  remain_words = remain_bytes / UNITS_PER_WORD;
+
+  if (remain_words)
+    {
+      emit_insn (nds32_expand_store_multiple (start_regno, remain_words,
+					      dst_base_reg, dstmem,
+					      true, &new_base_reg));
+      dst_base_reg = new_base_reg;
+      dstmem = gen_rtx_MEM (SImode, dst_base_reg);
+    }
+
+  remain_bytes = remain_bytes - (remain_words * UNITS_PER_WORD);
+
+  if (remain_bytes)
+    {
+      value = simplify_gen_subreg (QImode, value4word, SImode,
+				   subreg_lowpart_offset(QImode, SImode));
+      int offset = 0;
+      for (;remain_bytes;--remain_bytes, ++offset)
+	{
+	  nds32_emit_load_store (value, dstmem, QImode, offset, false);
+	}
+    }
+
+  return true;
+}
+
+bool
+nds32_expand_setmem (rtx dstmem, rtx size, rtx value, rtx align,
+		     rtx expected_align,
+		     rtx expected_size)
+{
+  bool align_to_4_bytes = (INTVAL (align) & 3) == 0;
+
+  /* Only expand at O3 */
+  if (optimize_size || optimize < 3)
+    return false;
+
+  if (TARGET_ISA_V3M && !align_to_4_bytes)
+    return nds32_expand_setmem_loop_v3m (dstmem, size, value);
+
+  if (nds32_expand_setmem_unroll (dstmem, size, value,
+				  align, expected_align, expected_size))
+    return true;
+
+  return nds32_expand_setmem_loop (dstmem, size, value);
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* PART 4: Auxiliary function for expand movstr pattern.  */
+
+bool
+nds32_expand_movstr (rtx dst_end_ptr,
+		     rtx dstmem,
+		     rtx srcmem)
+{
+  rtx tmp;
+  rtx dst_base_reg, src_base_reg;
+  rtx new_dst_base_reg, new_src_base_reg;
+  rtx last_non_null_char_ptr;
+  rtx ffbi_result;
+  rtx loop_label;
+
+  if (optimize_size || optimize < 3)
+    return false;
+
+  tmp = gen_reg_rtx (SImode);
+  ffbi_result = gen_reg_rtx (Pmode);
+  new_dst_base_reg = gen_reg_rtx (Pmode);
+  new_src_base_reg = gen_reg_rtx (Pmode);
+  dst_base_reg = copy_to_mode_reg (SImode, XEXP (dstmem, 0));
+  src_base_reg = copy_to_mode_reg (SImode, XEXP (srcmem, 0));
+  loop_label = gen_label_rtx ();
+
+  emit_label (loop_label);
+  emit_insn (gen_lmwzb (new_src_base_reg, src_base_reg, tmp));
+  emit_insn (gen_smwzb (new_dst_base_reg, dst_base_reg, tmp));
+  emit_insn (gen_unspec_ffb (ffbi_result, tmp, const0_rtx));
+
+  emit_move_insn (src_base_reg, new_src_base_reg);
+  emit_move_insn (dst_base_reg, new_dst_base_reg);
+
+  emit_cmp_and_jump_insns (ffbi_result, const0_rtx, EQ, NULL,
+			   SImode, 1, loop_label);
+
+  last_non_null_char_ptr = expand_binop (Pmode, add_optab, dst_base_reg,
+					 ffbi_result, NULL_RTX, 0, OPTAB_WIDEN);
+
+  emit_move_insn (dst_end_ptr, last_non_null_char_ptr);
+
+  return true;
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* PART 5: Auxiliary function for expand strlen pattern.  */
+
+bool
+nds32_expand_strlen (rtx result, rtx str,
+		     rtx target_char, rtx align ATTRIBUTE_UNUSED)
+{
+  rtx base_reg, backup_base_reg;
+  rtx ffb_result;
+  rtx target_char_ptr, length;
+  rtx loop_label, tmp;
+
+  if (optimize_size || optimize < 3)
+    return false;
+
+  gcc_assert (MEM_P (str));
+  gcc_assert (CONST_INT_P (target_char) || REG_P (target_char));
+
+  base_reg = copy_to_mode_reg (SImode, XEXP (str, 0));
+  loop_label = gen_label_rtx ();
+
+  ffb_result = gen_reg_rtx (Pmode);
+  tmp = gen_reg_rtx (SImode);
+  backup_base_reg = gen_reg_rtx (SImode);
+
+  /* Emit loop version of strlen.
+       move  $backup_base, $base
+     .Lloop:
+       lmw.bim $tmp, [$base], $tmp, 0
+       ffb   $ffb_result, $tmp, $target_char   ! is there $target_char?
+       beqz  $ffb_result, .Lloop
+       add   $last_char_ptr, $base, $ffb_result
+       sub   $length, $last_char_ptr, $backup_base  */
+
+  /* move  $backup_base, $base  */
+  emit_move_insn (backup_base_reg, base_reg);
+
+  /* .Lloop:  */
+  emit_label (loop_label);
+  /* lmw.bim $tmp, [$base], $tmp, 0  */
+  emit_insn (gen_unaligned_load_update_base_w (base_reg, tmp, base_reg));
+
+  /*  ffb   $ffb_result, $tmp, $target_char   ! is there $target_char?  */
+  emit_insn (gen_unspec_ffb (ffb_result, tmp, target_char));
+
+  /* beqz  $ffb_result, .Lloop  */
+  emit_cmp_and_jump_insns (ffb_result, const0_rtx, EQ, NULL,
+			   SImode, 1, loop_label);
+
+  /* add   $target_char_ptr, $base, $ffb_result   */
+  target_char_ptr = expand_binop (Pmode, add_optab, base_reg,
+				ffb_result, NULL_RTX, 0, OPTAB_WIDEN);
+
+  /* sub   $length, $target_char_ptr, $backup_base  */
+  length = expand_binop (Pmode, sub_optab, target_char_ptr,
+			 backup_base_reg, NULL_RTX, 0, OPTAB_WIDEN);
+
+  emit_move_insn (result, length);
+
+  return true;
+}

 /* ------------------------------------------------------------------------ */

+/* PART 6: Auxiliary function for expand load_multiple/store_multiple
+	   pattern.  */
+
 /* Functions to expand load_multiple and store_multiple.
    They are auxiliary extern functions to help create rtx template.
    Check nds32-multiple.md file for the patterns.  */
 rtx
 nds32_expand_load_multiple (int base_regno, int count,
-			    rtx base_addr, rtx basemem)
+			    rtx base_addr, rtx basemem,
+			    bool update_base_reg_p,
+			    rtx *update_base_reg)
 {
   int par_index;
   int offset;
+  int start_idx;
   rtx result;
   rtx new_addr, mem, reg;

+  /* Generate a unaligned load to prevent load instruction pull out from
+     parallel, and then it will generate lwi, and lose unaligned acces */
+  if (count == 1)
+    {
+      reg = gen_rtx_REG (SImode, base_regno);
+      if (update_base_reg_p)
+	{
+	  *update_base_reg = gen_reg_rtx (SImode);
+	  return gen_unaligned_load_update_base_w (*update_base_reg, reg, base_addr);
+	}
+      else
+	return gen_unaligned_load_w (reg, gen_rtx_MEM (SImode, base_addr));
+    }
+
   /* Create the pattern that is presented in nds32-multiple.md.  */
+  if (update_base_reg_p)
+    {
+      result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count + 1));
+      start_idx = 1;
+    }
+  else
+    {
+      result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count));
+      start_idx = 0;
+    }
+
+  if (update_base_reg_p)
+    {
+      offset           = count * 4;
+      new_addr         = plus_constant (Pmode, base_addr, offset);
+      *update_base_reg = gen_reg_rtx (SImode);

-  result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count));
+      XVECEXP (result, 0, 0) = gen_rtx_SET (*update_base_reg, new_addr);
+    }

   for (par_index = 0; par_index < count; par_index++)
     {
@@ -57,7 +1284,7 @@ nds32_expand_load_multiple (int base_regno, int count,
 					       new_addr, offset);
       reg      = gen_rtx_REG (SImode, base_regno + par_index);

-      XVECEXP (result, 0, par_index) = gen_rtx_SET (reg, mem);
+      XVECEXP (result, 0, (par_index + start_idx)) = gen_rtx_SET (reg, mem);
     }

   return result;
@@ -65,16 +1292,49 @@ nds32_expand_load_multiple (int base_regno, int count,

 rtx
 nds32_expand_store_multiple (int base_regno, int count,
-			     rtx base_addr, rtx basemem)
+			     rtx base_addr, rtx basemem,
+			     bool update_base_reg_p,
+			     rtx *update_base_reg)
 {
   int par_index;
   int offset;
+  int start_idx;
   rtx result;
   rtx new_addr, mem, reg;

+  if (count == 1)
+    {
+      reg = gen_rtx_REG (SImode, base_regno);
+      if (update_base_reg_p)
+	{
+	  *update_base_reg = gen_reg_rtx (SImode);
+	  return gen_unaligned_store_update_base_w (*update_base_reg, base_addr, reg);
+	}
+      else
+	return gen_unaligned_store_w (gen_rtx_MEM (SImode, base_addr), reg);
+    }
+
   /* Create the pattern that is presented in nds32-multiple.md.  */

-  result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count));
+  if (update_base_reg_p)
+    {
+      result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count + 1));
+      start_idx = 1;
+    }
+  else
+    {
+      result = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (count));
+      start_idx = 0;
+    }
+
+  if (update_base_reg_p)
+    {
+      offset           = count * 4;
+      new_addr         = plus_constant (Pmode, base_addr, offset);
+      *update_base_reg = gen_reg_rtx (SImode);
+
+      XVECEXP (result, 0, 0) = gen_rtx_SET (*update_base_reg, new_addr);
+    }

   for (par_index = 0; par_index < count; par_index++)
     {
@@ -85,58 +1345,11 @@ nds32_expand_store_multiple (int base_regno, int count,
 					       new_addr, offset);
       reg      = gen_rtx_REG (SImode, base_regno + par_index);

-      XVECEXP (result, 0, par_index) = gen_rtx_SET (mem, reg);
+      XVECEXP (result, 0, par_index + start_idx) = gen_rtx_SET (mem, reg);
     }

-  return result;
-}
-
-/* Function to move block memory content by
-   using load_multiple and store_multiple.
-   This is auxiliary extern function to help create rtx template.
-   Check nds32-multiple.md file for the patterns.  */
-int
-nds32_expand_movmemqi (rtx dstmem, rtx srcmem, rtx total_bytes, rtx alignment)
-{
-  HOST_WIDE_INT in_words, out_words;
-  rtx dst_base_reg, src_base_reg;
-  int maximum_bytes;
-
-  /* Because reduced-set regsiters has few registers
-     (r0~r5, r6~10, r15, r28~r31, where 'r15' and 'r28~r31'
-      cannot be used for register allocation),
-     using 8 registers (32 bytes) for moving memory block
-     may easily consume all of them.
-     It makes register allocation/spilling hard to work.
-     So we only allow maximum=4 registers (16 bytes) for
-     moving memory block under reduced-set registers.  */
-  if (TARGET_REDUCED_REGS)
-    maximum_bytes = 16;
-  else
-    maximum_bytes = 32;
-
-  /* 1. Total_bytes is integer for sure.
-     2. Alignment is integer for sure.
-     3. Maximum 4 or 8 registers, 4 * 4 = 16 bytes, 8 * 4 = 32 bytes.
-     4. Requires (n * 4) block size.
-     5. Requires 4-byte alignment.  */
-  if (GET_CODE (total_bytes) != CONST_INT
-      || GET_CODE (alignment) != CONST_INT
-      || INTVAL (total_bytes) > maximum_bytes
-      || INTVAL (total_bytes) & 3
-      || INTVAL (alignment) & 3)
-    return 0;

-  dst_base_reg = copy_to_mode_reg (SImode, XEXP (dstmem, 0));
-  src_base_reg = copy_to_mode_reg (SImode, XEXP (srcmem, 0));
-
-  out_words = in_words = INTVAL (total_bytes) / UNITS_PER_WORD;
-
-  emit_insn (nds32_expand_load_multiple (0, in_words, src_base_reg, srcmem));
-  emit_insn (nds32_expand_store_multiple (0, out_words, dst_base_reg, dstmem));
-
-  /* Successfully create patterns, return 1.  */
-  return 1;
+  return result;
 }

 /* ------------------------------------------------------------------------ */
diff --git a/gcc/config/nds32/nds32-modes.def b/gcc/config/nds32/nds32-modes.def
index f2d0e6c..7a6f953 100644
--- a/gcc/config/nds32/nds32-modes.def
+++ b/gcc/config/nds32/nds32-modes.def
@@ -18,4 +18,6 @@
    along with GCC; see the file COPYING3.  If not see
    <http://www.gnu.org/licenses/>.  */

-/* So far, there is no need to define any modes for nds32 target.  */
+/* Vector modes.  */
+VECTOR_MODES (INT, 4);        /*            V4QI V2HI */
+VECTOR_MODES (INT, 8);        /*            V8QI V4HI V2SI */
diff --git a/gcc/config/nds32/nds32-multiple.md b/gcc/config/nds32/nds32-multiple.md
index babc7f2..500a1c6 100644
--- a/gcc/config/nds32/nds32-multiple.md
+++ b/gcc/config/nds32/nds32-multiple.md
@@ -49,17 +49,19 @@
      otherwise we have to FAIL this rtx generation:
        1. The number of consecutive registers must be integer.
        2. Maximum 4 or 8 registers for lmw.bi instruction
-          (based on this nds32-multiple.md design).
+	  (based on this nds32-multiple.md design).
        3. Minimum 2 registers for lmw.bi instruction
-          (based on this nds32-multiple.md design).
+	  (based on this nds32-multiple.md design).
        4. operands[0] must be register for sure.
        5. operands[1] must be memory for sure.
-       6. Do not cross $r15 register because it is not allocatable.  */
+       6. operands[1] is not volatile memory access.
+       7. Do not cross $r15 register because it is not allocatable.  */
   if (GET_CODE (operands[2]) != CONST_INT
       || INTVAL (operands[2]) > maximum
       || INTVAL (operands[2]) < 2
       || GET_CODE (operands[0]) != REG
       || GET_CODE (operands[1]) != MEM
+      || MEM_VOLATILE_P (operands[1])
       || REGNO (operands[0]) + INTVAL (operands[2]) > TA_REGNUM)
     FAIL;

@@ -69,12 +71,943 @@
 					    INTVAL (operands[2]),
 					    force_reg (SImode,
 						       XEXP (operands[1], 0)),
-					    operands[1]);
+					    operands[1],
+					    false, NULL);
 })

 ;; Ordinary Load Multiple.
+(define_insn "*lmw_bim_si25"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 100)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 40))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 44))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 48))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 52))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 56))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 60))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 64))))
+     (set (match_operand:SI 20 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 68))))
+     (set (match_operand:SI 21 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 72))))
+     (set (match_operand:SI 22 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 76))))
+     (set (match_operand:SI 23 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 80))))
+     (set (match_operand:SI 24 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 84))))
+     (set (match_operand:SI 25 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 88))))
+     (set (match_operand:SI 26 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 92))))
+     (set (match_operand:SI 27 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 96))))])]
+  "(XVECLEN (operands[0], 0) == 26)"
+  "lmw.bim\t%3, [%1], %27, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "25")
+   (set_attr "length"             "4")]
+)

-(define_insn "*lmwsi8"
+(define_insn "*lmw_bim_si24"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 96)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 40))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 44))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 48))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 52))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 56))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 60))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 64))))
+     (set (match_operand:SI 20 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 68))))
+     (set (match_operand:SI 21 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 72))))
+     (set (match_operand:SI 22 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 76))))
+     (set (match_operand:SI 23 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 80))))
+     (set (match_operand:SI 24 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 84))))
+     (set (match_operand:SI 25 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 88))))
+     (set (match_operand:SI 26 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 92))))])]
+  "(XVECLEN (operands[0], 0) == 25)"
+  "lmw.bim\t%3, [%1], %26, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "24")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si23"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 92)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 40))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 44))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 48))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 52))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 56))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 60))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 64))))
+     (set (match_operand:SI 20 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 68))))
+     (set (match_operand:SI 21 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 72))))
+     (set (match_operand:SI 22 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 76))))
+     (set (match_operand:SI 23 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 80))))
+     (set (match_operand:SI 24 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 84))))
+     (set (match_operand:SI 25 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 88))))])]
+  "(XVECLEN (operands[0], 0) == 24)"
+  "lmw.bim\t%3, [%1], %25, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "23")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si22"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 88)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 40))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 44))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 48))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 52))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 56))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 60))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 64))))
+     (set (match_operand:SI 20 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 68))))
+     (set (match_operand:SI 21 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 72))))
+     (set (match_operand:SI 22 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 76))))
+     (set (match_operand:SI 23 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 80))))
+     (set (match_operand:SI 24 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 84))))])]
+  "(XVECLEN (operands[0], 0) == 23)"
+  "lmw.bim\t%3, [%1], %24, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "22")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si21"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 84)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 40))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 44))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 48))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 52))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 56))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 60))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 64))))
+     (set (match_operand:SI 20 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 68))))
+     (set (match_operand:SI 21 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 72))))
+     (set (match_operand:SI 22 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 76))))
+     (set (match_operand:SI 23 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 80))))])]
+  "(XVECLEN (operands[0], 0) == 22)"
+  "lmw.bim\t%3, [%1], %23, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "21")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si20"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 80)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 40))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 44))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 48))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 52))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 56))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 60))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 64))))
+     (set (match_operand:SI 20 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 68))))
+     (set (match_operand:SI 21 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 72))))
+     (set (match_operand:SI 22 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 76))))])]
+  "(XVECLEN (operands[0], 0) == 21)"
+  "lmw.bim\t%3, [%1], %22, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "20")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si19"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 76)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 40))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 44))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 48))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 52))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 56))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 60))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 64))))
+     (set (match_operand:SI 20 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 68))))
+     (set (match_operand:SI 21 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 72))))])]
+  "(XVECLEN (operands[0], 0) == 20)"
+  "lmw.bim\t%3, [%1], %21, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "19")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si18"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 72)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 40))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 44))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 48))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 52))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 56))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 60))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 64))))
+     (set (match_operand:SI 20 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 68))))])]
+  "(XVECLEN (operands[0], 0) == 19)"
+  "lmw.bim\t%3, [%1], %20, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "18")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si17"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 68)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 40))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 44))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 48))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 52))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 56))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 60))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 64))))])]
+  "(XVECLEN (operands[0], 0) == 18)"
+  "lmw.bim\t%3, [%1], %19, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "17")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si16"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 64)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 40))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 44))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 48))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 52))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 56))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 60))))])]
+  "(XVECLEN (operands[0], 0) == 17)"
+  "lmw.bim\t%3, [%1], %18, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "16")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si15"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 60)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 40))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 44))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 48))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 52))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 56))))])]
+  "(XVECLEN (operands[0], 0) == 16)"
+  "lmw.bim\t%3, [%1], %17, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "15")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si14"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 56)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 40))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 44))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 48))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 52))))])]
+  "(XVECLEN (operands[0], 0) == 15)"
+  "lmw.bim\t%3, [%1], %16, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "14")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si13"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 52)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 40))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 44))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 48))))])]
+  "(XVECLEN (operands[0], 0) == 14)"
+  "lmw.bim\t%3, [%1], %15, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "13")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si12"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 48)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 40))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 44))))])]
+  "(XVECLEN (operands[0], 0) == 13)"
+  "lmw.bim\t%3, [%1], %14, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "12")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si11"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 44)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 40))))])]
+  "(XVECLEN (operands[0], 0) == 12)"
+  "lmw.bim\t%3, [%1], %13, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "11")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si10"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 40)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 36))))])]
+  "(XVECLEN (operands[0], 0) == 11)"
+  "lmw.bim\t%3, [%1], %12, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "10")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si9"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 36)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 32))))])]
+  "(XVECLEN (operands[0], 0) == 10)"
+  "lmw.bim\t%3, [%1], %11, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "9")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si8"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 32)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 28))))])]
+  "(XVECLEN (operands[0], 0) == 9)"
+  "lmw.bim\t%3, [%1], %10, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "8")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si7"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 28)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 24))))])]
+  "(XVECLEN (operands[0], 0) == 8)"
+  "lmw.bim\t%3, [%1], %9, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "7")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si6"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 24)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 20))))])]
+  "(XVECLEN (operands[0], 0) == 7)"
+  "lmw.bim\t%3, [%1], %8, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "6")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si5"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 20)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 16))))])]
+  "(XVECLEN (operands[0], 0) == 6)"
+  "lmw.bim\t%3, [%1], %7, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "5")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si4"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 16)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 12))))])]
+  "(XVECLEN (operands[0], 0) == 5)"
+  "lmw.bim\t%3, [%1], %6, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "4")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si3"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 12)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 8))))])]
+  "(XVECLEN (operands[0], 0) == 4)"
+  "lmw.bim\t%3, [%1], %5, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "3")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmw_bim_si2"
+  [(match_parallel 0 "nds32_load_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 8)))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (match_dup 2)))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 2) (const_int 4))))])]
+  "(XVECLEN (operands[0], 0) == 3)"
+  "lmw.bim\t%3, [%1], %4, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "2")
+   (set_attr "length"             "4")]
+)
+
+(define_expand "unaligned_load_update_base_w"
+  [(parallel [(set (match_operand:SI 0 "register_operand" "")
+		   (plus:SI (match_operand:SI 2 "register_operand" "") (const_int 4)))
+	      (set (match_operand:SI 1 "register_operand" "")
+		   (unspec:SI [(mem:SI (match_dup 2))] UNSPEC_UALOAD_W))])]
+  ""
+{
+  /* DO NOT emit unaligned_load_w_m immediately since web pass don't
+     recognize post_inc, try it again after GCC 5.0.
+     REF: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63156  */
+  emit_insn (gen_unaligned_load_w (operands[1], gen_rtx_MEM (SImode, operands[2])));
+  emit_insn (gen_addsi3 (operands[0], operands[2], gen_int_mode (4, Pmode)));
+  DONE;
+}
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "1")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi25"
   [(match_parallel 0 "nds32_load_multiple_operation"
     [(set (match_operand:SI 2 "register_operand" "")
 	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
@@ -91,14 +1024,49 @@
      (set (match_operand:SI 8 "register_operand" "")
 	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
      (set (match_operand:SI 9 "register_operand" "")
-	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))])]
-  "(XVECLEN (operands[0], 0) == 8)"
-  "lmw.bi\t%2, [%1], %9, 0x0"
-  [(set_attr "type"   "load")
-   (set_attr "length"    "4")]
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 40))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 44))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 48))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 52))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 56))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 60))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 64))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 68))))
+     (set (match_operand:SI 20 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 72))))
+     (set (match_operand:SI 21 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 76))))
+     (set (match_operand:SI 22 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 80))))
+     (set (match_operand:SI 23 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 84))))
+     (set (match_operand:SI 24 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 88))))
+     (set (match_operand:SI 25 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 92))))
+     (set (match_operand:SI 26 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 96))))])]
+  "(XVECLEN (operands[0], 0) == 25)"
+  "lmw.bi\t%2, [%1], %26, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "25")
+   (set_attr "length"             "4")]
 )

-(define_insn "*lmwsi7"
+(define_insn "*lmwsi24"
   [(match_parallel 0 "nds32_load_multiple_operation"
     [(set (match_operand:SI 2 "register_operand" "")
 	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
@@ -113,14 +1081,49 @@
      (set (match_operand:SI 7 "register_operand" "")
 	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
      (set (match_operand:SI 8 "register_operand" "")
-	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))])]
-  "(XVECLEN (operands[0], 0) == 7)"
-  "lmw.bi\t%2, [%1], %8, 0x0"
-  [(set_attr "type"   "load")
-   (set_attr "length"    "4")]
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 40))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 44))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 48))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 52))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 56))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 60))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 64))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 68))))
+     (set (match_operand:SI 20 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 72))))
+     (set (match_operand:SI 21 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 76))))
+     (set (match_operand:SI 22 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 80))))
+     (set (match_operand:SI 23 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 84))))
+     (set (match_operand:SI 24 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 88))))
+     (set (match_operand:SI 25 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 92))))])]
+  "(XVECLEN (operands[0], 0) == 24)"
+  "lmw.bi\t%2, [%1], %25, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "24")
+   (set_attr "length"             "4")]
 )

-(define_insn "*lmwsi6"
+(define_insn "*lmwsi23"
   [(match_parallel 0 "nds32_load_multiple_operation"
     [(set (match_operand:SI 2 "register_operand" "")
 	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
@@ -133,14 +1136,49 @@
      (set (match_operand:SI 6 "register_operand" "")
 	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
      (set (match_operand:SI 7 "register_operand" "")
-	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))])]
-  "(XVECLEN (operands[0], 0) == 6)"
-  "lmw.bi\t%2, [%1], %7, 0x0"
-  [(set_attr "type"   "load")
-   (set_attr "length"    "4")]
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 40))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 44))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 48))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 52))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 56))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 60))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 64))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 68))))
+     (set (match_operand:SI 20 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 72))))
+     (set (match_operand:SI 21 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 76))))
+     (set (match_operand:SI 22 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 80))))
+     (set (match_operand:SI 23 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 84))))
+     (set (match_operand:SI 24 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 88))))])]
+  "(XVECLEN (operands[0], 0) == 23)"
+  "lmw.bi\t%2, [%1], %24, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "23")
+   (set_attr "length"             "4")]
 )

-(define_insn "*lmwsi5"
+(define_insn "*lmwsi22"
   [(match_parallel 0 "nds32_load_multiple_operation"
     [(set (match_operand:SI 2 "register_operand" "")
 	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
@@ -151,110 +1189,2430 @@
      (set (match_operand:SI 5 "register_operand" "")
 	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
      (set (match_operand:SI 6 "register_operand" "")
-	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))])]
-  "(XVECLEN (operands[0], 0) == 5)"
-  "lmw.bi\t%2, [%1], %6, 0x0"
-  [(set_attr "type"   "load")
-   (set_attr "length"    "4")]
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 40))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 44))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 48))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 52))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 56))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 60))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 64))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 68))))
+     (set (match_operand:SI 20 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 72))))
+     (set (match_operand:SI 21 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 76))))
+     (set (match_operand:SI 22 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 80))))
+     (set (match_operand:SI 23 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 84))))])]
+  "(XVECLEN (operands[0], 0) == 22)"
+  "lmw.bi\t%2, [%1], %23, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "22")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi21"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 40))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 44))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 48))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 52))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 56))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 60))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 64))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 68))))
+     (set (match_operand:SI 20 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 72))))
+     (set (match_operand:SI 21 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 76))))
+     (set (match_operand:SI 22 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 80))))])]
+  "(XVECLEN (operands[0], 0) == 21)"
+  "lmw.bi\t%2, [%1], %22, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "21")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi20"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 40))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 44))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 48))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 52))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 56))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 60))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 64))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 68))))
+     (set (match_operand:SI 20 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 72))))
+     (set (match_operand:SI 21 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 76))))])]
+  "(XVECLEN (operands[0], 0) == 20)"
+  "lmw.bi\t%2, [%1], %21, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "20")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi19"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 40))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 44))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 48))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 52))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 56))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 60))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 64))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 68))))
+     (set (match_operand:SI 20 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 72))))])]
+  "(XVECLEN (operands[0], 0) == 19)"
+  "lmw.bi\t%2, [%1], %20, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "19")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi18"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 40))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 44))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 48))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 52))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 56))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 60))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 64))))
+     (set (match_operand:SI 19 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 68))))])]
+  "(XVECLEN (operands[0], 0) == 18)"
+  "lmw.bi\t%2, [%1], %19, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "18")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi17"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 40))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 44))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 48))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 52))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 56))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 60))))
+     (set (match_operand:SI 18 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 64))))])]
+  "(XVECLEN (operands[0], 0) == 17)"
+  "lmw.bi\t%2, [%1], %18, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "17")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi16"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 40))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 44))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 48))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 52))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 56))))
+     (set (match_operand:SI 17 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 60))))])]
+  "(XVECLEN (operands[0], 0) == 16)"
+  "lmw.bi\t%2, [%1], %17, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "16")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi15"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 40))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 44))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 48))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 52))))
+     (set (match_operand:SI 16 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 56))))])]
+  "(XVECLEN (operands[0], 0) == 15)"
+  "lmw.bi\t%2, [%1], %16, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "15")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi14"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 40))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 44))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 48))))
+     (set (match_operand:SI 15 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 52))))])]
+  "(XVECLEN (operands[0], 0) == 14)"
+  "lmw.bi\t%2, [%1], %15, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "14")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi13"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 40))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 44))))
+     (set (match_operand:SI 14 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 48))))])]
+  "(XVECLEN (operands[0], 0) == 13)"
+  "lmw.bi\t%2, [%1], %14, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "13")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi12"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 40))))
+     (set (match_operand:SI 13 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 44))))])]
+  "(XVECLEN (operands[0], 0) == 12)"
+  "lmw.bi\t%2, [%1], %13, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "12")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi11"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))
+     (set (match_operand:SI 12 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 40))))])]
+  "(XVECLEN (operands[0], 0) == 11)"
+  "lmw.bi\t%2, [%1], %12, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "11")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi10"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))
+     (set (match_operand:SI 11 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 36))))])]
+  "(XVECLEN (operands[0], 0) == 10)"
+  "lmw.bi\t%2, [%1], %11, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"             "10")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi9"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))
+     (set (match_operand:SI 10 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 32))))])]
+  "(XVECLEN (operands[0], 0) == 9)"
+  "lmw.bi\t%2, [%1], %10, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "9")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi8"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))
+     (set (match_operand:SI 9 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 28))))])]
+  "(XVECLEN (operands[0], 0) == 8)"
+  "lmw.bi\t%2, [%1], %9, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "8")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi7"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))
+     (set (match_operand:SI 8 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 24))))])]
+  "(XVECLEN (operands[0], 0) == 7)"
+  "lmw.bi\t%2, [%1], %8, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "7")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi6"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))
+     (set (match_operand:SI 7 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 20))))])]
+  "(XVECLEN (operands[0], 0) == 6)"
+  "lmw.bi\t%2, [%1], %7, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "6")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi5"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))
+     (set (match_operand:SI 6 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 16))))])]
+  "(XVECLEN (operands[0], 0) == 5)"
+  "lmw.bi\t%2, [%1], %6, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "5")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi4"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
+     (set (match_operand:SI 5 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))])]
+  "(XVECLEN (operands[0], 0) == 4)"
+  "lmw.bi\t%2, [%1], %5, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "4")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi3"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
+     (set (match_operand:SI 4 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))])]
+  "(XVECLEN (operands[0], 0) == 3)"
+  "lmw.bi\t%2, [%1], %4, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "3")
+   (set_attr "length"             "4")]
+)
+
+(define_insn "*lmwsi2"
+  [(match_parallel 0 "nds32_load_multiple_operation"
+    [(set (match_operand:SI 2 "register_operand" "")
+	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
+     (set (match_operand:SI 3 "register_operand" "")
+	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))])]
+  "(XVECLEN (operands[0], 0) == 2)"
+  "lmw.bi\t%2, [%1], %3, 0x0"
+  [(set_attr "type"   "load_multiple")
+   (set_attr "combo"              "2")
+   (set_attr "length"             "4")]
+)
+
+;; Store Multiple Insns.
+;;
+;; operands[0] is the first memory location.
+;; operands[1] is the first of the consecutive registers.
+;; operands[2] is the number of consecutive registers.
+
+(define_expand "store_multiple"
+  [(match_par_dup 3 [(set (match_operand:SI 0 "" "")
+			  (match_operand:SI 1 "" ""))
+		     (use (match_operand:SI 2 "" ""))])]
+  ""
+{
+  int maximum;
+
+  /* Because reduced-set regsiters has few registers
+     (r0~r5, r6~10, r15, r28~r31, where 'r15' and 'r28~r31' cannot
+     be used for register allocation),
+     using 8 registers for store_multiple may easily consume all of them.
+     It makes register allocation/spilling hard to work.
+     So we only allow maximum=4 registers for store_multiple
+     under reduced-set registers.  */
+  if (TARGET_REDUCED_REGS)
+    maximum = 4;
+  else
+    maximum = 8;
+
+  /* Here are the conditions that must be all passed,
+     otherwise we have to FAIL this rtx generation:
+       1. The number of consecutive registers must be integer.
+       2. Maximum 4 or 8 registers for smw.bi instruction
+	  (based on this nds32-multiple.md design).
+       3. Minimum 2 registers for smw.bi instruction
+	  (based on this nds32-multiple.md design).
+       4. operands[0] must be memory for sure.
+       5. operands[1] must be register for sure.
+       6. operands[0] is not volatile memory access.
+       7. Do not cross $r15 register because it is not allocatable.  */
+  if (GET_CODE (operands[2]) != CONST_INT
+      || INTVAL (operands[2]) > maximum
+      || INTVAL (operands[2]) < 2
+      || GET_CODE (operands[0]) != MEM
+      || GET_CODE (operands[1]) != REG
+      || MEM_VOLATILE_P (operands[0])
+      || REGNO (operands[1]) + INTVAL (operands[2]) > TA_REGNUM)
+    FAIL;
+
+  /* For (mem addr), we force_reg on addr here,
+     so that nds32_expand_store_multiple can easily use it.  */
+  operands[3] = nds32_expand_store_multiple (REGNO (operands[1]),
+					     INTVAL (operands[2]),
+					     force_reg (SImode,
+							XEXP (operands[0], 0)),
+					     operands[0],
+					     false, NULL);
+})
+
+;; Ordinary Store Multiple.
+(define_insn "*stm_bim_si25"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 100)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 40)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 44)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 48)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 52)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 56)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 60)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 64)))
+	  (match_operand:SI 19 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 68)))
+	  (match_operand:SI 20 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 72)))
+	  (match_operand:SI 21 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 76)))
+	  (match_operand:SI 22 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 80)))
+	  (match_operand:SI 23 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 84)))
+	  (match_operand:SI 24 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 88)))
+	  (match_operand:SI 25 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 92)))
+	  (match_operand:SI 26 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 96)))
+	  (match_operand:SI 27 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 26)"
+  "smw.bim\t%3, [%1], %27, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "25")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si24"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 96)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 40)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 44)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 48)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 52)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 56)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 60)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 64)))
+	  (match_operand:SI 19 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 68)))
+	  (match_operand:SI 20 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 72)))
+	  (match_operand:SI 21 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 76)))
+	  (match_operand:SI 22 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 80)))
+	  (match_operand:SI 23 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 84)))
+	  (match_operand:SI 24 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 88)))
+	  (match_operand:SI 25 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 92)))
+	  (match_operand:SI 26 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 25)"
+  "smw.bim\t%3, [%1], %26, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "24")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si23"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 92)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 40)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 44)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 48)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 52)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 56)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 60)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 64)))
+	  (match_operand:SI 19 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 68)))
+	  (match_operand:SI 20 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 72)))
+	  (match_operand:SI 21 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 76)))
+	  (match_operand:SI 22 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 80)))
+	  (match_operand:SI 23 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 84)))
+	  (match_operand:SI 24 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 88)))
+	  (match_operand:SI 25 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 24)"
+  "smw.bim\t%3, [%1], %25, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "23")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si22"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 88)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 40)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 44)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 48)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 52)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 56)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 60)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 64)))
+	  (match_operand:SI 19 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 68)))
+	  (match_operand:SI 20 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 72)))
+	  (match_operand:SI 21 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 76)))
+	  (match_operand:SI 22 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 80)))
+	  (match_operand:SI 23 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 84)))
+	  (match_operand:SI 24 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 23)"
+  "smw.bim\t%3, [%1], %24, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "22")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si21"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 84)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 40)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 44)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 48)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 52)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 56)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 60)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 64)))
+	  (match_operand:SI 19 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 68)))
+	  (match_operand:SI 20 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 72)))
+	  (match_operand:SI 21 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 76)))
+	  (match_operand:SI 22 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 80)))
+	  (match_operand:SI 23 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 22)"
+  "smw.bim\t%3, [%1], %23, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "21")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si20"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 80)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 40)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 44)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 48)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 52)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 56)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 60)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 64)))
+	  (match_operand:SI 19 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 68)))
+	  (match_operand:SI 20 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 72)))
+	  (match_operand:SI 21 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 76)))
+	  (match_operand:SI 22 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 21)"
+  "smw.bim\t%3, [%1], %22, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "20")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si19"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 76)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 40)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 44)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 48)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 52)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 56)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 60)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 64)))
+	  (match_operand:SI 19 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 68)))
+	  (match_operand:SI 20 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 72)))
+	  (match_operand:SI 21 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 20)"
+  "smw.bim\t%3, [%1], %21, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "19")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si18"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 72)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 40)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 44)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 48)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 52)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 56)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 60)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 64)))
+	  (match_operand:SI 19 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 68)))
+	  (match_operand:SI 20 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 19)"
+  "smw.bim\t%3, [%1], %20, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "18")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si17"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 68)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 40)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 44)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 48)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 52)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 56)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 60)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 64)))
+	  (match_operand:SI 19 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 18)"
+  "smw.bim\t%3, [%1], %19, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "17")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si16"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 64)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 40)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 44)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 48)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 52)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 56)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 60)))
+	  (match_operand:SI 18 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 17)"
+  "smw.bim\t%3, [%1], %18, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "16")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si15"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 60)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 40)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 44)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 48)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 52)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 56)))
+	  (match_operand:SI 17 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 16)"
+  "smw.bim\t%3, [%1], %17, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "15")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si14"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 56)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 40)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 44)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 48)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 52)))
+	  (match_operand:SI 16 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 15)"
+  "smw.bim\t%3, [%1], %16, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "14")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si13"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 52)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 40)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 44)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 48)))
+	  (match_operand:SI 15 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 14)"
+  "smw.bim\t%3, [%1], %15, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "13")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si12"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 48)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 40)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 44)))
+	  (match_operand:SI 14 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 13)"
+  "smw.bim\t%3, [%1], %14, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "12")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si11"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 44)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 40)))
+	  (match_operand:SI 13 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 12)"
+  "smw.bim\t%3, [%1], %13, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "11")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si10"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 40)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 36)))
+	  (match_operand:SI 12 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 11)"
+  "smw.bim\t%3, [%1], %12, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "10")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si9"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 36)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 32)))
+	  (match_operand:SI 11 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 10)"
+  "smw.bim\t%3, [%1], %11, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "9")
+   (set_attr "length"              "4")]
+)
+
+
+(define_insn "*stm_bim_si8"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 32)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 28)))
+	  (match_operand:SI 10 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 9)"
+  "smw.bim\t%3, [%1], %10, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "8")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si7"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 28)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 24)))
+	  (match_operand:SI 9 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 8)"
+  "smw.bim\t%3, [%1], %9, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "7")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si6"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 24)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 20)))
+	  (match_operand:SI 8 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 7)"
+  "smw.bim\t%3, [%1], %8, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "6")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si5"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 20)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 16)))
+	  (match_operand:SI 7 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 6)"
+  "smw.bim\t%3, [%1], %7, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "5")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si4"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 16)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 12)))
+	  (match_operand:SI 6 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 5)"
+  "smw.bim\t%3, [%1], %6, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "4")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si3"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 12)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 8)))
+	  (match_operand:SI 5 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 4)"
+  "smw.bim\t%3, [%1], %5, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "3")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stm_bim_si2"
+  [(match_parallel 0 "nds32_store_multiple_and_update_address_operation"
+    [(set (match_operand:SI 1 "register_operand" "=r")
+	  (plus:SI (match_operand:SI 2 "register_operand" "1") (const_int 8)))
+     (set (mem:SI (match_dup 2))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+	  (match_operand:SI 4 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 3)"
+  "smw.bim\t%3, [%1], %4, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "2")
+   (set_attr "length"              "4")]
+)
+
+(define_expand "unaligned_store_update_base_w"
+  [(parallel [(set (match_operand:SI 0 "register_operand" "=r")
+		   (plus:SI (match_operand:SI 1 "register_operand" "0") (const_int 4)))
+	      (set (mem:SI (match_dup 1))
+		   (unspec:SI [(match_operand:SI 2 "register_operand" "r")] UNSPEC_UASTORE_W))])]
+  ""
+{
+  /* DO NOT emit unaligned_store_w_m immediately since web pass don't
+     recognize post_inc, try it again after GCC 5.0.
+     REF: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63156  */
+  emit_insn (gen_unaligned_store_w (gen_rtx_MEM (SImode, operands[1]), operands[2]));
+  emit_insn (gen_addsi3 (operands[0], operands[1], gen_int_mode (4, Pmode)));
+  DONE;
+}
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "1")
+   (set_attr "length"              "4")]
+)
+
+(define_expand "unaligned_store_update_base_dw"
+  [(parallel [(set (match_operand:SI 0 "register_operand" "=r")
+		   (plus:SI (match_operand:SI 1 "register_operand" "0") (const_int 8)))
+	      (set (mem:DI (match_dup 1))
+		   (unspec:DI [(match_operand:DI 2 "register_operand" "r")] UNSPEC_UASTORE_DW))])]
+  ""
+{
+  /* DO NOT emit unaligned_store_w_m immediately since web pass don't
+     recognize post_inc, try it again after GCC 5.0.
+     REF: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63156  */
+  emit_insn (gen_unaligned_store_dw (gen_rtx_MEM (DImode, operands[1]), operands[2]));
+  emit_insn (gen_addsi3 (operands[0], operands[1], gen_int_mode (8, Pmode)));
+  DONE;
+}
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "2")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stmsi25"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 40)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 44)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 48)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 52)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 56)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 60)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 64)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 68)))
+	  (match_operand:SI 19 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 72)))
+	  (match_operand:SI 20 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 76)))
+	  (match_operand:SI 21 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 80)))
+	  (match_operand:SI 22 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 84)))
+	  (match_operand:SI 23 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 88)))
+	  (match_operand:SI 24 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 92)))
+	  (match_operand:SI 25 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 96)))
+	  (match_operand:SI 26 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 25)"
+  "smw.bi\t%2, [%1], %26, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "25")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stmsi24"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 40)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 44)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 48)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 52)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 56)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 60)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 64)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 68)))
+	  (match_operand:SI 19 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 72)))
+	  (match_operand:SI 20 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 76)))
+	  (match_operand:SI 21 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 80)))
+	  (match_operand:SI 22 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 84)))
+	  (match_operand:SI 23 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 88)))
+	  (match_operand:SI 24 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 92)))
+	  (match_operand:SI 25 "register_operand" ""))
+])]
+  "(XVECLEN (operands[0], 0) == 24)"
+  "smw.bi\t%2, [%1], %25, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "24")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stmsi23"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 40)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 44)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 48)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 52)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 56)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 60)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 64)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 68)))
+	  (match_operand:SI 19 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 72)))
+	  (match_operand:SI 20 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 76)))
+	  (match_operand:SI 21 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 80)))
+	  (match_operand:SI 22 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 84)))
+	  (match_operand:SI 23 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 88)))
+	  (match_operand:SI 24 "register_operand" ""))
+])]
+  "(XVECLEN (operands[0], 0) == 23)"
+  "smw.bi\t%2, [%1], %24, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "23")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stmsi22"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 40)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 44)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 48)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 52)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 56)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 60)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 64)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 68)))
+	  (match_operand:SI 19 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 72)))
+	  (match_operand:SI 20 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 76)))
+	  (match_operand:SI 21 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 80)))
+	  (match_operand:SI 22 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 84)))
+	  (match_operand:SI 23 "register_operand" ""))
+])]
+  "(XVECLEN (operands[0], 0) == 22)"
+  "smw.bi\t%2, [%1], %23, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "22")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stmsi21"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 40)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 44)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 48)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 52)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 56)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 60)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 64)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 68)))
+	  (match_operand:SI 19 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 72)))
+	  (match_operand:SI 20 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 76)))
+	  (match_operand:SI 21 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 80)))
+	  (match_operand:SI 22 "register_operand" ""))
+])]
+  "(XVECLEN (operands[0], 0) == 21)"
+  "smw.bi\t%2, [%1], %22, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "21")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stmsi20"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 40)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 44)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 48)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 52)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 56)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 60)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 64)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 68)))
+	  (match_operand:SI 19 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 72)))
+	  (match_operand:SI 20 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 76)))
+	  (match_operand:SI 21 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 20)"
+  "smw.bi\t%2, [%1], %21, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "20")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stmsi19"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 40)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 44)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 48)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 52)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 56)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 60)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 64)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 68)))
+	  (match_operand:SI 19 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 72)))
+	  (match_operand:SI 20 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 19)"
+  "smw.bi\t%2, [%1], %20, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "19")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "*stmsi18"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 40)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 44)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 48)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 52)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 56)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 60)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 64)))
+	  (match_operand:SI 18 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 68)))
+	  (match_operand:SI 19 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 18)"
+  "smw.bi\t%2, [%1], %19, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "18")
+   (set_attr "length"              "4")]
 )

-(define_insn "*lmwsi4"
-  [(match_parallel 0 "nds32_load_multiple_operation"
-    [(set (match_operand:SI 2 "register_operand" "")
-	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
-     (set (match_operand:SI 3 "register_operand" "")
-	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
-     (set (match_operand:SI 4 "register_operand" "")
-	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))
-     (set (match_operand:SI 5 "register_operand" "")
-	  (mem:SI (plus:SI (match_dup 1) (const_int 12))))])]
-  "(XVECLEN (operands[0], 0) == 4)"
-  "lmw.bi\t%2, [%1], %5, 0x0"
-  [(set_attr "type"   "load")
-   (set_attr "length"    "4")]
+(define_insn "*stmsi17"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 40)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 44)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 48)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 52)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 56)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 60)))
+	  (match_operand:SI 17 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 64)))
+	  (match_operand:SI 18 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 17)"
+  "smw.bi\t%2, [%1], %18, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "17")
+   (set_attr "length"              "4")]
 )

-(define_insn "*lmwsi3"
-  [(match_parallel 0 "nds32_load_multiple_operation"
-    [(set (match_operand:SI 2 "register_operand" "")
-	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
-     (set (match_operand:SI 3 "register_operand" "")
-	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))
-     (set (match_operand:SI 4 "register_operand" "")
-	  (mem:SI (plus:SI (match_dup 1) (const_int 8))))])]
-  "(XVECLEN (operands[0], 0) == 3)"
-  "lmw.bi\t%2, [%1], %4, 0x0"
-  [(set_attr "type"   "load")
-   (set_attr "length"    "4")]
+(define_insn "*stmsi16"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 40)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 44)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 48)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 52)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 56)))
+	  (match_operand:SI 16 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 60)))
+	  (match_operand:SI 17 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 16)"
+  "smw.bi\t%2, [%1], %17, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "16")
+   (set_attr "length"              "4")]
 )

-(define_insn "*lmwsi2"
-  [(match_parallel 0 "nds32_load_multiple_operation"
-    [(set (match_operand:SI 2 "register_operand" "")
-	  (mem:SI (match_operand:SI 1 "register_operand" "r")))
-     (set (match_operand:SI 3 "register_operand" "")
-	  (mem:SI (plus:SI (match_dup 1) (const_int 4))))])]
-  "(XVECLEN (operands[0], 0) == 2)"
-  "lmw.bi\t%2, [%1], %3, 0x0"
-  [(set_attr "type"   "load")
-   (set_attr "length"    "4")]
+(define_insn "*stmsi15"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 40)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 44)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 48)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 52)))
+	  (match_operand:SI 15 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 56)))
+	  (match_operand:SI 16 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 15)"
+  "smw.bi\t%2, [%1], %16, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "15")
+   (set_attr "length"              "4")]
 )

+(define_insn "*stmsi14"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 40)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 44)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 48)))
+	  (match_operand:SI 14 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 52)))
+	  (match_operand:SI 15 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 14)"
+  "smw.bi\t%2, [%1], %15, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "14")
+   (set_attr "length"              "4")]
+)

-;; Store Multiple Insns.
-;;
-;; operands[0] is the first memory location.
-;; opernads[1] is the first of the consecutive registers.
-;; operands[2] is the number of consecutive registers.
-
-(define_expand "store_multiple"
-  [(match_par_dup 3 [(set (match_operand:SI 0 "" "")
-			  (match_operand:SI 1 "" ""))
-		     (use (match_operand:SI 2 "" ""))])]
-  ""
-{
-  int maximum;
+(define_insn "*stmsi13"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 40)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 44)))
+	  (match_operand:SI 13 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 48)))
+	  (match_operand:SI 14 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 13)"
+  "smw.bi\t%2, [%1], %14, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "13")
+   (set_attr "length"              "4")]
+)

-  /* Because reduced-set regsiters has few registers
-     (r0~r5, r6~10, r15, r28~r31, where 'r15' and 'r28~r31' cannot
-     be used for register allocation),
-     using 8 registers for store_multiple may easily consume all of them.
-     It makes register allocation/spilling hard to work.
-     So we only allow maximum=4 registers for store_multiple
-     under reduced-set registers.  */
-  if (TARGET_REDUCED_REGS)
-    maximum = 4;
-  else
-    maximum = 8;
+(define_insn "*stmsi12"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 40)))
+	  (match_operand:SI 12 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 44)))
+	  (match_operand:SI 13 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 12)"
+  "smw.bi\t%2, [%1], %13, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "12")
+   (set_attr "length"              "4")]
+)

-  /* Here are the conditions that must be all passed,
-     otherwise we have to FAIL this rtx generation:
-       1. The number of consecutive registers must be integer.
-       2. Maximum 4 or 8 registers for smw.bi instruction
-          (based on this nds32-multiple.md design).
-       3. Minimum 2 registers for smw.bi instruction
-          (based on this nds32-multiple.md design).
-       4. operands[0] must be memory for sure.
-       5. operands[1] must be register for sure.
-       6. Do not cross $r15 register because it is not allocatable.  */
-  if (GET_CODE (operands[2]) != CONST_INT
-      || INTVAL (operands[2]) > maximum
-      || INTVAL (operands[2]) < 2
-      || GET_CODE (operands[0]) != MEM
-      || GET_CODE (operands[1]) != REG
-      || REGNO (operands[1]) + INTVAL (operands[2]) > TA_REGNUM)
-    FAIL;
+(define_insn "*stmsi11"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 40)))
+	  (match_operand:SI 12 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 11)"
+  "smw.bi\t%2, [%1], %12, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "11")
+   (set_attr "length"              "4")]
+)

-  /* For (mem addr), we force_reg on addr here,
-     so that nds32_expand_store_multiple can easily use it.  */
-  operands[3] = nds32_expand_store_multiple (REGNO (operands[1]),
-					     INTVAL (operands[2]),
-					     force_reg (SImode,
-							XEXP (operands[0], 0)),
-					     operands[0]);
-})
+(define_insn "*stmsi10"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 36)))
+	  (match_operand:SI 11 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 10)"
+  "smw.bi\t%2, [%1], %11, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"              "10")
+   (set_attr "length"              "4")]
+)

-;; Ordinary Store Multiple.
+(define_insn "*stmsi9"
+  [(match_parallel 0 "nds32_store_multiple_operation"
+    [(set (mem:SI (match_operand:SI 1 "register_operand" "r"))
+	  (match_operand:SI 2  "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 4)))
+	  (match_operand:SI 3  "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 8)))
+	  (match_operand:SI 4  "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 12)))
+	  (match_operand:SI 5  "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 16)))
+	  (match_operand:SI 6  "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 20)))
+	  (match_operand:SI 7  "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 24)))
+	  (match_operand:SI 8  "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 28)))
+	  (match_operand:SI 9  "register_operand" ""))
+     (set (mem:SI (plus:SI (match_dup 1) (const_int 32)))
+	  (match_operand:SI 10 "register_operand" ""))])]
+  "(XVECLEN (operands[0], 0) == 9)"
+  "smw.bi\t%2, [%1], %10, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "9")
+   (set_attr "length"              "4")]
+)

 (define_insn "*stmsi8"
   [(match_parallel 0 "nds32_store_multiple_operation"
@@ -276,8 +3634,9 @@
 	  (match_operand:SI 9 "register_operand" ""))])]
   "(XVECLEN (operands[0], 0) == 8)"
   "smw.bi\t%2, [%1], %9, 0x0"
-  [(set_attr "type"   "store")
-   (set_attr "length"     "4")]
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "8")
+   (set_attr "length"              "4")]
 )

 (define_insn "*stmsi7"
@@ -298,8 +3657,9 @@
 	  (match_operand:SI 8 "register_operand" ""))])]
   "(XVECLEN (operands[0], 0) == 7)"
   "smw.bi\t%2, [%1], %8, 0x0"
-  [(set_attr "type"   "store")
-   (set_attr "length"     "4")]
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "7")
+   (set_attr "length"              "4")]
 )

 (define_insn "*stmsi6"
@@ -318,8 +3678,9 @@
 	  (match_operand:SI 7 "register_operand" ""))])]
   "(XVECLEN (operands[0], 0) == 6)"
   "smw.bi\t%2, [%1], %7, 0x0"
-  [(set_attr "type"   "store")
-   (set_attr "length"     "4")]
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "6")
+   (set_attr "length"              "4")]
 )

 (define_insn "*stmsi5"
@@ -336,8 +3697,9 @@
 	  (match_operand:SI 6 "register_operand" ""))])]
   "(XVECLEN (operands[0], 0) == 5)"
   "smw.bi\t%2, [%1], %6, 0x0"
-  [(set_attr "type"   "store")
-   (set_attr "length"     "4")]
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "5")
+   (set_attr "length"              "4")]
 )

 (define_insn "*stmsi4"
@@ -352,8 +3714,9 @@
 	  (match_operand:SI 5 "register_operand" ""))])]
   "(XVECLEN (operands[0], 0) == 4)"
   "smw.bi\t%2, [%1], %5, 0x0"
-  [(set_attr "type"   "store")
-   (set_attr "length"     "4")]
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "4")
+   (set_attr "length"              "4")]
 )

 (define_insn "*stmsi3"
@@ -366,8 +3729,9 @@
 	  (match_operand:SI 4 "register_operand" ""))])]
   "(XVECLEN (operands[0], 0) == 3)"
   "smw.bi\t%2, [%1], %4, 0x0"
-  [(set_attr "type"   "store")
-   (set_attr "length"     "4")]
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "3")
+   (set_attr "length"              "4")]
 )

 (define_insn "*stmsi2"
@@ -378,8 +3742,9 @@
 	  (match_operand:SI 3 "register_operand" ""))])]
   "(XVECLEN (operands[0], 0) == 2)"
   "smw.bi\t%2, [%1], %3, 0x0"
-  [(set_attr "type"   "store")
-   (set_attr "length"     "4")]
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "2")
+   (set_attr "length"              "4")]
 )

 ;; Move a block of memory if it is word aligned and MORE than 2 words long.
@@ -391,14 +3756,14 @@
 ;; operands[2] is the number of bytes to move.
 ;; operands[3] is the known shared alignment.

-(define_expand "movmemqi"
+(define_expand "movmemsi"
   [(match_operand:BLK 0 "general_operand" "")
    (match_operand:BLK 1 "general_operand" "")
-   (match_operand:SI 2 "const_int_operand" "")
+   (match_operand:SI 2 "nds32_reg_constant_operand" "")
    (match_operand:SI 3 "const_int_operand" "")]
   ""
 {
-  if (nds32_expand_movmemqi (operands[0],
+  if (nds32_expand_movmemsi (operands[0],
 			     operands[1],
 			     operands[2],
 			     operands[3]))
@@ -408,3 +3773,75 @@
 })

 ;; ------------------------------------------------------------------------
+
+(define_insn "lmwzb"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(plus:SI (match_operand:SI 1 "register_operand" "0") (const_int 4)))
+   (set (match_operand:SI 2 "register_operand" "=r")
+	(unspec:SI [(mem:SI (match_dup 1))] UNSPEC_LMWZB))]
+  ""
+  "lmwzb.bm\t%2, [%1], %2, 0x0"
+  [(set_attr "type"    "load_multiple")
+   (set_attr "combo"               "1")
+   (set_attr "length"              "4")]
+)
+
+(define_insn "smwzb"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(plus:SI (match_operand:SI 1 "register_operand" "0") (const_int 4)))
+   (set (mem:SI (match_dup 1))
+	(unspec:SI [(match_operand:SI 2 "register_operand" "r")] UNSPEC_SMWZB))]
+  ""
+  "smwzb.bm\t%2, [%1], %2, 0x0"
+  [(set_attr "type"   "store_multiple")
+   (set_attr "combo"               "1")
+   (set_attr "length"              "4")]
+)
+
+(define_expand "movstr"
+  [(match_operand:SI 0 "register_operand" "")
+   (match_operand:BLK 1 "memory_operand" "")
+   (match_operand:BLK 2 "memory_operand" "")]
+  "TARGET_EXT_STRING && TARGET_INLINE_STRCPY"
+{
+  if (nds32_expand_movstr (operands[0],
+			   operands[1],
+			   operands[2]))
+    DONE;
+
+  FAIL;
+})
+
+(define_expand "strlensi"
+  [(match_operand:SI  0 "register_operand")
+   (match_operand:BLK 1 "memory_operand")
+   (match_operand:QI  2 "nds32_reg_constant_operand")
+   (match_operand     3 "const_int_operand")]
+  "TARGET_EXT_STRING"
+{
+  if (nds32_expand_strlen (operands[0], operands[1], operands[2], operands[3]))
+    DONE;
+
+  FAIL;
+})
+
+(define_expand "setmemsi"
+   [(use (match_operand:BLK 0 "memory_operand"))
+    (use (match_operand:SI 1 "nds32_reg_constant_operand"))
+    (use (match_operand:QI 2 "nonmemory_operand"))
+    (use (match_operand 3 "const_int_operand"))
+    (use (match_operand:SI 4 "const_int_operand"))
+    (use (match_operand:SI 5 "const_int_operand"))]
+  ""
+{
+ if (nds32_expand_setmem (operands[0], operands[1],
+			  operands[2], operands[3],
+			  operands[4], operands[5]))
+   DONE;
+
+ FAIL;
+})
+
+
+
+;; ------------------------------------------------------------------------
diff --git a/gcc/config/nds32/nds32-n10.md b/gcc/config/nds32/nds32-n10.md
new file mode 100644
index 0000000..7261608
--- /dev/null
+++ b/gcc/config/nds32/nds32-n10.md
@@ -0,0 +1,439 @@
+;; Pipeline descriptions of Andes NDS32 cpu for GNU compiler
+;; Copyright (C) 2012-2016 Free Software Foundation, Inc.
+;; Contributed by Andes Technology Corporation.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+
+;; ------------------------------------------------------------------------
+;; Define N10 pipeline settings.
+;; ------------------------------------------------------------------------
+
+(define_automaton "nds32_n10_machine")
+
+;; ------------------------------------------------------------------------
+;; Pipeline Stages
+;; ------------------------------------------------------------------------
+;; IF - Instruction Fetch
+;; II - Instruction Issue / Instruction Decode
+;; EX - Instruction Execution
+;; MM - Memory Execution
+;; WB - Instruction Retire / Result Write-Back
+
+(define_cpu_unit "n10_ii" "nds32_n10_machine")
+(define_cpu_unit "n10_ex" "nds32_n10_machine")
+(define_cpu_unit "n10_mm" "nds32_n10_machine")
+(define_cpu_unit "n10_wb" "nds32_n10_machine")
+(define_cpu_unit "n10f_iq" "nds32_n10_machine")
+(define_cpu_unit "n10f_rf" "nds32_n10_machine")
+(define_cpu_unit "n10f_e1" "nds32_n10_machine")
+(define_cpu_unit "n10f_e2" "nds32_n10_machine")
+(define_cpu_unit "n10f_e3" "nds32_n10_machine")
+(define_cpu_unit "n10f_e4" "nds32_n10_machine")
+
+(define_insn_reservation "nds_n10_unknown" 1
+  (and (eq_attr "type" "unknown")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_misc" 1
+  (and (eq_attr "type" "misc")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_mmu" 1
+  (and (eq_attr "type" "mmu")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_alu" 1
+  (and (eq_attr "type" "alu")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_alu_shift" 1
+  (and (eq_attr "type" "alu_shift")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ii+n10_ex, n10_ex+n10_mm, n10_mm+n10_wb, n10_wb")
+
+(define_insn_reservation "nds_n10_pbsad" 1
+  (and (eq_attr "type" "pbsad")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex*3, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_pbsada" 1
+  (and (eq_attr "type" "pbsada")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex*3, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_load" 1
+  (and (match_test "nds32::load_single_p (insn)")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_store" 1
+  (and (match_test "nds32::store_single_p (insn)")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_load_multiple_1" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "1")))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_load_multiple_2" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (ior (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "2"))
+	    (match_test "nds32::load_double_p (insn)")))
+  "n10_ii, n10_ii+n10_ex, n10_ex+n10_mm, n10_mm+n10_wb, n10_wb")
+
+(define_insn_reservation "nds_n10_load_multiple_3" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "3")))
+  "n10_ii, n10_ii+n10_ex, n10_ii+n10_ex+n10_mm, n10_ex+n10_mm+n10_wb, n10_mm+n10_wb, n10_wb")
+
+(define_insn_reservation "nds_n10_load_multiple_4" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "4")))
+  "n10_ii, n10_ii+n10_ex, n10_ii+n10_ex+n10_mm, n10_ii+n10_ex+n10_mm+n10_wb, n10_ex+n10_mm+n10_wb, n10_mm+n10_wb, n10_wb")
+
+(define_insn_reservation "nds_n10_load_multiple_5" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "5")))
+  "n10_ii, n10_ii+n10_ex, n10_ii+n10_ex+n10_mm, (n10_ii+n10_ex+n10_mm+n10_wb)*2, n10_ex+n10_mm+n10_wb, n10_mm+n10_wb, n10_wb")
+
+(define_insn_reservation "nds_n10_load_multiple_6" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "6")))
+  "n10_ii, n10_ii+n10_ex, n10_ii+n10_ex+n10_mm, (n10_ii+n10_ex+n10_mm+n10_wb)*3, n10_ex+n10_mm+n10_wb, n10_mm+n10_wb, n10_wb")
+
+(define_insn_reservation "nds_n10_load_multiple_7" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "7")))
+  "n10_ii, n10_ii+n10_ex, n10_ii+n10_ex+n10_mm, (n10_ii+n10_ex+n10_mm+n10_wb)*4, n10_ex+n10_mm+n10_wb, n10_mm+n10_wb, n10_wb")
+
+(define_insn_reservation "nds_n10_load_multiple_N" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (and (eq_attr "type" "load_multiple")
+	    (match_test "get_attr_combo (insn) >= 8")))
+  "n10_ii, n10_ii+n10_ex, n10_ii+n10_ex+n10_mm, (n10_ii+n10_ex+n10_mm+n10_wb)*5, n10_ex+n10_mm+n10_wb, n10_mm+n10_wb, n10_wb")
+
+(define_insn_reservation "nds_n10_store_multiple_1" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "1")))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_store_multiple_2" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (ior (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "2"))
+       (match_test "nds32::store_double_p (insn)")))
+  "n10_ii, n10_ii+n10_ex, n10_ex+n10_mm, n10_mm+n10_wb, n10_wb")
+
+(define_insn_reservation "nds_n10_store_multiple_3" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "3")))
+  "n10_ii, n10_ii+n10_ex, n10_ii+n10_ex+n10_mm, n10_ex+n10_mm+n10_wb, n10_mm+n10_wb, n10_wb")
+
+(define_insn_reservation "nds_n10_store_multiple_4" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "4")))
+  "n10_ii, n10_ii+n10_ex, n10_ii+n10_ex+n10_mm, n10_ii+n10_ex+n10_mm+n10_wb, n10_ex+n10_mm+n10_wb, n10_mm+n10_wb, n10_wb")
+
+(define_insn_reservation "nds_n10_store_multiple_5" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "5")))
+  "n10_ii, n10_ii+n10_ex, n10_ii+n10_ex+n10_mm, (n10_ii+n10_ex+n10_mm+n10_wb)*2, n10_ex+n10_mm+n10_wb, n10_mm+n10_wb, n10_wb")
+
+(define_insn_reservation "nds_n10_store_multiple_6" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "6")))
+  "n10_ii, n10_ii+n10_ex, n10_ii+n10_ex+n10_mm, (n10_ii+n10_ex+n10_mm+n10_wb)*3, n10_ex+n10_mm+n10_wb, n10_mm+n10_wb, n10_wb")
+
+(define_insn_reservation "nds_n10_store_multiple_7" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "7")))
+  "n10_ii, n10_ii+n10_ex, n10_ii+n10_ex+n10_mm, (n10_ii+n10_ex+n10_mm+n10_wb)*4, n10_ex+n10_mm+n10_wb, n10_mm+n10_wb, n10_wb")
+
+(define_insn_reservation "nds_n10_store_multiple_N" 1
+  (and (eq_attr "pipeline_model" "n10")
+       (and (eq_attr "type" "store_multiple")
+	    (match_test "get_attr_combo (insn) >= 8")))
+  "n10_ii, n10_ii+n10_ex, n10_ii+n10_ex+n10_mm, (n10_ii+n10_ex+n10_mm+n10_wb)*5, n10_ex+n10_mm+n10_wb, n10_mm+n10_wb, n10_wb")
+
+(define_insn_reservation "nds_n10_mul" 1
+  (and (eq_attr "type" "mul")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_mac" 1
+       (and (eq_attr "type" "mac")
+	    (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_div" 1
+  (and (eq_attr "type" "div")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex*34, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_branch" 1
+  (and (eq_attr "type" "branch")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_dsp_alu" 1
+  (and (eq_attr "type" "dalu")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_dsp_alu64" 1
+  (and (eq_attr "type" "dalu64")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_dsp_alu_round" 1
+  (and (eq_attr "type" "daluround")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_dsp_cmp" 1
+  (and (eq_attr "type" "dcmp")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_dsp_clip" 1
+  (and (eq_attr "type" "dclip")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_dsp_mul" 1
+  (and (eq_attr "type" "dmul")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_dsp_mac" 1
+  (and (eq_attr "type" "dmac")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_dsp_insb" 1
+  (and (eq_attr "type" "dinsb")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_dsp_pack" 1
+  (and (eq_attr "type" "dpack")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_dsp_bpick" 1
+  (and (eq_attr "type" "dbpick")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_dsp_wext" 1
+  (and (eq_attr "type" "dwext")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ex, n10_mm, n10_wb")
+
+(define_insn_reservation "nds_n10_fpu_alu" 4
+  (and (eq_attr "type" "falu")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10f_iq, n10f_rf, n10f_e1, n10f_e2, n10f_e3, n10f_e4")
+
+(define_insn_reservation "nds_n10_fpu_muls" 4
+  (and (eq_attr "type" "fmuls")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10f_iq, n10f_rf, n10f_e1, n10f_e2, n10f_e3, n10f_e4")
+
+(define_insn_reservation "nds_n10_fpu_muld" 4
+  (and (eq_attr "type" "fmuld")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10f_iq, n10f_rf, n10f_e1, n10f_e2*2, n10f_e3, n10f_e4")
+
+(define_insn_reservation "nds_n10_fpu_macs" 4
+  (and (eq_attr "type" "fmacs")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10f_iq, n10f_rf, n10f_e1, n10f_e2*3, n10f_e3, n10f_e4")
+
+(define_insn_reservation "nds_n10_fpu_macd" 4
+  (and (eq_attr "type" "fmacd")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10f_iq, n10f_rf, n10f_e1, n10f_e2*4, n10f_e3, n10f_e4")
+
+(define_insn_reservation "nds_n10_fpu_divs" 4
+  (and (ior (eq_attr "type" "fdivs")
+	    (eq_attr "type" "fsqrts"))
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10f_iq, n10f_rf, n10f_e1, n10f_e2*14, n10f_e3, n10f_e4")
+
+(define_insn_reservation "nds_n10_fpu_divd" 4
+  (and (ior (eq_attr "type" "fdivd")
+	    (eq_attr "type" "fsqrtd"))
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10f_iq, n10f_rf, n10f_e1, n10f_e2*28, n10f_e3, n10f_e4")
+
+(define_insn_reservation "nds_n10_fpu_fast_alu" 2
+  (and (ior (eq_attr "type" "fcmp")
+	    (ior (eq_attr "type" "fabs")
+		 (ior (eq_attr "type" "fcpy")
+		      (eq_attr "type" "fcmov"))))
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10f_iq, n10f_rf, n10f_e1, n10f_e2, n10f_e3, n10f_e4")
+
+(define_insn_reservation "nds_n10_fpu_fmtsr" 4
+  (and (eq_attr "type" "fmtsr")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10f_iq, n10f_rf, n10f_e1, n10f_e2, n10f_e3, n10f_e4")
+
+(define_insn_reservation "nds_n10_fpu_fmtdr" 4
+  (and (eq_attr "type" "fmtdr")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ii+n10f_iq, n10f_iq+n10f_rf, n10f_rf+n10f_e1, n10f_e1+n10f_e2, n10f_e2+n10f_e3, n10f_e3+n10f_e4, n10f_e4")
+
+(define_insn_reservation "nds_n10_fpu_fmfsr" 2
+  (and (eq_attr "type" "fmfsr")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10f_iq, n10f_rf, n10f_e1, n10f_e2, n10f_e3, n10f_e4")
+
+(define_insn_reservation "nds_n10_fpu_fmfdr" 2
+  (and (eq_attr "type" "fmfdr")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10_ii+n10f_iq, n10f_iq+n10f_rf, n10f_rf+n10f_e1, n10f_e1+n10f_e2, n10f_e2+n10f_e3, n10f_e3+n10f_e4, n10f_e4")
+
+(define_insn_reservation "nds_n10_fpu_load" 3
+  (and (eq_attr "type" "fload")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10f_iq, n10f_rf, n10f_e1, n10f_e2, n10f_e3, n10f_e4")
+
+(define_insn_reservation "nds_n10_fpu_store" 1
+  (and (eq_attr "type" "fstore")
+       (eq_attr "pipeline_model" "n10"))
+  "n10_ii, n10f_iq, n10f_rf, n10f_e1, n10f_e2, n10f_e3, n10f_e4")
+
+;; ------------------------------------------------------------------------
+;; Comment Notations and Bypass Rules
+;; ------------------------------------------------------------------------
+;; Producers (LHS)
+;;   LD
+;;     Load data from the memory and produce the loaded data. The result is
+;;     ready at MM.
+;;   LMW(N, M)
+;;     There are N micro-operations within an instruction that loads multiple
+;;     words. The result produced by the M-th micro-operation is sent to
+;;     consumers. The result is ready at MM.
+;;   MUL, MAC
+;;     Compute data in the multiply-adder and produce the data. The result
+;;     is ready at MM.
+;;   DIV
+;;     Compute data in the divider and produce the data. The result is ready
+;;     at MM.
+;;
+;; Consumers (RHS)
+;;   ALU, MOVD44, PBSAD, PBSADA_RaRb, MUL, MAC, DIV, MMU
+;;     Require operands at EX.
+;;   ALU_SHIFT_Rb
+;;     An ALU-SHIFT instruction consists of a shift micro-operation followed
+;;     by an arithmetic micro-operation. The operand Rb is used by the first
+;;     micro-operation, and there are some latencies if data dependency occurs.
+;;   MAC_RaRb
+;;     A MAC instruction does multiplication at EX and does accumulation at MM,
+;;     so the operand Rt is required at MM, and operands Ra and Rb are required
+;;     at EX.
+;;   ADDR_IN
+;;     If an instruction requires an address as its input operand, the address
+;;     is required at EX.
+;;   ST
+;;     A store instruction requires its data at MM.
+;;   SMW(N, M)
+;;     There are N micro-operations within an instruction that stores multiple
+;;     words. Each M-th micro-operation requires its data at MM.
+;;   BR
+;;     If a branch instruction is conditional, its input data is required at EX.
+
+;; FPU_ADDR_OUT -> FPU_ADDR_IN
+;; Main pipeline rules don't need this because those default latency is 1.
+(define_bypass 1
+  "nds_n10_fpu_load, nds_n10_fpu_store"
+  "nds_n10_fpu_load, nds_n10_fpu_store"
+  "nds32_n10_ex_to_ex_p"
+)
+
+;; LD, MUL, MAC, DIV, DALU64, DMUL, DMAC, DALUROUND, DBPICK, DWEXT
+;;   -> ALU, ALU_SHIFT_Rb, PBSAD, PBSADA_RaRb, MOVD44, MUL, MAC_RaRb, DIV, ADDR_IN, BR, MMU,
+;;      DALU, DALUROUND, DMUL, DMAC_RaRb, DPACK, DINSB, DCMP, DCLIP, WEXT_O, BPICK_RaRb
+(define_bypass 2
+  "nds_n10_load, nds_n10_mul, nds_n10_mac, nds_n10_div,\
+   nds_n10_dsp_alu64, nds_n10_dsp_mul, nds_n10_dsp_mac,\
+   nds_n10_dsp_alu_round, nds_n10_dsp_bpick, nds_n10_dsp_wext"
+  "nds_n10_alu, nds_n10_alu_shift,\
+   nds_n10_pbsad, nds_n10_pbsada,\
+   nds_n10_mul, nds_n10_mac, nds_n10_div,\
+   nds_n10_branch,\
+   nds_n10_load, nds_n10_store,\
+   nds_n10_load_multiple_1, nds_n10_load_multiple_2, nds_n10_load_multiple_3,\
+   nds_n10_load_multiple_4, nds_n10_load_multiple_5, nds_n10_load_multiple_6,\
+   nds_n10_load_multiple_7, nds_n10_load_multiple_N,\
+   nds_n10_store_multiple_1, nds_n10_store_multiple_2, nds_n10_store_multiple_3,\
+   nds_n10_store_multiple_4, nds_n10_store_multiple_5, nds_n10_store_multiple_6,\
+   nds_n10_store_multiple_7, nds_n10_store_multiple_N,\
+   nds_n10_mmu,\
+   nds_n10_dsp_alu, nds_n10_dsp_alu_round,\
+   nds_n10_dsp_mul, nds_n10_dsp_mac, nds_n10_dsp_pack,\
+   nds_n10_dsp_insb, nds_n10_dsp_cmp, nds_n10_dsp_clip,\
+   nds_n10_dsp_wext, nds_n10_dsp_bpick"
+  "nds32_n10_mm_to_ex_p"
+)
+
+;; LMW(N, N)
+;;   -> ALU, ALU_SHIFT_Rb, PBSAD, PBSADA_RaRb, MOVD44, MUL, MAC_RaRb, DIV, ADDR_IN, BR, MMU
+;;      DALU, DALUROUND, DMUL, DMAC_RaRb, DPACK, DINSB, DCMP, DCLIP, WEXT_O, BPICK_RaRb
+(define_bypass 2
+  "nds_n10_load_multiple_1, nds_n10_load_multiple_2, nds_n10_load_multiple_3,\
+   nds_n10_load_multiple_4, nds_n10_load_multiple_5, nds_n10_load_multiple_6,\
+   nds_n10_load_multiple_7, nds_n10_load_multiple_N"
+  "nds_n10_alu, nds_n10_alu_shift,\
+   nds_n10_pbsad, nds_n10_pbsada,\
+   nds_n10_mul, nds_n10_mac, nds_n10_div,\
+   nds_n10_branch,\
+   nds_n10_load, nds_n10_store,\
+   nds_n10_load_multiple_1, nds_n10_load_multiple_2, nds_n10_load_multiple_3,\
+   nds_n10_load_multiple_4, nds_n10_load_multiple_5, nds_n10_load_multiple_6,\
+   nds_n10_load_multiple_7, nds_n10_load_multiple_N,\
+   nds_n10_store_multiple_1, nds_n10_store_multiple_2, nds_n10_store_multiple_3,\
+   nds_n10_store_multiple_4, nds_n10_store_multiple_5, nds_n10_store_multiple_6,\
+   nds_n10_store_multiple_7, nds_n10_store_multiple_N,\
+   nds_n10_mmu,\
+   nds_n10_dsp_alu, nds_n10_dsp_alu_round,\
+   nds_n10_dsp_mul, nds_n10_dsp_mac, nds_n10_dsp_pack,\
+   nds_n10_dsp_insb, nds_n10_dsp_cmp, nds_n10_dsp_clip,\
+   nds_n10_dsp_wext, nds_n10_dsp_bpick"
+  "nds32_n10_last_load_to_ex_p"
+)
diff --git a/gcc/config/nds32/nds32-n13.md b/gcc/config/nds32/nds32-n13.md
new file mode 100644
index 0000000..622480d
--- /dev/null
+++ b/gcc/config/nds32/nds32-n13.md
@@ -0,0 +1,401 @@
+;; Pipeline descriptions of Andes NDS32 cpu for GNU compiler
+;; Copyright (C) 2012-2016 Free Software Foundation, Inc.
+;; Contributed by Andes Technology Corporation.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+
+;; ------------------------------------------------------------------------
+;; Define N13 pipeline settings.
+;; ------------------------------------------------------------------------
+
+(define_automaton "nds32_n13_machine")
+
+;; ------------------------------------------------------------------------
+;; Pipeline Stages
+;; ------------------------------------------------------------------------
+;; F1 - Instruction Fetch First
+;;   Instruction Tag/Data Arrays
+;;   ITLB Address Translation
+;;   Branch Target Buffer Prediction
+;; F2 - Instruction Fetch Second
+;;   Instruction Cache Hit Detection
+;;   Cache Way Selection
+;;   Inustruction Alignment
+;; I1 - Instruction Issue First / Instruction Decode
+;;   Instruction Cache Replay Triggering
+;;   32/16-Bit Instruction Decode
+;;   Return Address Stack Prediction
+;; I2 - Instruction Issue Second / Register File Access
+;;   Instruction Issue Logic
+;;   Register File Access
+;; E1 - Instruction Execute First / Address Generation / MAC First
+;;   Data Access Address generation
+;;   Multiply Operation
+;; E2 - Instruction Execute Second / Data Access First / MAC Second /
+;;      ALU Execute
+;;   Skewed ALU
+;;   Branch/Jump/Return Resolution
+;;   Data Tag/Data arrays
+;;   DTLB address translation
+;;   Accumulation Operation
+;; E3 - Instruction Execute Third / Data Access Second
+;;   Data Cache Hit Detection
+;;   Cache Way Selection
+;;   Data Alignment
+;; E4 - Instruction Execute Fourth / Write Back
+;;   Interruption Resolution
+;;   Instruction Retire
+;;   Register File Write Back
+
+(define_cpu_unit "n13_i1" "nds32_n13_machine")
+(define_cpu_unit "n13_i2" "nds32_n13_machine")
+(define_cpu_unit "n13_e1" "nds32_n13_machine")
+(define_cpu_unit "n13_e2" "nds32_n13_machine")
+(define_cpu_unit "n13_e3" "nds32_n13_machine")
+(define_cpu_unit "n13_e4" "nds32_n13_machine")
+
+(define_insn_reservation "nds_n13_unknown" 1
+  (and (eq_attr "type" "unknown")
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2, n13_e1, n13_e2, n13_e3, n13_e4")
+
+(define_insn_reservation "nds_n13_misc" 1
+  (and (eq_attr "type" "misc")
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2, n13_e1, n13_e2, n13_e3, n13_e4")
+
+(define_insn_reservation "nds_n13_mmu" 1
+  (and (eq_attr "type" "mmu")
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2, n13_e1, n13_e2, n13_e3, n13_e4")
+
+(define_insn_reservation "nds_n13_alu" 1
+  (and (eq_attr "type" "alu")
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2, n13_e1, n13_e2, n13_e3, n13_e4")
+
+(define_insn_reservation "nds_n13_alu_shift" 1
+  (and (eq_attr "type" "alu_shift")
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i1+n13_i2, n13_i2+n13_e1, n13_e1+n13_e2, n13_e2+n13_e3, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_pbsad" 1
+  (and (eq_attr "type" "pbsad")
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2, n13_e1, n13_e2*2, n13_e3, n13_e4")
+
+(define_insn_reservation "nds_n13_pbsada" 1
+  (and (eq_attr "type" "pbsada")
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2, n13_e1, n13_e2*3, n13_e3, n13_e4")
+
+(define_insn_reservation "nds_n13_load" 1
+  (and (match_test "nds32::load_single_p (insn)")
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2, n13_e1, n13_e2, n13_e3, n13_e4")
+
+(define_insn_reservation "nds_n13_store" 1
+  (and (match_test "nds32::store_single_p (insn)")
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2, n13_e1, n13_e2, n13_e3, n13_e4")
+
+(define_insn_reservation "nds_n13_load_multiple_1" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "1"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2, n13_e1, n13_e2, n13_e3, n13_e4")
+
+(define_insn_reservation "nds_n13_load_multiple_2" 1
+  (and (ior (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "2"))
+	    (match_test "nds32::load_double_p (insn)"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i1+n13_i2, n13_i2+n13_e1, n13_e1+n13_e2, n13_e2+n13_e3, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_load_multiple_3" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "3"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2+n13_i2, n13_i1+n13_i2+n13_e1, n13_i2+n13_e1+n13_e2, n13_e1+n13_e2+n13_e3, n13_e2+n13_e3+n13_e4, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_load_multiple_4" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "4"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i1+n13_i2, n13_i1+n13_i2+n13_e1, n13_i1+n13_i2+n13_e1+n13_e2, n13_i2+n13_e1+n13_e2+n13_e3, n13_e1+n13_e2+n13_e3+n13_e4, n13_e2+n13_e3+n13_e4, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_load_multiple_5" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "5"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i1+n13_i2, n13_i1+n13_i2+n13_e1, n13_i1+n13_i2+n13_e1+n13_e2, n13_i1+n13_i2+n13_e1+n13_e2+n13_e3, n13_i2+n13_e1+n13_e2+n13_e3+n13_e4, n13_e1+n13_e2+n13_e3+n13_e4, n13_e2+n13_e3+n13_e4, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_load_multiple_6" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "6"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i1+n13_i2, n13_i1+n13_i2+n13_e1, n13_i1+n13_i2+n13_e1+n13_e2, n13_i1+n13_i2+n13_e1+n13_e2+n13_e3, n13_i1+n13_i2+n13_e1+n13_e2+n13_e3+n13_e4, n13_i2+n13_e1+n13_e2+n13_e3+n13_e4, n13_e1+n13_e2+n13_e3+n13_e4, n13_e2+n13_e3+n13_e4, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_load_multiple_7" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "7"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i1+n13_i2, n13_i1+n13_i2+n13_e1, n13_i1+n13_i2+n13_e1+n13_e2, n13_i1+n13_i2+n13_e1+n13_e2+n13_e3, (n13_i1+n13_i2+n13_e1+n13_e2+n13_e3+n13_e4)*2, n13_i2+n13_e1+n13_e2+n13_e3+n13_e4, n13_e1+n13_e2+n13_e3+n13_e4, n13_e2+n13_e3+n13_e4, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_load_multiple_8" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "8"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i1+n13_i2, n13_i1+n13_i2+n13_e1, n13_i1+n13_i2+n13_e1+n13_e2, n13_i1+n13_i2+n13_e1+n13_e2+n13_e3, (n13_i1+n13_i2+n13_e1+n13_e2+n13_e3+n13_e4)*3, n13_i2+n13_e1+n13_e2+n13_e3+n13_e4, n13_e1+n13_e2+n13_e3+n13_e4, n13_e2+n13_e3+n13_e4, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_load_multiple_12" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "12"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i1+n13_i2, n13_i1+n13_i2+n13_e1, n13_i1+n13_i2+n13_e1+n13_e2, n13_i1+n13_i2+n13_e1+n13_e2+n13_e3, (n13_i1+n13_i2+n13_e1+n13_e2+n13_e3+n13_e4)*7, n13_i2+n13_e1+n13_e2+n13_e3+n13_e4, n13_e1+n13_e2+n13_e3+n13_e4, n13_e2+n13_e3+n13_e4, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_store_multiple_1" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "1"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2, n13_e1, n13_e2, n13_e3, n13_e4")
+
+(define_insn_reservation "nds_n13_store_multiple_2" 1
+  (and (ior (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "2"))
+	    (match_test "nds32::store_double_p (insn)"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i1+n13_i2, n13_i2+n13_e1, n13_e1+n13_e2, n13_e2+n13_e3, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_store_multiple_3" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "3"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2+n13_i2, n13_i1+n13_i2+n13_e1, n13_i2+n13_e1+n13_e2, n13_e1+n13_e2+n13_e3, n13_e2+n13_e3+n13_e4, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_store_multiple_4" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "4"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i1+n13_i2, n13_i1+n13_i2+n13_e1, n13_i1+n13_i2+n13_e1+n13_e2, n13_i2+n13_e1+n13_e2+n13_e3, n13_e1+n13_e2+n13_e3+n13_e4, n13_e2+n13_e3+n13_e4, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_store_multiple_5" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "5"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i1+n13_i2, n13_i1+n13_i2+n13_e1, n13_i1+n13_i2+n13_e1+n13_e2, n13_i1+n13_i2+n13_e1+n13_e2+n13_e3, n13_i2+n13_e1+n13_e2+n13_e3+n13_e4, n13_e1+n13_e2+n13_e3+n13_e4, n13_e2+n13_e3+n13_e4, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_store_multiple_6" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "6"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i1+n13_i2, n13_i1+n13_i2+n13_e1, n13_i1+n13_i2+n13_e1+n13_e2, n13_i1+n13_i2+n13_e1+n13_e2+n13_e3, n13_i1+n13_i2+n13_e1+n13_e2+n13_e3+n13_e4, n13_i2+n13_e1+n13_e2+n13_e3+n13_e4, n13_e1+n13_e2+n13_e3+n13_e4, n13_e2+n13_e3+n13_e4, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_store_multiple_7" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "7"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i1+n13_i2, n13_i1+n13_i2+n13_e1, n13_i1+n13_i2+n13_e1+n13_e2, n13_i1+n13_i2+n13_e1+n13_e2+n13_e3, (n13_i1+n13_i2+n13_e1+n13_e2+n13_e3+n13_e4)*2, n13_i2+n13_e1+n13_e2+n13_e3+n13_e4, n13_e1+n13_e2+n13_e3+n13_e4, n13_e2+n13_e3+n13_e4, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_store_multiple_8" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "8"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i1+n13_i2, n13_i1+n13_i2+n13_e1, n13_i1+n13_i2+n13_e1+n13_e2, n13_i1+n13_i2+n13_e1+n13_e2+n13_e3, (n13_i1+n13_i2+n13_e1+n13_e2+n13_e3+n13_e4)*3, n13_i2+n13_e1+n13_e2+n13_e3+n13_e4, n13_e1+n13_e2+n13_e3+n13_e4, n13_e2+n13_e3+n13_e4, n13_e3+n13_e4, n13_e4")
+
+(define_insn_reservation "nds_n13_store_multiple_12" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "12"))
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i1+n13_i2, n13_i1+n13_i2+n13_e1, n13_i1+n13_i2+n13_e1+n13_e2, n13_i1+n13_i2+n13_e1+n13_e2+n13_e3, (n13_i1+n13_i2+n13_e1+n13_e2+n13_e3+n13_e4)*7, n13_i2+n13_e1+n13_e2+n13_e3+n13_e4, n13_e1+n13_e2+n13_e3+n13_e4, n13_e2+n13_e3+n13_e4, n13_e3+n13_e4, n13_e4")
+
+;; The multiplier at E1 takes two cycles.
+(define_insn_reservation "nds_n13_mul" 1
+  (and (eq_attr "type" "mul")
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2, n13_e1*2, n13_e2, n13_e3, n13_e4")
+
+(define_insn_reservation "nds_n13_mac" 1
+  (and (eq_attr "type" "mac")
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2, n13_e1*2, n13_e2, n13_e3, n13_e4")
+
+;; The cycles consumed at E2 are 32 - CLZ(abs(Ra)) + 2,
+;; so the worst case is 34.
+(define_insn_reservation "nds_n13_div" 1
+  (and (eq_attr "type" "div")
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2, n13_e1, n13_e2*34, n13_e3, n13_e4")
+
+(define_insn_reservation "nds_n13_branch" 1
+  (and (eq_attr "type" "branch")
+       (eq_attr "pipeline_model" "n13"))
+  "n13_i1, n13_i2, n13_e1, n13_e2, n13_e3, n13_e4")
+
+;; ------------------------------------------------------------------------
+;; Comment Notations and Bypass Rules
+;; ------------------------------------------------------------------------
+;; Producers (LHS)
+;;   LD
+;;     Load data from the memory and produce the loaded data. The result is
+;;     ready at E3.
+;;   LMW(N, M)
+;;     There are N micro-operations within an instruction that loads multiple
+;;     words. The result produced by the M-th micro-operation is sent to
+;;     consumers. The result is ready at E3.
+;;   ADDR_OUT
+;;     Most load/store instructions can produce an address output if updating
+;;     the base register is required. The result is ready at E2, which is
+;;     produced by ALU.
+;;   ALU, ALU_SHIFT, SIMD
+;;     Compute data in ALU and produce the data. The result is ready at E2.
+;;   MUL, MAC
+;;     Compute data in the multiply-adder and produce the data. The result
+;;     is ready at E2.
+;;   DIV
+;;     Compute data in the divider and produce the data. The result is ready
+;;     at E2.
+;;   BR
+;;     Branch-with-link instructions produces a result containing the return
+;;     address. The result is ready at E2.
+;;
+;; Consumers (RHS)
+;;   ALU
+;;     General ALU instructions require operands at E2.
+;;   ALU_E1
+;;     Some special ALU instructions, such as BSE, BSP and MOVD44, require
+;;     operand at E1.
+;;   MUL, DIV, PBSAD, MMU
+;;     Operands are required at E1.
+;;   PBSADA_Rt, PBSADA_RaRb
+;;     Operands Ra and Rb are required at E1, and the operand Rt is required
+;;     at E2.
+;;   ALU_SHIFT_Rb
+;;     An ALU-SHIFT instruction consists of a shift micro-operation followed
+;;     by an arithmetic micro-operation. The operand Rb is used by the first
+;;     micro-operation, and there are some latencies if data dependency occurs.
+;;   MAC_RaRb
+;;     A MAC instruction does multiplication at E1 and does accumulation at E2,
+;;     so the operand Rt is required at E2, and operands Ra and Rb are required
+;;     at E1.
+;;   ADDR_IN
+;;     If an instruction requires an address as its input operand, the address
+;;     is required at E1.
+;;   ST
+;;     A store instruction requires its data at E2.
+;;   SMW(N, M)
+;;     There are N micro-operations within an instruction that stores multiple
+;;     words. Each M-th micro-operation requires its data at E2.
+;;   BR
+;;     If a branch instruction is conditional, its input data is required at E2.
+
+;; LD -> ALU_E1, PBSAD, PBSADA_RaRb, MUL, MAC_RaRb, DIV, MMU, ADDR_IN
+(define_bypass 3
+  "nds_n13_load"
+  "nds_n13_alu, nds_n13_pbsad, nds_n13_pbsada,\
+   nds_n13_mul, nds_n13_mac, nds_n13_div,\
+   nds_n13_mmu,\
+   nds_n13_load, nds_n13_store,\
+   nds_n13_load_multiple_1,nds_n13_load_multiple_2, nds_n13_load_multiple_3,\
+   nds_n13_load_multiple_4,nds_n13_load_multiple_5, nds_n13_load_multiple_6,\
+   nds_n13_load_multiple_7,nds_n13_load_multiple_8, nds_n13_load_multiple_12,\
+   nds_n13_store_multiple_1,nds_n13_store_multiple_2, nds_n13_store_multiple_3,\
+   nds_n13_store_multiple_4,nds_n13_store_multiple_5, nds_n13_store_multiple_6,\
+   nds_n13_store_multiple_7,nds_n13_store_multiple_8, nds_n13_store_multiple_12"
+  "nds32_n13_load_to_e1_p"
+)
+
+;; LD -> ALU, ALU_SHIFT_Rb, PBSADA_Rt, BR, ST, SMW(N, 1)
+(define_bypass 2
+  "nds_n13_load"
+  "nds_n13_alu, nds_n13_alu_shift, nds_n13_pbsada, nds_n13_branch, nds_n13_store,\
+   nds_n13_store_multiple_1,nds_n13_store_multiple_2, nds_n13_store_multiple_3,\
+   nds_n13_store_multiple_4,nds_n13_store_multiple_5, nds_n13_store_multiple_6,\
+   nds_n13_store_multiple_7,nds_n13_store_multiple_8, nds_n13_store_multiple_12"
+  "nds32_n13_load_to_e2_p"
+)
+
+;; LMW(N, N) -> ALU_E1, PBSAD, PBSADA_RaRb, MUL, MAC_RaRb, DIV, MMU, ADDR_IN
+(define_bypass 3
+  "nds_n13_load_multiple_1,nds_n13_load_multiple_2, nds_n13_load_multiple_3,\
+   nds_n13_load_multiple_4,nds_n13_load_multiple_5, nds_n13_load_multiple_6,\
+   nds_n13_load_multiple_7,nds_n13_load_multiple_8, nds_n13_load_multiple_12"
+  "nds_n13_alu, nds_n13_pbsad, nds_n13_pbsada,\
+   nds_n13_mul, nds_n13_mac, nds_n13_div,\
+   nds_n13_mmu,\
+   nds_n13_load, nds_n13_store,\
+   nds_n13_load_multiple_1,nds_n13_load_multiple_2, nds_n13_load_multiple_3,\
+   nds_n13_load_multiple_4,nds_n13_load_multiple_5, nds_n13_load_multiple_6,\
+   nds_n13_load_multiple_7,nds_n13_load_multiple_8, nds_n13_load_multiple_12,\
+   nds_n13_store_multiple_1,nds_n13_store_multiple_2, nds_n13_store_multiple_3,\
+   nds_n13_store_multiple_4,nds_n13_store_multiple_5, nds_n13_store_multiple_6,\
+   nds_n13_store_multiple_7,nds_n13_store_multiple_8, nds_n13_store_multiple_12"
+  "nds32_n13_last_load_to_e1_p")
+
+;; LMW(N, N) -> ALU, ALU_SHIFT_Rb, PBSADA_Rt, BR, ST, SMW(N, 1)
+(define_bypass 2
+  "nds_n13_load_multiple_1,nds_n13_load_multiple_2, nds_n13_load_multiple_3,\
+   nds_n13_load_multiple_4,nds_n13_load_multiple_5, nds_n13_load_multiple_6,\
+   nds_n13_load_multiple_7,nds_n13_load_multiple_8, nds_n13_load_multiple_12"
+  "nds_n13_alu, nds_n13_alu_shift, nds_n13_pbsada, nds_n13_branch, nds_n13_store,\
+   nds_n13_store_multiple_1,nds_n13_store_multiple_2, nds_n13_store_multiple_3,\
+   nds_n13_store_multiple_4,nds_n13_store_multiple_5, nds_n13_store_multiple_6,\
+   nds_n13_store_multiple_7,nds_n13_store_multiple_8, nds_n13_store_multiple_12"
+  "nds32_n13_last_load_to_e2_p"
+)
+
+;; LMW(N, N - 1) -> ALU_E1, PBSAD, PBSADA_RaRb, MUL, MAC_RaRb, DIV, MMU, ADDR_IN
+(define_bypass 2
+  "nds_n13_load_multiple_1,nds_n13_load_multiple_2, nds_n13_load_multiple_3,\
+   nds_n13_load_multiple_4,nds_n13_load_multiple_5, nds_n13_load_multiple_6,\
+   nds_n13_load_multiple_7,nds_n13_load_multiple_8, nds_n13_load_multiple_12"
+  "nds_n13_alu, nds_n13_pbsad, nds_n13_pbsada,\
+   nds_n13_mul, nds_n13_mac, nds_n13_div,\
+   nds_n13_mmu,\
+   nds_n13_load, nds_n13_store,\
+   nds_n13_load_multiple_1,nds_n13_load_multiple_2, nds_n13_load_multiple_3,\
+   nds_n13_load_multiple_4,nds_n13_load_multiple_5, nds_n13_load_multiple_6,\
+   nds_n13_load_multiple_7,nds_n13_load_multiple_8, nds_n13_load_multiple_12,\
+   nds_n13_store_multiple_1,nds_n13_store_multiple_2, nds_n13_store_multiple_3,\
+   nds_n13_store_multiple_4,nds_n13_store_multiple_5, nds_n13_store_multiple_6,\
+   nds_n13_store_multiple_7,nds_n13_store_multiple_8, nds_n13_store_multiple_12"
+  "nds32_n13_last_two_load_to_e1_p")
+
+;; ALU, ALU_SHIFT, SIMD, BR, MUL, MAC, DIV, ADDR_OUT
+;;   ->  ALU_E1, PBSAD, PBSADA_RaRb, MUL, MAC_RaRb, DIV, MMU, ADDR_IN
+(define_bypass 2
+  "nds_n13_alu, nds_n13_alu_shift, nds_n13_pbsad, nds_n13_pbsada, nds_n13_branch,\
+   nds_n13_mul, nds_n13_mac, nds_n13_div,\
+   nds_n13_load, nds_n13_store,\
+   nds_n13_load_multiple_1,nds_n13_load_multiple_2, nds_n13_load_multiple_3,\
+   nds_n13_load_multiple_4,nds_n13_load_multiple_5, nds_n13_load_multiple_6,\
+   nds_n13_load_multiple_7,nds_n13_load_multiple_8, nds_n13_load_multiple_12,\
+   nds_n13_store_multiple_1,nds_n13_store_multiple_2, nds_n13_store_multiple_3,\
+   nds_n13_store_multiple_4,nds_n13_store_multiple_5, nds_n13_store_multiple_6,\
+   nds_n13_store_multiple_7,nds_n13_store_multiple_8, nds_n13_store_multiple_12"
+  "nds_n13_alu, nds_n13_pbsad, nds_n13_pbsada,\
+   nds_n13_mul, nds_n13_mac, nds_n13_div,\
+   nds_n13_mmu,\
+   nds_n13_load, nds_n13_store,\
+   nds_n13_load_multiple_1,nds_n13_load_multiple_2, nds_n13_load_multiple_3,\
+   nds_n13_load_multiple_4,nds_n13_load_multiple_5, nds_n13_load_multiple_6,\
+   nds_n13_load_multiple_7,nds_n13_load_multiple_8, nds_n13_load_multiple_12,\
+   nds_n13_store_multiple_1,nds_n13_store_multiple_2, nds_n13_store_multiple_3,\
+   nds_n13_store_multiple_4,nds_n13_store_multiple_5, nds_n13_store_multiple_6,\
+   nds_n13_store_multiple_7,nds_n13_store_multiple_8, nds_n13_store_multiple_12"
+  "nds32_n13_e2_to_e1_p")
diff --git a/gcc/config/nds32/nds32-n7.md b/gcc/config/nds32/nds32-n7.md
new file mode 100644
index 0000000..ff788ce
--- /dev/null
+++ b/gcc/config/nds32/nds32-n7.md
@@ -0,0 +1,298 @@
+;; Pipeline descriptions of Andes NDS32 cpu for GNU compiler
+;; Copyright (C) 2012-2016 Free Software Foundation, Inc.
+;; Contributed by Andes Technology Corporation.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+
+;; ------------------------------------------------------------------------
+;; Define N8 pipeline settings.
+;; ------------------------------------------------------------------------
+
+(define_automaton "nds32_n7_machine")
+
+;; ------------------------------------------------------------------------
+;; Pipeline Stages
+;; ------------------------------------------------------------------------
+;; IF - Instruction Fetch
+;;   Instruction Alignment
+;;   Instruction Pre-decode
+;; II - Instruction Issue
+;;   Instruction Decode
+;;   Register File Access
+;;   Instruction Execution
+;;   Interrupt Handling
+;; EXD - Psuedo Stage
+;;   Load Data Completion
+
+(define_cpu_unit "n7_ii" "nds32_n7_machine")
+
+(define_insn_reservation "nds_n7_unknown" 1
+  (and (eq_attr "type" "unknown")
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii")
+
+(define_insn_reservation "nds_n7_misc" 1
+  (and (eq_attr "type" "misc")
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii")
+
+(define_insn_reservation "nds_n7_alu" 1
+  (and (eq_attr "type" "alu")
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii")
+
+(define_insn_reservation "nds_n7_load" 1
+  (and (match_test "nds32::load_single_p (insn)")
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii")
+
+(define_insn_reservation "nds_n7_store" 1
+  (and (match_test "nds32::store_single_p (insn)")
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii")
+
+(define_insn_reservation "nds_n7_load_multiple_1" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "1"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii")
+
+(define_insn_reservation "nds_n7_load_multiple_2" 1
+  (and (ior (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "2"))
+	    (match_test "nds32::load_double_p (insn)"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*2")
+
+(define_insn_reservation "nds_n7_load_multiple_3" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "3"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*3")
+
+(define_insn_reservation "nds_n7_load_multiple_4" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "4"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*4")
+
+(define_insn_reservation "nds_n7_load_multiple_5" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "5"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*5")
+
+(define_insn_reservation "nds_n7_load_multiple_6" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "6"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*6")
+
+(define_insn_reservation "nds_n7_load_multiple_7" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "7"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*7")
+
+(define_insn_reservation "nds_n7_load_multiple_8" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "8"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*8")
+
+(define_insn_reservation "nds_n7_load_multiple_12" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "12"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*12")
+
+(define_insn_reservation "nds_n7_store_multiple_1" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "1"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii")
+
+(define_insn_reservation "nds_n7_store_multiple_2" 1
+  (and (ior (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "2"))
+	    (match_test "nds32::store_double_p (insn)"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*2")
+
+(define_insn_reservation "nds_n7_store_multiple_3" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "3"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*3")
+
+(define_insn_reservation "nds_n7_store_multiple_4" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "4"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*4")
+
+(define_insn_reservation "nds_n7_store_multiple_5" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "5"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*5")
+
+(define_insn_reservation "nds_n7_store_multiple_6" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "6"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*6")
+
+(define_insn_reservation "nds_n7_store_multiple_7" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "7"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*7")
+
+(define_insn_reservation "nds_n7_store_multiple_8" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "8"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*8")
+
+(define_insn_reservation "nds_n7_store_multiple_12" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "12"))
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*12")
+
+(define_insn_reservation "nds_n7_mul_fast" 1
+  (and (match_test "nds32_mul_config != MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mul")
+	    (eq_attr "pipeline_model" "n7")))
+  "n7_ii")
+
+(define_insn_reservation "nds_n7_mul_slow" 1
+  (and (match_test "nds32_mul_config == MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mul")
+	    (eq_attr "pipeline_model" "n7")))
+  "n7_ii*17")
+
+(define_insn_reservation "nds_n7_mac_fast" 1
+  (and (match_test "nds32_mul_config != MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mac")
+	    (eq_attr "pipeline_model" "n7")))
+  "n7_ii*2")
+
+(define_insn_reservation "nds_n7_mac_slow" 1
+  (and (match_test "nds32_mul_config == MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mac")
+	    (eq_attr "pipeline_model" "n7")))
+  "n7_ii*18")
+
+(define_insn_reservation "nds_n7_div" 1
+  (and (eq_attr "type" "div")
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii*37")
+
+(define_insn_reservation "nds_n7_branch" 1
+  (and (eq_attr "type" "branch")
+       (eq_attr "pipeline_model" "n7"))
+  "n7_ii")
+
+;; ------------------------------------------------------------------------
+;; Comment Notations and Bypass Rules
+;; ------------------------------------------------------------------------
+;; Producers (LHS)
+;;   LD_!bi
+;;     Load data from the memory (without updating the base register) and
+;;     produce the loaded data. The result is ready at EXD.
+;;   LMW(N, M)
+;;     There are N micro-operations within an instruction that loads multiple
+;;     words. The result produced by the M-th micro-operation is sent to
+;;     consumers. The result is ready at EXD. If the base register should be
+;;     updated, an extra micro-operation is inserted to the sequence, and the
+;;     result is ready at II.
+;;
+;; Consumers (RHS)
+;;   ALU, MUL, DIV
+;;     Require operands at II.
+;;   MOVD44_E
+;;     A double-word move instruction needs two micro-operations because the
+;;     reigster ports is 2R1W. The first micro-operation writes an even number
+;;     register, and the second micro-operation writes an odd number register.
+;;     Each input operand is required at II for each micro-operation. The letter
+;;     'E' stands for even.
+;;   MAC_RaRb
+;;     A MAC instruction is separated into two micro-operations. The first
+;;     micro-operation does the multiplication, which requires operands Ra
+;;     and Rb at II. The second micro-options does the accumulation, which
+;;     requires the operand Rt at II.
+;;   ADDR_IN_MOP(N)
+;;     Because the reigster port is 2R1W, some load/store instructions are
+;;     separated into many micro-operations. N denotes the address input is
+;;     required by the N-th micro-operation. Such operand is required at II.
+;;   ST_bi
+;;     A post-increment store instruction requires its data at II.
+;;   ST_!bi_RI
+;;     A store instruction with an immediate offset requires its data at II.
+;;     If the offset field is a register (ST_!bi_RR), the instruction will be
+;;     separated into two micro-operations, and the second one requires the
+;;     input operand at II in order to store it to the memory.
+;;   SMW(N, M)
+;;     There are N micro-operations within an instruction that stores multiple
+;;     words. Each M-th micro-operation requires its data at II. If the base
+;;     register should be updated, an extra micro-operation is inserted to the
+;;     sequence.
+;;   BR_COND
+;;     If a branch instruction is conditional, its input data is required at II.
+
+;; LD_!bi
+;;   -> ALU, MOVD44_E, MUL, MAC_RaRb, DIV, BR, ADDR_IN_MOP(1), ST_bi, ST_!bi_RI, SMW(N, 1)
+(define_bypass 2
+  "nds_n7_load"
+  "nds_n7_alu,\
+   nds_n7_mul_fast, nds_n7_mul_slow,\
+   nds_n7_mac_fast, nds_n7_mac_slow,\
+   nds_n7_div,\
+   nds_n7_branch,\
+   nds_n7_load, nds_n7_store,\
+   nds_n7_load_multiple_1,nds_n7_load_multiple_2, nds_n7_load_multiple_3,\
+   nds_n7_load_multiple_4,nds_n7_load_multiple_5, nds_n7_load_multiple_6,\
+   nds_n7_load_multiple_7,nds_n7_load_multiple_8, nds_n7_load_multiple_12,\
+   nds_n7_store_multiple_1,nds_n7_store_multiple_2, nds_n7_store_multiple_3,\
+   nds_n7_store_multiple_4,nds_n7_store_multiple_5, nds_n7_store_multiple_6,\
+   nds_n7_store_multiple_7,nds_n7_store_multiple_8, nds_n7_store_multiple_12"
+  "nds32_n7_load_to_ii_p"
+)
+
+;; LMW(N, N)
+;;   -> ALU, MOVD44_E, MUL, MAC_RaRb, DIV, BR, AADR_IN_MOP(1), ST_bi, ST_!bi_RI, SMW(N, 1)
+(define_bypass 2
+  "nds_n7_load_multiple_1,nds_n7_load_multiple_2, nds_n7_load_multiple_3,\
+   nds_n7_load_multiple_4,nds_n7_load_multiple_5, nds_n7_load_multiple_6,\
+   nds_n7_load_multiple_7,nds_n7_load_multiple_8, nds_n7_load_multiple_12"
+  "nds_n7_alu,\
+   nds_n7_mul_fast, nds_n7_mul_slow,\
+   nds_n7_mac_fast, nds_n7_mac_slow,\
+   nds_n7_div,\
+   nds_n7_branch,\
+   nds_n7_load, nds_n7_store,\
+   nds_n7_load_multiple_1,nds_n7_load_multiple_2, nds_n7_load_multiple_3,\
+   nds_n7_load_multiple_4,nds_n7_load_multiple_5, nds_n7_load_multiple_6,\
+   nds_n7_load_multiple_7,nds_n7_load_multiple_8, nds_n7_load_multiple_12,\
+   nds_n7_store_multiple_1,nds_n7_store_multiple_2, nds_n7_store_multiple_3,\
+   nds_n7_store_multiple_4,nds_n7_store_multiple_5, nds_n7_store_multiple_6,\
+   nds_n7_store_multiple_7,nds_n7_store_multiple_8, nds_n7_store_multiple_12"
+  "nds32_n7_last_load_to_ii_p"
+)
diff --git a/gcc/config/nds32/nds32-n8.md b/gcc/config/nds32/nds32-n8.md
new file mode 100644
index 0000000..c3db9cd
--- /dev/null
+++ b/gcc/config/nds32/nds32-n8.md
@@ -0,0 +1,389 @@
+;; Pipeline descriptions of Andes NDS32 cpu for GNU compiler
+;; Copyright (C) 2012-2016 Free Software Foundation, Inc.
+;; Contributed by Andes Technology Corporation.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+
+;; ------------------------------------------------------------------------
+;; Define N8 pipeline settings.
+;; ------------------------------------------------------------------------
+
+(define_automaton "nds32_n8_machine")
+
+;; ------------------------------------------------------------------------
+;; Pipeline Stages
+;; ------------------------------------------------------------------------
+;; IF - Instruction Fetch
+;; II - Instruction Issue / Address Generation
+;; EX - Instruction Execution
+;; EXD - Psuedo Stage / Load Data Completion
+
+(define_cpu_unit "n8_ii" "nds32_n8_machine")
+(define_cpu_unit "n8_ex" "nds32_n8_machine")
+
+(define_insn_reservation "nds_n8_unknown" 1
+  (and (eq_attr "type" "unknown")
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, n8_ex")
+
+(define_insn_reservation "nds_n8_misc" 1
+  (and (eq_attr "type" "misc")
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, n8_ex")
+
+(define_insn_reservation "nds_n8_alu" 1
+  (and (eq_attr "type" "alu")
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, n8_ex")
+
+(define_insn_reservation "nds_n8_load" 1
+  (and (match_test "nds32::load_single_p (insn)")
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, n8_ex")
+
+(define_insn_reservation "nds_n8_store" 1
+  (and (match_test "nds32::store_single_p (insn)")
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, n8_ex")
+
+(define_insn_reservation "nds_n8_load_multiple_1" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "1"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, n8_ex")
+
+(define_insn_reservation "nds_n8_load_multiple_2" 1
+  (and (ior (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "2"))
+	    (match_test "nds32::load_double_p (insn)"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, n8_ii+n8_ex, n8_ex")
+
+(define_insn_reservation "nds_n8_load_multiple_3" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "3"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, (n8_ii+n8_ex)*2, n8_ex")
+
+(define_insn_reservation "nds_n8_load_multiple_4" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "4"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, (n8_ii+n8_ex)*3, n8_ex")
+
+(define_insn_reservation "nds_n8_load_multiple_5" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "5"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, (n8_ii+n8_ex)*4, n8_ex")
+
+(define_insn_reservation "nds_n8_load_multiple_6" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "6"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, (n8_ii+n8_ex)*5, n8_ex")
+
+(define_insn_reservation "nds_n8_load_multiple_7" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "7"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, (n8_ii+n8_ex)*6, n8_ex")
+
+(define_insn_reservation "nds_n8_load_multiple_8" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "8"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, (n8_ii+n8_ex)*7, n8_ex")
+
+(define_insn_reservation "nds_n8_load_multiple_12" 1
+  (and (and (eq_attr "type" "load_multiple")
+	    (eq_attr "combo" "12"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, (n8_ii+n8_ex)*11, n8_ex")
+
+(define_insn_reservation "nds_n8_store_multiple_1" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "1"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, n8_ex")
+
+(define_insn_reservation "nds_n8_store_multiple_2" 1
+  (and (ior (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "2"))
+	    (match_test "nds32::store_double_p (insn)"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, n8_ii+n8_ex, n8_ex")
+
+(define_insn_reservation "nds_n8_store_multiple_3" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "3"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, (n8_ii+n8_ex)*2, n8_ex")
+
+(define_insn_reservation "nds_n8_store_multiple_4" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "4"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, (n8_ii+n8_ex)*3, n8_ex")
+
+(define_insn_reservation "nds_n8_store_multiple_5" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "5"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, (n8_ii+n8_ex)*4, n8_ex")
+
+(define_insn_reservation "nds_n8_store_multiple_6" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "6"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, (n8_ii+n8_ex)*5, n8_ex")
+
+(define_insn_reservation "nds_n8_store_multiple_7" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "7"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, (n8_ii+n8_ex)*6, n8_ex")
+
+(define_insn_reservation "nds_n8_store_multiple_8" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "8"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, (n8_ii+n8_ex)*7, n8_ex")
+
+(define_insn_reservation "nds_n8_store_multiple_12" 1
+  (and (and (eq_attr "type" "store_multiple")
+	    (eq_attr "combo" "12"))
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, (n8_ii+n8_ex)*11, n8_ex")
+
+(define_insn_reservation "nds_n8_mul_fast" 1
+  (and (match_test "nds32_mul_config != MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mul")
+	    (eq_attr "pipeline_model" "n8")))
+  "n8_ii, n8_ex")
+
+(define_insn_reservation "nds_n8_mul_slow" 1
+  (and (match_test "nds32_mul_config == MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mul")
+	    (eq_attr "pipeline_model" "n8")))
+  "n8_ii, n8_ex*16")
+
+(define_insn_reservation "nds_n8_mac_fast" 1
+  (and (match_test "nds32_mul_config != MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mac")
+	    (eq_attr "pipeline_model" "n8")))
+  "n8_ii, n8_ii+n8_ex, n8_ex")
+
+(define_insn_reservation "nds_n8_mac_slow" 1
+  (and (match_test "nds32_mul_config == MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mac")
+	    (eq_attr "pipeline_model" "n8")))
+  "n8_ii, (n8_ii+n8_ex)*16, n8_ex")
+
+(define_insn_reservation "nds_n8_div" 1
+  (and (eq_attr "type" "div")
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, (n8_ii+n8_ex)*36, n8_ex")
+
+(define_insn_reservation "nds_n8_branch" 1
+  (and (eq_attr "type" "branch")
+       (eq_attr "pipeline_model" "n8"))
+  "n8_ii, n8_ex")
+
+;; ------------------------------------------------------------------------
+;; Comment Notations and Bypass Rules
+;; ------------------------------------------------------------------------
+;; Producers (LHS)
+;;   LD_!bi
+;;     Load data from the memory (without updating the base register) and
+;;     produce the loaded data. The result is ready at EXD.
+;;   LD_bi
+;;     Load data from the memory (with updating the base register) and
+;;     produce the loaded data. The result is ready at EXD. Because the
+;;     register port is 2R1W, two micro-operations are required in order
+;;     to write two registers. The base register is updated by the second
+;;     micro-operation and the result is ready at EX.
+;;   LMW(N, M)
+;;     There are N micro-operations within an instruction that loads multiple
+;;     words. The result produced by the M-th micro-operation is sent to
+;;     consumers. The result is ready at EXD. If the base register should be
+;;     updated, an extra micro-operation is inserted to the sequence, and the
+;;     result is ready at EX.
+;;   ADDR_OUT
+;;     Most load/store instructions can produce an address output if updating
+;;     the base register is required. The result is ready at EX, which is
+;;     produced by ALU.
+;;   ALU, MUL, MAC
+;;     The result is ready at EX.
+;;   MOVD44_O
+;;     A double-word move instruction needs to write registers twice. Because
+;;     the register port is 2R1W, two micro-operations are required. The even
+;;     number reigster is updated by the first one, and the odd number register
+;;     is updated by the second one. Each of the results is ready at EX.
+;;     The letter 'O' stands for odd.
+;;   DIV_Rs
+;;     A division instruction saves the quotient result to Rt and saves the
+;;     remainder result to Rs. It requires two micro-operations because the
+;;     register port is 2R1W. The first micro-operation writes to Rt, and
+;;     the seconde one writes to Rs. Each of the results is ready at EX.
+;;
+;; Consumers (RHS)
+;;   ALU, MUL, DIV
+;;     Require operands at EX.
+;;   MOVD44_E
+;;     The letter 'E' stands for even, which is accessed by the first micro-
+;;     operation and a movd44 instruction. The operand is required at EX.
+;;   MAC_RaRb
+;;     A MAC instruction is separated into two micro-operations. The first
+;;     micro-operation does the multiplication, which requires operands Ra
+;;     and Rb at EX. The second micro-options does the accumulation, which
+;;     requires the operand Rt at EX.
+;;   ADDR_IN_MOP(N)
+;;     Because the reigster port is 2R1W, some load/store instructions are
+;;     separated into many micro-operations. N denotes the address input is
+;;     required by the N-th micro-operation. Such operand is required at II.
+;;   ST_bi
+;;     A post-increment store instruction requires its data at EX.
+;;   ST_!bi_RI
+;;     A store instruction with an immediate offset requires its data at EX.
+;;     If the offset field is a register (ST_!bi_RR), the instruction will be
+;;     separated into two micro-operations, and the second one requires the
+;;     input operand at EX in order to store it to the memory.
+;;   SMW(N, M)
+;;     There are N micro-operations within an instruction that stores multiple
+;;     words. Each M-th micro-operation requires its data at EX. If the base
+;;     register should be updated, an extra micro-operation is inserted to the
+;;     sequence.
+;;   BR_COND
+;;     If a branch instruction is conditional, its input data is required at EX.
+
+;; LD_!bi -> ADDR_IN_MOP(1)
+(define_bypass 3
+  "nds_n8_load"
+  "nds_n8_branch,\
+   nds_n8_load, nds_n8_store,\
+   nds_n8_load_multiple_1,nds_n8_load_multiple_2, nds_n8_load_multiple_3,\
+   nds_n8_load_multiple_4,nds_n8_load_multiple_5, nds_n8_load_multiple_6,\
+   nds_n8_load_multiple_7,nds_n8_load_multiple_8, nds_n8_load_multiple_12,\
+   nds_n8_store_multiple_1,nds_n8_store_multiple_2, nds_n8_store_multiple_3,\
+   nds_n8_store_multiple_4,nds_n8_store_multiple_5, nds_n8_store_multiple_6,\
+   nds_n8_store_multiple_7,nds_n8_store_multiple_8, nds_n8_store_multiple_12"
+  "nds32_n8_load_to_ii_p"
+)
+
+;; LMW(N, N) -> ADDR_IN_MOP(1)
+(define_bypass 3
+  "nds_n8_load_multiple_1,nds_n8_load_multiple_2, nds_n8_load_multiple_3,\
+   nds_n8_load_multiple_4,nds_n8_load_multiple_5, nds_n8_load_multiple_6,\
+   nds_n8_load_multiple_7,nds_n8_load_multiple_8, nds_n8_load_multiple_12"
+  "nds_n8_branch,\
+   nds_n8_load, nds_n8_store,\
+   nds_n8_load_multiple_1,nds_n8_load_multiple_2, nds_n8_load_multiple_3,\
+   nds_n8_load_multiple_4,nds_n8_load_multiple_5, nds_n8_load_multiple_6,\
+   nds_n8_load_multiple_7,nds_n8_load_multiple_8, nds_n8_load_multiple_12,\
+   nds_n8_store_multiple_1,nds_n8_store_multiple_2, nds_n8_store_multiple_3,\
+   nds_n8_store_multiple_4,nds_n8_store_multiple_5, nds_n8_store_multiple_6,\
+   nds_n8_store_multiple_7,nds_n8_store_multiple_8, nds_n8_store_multiple_12"
+  "nds32_n8_last_load_to_ii_p"
+)
+
+;; LMW(N, N - 1) -> ADDR_IN_MOP(1)
+(define_bypass 2
+  "nds_n8_load_multiple_1,nds_n8_load_multiple_2, nds_n8_load_multiple_3,\
+   nds_n8_load_multiple_4,nds_n8_load_multiple_5, nds_n8_load_multiple_6,\
+   nds_n8_load_multiple_7,nds_n8_load_multiple_8, nds_n8_load_multiple_12"
+  "nds_n8_branch,\
+   nds_n8_load, nds_n8_store,\
+   nds_n8_load_multiple_1,nds_n8_load_multiple_2, nds_n8_load_multiple_3,\
+   nds_n8_load_multiple_4,nds_n8_load_multiple_5, nds_n8_load_multiple_6,\
+   nds_n8_load_multiple_7,nds_n8_load_multiple_8, nds_n8_load_multiple_12,\
+   nds_n8_store_multiple_1,nds_n8_store_multiple_2, nds_n8_store_multiple_3,\
+   nds_n8_store_multiple_4,nds_n8_store_multiple_5, nds_n8_store_multiple_6,\
+   nds_n8_store_multiple_7,nds_n8_store_multiple_8, nds_n8_store_multiple_12"
+  "nds32_n8_last_load_two_to_ii_p"
+)
+
+;; LD_bi -> ADDR_IN_MOP(1)
+(define_bypass 2
+  "nds_n8_load"
+  "nds_n8_branch,\
+   nds_n8_load, nds_n8_store,\
+   nds_n8_load_multiple_1,nds_n8_load_multiple_2, nds_n8_load_multiple_3,\
+   nds_n8_load_multiple_4,nds_n8_load_multiple_5, nds_n8_load_multiple_6,\
+   nds_n8_load_multiple_7,nds_n8_load_multiple_8, nds_n8_load_multiple_12,\
+   nds_n8_store_multiple_1,nds_n8_store_multiple_2, nds_n8_store_multiple_3,\
+   nds_n8_store_multiple_4,nds_n8_store_multiple_5, nds_n8_store_multiple_6,\
+   nds_n8_store_multiple_7,nds_n8_store_multiple_8, nds_n8_store_multiple_12"
+  "nds32_n8_load_bi_to_ii_p"
+)
+
+;; LD_!bi -> ALU, MOVD44_E, MUL, MAC_RaRb, DIV, BR_COND, ST_bi, ST_!bi_RI, SMW(N, 1)
+(define_bypass 2
+  "nds_n8_load"
+  "nds_n8_alu,
+   nds_n8_mul_fast, nds_n8_mul_slow,\
+   nds_n8_mac_fast, nds_n8_mac_slow,\
+   nds_n8_div,\
+   nds_n8_branch,\
+   nds_n8_store,\
+   nds_n8_store_multiple_1,nds_n8_store_multiple_2, nds_n8_store_multiple_3,\
+   nds_n8_store_multiple_4,nds_n8_store_multiple_5, nds_n8_store_multiple_6,\
+   nds_n8_store_multiple_7,nds_n8_store_multiple_8, nds_n8_store_multiple_12"
+  "nds32_n8_load_to_ex_p"
+)
+
+;; ALU, MOVD44_O, MUL, MAC, DIV_Rs, LD_bi, ADDR_OUT -> ADDR_IN_MOP(1)
+(define_bypass 2
+  "nds_n8_alu,
+   nds_n8_mul_fast, nds_n8_mul_slow,\
+   nds_n8_mac_fast, nds_n8_mac_slow,\
+   nds_n8_div,\
+   nds_n8_load, nds_n8_store,\
+   nds_n8_load_multiple_1,nds_n8_load_multiple_2, nds_n8_load_multiple_3,\
+   nds_n8_load_multiple_4,nds_n8_load_multiple_5, nds_n8_load_multiple_6,\
+   nds_n8_load_multiple_7,nds_n8_load_multiple_8, nds_n8_load_multiple_12,\
+   nds_n8_store_multiple_1,nds_n8_store_multiple_2, nds_n8_store_multiple_3,\
+   nds_n8_store_multiple_4,nds_n8_store_multiple_5, nds_n8_store_multiple_6,\
+   nds_n8_store_multiple_7,nds_n8_store_multiple_8, nds_n8_store_multiple_12"
+  "nds_n8_branch,\
+   nds_n8_load, nds_n8_store,\
+   nds_n8_load_multiple_1,nds_n8_load_multiple_2, nds_n8_load_multiple_3,\
+   nds_n8_load_multiple_4,nds_n8_load_multiple_5, nds_n8_load_multiple_6,\
+   nds_n8_load_multiple_7,nds_n8_load_multiple_8, nds_n8_load_multiple_12,\
+   nds_n8_store_multiple_1,nds_n8_store_multiple_2, nds_n8_store_multiple_3,\
+   nds_n8_store_multiple_4,nds_n8_store_multiple_5, nds_n8_store_multiple_6,\
+   nds_n8_store_multiple_7,nds_n8_store_multiple_8, nds_n8_store_multiple_12"
+  "nds32_n8_ex_to_ii_p"
+)
+
+;; LMW(N, N) -> ALU, MOVD44_E, MUL, MAC_RaRb, DIV, BR_COND, ST_bi, ST_!bi_RI, SMW(N, 1)
+(define_bypass 2
+  "nds_n8_load_multiple_1,nds_n8_load_multiple_2, nds_n8_load_multiple_3,\
+   nds_n8_load_multiple_4,nds_n8_load_multiple_5, nds_n8_load_multiple_6,\
+   nds_n8_load_multiple_7,nds_n8_load_multiple_8, nds_n8_load_multiple_12"
+  "nds_n8_alu,
+   nds_n8_mul_fast, nds_n8_mul_slow,\
+   nds_n8_mac_fast, nds_n8_mac_slow,\
+   nds_n8_div,\
+   nds_n8_branch,\
+   nds_n8_store,\
+   nds_n8_store_multiple_1,nds_n8_store_multiple_2, nds_n8_store_multiple_3,\
+   nds_n8_store_multiple_4,nds_n8_store_multiple_5, nds_n8_store_multiple_6,\
+   nds_n8_store_multiple_7,nds_n8_store_multiple_8, nds_n8_store_multiple_12"
+  "nds32_n8_last_load_to_ex_p"
+)
diff --git a/gcc/config/nds32/nds32-n9-2r1w.md b/gcc/config/nds32/nds32-n9-2r1w.md
new file mode 100644
index 0000000..d0db953
--- /dev/null
+++ b/gcc/config/nds32/nds32-n9-2r1w.md
@@ -0,0 +1,362 @@
+;; Pipeline descriptions of Andes NDS32 cpu for GNU compiler
+;; Copyright (C) 2012-2016 Free Software Foundation, Inc.
+;; Contributed by Andes Technology Corporation.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+
+;; ------------------------------------------------------------------------
+;; Define N9 2R1W pipeline settings.
+;; ------------------------------------------------------------------------
+
+(define_automaton "nds32_n9_2r1w_machine")
+
+;; ------------------------------------------------------------------------
+;; Pipeline Stages
+;; ------------------------------------------------------------------------
+;; IF - Instruction Fetch
+;; II - Instruction Issue / Instruction Decode
+;; EX - Instruction Execution
+;; MM - Memory Execution
+;; WB - Instruction Retire / Result Write-Back
+
+(define_cpu_unit "n9_2r1w_ii" "nds32_n9_2r1w_machine")
+(define_cpu_unit "n9_2r1w_ex" "nds32_n9_2r1w_machine")
+(define_cpu_unit "n9_2r1w_mm" "nds32_n9_2r1w_machine")
+(define_cpu_unit "n9_2r1w_wb" "nds32_n9_2r1w_machine")
+
+(define_insn_reservation "nds_n9_2r1w_unknown" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "type" "unknown")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_2r1w_ii, n9_2r1w_ex, n9_2r1w_mm, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_misc" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "type" "misc")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_2r1w_ii, n9_2r1w_ex, n9_2r1w_mm, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_mmu" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "type" "mmu")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_2r1w_ii, n9_2r1w_ex, n9_2r1w_mm, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_alu" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "type" "alu")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_2r1w_ii, n9_2r1w_ex, n9_2r1w_mm, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_alu_shift" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "type" "alu_shift")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ex+n9_2r1w_mm, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_pbsad" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "type" "pbsad")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_2r1w_ii, n9_2r1w_ex*3, n9_2r1w_mm, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_pbsada" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "type" "pbsada")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_2r1w_ii, n9_2r1w_ex*3, n9_2r1w_mm, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_load" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (match_test "nds32::load_single_p (insn)")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_2r1w_ii, n9_2r1w_ex, n9_2r1w_mm, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_store" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (match_test "nds32::store_single_p (insn)")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_2r1w_ii, n9_2r1w_ex, n9_2r1w_mm, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_load_multiple_1" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "1"))))
+  "n9_2r1w_ii, n9_2r1w_ex, n9_2r1w_mm, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_load_multiple_2" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (ior (and (eq_attr "type" "load_multiple")
+		      (eq_attr "combo" "2"))
+		 (match_test "nds32::load_double_p (insn)"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ex+n9_2r1w_mm, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_load_multiple_3" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "3"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm, n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_load_multiple_4" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "4"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_load_multiple_5" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "5"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm, (n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb)*2, n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_load_multiple_6" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "6"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm, (n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb)*3, n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_load_multiple_7" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "7"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm, (n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb)*4, n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_load_multiple_8" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "8"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm, (n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb)*5, n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_load_multiple_12" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "12"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm, (n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb)*9, n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_store_multiple_1" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "1"))))
+  "n9_2r1w_ii, n9_2r1w_ex, n9_2r1w_mm, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_store_multiple_2" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (ior (and (eq_attr "type" "store_multiple")
+		      (eq_attr "combo" "2"))
+		 (match_test "nds32::store_double_p (insn)"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ex+n9_2r1w_mm, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_store_multiple_3" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "3"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm, n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_store_multiple_4" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "4"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_store_multiple_5" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "5"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm, (n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb)*2, n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_store_multiple_6" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "6"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm, (n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb)*3, n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_store_multiple_7" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "7"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm, (n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb)*4, n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_store_multiple_8" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "8"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm, (n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb)*5, n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_store_multiple_12" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "12"))))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm, (n9_2r1w_ii+n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb)*9, n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_mul_fast" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W && nds32_mul_config != MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mul")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_2r1w_ii, n9_2r1w_ex, n9_2r1w_mm, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_mul_slow" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W && nds32_mul_config == MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mul")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_2r1w_ii, n9_2r1w_ex*17, n9_2r1w_mm, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_mac_fast" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W && nds32_mul_config != MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mac")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_2r1w_ii, n9_2r1w_ii+n9_2r1w_ex, n9_2r1w_ex+n9_2r1w_mm, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_mac_slow" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W && nds32_mul_config == MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mac")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_2r1w_ii, (n9_2r1w_ii+n9_2r1w_ex)*17, n9_2r1w_ex+n9_2r1w_mm, n9_2r1w_ex+n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_div" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "type" "div")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_2r1w_ii, (n9_2r1w_ii+n9_2r1w_ex)*34, n9_2r1w_ex+n9_2r1w_mm, n9_2r1w_mm+n9_2r1w_wb, n9_2r1w_wb")
+
+(define_insn_reservation "nds_n9_2r1w_branch" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_2R1W")
+       (and (eq_attr "type" "branch")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_2r1w_ii, n9_2r1w_ex, n9_2r1w_mm, n9_2r1w_wb")
+
+;; ------------------------------------------------------------------------
+;; Comment Notations and Bypass Rules
+;; ------------------------------------------------------------------------
+;; Producers (LHS)
+;;   LD_!bi
+;;     Load data from the memory (without updating the base register) and
+;;     produce the loaded data. The result is ready at MM. Because the register
+;;     port is 2R1W, two micro-operations are required if the base register
+;;     should be updated. In this case, the base register is updated by the
+;;     second micro-operation, and the updated result is ready at EX.
+;;   LMW(N, M)
+;;     There are N micro-operations within an instruction that loads multiple
+;;     words. The result produced by the M-th micro-operation is sent to
+;;     consumers. The result is ready at MM.  If the base register should be
+;;     updated, an extra micro-operation is apppended to the end of the
+;;     sequence, and the result is ready at EX.
+;;   MUL, MAC
+;;     Compute data in the multiply-adder and produce the data. The result
+;;     is ready at MM.
+;;   DIV
+;;     Compute data in the divider and produce the data. The result is ready
+;;     at MM.
+;;
+;; Consumers (RHS)
+;;   ALU, PBSAD, PBSADA_RaRb, MUL, MAC, DIV, MMU
+;;     Require operands at EX.
+;;   ALU_SHIFT_Rb
+;;     An ALU-SHIFT instruction consists of a shift micro-operation followed
+;;     by an arithmetic micro-operation. The operand Rb is used by the first
+;;     micro-operation, and there are some latencies if data dependency occurs.
+;;   MOVD44_E
+;;     A double-word move instruction needs two micro-operations because the
+;;     reigster ports is 2R1W. The first micro-operation writes an even number
+;;     register, and the second micro-operation writes an odd number register.
+;;     Each input operand is required at EX for each micro-operation. MOVD44_E
+;;     stands for the first micro-operation.
+;;   MAC_RaRb, M2R
+;;     MAC instructions do multiplication at EX and do accumulation at MM, but
+;;     MAC instructions which operate on general purpose registers always
+;;     require operands at EX because MM stage cannot be forwarded in 2R1W mode.
+;;   ADDR_IN
+;;     If an instruction requires an address as its input operand, the address
+;;     is required at EX.
+;;   ST_bi
+;;     A post-increment store instruction requires its data at EX because MM
+;;     cannot be forwarded in 2R1W mode.
+;;   ST_!bi_RI
+;;     A store instruction with an immediate offset requires its data at EX
+;;     because MM cannot be forwarded in 2R1W mode. If the offset field is a
+;;     register (ST_!bi_RR), the instruction will be separated into two micro-
+;;     operations, and the second one requires the input operand at EX in order
+;;     to store it to the memory.
+;;   SMW(N, M)
+;;     There are N micro-operations within an instruction that stores multiple
+;;     words. Each M-th micro-operation requires its data at MM.
+;;   BR
+;;     If a branch instruction is conditional, its input data is required at EX.
+
+;; LD_!bi, MUL, MAC
+;;   -> ALU, ALU_SHIFT_Rb, PBSAD, PBSADA_RaRb, MOVD44_E, MUL, MAC_RaRb, M2R, DIV, ADDR_IN_!bi, ADDR_IN_bi_Ra, ST_bi, ST_!bi_RI, BR, MMU
+(define_bypass 2
+  "nds_n9_2r1w_load,\
+   nds_n9_2r1w_mul_fast, nds_n9_2r1w_mul_slow,\
+   nds_n9_2r1w_mac_fast, nds_n9_2r1w_mac_slow"
+  "nds_n9_2r1w_alu, nds_n9_2r1w_alu_shift,\
+   nds_n9_2r1w_pbsad, nds_n9_2r1w_pbsada,\
+   nds_n9_2r1w_mul_fast, nds_n9_2r1w_mul_slow,\
+   nds_n9_2r1w_mac_fast, nds_n9_2r1w_mac_slow,\
+   nds_n9_2r1w_branch,\
+   nds_n9_2r1w_div,\
+   nds_n9_2r1w_load,nds_n9_2r1w_store,\
+   nds_n9_2r1w_load_multiple_1,nds_n9_2r1w_load_multiple_2, nds_n9_2r1w_load_multiple_3,\
+   nds_n9_2r1w_load_multiple_4,nds_n9_2r1w_load_multiple_5, nds_n9_2r1w_load_multiple_6,\
+   nds_n9_2r1w_load_multiple_7,nds_n9_2r1w_load_multiple_8, nds_n9_2r1w_load_multiple_12,\
+   nds_n9_2r1w_store_multiple_1,nds_n9_2r1w_store_multiple_2, nds_n9_2r1w_store_multiple_3,\
+   nds_n9_2r1w_store_multiple_4,nds_n9_2r1w_store_multiple_5, nds_n9_2r1w_store_multiple_6,\
+   nds_n9_2r1w_store_multiple_7,nds_n9_2r1w_store_multiple_8, nds_n9_2r1w_store_multiple_12,\
+   nds_n9_2r1w_mmu"
+  "nds32_n9_2r1w_mm_to_ex_p"
+)
+
+;; LMW(N, N)
+;;   -> ALU, ALU_SHIFT_Rb, PBSAD, PBSADA_RaRb, MOVD44_E, MUL, MAC_RaRb, M2R, DIV, ADDR_IN_!bi, ADDR_IN_bi_Ra, ST_bi, ST_!bi_RI, BR, MMU
+(define_bypass 2
+  "nds_n9_2r1w_load_multiple_1,nds_n9_2r1w_load_multiple_2, nds_n9_2r1w_load_multiple_3,\
+   nds_n9_2r1w_load_multiple_4,nds_n9_2r1w_load_multiple_5, nds_n9_2r1w_load_multiple_6,\
+   nds_n9_2r1w_load_multiple_7,nds_n9_2r1w_load_multiple_8, nds_n9_2r1w_load_multiple_12"
+  "nds_n9_2r1w_alu, nds_n9_2r1w_alu_shift,\
+   nds_n9_2r1w_pbsad, nds_n9_2r1w_pbsada,\
+   nds_n9_2r1w_mul_fast, nds_n9_2r1w_mul_slow,\
+   nds_n9_2r1w_mac_fast, nds_n9_2r1w_mac_slow,\
+   nds_n9_2r1w_branch,\
+   nds_n9_2r1w_div,\
+   nds_n9_2r1w_load,nds_n9_2r1w_store,\
+   nds_n9_2r1w_load_multiple_1,nds_n9_2r1w_load_multiple_2, nds_n9_2r1w_load_multiple_3,\
+   nds_n9_2r1w_load_multiple_4,nds_n9_2r1w_load_multiple_5, nds_n9_2r1w_load_multiple_6,\
+   nds_n9_2r1w_load_multiple_7,nds_n9_2r1w_load_multiple_8, nds_n9_2r1w_load_multiple_12,\
+   nds_n9_2r1w_store_multiple_1,nds_n9_2r1w_store_multiple_2, nds_n9_2r1w_store_multiple_3,\
+   nds_n9_2r1w_store_multiple_4,nds_n9_2r1w_store_multiple_5, nds_n9_2r1w_store_multiple_6,\
+   nds_n9_2r1w_store_multiple_7,nds_n9_2r1w_store_multiple_8, nds_n9_2r1w_store_multiple_12,\
+   nds_n9_2r1w_mmu"
+  "nds32_n9_last_load_to_ex_p"
+)
diff --git a/gcc/config/nds32/nds32-n9-3r2w.md b/gcc/config/nds32/nds32-n9-3r2w.md
new file mode 100644
index 0000000..7849c72
--- /dev/null
+++ b/gcc/config/nds32/nds32-n9-3r2w.md
@@ -0,0 +1,357 @@
+;; Pipeline descriptions of Andes NDS32 cpu for GNU compiler
+;; Copyright (C) 2012-2016 Free Software Foundation, Inc.
+;; Contributed by Andes Technology Corporation.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+
+;; ------------------------------------------------------------------------
+;; Define N9 3R2W pipeline settings.
+;; ------------------------------------------------------------------------
+
+(define_automaton "nds32_n9_3r2w_machine")
+
+;; ------------------------------------------------------------------------
+;; Pipeline Stages
+;; ------------------------------------------------------------------------
+;; IF - Instruction Fetch
+;; II - Instruction Issue / Instruction Decode
+;; EX - Instruction Execution
+;; MM - Memory Execution
+;; WB - Instruction Retire / Result Write-Back
+
+(define_cpu_unit "n9_3r2w_ii" "nds32_n9_3r2w_machine")
+(define_cpu_unit "n9_3r2w_ex" "nds32_n9_3r2w_machine")
+(define_cpu_unit "n9_3r2w_mm" "nds32_n9_3r2w_machine")
+(define_cpu_unit "n9_3r2w_wb" "nds32_n9_3r2w_machine")
+
+(define_insn_reservation "nds_n9_3r2w_unknown" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "type" "unknown")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_misc" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "type" "misc")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_mmu" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "type" "mmu")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_alu" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "type" "alu")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_alu_shift" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "type" "alu_shift")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ex+n9_3r2w_mm, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_pbsad" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "type" "pbsad")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex*3, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_pbsada" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "type" "pbsada")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex*3, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_load" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (match_test "nds32::load_single_p (insn)")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_store" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (match_test "nds32::store_single_p (insn)")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_load_multiple_1" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "1"))))
+  "n9_3r2w_ii, n9_3r2w_ex, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_load_multiple_2" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (ior (and (eq_attr "type" "load_multiple")
+		      (eq_attr "combo" "2"))
+		 (match_test "nds32::load_double_p (insn)"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ex+n9_3r2w_mm, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_load_multiple_3" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "3"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm, n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_load_multiple_4" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "4"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_load_multiple_5" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "5"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm, (n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb)*2, n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_load_multiple_6" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "6"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm, (n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb)*3, n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_load_multiple_7" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "7"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm, (n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb)*4, n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_load_multiple_8" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "8"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm, (n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb)*5, n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_load_multiple_12" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "12"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm, (n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb)*9, n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_store_multiple_1" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "1"))))
+  "n9_3r2w_ii, n9_3r2w_ex, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_store_multiple_2" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (ior (and (eq_attr "type" "store_multiple")
+		      (eq_attr "combo" "2"))
+		 (match_test "nds32::store_double_p (insn)"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ex+n9_3r2w_mm, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_store_multiple_3" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "3"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm, n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_store_multiple_4" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "4"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_store_multiple_5" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "5"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm, (n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb)*2, n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_store_multiple_6" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "6"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm, (n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb)*3, n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_store_multiple_7" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "7"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm, (n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb)*4, n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_store_multiple_8" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "8"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm, (n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb)*5, n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_store_multiple_12" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "pipeline_model" "n9")
+	    (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "12"))))
+  "n9_3r2w_ii, n9_3r2w_ii+n9_3r2w_ex, n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm, (n9_3r2w_ii+n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb)*9, n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_mm+n9_3r2w_wb, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_mul_fast1" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W && nds32_mul_config == MUL_TYPE_FAST_1")
+       (and (eq_attr "type" "mul")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_mul_fast2" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W && nds32_mul_config == MUL_TYPE_FAST_2")
+       (and (eq_attr "type" "mul")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex*2, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_mul_slow" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W && nds32_mul_config == MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mul")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex*17, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_mac_fast1" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W && nds32_mul_config == MUL_TYPE_FAST_1")
+       (and (eq_attr "type" "mac")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_mac_fast2" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W && nds32_mul_config == MUL_TYPE_FAST_2")
+       (and (eq_attr "type" "mac")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex*2, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_mac_slow" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W && nds32_mul_config == MUL_TYPE_SLOW")
+       (and (eq_attr "type" "mac")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex*17, n9_3r2w_ex+n9_3r2w_mm, n9_3r2w_ex+n9_3r2w_mm+n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_div" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "type" "div")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex*34, n9_3r2w_mm, n9_3r2w_wb")
+
+(define_insn_reservation "nds_n9_3r2w_branch" 1
+  (and (match_test "nds32_register_ports_config == REG_PORT_3R2W")
+       (and (eq_attr "type" "branch")
+	    (eq_attr "pipeline_model" "n9")))
+  "n9_3r2w_ii, n9_3r2w_ex, n9_3r2w_mm, n9_3r2w_wb")
+
+;; ------------------------------------------------------------------------
+;; Comment Notations and Bypass Rules
+;; ------------------------------------------------------------------------
+;; Producers (LHS)
+;;   LD
+;;     Load data from the memory and produce the loaded data. The result is
+;;     ready at MM.
+;;   LMW(N, M)
+;;     There are N micro-operations within an instruction that loads multiple
+;;     words. The result produced by the M-th micro-operation is sent to
+;;     consumers. The result is ready at MM.
+;;   MUL, MAC
+;;     Compute data in the multiply-adder and produce the data. The result
+;;     is ready at MM.
+;;   DIV
+;;     Compute data in the divider and produce the data. The result is ready
+;;     at MM.
+;;
+;; Consumers (RHS)
+;;   ALU, MOVD44, PBSAD, PBSADA_RaRb, MUL, MAC, DIV, MMU
+;;     Require operands at EX.
+;;   ALU_SHIFT_Rb
+;;     An ALU-SHIFT instruction consists of a shift micro-operation followed
+;;     by an arithmetic micro-operation. The operand Rb is used by the first
+;;     micro-operation, and there are some latencies if data dependency occurs.
+;;   MAC_RaRb
+;;     A MAC instruction does multiplication at EX and does accumulation at MM,
+;;     so the operand Rt is required at MM, and operands Ra and Rb are required
+;;     at EX.
+;;   ADDR_IN
+;;     If an instruction requires an address as its input operand, the address
+;;     is required at EX.
+;;   ST
+;;     A store instruction requires its data at MM.
+;;   SMW(N, M)
+;;     There are N micro-operations within an instruction that stores multiple
+;;     words. Each M-th micro-operation requires its data at MM.
+;;   BR
+;;     If a branch instruction is conditional, its input data is required at EX.
+
+;; LD, MUL, MAC, DIV
+;;   -> ALU, ALU_SHIFT_Rb, PBSAD, PBSADA_RaRb, MOVD44, MUL, MAC_RaRb, DIV, ADDR_IN, BR, MMU
+(define_bypass 2
+  "nds_n9_3r2w_load,\
+   nds_n9_3r2w_mul_fast1, nds_n9_3r2w_mul_fast2, nds_n9_3r2w_mul_slow,\
+   nds_n9_3r2w_mac_fast1, nds_n9_3r2w_mac_fast2, nds_n9_3r2w_mac_slow,\
+   nds_n9_3r2w_div"
+  "nds_n9_3r2w_alu, nds_n9_3r2w_alu_shift,\
+   nds_n9_3r2w_pbsad, nds_n9_3r2w_pbsada,\
+   nds_n9_3r2w_mul_fast1, nds_n9_3r2w_mul_fast2, nds_n9_3r2w_mul_slow,\
+   nds_n9_3r2w_mac_fast1, nds_n9_3r2w_mac_fast2, nds_n9_3r2w_mac_slow,\
+   nds_n9_3r2w_branch,\
+   nds_n9_3r2w_div,\
+   nds_n9_3r2w_load,nds_n9_3r2w_store,\
+   nds_n9_3r2w_load_multiple_1,nds_n9_3r2w_load_multiple_2, nds_n9_3r2w_load_multiple_3,\
+   nds_n9_3r2w_load_multiple_4,nds_n9_3r2w_load_multiple_5, nds_n9_3r2w_load_multiple_6,\
+   nds_n9_3r2w_load_multiple_7,nds_n9_3r2w_load_multiple_8, nds_n9_3r2w_load_multiple_12,\
+   nds_n9_3r2w_store_multiple_1,nds_n9_3r2w_store_multiple_2, nds_n9_3r2w_store_multiple_3,\
+   nds_n9_3r2w_store_multiple_4,nds_n9_3r2w_store_multiple_5, nds_n9_3r2w_store_multiple_6,\
+   nds_n9_3r2w_store_multiple_7,nds_n9_3r2w_store_multiple_8, nds_n9_3r2w_store_multiple_12,\
+   nds_n9_3r2w_mmu"
+  "nds32_n9_3r2w_mm_to_ex_p"
+)
+
+;; LMW(N, N)
+;;   -> ALU, ALU_SHIFT_Rb, PBSAD, PBSADA_RaRb, MOVD44, MUL, MAC_RaRb, DIV, ADDR_IN, BR, MMU
+(define_bypass 2
+  "nds_n9_3r2w_load_multiple_1,nds_n9_3r2w_load_multiple_2, nds_n9_3r2w_load_multiple_3,\
+   nds_n9_3r2w_load_multiple_4,nds_n9_3r2w_load_multiple_5, nds_n9_3r2w_load_multiple_6,\
+   nds_n9_3r2w_load_multiple_7,nds_n9_3r2w_load_multiple_8, nds_n9_3r2w_load_multiple_12"
+  "nds_n9_3r2w_alu, nds_n9_3r2w_alu_shift,\
+   nds_n9_3r2w_pbsad, nds_n9_3r2w_pbsada,\
+   nds_n9_3r2w_mul_fast1, nds_n9_3r2w_mul_fast2, nds_n9_3r2w_mul_slow,\
+   nds_n9_3r2w_mac_fast1, nds_n9_3r2w_mac_fast2, nds_n9_3r2w_mac_slow,\
+   nds_n9_3r2w_branch,\
+   nds_n9_3r2w_div,\
+   nds_n9_3r2w_load,nds_n9_3r2w_store,\
+   nds_n9_3r2w_load_multiple_1,nds_n9_3r2w_load_multiple_2, nds_n9_3r2w_load_multiple_3,\
+   nds_n9_3r2w_load_multiple_4,nds_n9_3r2w_load_multiple_5, nds_n9_3r2w_load_multiple_6,\
+   nds_n9_3r2w_load_multiple_7,nds_n9_3r2w_load_multiple_8, nds_n9_3r2w_load_multiple_12,\
+   nds_n9_3r2w_store_multiple_1,nds_n9_3r2w_store_multiple_2, nds_n9_3r2w_store_multiple_3,\
+   nds_n9_3r2w_store_multiple_4,nds_n9_3r2w_store_multiple_5, nds_n9_3r2w_store_multiple_6,\
+   nds_n9_3r2w_store_multiple_7,nds_n9_3r2w_store_multiple_8, nds_n9_3r2w_store_multiple_12,\
+   nds_n9_3r2w_mmu"
+  "nds32_n9_last_load_to_ex_p"
+)
diff --git a/gcc/config/nds32/nds32-opts.h b/gcc/config/nds32/nds32-opts.h
index 25c4081..e4017bb 100644
--- a/gcc/config/nds32/nds32-opts.h
+++ b/gcc/config/nds32/nds32-opts.h
@@ -22,14 +22,42 @@
 #define NDS32_OPTS_H

 #define NDS32_DEFAULT_CACHE_BLOCK_SIZE 16
-#define NDS32_DEFAULT_ISR_VECTOR_SIZE (TARGET_ISA_V3 ? 4 : 16)
+#define NDS32_DEFAULT_ISR_VECTOR_SIZE TARGET_DEFAULT_ISR_VECTOR_SIZE

 /* The various ANDES ISA.  */
 enum nds32_arch_type
 {
   ARCH_V2,
+  ARCH_V2J,
   ARCH_V3,
-  ARCH_V3M
+  ARCH_V3J,
+  ARCH_V3M,
+  ARCH_V3M_PLUS,
+  ARCH_V3F,
+  ARCH_V3S
+};
+
+/* The various ANDES CPU.  */
+enum nds32_cpu_type
+{
+  CPU_N6,
+  CPU_N7,
+  CPU_N8,
+  CPU_E8,
+  CPU_N9,
+  CPU_N10,
+  CPU_GRAYWOLF,
+  CPU_N12,
+  CPU_N13,
+  CPU_PANTHER,
+  CPU_SIMPLE
+};
+
+/* The code model defines the address generation strategy.  */
+enum nds32_memory_model_type
+{
+  MEMORY_MODEL_SLOW,
+  MEMORY_MODEL_FAST
 };

 /* The code model defines the address generation strategy.  */
@@ -40,4 +68,56 @@ enum nds32_cmodel_type
   CMODEL_LARGE
 };

+/* The code model defines the address generation strategy.  */
+enum nds32_ict_model_type
+{
+  ICT_MODEL_SMALL,
+  ICT_MODEL_LARGE
+};
+
+
+/* Multiply instruction configuration.  */
+enum nds32_mul_type
+{
+  MUL_TYPE_FAST_1,
+  MUL_TYPE_FAST_2,
+  MUL_TYPE_SLOW
+};
+
+/* Register ports configuration.  */
+enum nds32_register_ports
+{
+  REG_PORT_3R2W,
+  REG_PORT_2R1W
+};
+
+/* Which ABI to use.  */
+enum abi_type
+{
+  NDS32_ABI_V2,
+  NDS32_ABI_V2_FP_PLUS
+};
+
+/* The various FPU number of registers.  */
+enum float_reg_number
+{
+  NDS32_CONFIG_FPU_0,
+  NDS32_CONFIG_FPU_1,
+  NDS32_CONFIG_FPU_2,
+  NDS32_CONFIG_FPU_3,
+  NDS32_CONFIG_FPU_4,
+  NDS32_CONFIG_FPU_5,
+  NDS32_CONFIG_FPU_6,
+  NDS32_CONFIG_FPU_7
+};
+
+/* Do lmwsmw opt model.  */
+enum lmwsmw_cost_type
+{
+  LMWSMW_OPT_SIZE,
+  LMWSMW_OPT_SPEED,
+  LMWSMW_OPT_ALL,
+  LMWSMW_OPT_AUTO
+};
+
 #endif
diff --git a/gcc/config/nds32/nds32-panther.md b/gcc/config/nds32/nds32-panther.md
new file mode 100644
index 0000000..d45de1c
--- /dev/null
+++ b/gcc/config/nds32/nds32-panther.md
@@ -0,0 +1,446 @@
+;; Pipeline descriptions of Andes NDS32 cpu for GNU compiler
+;; Copyright (C) 2012-2016 Free Software Foundation, Inc.
+;; Contributed by Andes Technology Corporation.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; ------------------------------------------------------------------------
+;; Define Panther pipeline settings.
+;; ------------------------------------------------------------------------
+
+(define_automaton "nds32_pn_machine")
+
+(define_cpu_unit "pn_i3_0" "nds32_pn_machine")
+(define_cpu_unit "pn_i3_1" "nds32_pn_machine")
+(define_cpu_unit "pn_e1_p0" "nds32_pn_machine")
+(define_cpu_unit "pn_e2_p0" "nds32_pn_machine")
+(define_cpu_unit "pn_e3_p0" "nds32_pn_machine")
+(define_cpu_unit "pn_e4_p0" "nds32_pn_machine")
+(define_cpu_unit "pn_wb_p0" "nds32_pn_machine")
+(define_cpu_unit "pn_e1_p1" "nds32_pn_machine")
+(define_cpu_unit "pn_e2_p1" "nds32_pn_machine")
+(define_cpu_unit "pn_e3_p1" "nds32_pn_machine")
+(define_cpu_unit "pn_e4_p1" "nds32_pn_machine")
+(define_cpu_unit "pn_wb_p1" "nds32_pn_machine")
+(define_cpu_unit "pn_e1_p2" "nds32_pn_machine")
+(define_cpu_unit "pn_e2_p2" "nds32_pn_machine")
+(define_cpu_unit "pn_e3_p2" "nds32_pn_machine")
+(define_cpu_unit "pn_e4_p2" "nds32_pn_machine")
+(define_cpu_unit "pn_wb_p2" "nds32_pn_machine")
+
+(define_reservation "pn_i3" "pn_i3_0 | pn_i3_1")
+(define_reservation "pn_e1" "pn_e1_p0 | pn_e1_p1")
+(define_reservation "pn_e2" "pn_e2_p0 | pn_e2_p1")
+(define_reservation "pn_e3" "pn_e3_p0 | pn_e3_p1")
+(define_reservation "pn_e4" "pn_e4_p0 | pn_e4_p1")
+(define_reservation "pn_wb" "pn_wb_p0 | pn_wb_p1")
+
+(define_insn_reservation "nds_pn_unknown" 1
+  (and (eq_attr "type" "unknown")
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1, pn_e2, pn_e3, pn_e4, pn_wb")
+
+(define_insn_reservation "nds_pn_misc" 1
+  (and (eq_attr "type" "misc")
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1, pn_e2, pn_e3, pn_e4, pn_wb")
+
+(define_insn_reservation "nds_pn_mmu" 1
+  (and (eq_attr "type" "mmu")
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1, pn_e2, pn_e3, pn_e4, pn_wb")
+
+(define_insn_reservation "nds_pn_movd44" 1
+  (and (and (and (eq_attr "type" "alu")
+		 (eq_attr "subtype" "simple"))
+	    (match_test "nds32::movd44_insn_p (insn)"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p1, pn_e2_p1, pn_e3_p1, pn_e4_p1, pn_wb_p1")
+
+(define_insn_reservation "nds_pn_alu" 1
+  (and (and (and (eq_attr "type" "alu")
+		 (eq_attr "subtype" "simple"))
+	    (match_test "!nds32::movd44_insn_p (insn)"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1, pn_e2, pn_e3, pn_e4, pn_wb")
+
+(define_insn_reservation "nds_pn_shift" 1
+  (and (and (eq_attr "type" "alu")
+	    (eq_attr "subtype" "shift"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1, pn_e2, pn_e3, pn_e4, pn_wb")
+
+(define_insn_reservation "nds_pn_alu_shift" 1
+  (and (eq_attr "type" "alu_shift")
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1, pn_e2, pn_e3, pn_e4, pn_wb")
+
+(define_insn_reservation "nds_pn_pbsad" 1
+  (and (eq_attr "type" "pbsad")
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1, pn_e2, pn_e3*2, pn_e4, pn_wb")
+
+(define_insn_reservation "nds_pn_pbsada" 1
+  (and (eq_attr "type" "pbsada")
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1, pn_e2, pn_e3*3, pn_e4, pn_wb")
+
+(define_insn_reservation "nds_pn_load_full_word" 1
+  (and (match_test "nds32::load_full_word_p (insn)")
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_load_partial_word" 1
+  (and (match_test "nds32::load_partial_word_p (insn)")
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_store" 1
+  (and (match_test "nds32::store_single_p (insn)")
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_load_multiple_1" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "1"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_load_multiple_2" 1
+  (and (ior (and (eq_attr "type" "load_multiple")
+		 (eq_attr "combo" "2"))
+	    (match_test "nds32::load_double_p (insn)"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*2, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_load_multiple_3" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "3"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*3, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_load_multiple_4" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "4"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*4, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_load_multiple_5" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "5"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*5, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_load_multiple_6" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "5"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*6, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_load_multiple_7" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "7"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*7, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_load_multiple_8" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "8"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*8, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_load_multiple_12" 1
+  (and (and (eq_attr "type" "load_multiple")
+            (eq_attr "combo" "12"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*12, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_store_multiple_1" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "1"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_store_multiple_2" 1
+  (and (ior (and (eq_attr "type" "store_multiple")
+		 (eq_attr "combo" "2"))
+	    (match_test "nds32::store_double_p (insn)"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*2, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_store_multiple_3" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "3"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*3, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_store_multiple_4" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "4"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*4, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_store_multiple_5" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "5"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*5, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_store_multiple_6" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "5"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*6, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_store_multiple_7" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "7"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*7, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_store_multiple_8" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "8"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*8, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_store_multiple_12" 1
+  (and (and (eq_attr "type" "store_multiple")
+            (eq_attr "combo" "12"))
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p2*12, pn_e2_p2, pn_e3_p2, pn_e4_p2, pn_wb_p2")
+
+(define_insn_reservation "nds_pn_mul" 1
+  (and (eq_attr "type" "mul")
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p1, pn_e2_p1, pn_e3_p1, pn_e4_p1, pn_wb_p1")
+
+(define_insn_reservation "nds_pn_mac" 1
+  (and (eq_attr "type" "mac")
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p1, pn_e2_p1, pn_e3_p1, pn_e4_p1, pn_wb_p1")
+
+;; The cycles consumed in E4 stage is 32 - CLZ(abs(Ra)) + 2,
+;; so the worst case is 34.
+(define_insn_reservation "nds_pn_div" 1
+  (and (eq_attr "type" "div")
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p1, pn_e2_p1, pn_e3_p1, pn_e4_p1*34, pn_wb_p1")
+
+(define_insn_reservation "nds_pn_branch" 1
+  (and (eq_attr "type" "branch")
+       (eq_attr "pipeline_model" "panther"))
+  "pn_i3, pn_e1_p0, pn_e2_p0, pn_e3_p0, pn_e4_p0, pn_wb_p0")
+
+;; SHIFT -> ADDR_IN
+(define_bypass 2
+  "nds_pn_shift"
+  "nds_pn_load_full_word, nds_pn_load_partial_word, nds_pn_store,\
+   nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12,\
+   nds_pn_store_multiple_1, nds_pn_store_multiple_2, nds_pn_store_multiple_3,\
+   nds_pn_store_multiple_4, nds_pn_store_multiple_5, nds_pn_store_multiple_6,\
+   nds_pn_store_multiple_7, nds_pn_store_multiple_8, nds_pn_store_multiple_12"
+  "nds32_pn_e2_to_e1_p"
+)
+
+;; ALU, MOVD44 -> ADDR_IN
+(define_bypass 3
+  "nds_pn_alu, nds_pn_movd44"
+  "nds_pn_load_full_word, nds_pn_load_partial_word, nds_pn_store,\
+   nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12,\
+   nds_pn_store_multiple_1, nds_pn_store_multiple_2, nds_pn_store_multiple_3,\
+   nds_pn_store_multiple_4, nds_pn_store_multiple_5, nds_pn_store_multiple_6,\
+   nds_pn_store_multiple_7, nds_pn_store_multiple_8, nds_pn_store_multiple_12"
+  "nds32_pn_e3_to_e1_p"
+)
+
+;; ALU, MOVD44 -> SHIFT, MUL, MAC_RaRb
+(define_bypass 2
+  "nds_pn_alu, nds_pn_movd44"
+  "nds_pn_shift, nds_pn_mul, nds_pn_mac"
+  "nds32_pn_e3_to_e2_p"
+)
+
+;; MUL, MAC, DIV, LW, ADDR_OUT -> ADDR_IN
+(define_bypass 4
+  "nds_pn_mul, nds_pn_mac, nds_pn_div,\
+   nds_pn_load_full_word, nds_pn_load_partial_word, nds_pn_store,\
+   nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12,\
+   nds_pn_store_multiple_1, nds_pn_store_multiple_2, nds_pn_store_multiple_3,\
+   nds_pn_store_multiple_4, nds_pn_store_multiple_5, nds_pn_store_multiple_6,\
+   nds_pn_store_multiple_7, nds_pn_store_multiple_8, nds_pn_store_multiple_12"
+  "nds_pn_load_full_word, nds_pn_load_partial_word, nds_pn_store,\
+   nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12,\
+   nds_pn_store_multiple_1, nds_pn_store_multiple_2, nds_pn_store_multiple_3,\
+   nds_pn_store_multiple_4, nds_pn_store_multiple_5, nds_pn_store_multiple_6,\
+   nds_pn_store_multiple_7, nds_pn_store_multiple_8, nds_pn_store_multiple_12"
+  "nds32_pn_e4_to_e1_p"
+)
+
+;; MUL, MAC, DIV, LW, ADDR_OUT -> SHIFT, MUL, MAC_RaRb
+(define_bypass 3
+  "nds_pn_mul, nds_pn_mac, nds_pn_div,\
+   nds_pn_load_full_word, nds_pn_load_partial_word, nds_pn_store,\
+   nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12,\
+   nds_pn_store_multiple_1, nds_pn_store_multiple_2, nds_pn_store_multiple_3,\
+   nds_pn_store_multiple_4, nds_pn_store_multiple_5, nds_pn_store_multiple_6,\
+   nds_pn_store_multiple_7, nds_pn_store_multiple_8, nds_pn_store_multiple_12"
+  "nds_pn_shift, nds_pn_mul, nds_pn_mac"
+  "nds32_pn_e4_to_e2_p"
+)
+
+;; MUL, MAC, DIV, LW, ADDR_OUT -> ALU, MOVD44, BR_COND, ST, SMW(N, 1)
+(define_bypass 2
+  "nds_pn_mul, nds_pn_mac, nds_pn_div,\
+   nds_pn_load_full_word, nds_pn_load_partial_word, nds_pn_store,\
+   nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12,\
+   nds_pn_store_multiple_1, nds_pn_store_multiple_2, nds_pn_store_multiple_3,\
+   nds_pn_store_multiple_4, nds_pn_store_multiple_5, nds_pn_store_multiple_6,\
+   nds_pn_store_multiple_7, nds_pn_store_multiple_8, nds_pn_store_multiple_12"
+  "nds_pn_alu, nds_pn_movd44, nds_pn_branch,\
+   nds_pn_store,\
+   nds_pn_store_multiple_1, nds_pn_store_multiple_2, nds_pn_store_multiple_3,\
+   nds_pn_store_multiple_4, nds_pn_store_multiple_5, nds_pn_store_multiple_6,\
+   nds_pn_store_multiple_7, nds_pn_store_multiple_8, nds_pn_store_multiple_12"
+  "nds32_pn_e4_to_e3_p"
+)
+
+;; LH, LB -> ADDR_IN
+(define_bypass 5
+  "nds_pn_load_partial_word"
+  "nds_pn_load_full_word, nds_pn_load_partial_word, nds_pn_store,\
+   nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12,\
+   nds_pn_store_multiple_1, nds_pn_store_multiple_2, nds_pn_store_multiple_3,\
+   nds_pn_store_multiple_4, nds_pn_store_multiple_5, nds_pn_store_multiple_6,\
+   nds_pn_store_multiple_7, nds_pn_store_multiple_8, nds_pn_store_multiple_12"
+  "nds32_pn_wb_to_e1_p"
+)
+
+;; LH, LB -> SHIFT, MUL, MAC_RaRb
+(define_bypass 4
+  "nds_pn_load_partial_word"
+  "nds_pn_shift, nds_pn_mul, nds_pn_mac"
+  "nds32_pn_wb_to_e2_p"
+)
+
+;; LH, LB -> ALU, MOVD44, BR_COND, ST, SMW(N, 1)
+(define_bypass 3
+  "nds_pn_load_partial_word"
+  "nds_pn_alu, nds_pn_movd44, nds_pn_branch,\
+   nds_pn_store,\
+   nds_pn_store_multiple_1, nds_pn_store_multiple_2, nds_pn_store_multiple_3,\
+   nds_pn_store_multiple_4, nds_pn_store_multiple_5, nds_pn_store_multiple_6,\
+   nds_pn_store_multiple_7, nds_pn_store_multiple_8, nds_pn_store_multiple_12"
+  "nds32_pn_wb_to_e3_p"
+)
+
+;; LH, LB -> DIV
+(define_bypass 2
+  "nds_pn_load_partial_word"
+  "nds_pn_div"
+  "nds32_pn_wb_to_e4_p"
+)
+
+;; LMW(N, N) -> ADDR_IN
+(define_bypass 4
+  "nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12"
+  "nds_pn_load_full_word, nds_pn_load_partial_word, nds_pn_store,\
+   nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12,\
+   nds_pn_store_multiple_1, nds_pn_store_multiple_2, nds_pn_store_multiple_3,\
+   nds_pn_store_multiple_4, nds_pn_store_multiple_5, nds_pn_store_multiple_6,\
+   nds_pn_store_multiple_7, nds_pn_store_multiple_8, nds_pn_store_multiple_12"
+  "nds32_pn_last_load_to_e1_p"
+)
+
+;; LMW(N, N) -> SHIFT, MUL, MAC_RaRb
+(define_bypass 3
+  "nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12"
+  "nds_pn_shift, nds_pn_mul, nds_pn_mac"
+  "nds32_pn_last_load_to_e2_p"
+)
+
+;; LMW(N, N - 1) -> ADDR_IN
+(define_bypass 3
+  "nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12"
+  "nds_pn_load_full_word, nds_pn_load_partial_word, nds_pn_store,\
+   nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12,\
+   nds_pn_store_multiple_1, nds_pn_store_multiple_2, nds_pn_store_multiple_3,\
+   nds_pn_store_multiple_4, nds_pn_store_multiple_5, nds_pn_store_multiple_6,\
+   nds_pn_store_multiple_7, nds_pn_store_multiple_8, nds_pn_store_multiple_12"
+  "nds32_pn_last_two_load_to_e1_p"
+)
+
+;; LMW(N, N - 2) -> ADDR_IN
+(define_bypass 2
+  "nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12"
+  "nds_pn_load_full_word, nds_pn_load_partial_word, nds_pn_store,\
+   nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12,\
+   nds_pn_store_multiple_1, nds_pn_store_multiple_2, nds_pn_store_multiple_3,\
+   nds_pn_store_multiple_4, nds_pn_store_multiple_5, nds_pn_store_multiple_6,\
+   nds_pn_store_multiple_7, nds_pn_store_multiple_8, nds_pn_store_multiple_12"
+  "nds32_pn_last_three_load_to_e1_p"
+)
+
+;; LMW(N, N - 1) -> SHIFT, MUL, MAC_RaRb
+(define_bypass 2
+  "nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12"
+  "nds_pn_shift, nds_pn_mul, nds_pn_mac"
+  "nds32_pn_last_two_load_to_e2_p"
+)
+
+;; LMW(N, N) -> ALU, MOVD44, BR_COND
+(define_bypass 2
+  "nds_pn_load_multiple_1, nds_pn_load_multiple_2, nds_pn_load_multiple_3,\
+   nds_pn_load_multiple_4, nds_pn_load_multiple_5, nds_pn_load_multiple_6,\
+   nds_pn_load_multiple_7, nds_pn_load_multiple_8, nds_pn_load_multiple_12"
+  "nds_pn_alu, nds_pn_movd44, nds_pn_branch,\
+   nds_pn_store,\
+   nds_pn_store_multiple_1, nds_pn_store_multiple_2, nds_pn_store_multiple_3,\
+   nds_pn_store_multiple_4, nds_pn_store_multiple_5, nds_pn_store_multiple_6,\
+   nds_pn_store_multiple_7, nds_pn_store_multiple_8, nds_pn_store_multiple_12"
+  "nds32_pn_last_load_to_e3_p"
+)
diff --git a/gcc/config/nds32/nds32-peephole2.md b/gcc/config/nds32/nds32-peephole2.md
index 07e3a2b..bb47385 100644
--- a/gcc/config/nds32/nds32-peephole2.md
+++ b/gcc/config/nds32/nds32-peephole2.md
@@ -19,6 +19,197 @@
 ;; <http://www.gnu.org/licenses/>.


-;; Use define_peephole2 to handle possible target-specific optimization.
+;; Use define_split, define_peephole, and define_peephole2 to
+;; handle possible target-specific optimization in this file.

 ;; ------------------------------------------------------------------------
+;; Try to utilize 16-bit instruction by swap operand if possible.
+;; ------------------------------------------------------------------------
+
+;; Try to make add as add45.
+(define_peephole2
+  [(set (match_operand:QIHISI 0 "register_operand"              "")
+	(plus:QIHISI (match_operand:QIHISI 1 "register_operand" "")
+		     (match_operand:QIHISI 2 "register_operand" "")))]
+  "reload_completed
+   && TARGET_16_BIT
+   && REGNO (operands[0]) == REGNO (operands[2])
+   && REGNO (operands[0]) != REGNO (operands[1])
+   && TEST_HARD_REG_BIT (reg_class_contents[MIDDLE_REGS], REGNO (operands[0]))"
+  [(set (match_dup 0) (plus:QIHISI (match_dup 2) (match_dup 1)))])
+
+;; Try to make xor/ior/and/mult as xor33/ior33/and33/mult33.
+(define_peephole2
+  [(set (match_operand:SI 0 "register_operand"    "")
+	(match_operator:SI 1 "nds32_have_33_inst_operator"
+	  [(match_operand:SI 2 "register_operand" "")
+	   (match_operand:SI 3 "register_operand" "")]))]
+  "reload_completed
+   && TARGET_16_BIT
+   && REGNO (operands[0]) == REGNO (operands[3])
+   && REGNO (operands[0]) != REGNO (operands[2])
+   && TEST_HARD_REG_BIT (reg_class_contents[LOW_REGS], REGNO (operands[0]))
+   && TEST_HARD_REG_BIT (reg_class_contents[LOW_REGS], REGNO (operands[2]))"
+  [(set (match_dup 0) (match_op_dup 1 [(match_dup 3) (match_dup 2)]))])
+
+(define_peephole
+  [(set (match_operand:SI 0 "register_operand" "")
+	(match_operand:SI 1 "register_operand" ""))
+   (set (match_operand:SI 2 "register_operand" "")
+	(match_operand:SI 3 "register_operand" ""))]
+  "TARGET_16_BIT
+   && !TARGET_ISA_V2
+   && NDS32_IS_GPR_REGNUM (REGNO (operands[0]))
+   && NDS32_IS_GPR_REGNUM (REGNO (operands[1]))
+   && ((REGNO (operands[0]) & 0x1) == 0)
+   && ((REGNO (operands[1]) & 0x1) == 0)
+   && (REGNO (operands[0]) + 1) == REGNO (operands[2])
+   && (REGNO (operands[1]) + 1) == REGNO (operands[3])"
+  "movd44\t%0, %1"
+  [(set_attr "type"   "alu")
+   (set_attr "length" "2")])
+
+;; Merge two fcpyss to fcpysd.
+(define_peephole2
+  [(set (match_operand:SF 0 "float_even_register_operand" "")
+	(match_operand:SF 1 "float_even_register_operand" ""))
+   (set (match_operand:SF 2 "float_odd_register_operand"  "")
+	(match_operand:SF 3 "float_odd_register_operand"  ""))]
+  "(TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)
+   && REGNO (operands[0]) == REGNO (operands[2]) - 1
+   && REGNO (operands[1]) == REGNO (operands[3]) - 1"
+  [(set (match_dup 4) (match_dup 5))]
+  {
+    operands[4] = gen_rtx_REG (DFmode, REGNO (operands[0]));
+    operands[5] = gen_rtx_REG (DFmode, REGNO (operands[1]));
+  })
+
+(define_peephole2
+  [(set (match_operand:SF 0 "float_odd_register_operand"  "")
+	(match_operand:SF 1 "float_odd_register_operand"  ""))
+   (set (match_operand:SF 2 "float_even_register_operand" "")
+	(match_operand:SF 3 "float_even_register_operand" ""))]
+  "(TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)
+   && REGNO (operands[2]) == REGNO (operands[0]) - 1
+   && REGNO (operands[3]) == REGNO (operands[1]) - 1"
+  [(set (match_dup 4) (match_dup 5))]
+  {
+    operands[4] = gen_rtx_REG (DFmode, REGNO (operands[2]));
+    operands[5] = gen_rtx_REG (DFmode, REGNO (operands[3]));
+  })
+
+;; Merge two flsi to fldi.
+(define_peephole2
+  [(set (match_operand:SF 0 "float_even_register_operand" "")
+	(match_operand:SF 1 "memory_operand" ""))
+   (set (match_operand:SF 2 "float_odd_register_operand" "")
+	(match_operand:SF 3 "memory_operand" ""))]
+  "REGNO (operands[0]) == REGNO (operands[2]) - 1
+   && nds32_memory_merge_peep_p (operands[3], operands[1])"
+  [(set (match_dup 0) (match_dup 1))]
+{
+    operands[1] = widen_memory_access (operands[3], DFmode, 0);
+    operands[0] = gen_rtx_REG (DFmode, REGNO (operands[0]));
+})
+
+(define_peephole2
+  [(set (match_operand:SF 0 "float_odd_register_operand" "")
+	(match_operand:SF 1 "memory_operand" ""))
+   (set (match_operand:SF 2 "float_even_register_operand" "")
+	(match_operand:SF 3 "memory_operand" ""))]
+  "REGNO (operands[2]) == REGNO (operands[0]) - 1
+   && nds32_memory_merge_peep_p (operands[1], operands[3])"
+  [(set (match_dup 0) (match_dup 1))]
+{
+    operands[1] = widen_memory_access (operands[1], DFmode, 0);
+    operands[0] = gen_rtx_REG (DFmode, REGNO (operands[2]));
+})
+
+;; Merge two fssi to fsdi.
+(define_peephole2
+  [(set (match_operand:SF 0 "memory_operand" "")
+	(match_operand:SF 1 "float_even_register_operand" ""))
+   (set (match_operand:SF 2 "memory_operand" "")
+	(match_operand:SF 3 "float_odd_register_operand" ""))]
+  "REGNO (operands[1]) == REGNO (operands[3]) - 1
+   && nds32_memory_merge_peep_p (operands[2], operands[0])"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  operands[0] = widen_memory_access (operands[2], DFmode, 0);
+  operands[1] = gen_rtx_REG (DFmode, REGNO (operands[1]));
+})
+
+(define_peephole2
+  [(set (match_operand:SF 0 "memory_operand" "")
+	(match_operand:SF 1 "float_odd_register_operand" ""))
+   (set (match_operand:SF 2 "memory_operand" "")
+	(match_operand:SF 3 "float_even_register_operand" ""))]
+  "REGNO (operands[3]) == REGNO (operands[1]) - 1
+   && nds32_memory_merge_peep_p (operands[0], operands[2])"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  operands[0] = widen_memory_access (operands[0], DFmode, 0);
+  operands[1] = gen_rtx_REG (DFmode, REGNO (operands[3]));
+})
+
+;; ------------------------------------------------------------------------
+;; GCC will prefer [u]divmodsi3 rather than [u]divsi3 even remainder is
+;; unused, so we use split to drop mod operation for lower register pressure.
+
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+	(div:SI (match_operand:SI 1 "register_operand")
+		(match_operand:SI 2 "register_operand")))
+   (set (match_operand:SI 3 "register_operand")
+	(mod:SI (match_dup 1) (match_dup 2)))]
+  "find_regno_note (insn, REG_UNUSED, REGNO (operands[3])) != NULL
+   && can_create_pseudo_p ()"
+  [(set (match_dup 0)
+	(div:SI (match_dup 1)
+		(match_dup 2)))])
+
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+	(udiv:SI (match_operand:SI 1 "register_operand")
+		 (match_operand:SI 2 "register_operand")))
+   (set (match_operand:SI 3 "register_operand")
+	(umod:SI (match_dup 1) (match_dup 2)))]
+  "find_regno_note (insn, REG_UNUSED, REGNO (operands[3])) != NULL
+   && can_create_pseudo_p ()"
+  [(set (match_dup 0)
+	(udiv:SI (match_dup 1)
+		 (match_dup 2)))])
+
+(define_peephole2
+  [(set (match_operand:DI 0 "register_operand")
+	(mult:DI (sign_extend:DI (match_operand:SI 1 "register_operand"))
+		 (sign_extend:DI (match_operand:SI 2 "register_operand"))))]
+  "NDS32_EXT_DSP_P ()
+   && peep2_regno_dead_p (1, WORDS_BIG_ENDIAN ? REGNO (operands[0]) + 1 : REGNO (operands[0]))"
+  [(const_int 1)]
+{
+  rtx highpart = nds32_di_high_part_subreg (operands[0]);
+  emit_insn (gen_smulsi3_highpart (highpart, operands[1], operands[2]));
+  DONE;
+})
+
+(define_split
+  [(set (match_operand:DI 0 "nds32_general_register_operand" "")
+	(match_operand:DI 1 "nds32_general_register_operand" ""))]
+  "find_regno_note (insn, REG_UNUSED, REGNO (operands[0])) != NULL
+   || find_regno_note (insn, REG_UNUSED, REGNO (operands[0]) + 1) != NULL"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  rtx dead_note = find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[0]));
+  HOST_WIDE_INT offset;
+  if (dead_note == NULL_RTX)
+    offset = 0;
+  else
+    offset = 4;
+  operands[0] = simplify_gen_subreg (
+		  SImode, operands[0],
+		  DImode, offset);
+  operands[1] = simplify_gen_subreg (
+		  SImode, operands[1],
+		  DImode, offset);
+})
diff --git a/gcc/config/nds32/nds32-pipelines-auxiliary.c b/gcc/config/nds32/nds32-pipelines-auxiliary.c
index a396fff..903a2ed 100644
--- a/gcc/config/nds32/nds32-pipelines-auxiliary.c
+++ b/gcc/config/nds32/nds32-pipelines-auxiliary.c
@@ -21,14 +21,2638 @@

 /* ------------------------------------------------------------------------ */

+#include <set>
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
+#include "tree.h"
+#include "rtl.h"
+#include "df.h"
+#include "alias.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "regs.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
+#include "explow.h"
+#include "emit-rtl.h"
+#include "stmt.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"		/* For add_builtin_function().  */
+#include "builtins.h"
+#include "tree-pass.h"

 /* ------------------------------------------------------------------------ */

-/* This file is prepared for future implementation of precise
-   pipeline description for nds32 target.  */
+namespace nds32 {
+namespace scheduling {
+
+/* Classify the memory access direction.  It's unknown if the offset register
+   is not a constant value.  */
+enum memory_access_direction
+{
+  MEM_ACCESS_DIR_POS,
+  MEM_ACCESS_DIR_NEG,
+  MEM_ACCESS_DIR_UNKNOWN
+};
+
+/* This class provides some wrappers of the DFA scheduler.  Due to the design
+   drawback of the DFA scheduler, creating two instances at the same time is
+   now allowed.  Use the loosest relationship such as 'dependency' instead of
+   'aggregation' or 'composition' can minimize this issue.  */
+class pipeline_simulator
+{
+public:
+  pipeline_simulator ();
+  ~pipeline_simulator ();
+
+  void advance_cycle (int cycles = 1);
+  int query_latency (rtx_insn *producer, rtx_insn *consumer) const;
+  int issue_insn (rtx_insn *insn);
+  int force_issue_insn (rtx_insn *insn);
+
+private:
+  static int gcc_dfa_initialized_;
+  state_t state_;
+};
+
+/* Insert pseudo NOPs so that we can see stall cycles caused by structural or
+   data hazards in the assembly code.  The design of this class is similar to
+   the 'template method' pattern, but we don't need to maintain multiple
+   customized algorithms at the same time.  Hence this class has no virtual
+   functions providing further customizations.  */
+class stall_inserter
+{
+private:
+  enum dep_type { RES_DEP, DATA_DEP };
+
+public:
+  void insert_stalls ();
+
+private:
+  static rtx emit_pseudo_nop_before (rtx_insn *insn, int cycles, enum dep_type type);
+
+  void insert_structural_hazard_stalls ();
+  void insert_data_hazard_stalls ();
+  void emit_pseudo_nops_for_data_hazards (rtx_insn *insn,
+					  pipeline_simulator &simulator);
+};
+
+class pass_nds32_print_stalls : public rtl_opt_pass
+{
+public:
+  pass_nds32_print_stalls (gcc::context *ctxt);
+
+  bool gate (function *);
+  unsigned int execute (function *);
+};
+
+int pipeline_simulator::gcc_dfa_initialized_ = 0;
+
+const pass_data pass_data_nds32_print_stalls =
+{
+  RTL_PASS,				/* type */
+  "print_stalls",			/* name */
+  OPTGROUP_NONE,			/* optinfo_flags */
+  TV_MACH_DEP,				/* tv_id */
+  0,					/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  0					/* todo_flags_finish */
+};
+
+rtl_opt_pass *
+make_pass_nds32_print_stalls (gcc::context *ctxt)
+{
+  return new pass_nds32_print_stalls (ctxt);
+}
+
+/* A safe wrapper to the function reg_overlap_mentioned_p ().  */
+bool
+reg_overlap_p (rtx x, rtx in)
+{
+  if (x == NULL_RTX || in == NULL_RTX)
+    return false;
+
+  return static_cast <bool> (reg_overlap_mentioned_p (x, in));
+}
+
+/* Calculate the cycle distance between two insns in pipeline view.
+   Hence each insn can be treated as one cycle.
+   TODO: multi-cycle insns should be handled
+	 specially, but we haven't done it here.  */
+int
+cycle_distance (rtx_insn *from, rtx_insn *to)
+{
+  int count = 1;
+
+  for (from = NEXT_INSN (from); from && from != to; from = NEXT_INSN (from))
+    {
+      if (!insn_executable_p (from))
+	continue;
+
+      if (insn_pseudo_nop_p (from))
+	count += INTVAL (XVECEXP (PATTERN (from), 0, 0));
+      else
+	++count;
+    }
+
+  return count;
+}
+
+/* Determine the memory access direction of a load/store insn.  */
+memory_access_direction
+determine_access_direction (rtx_insn *insn)
+{
+  int post_update_rtx_index;
+  rtx plus_rtx;
+  rtx mem_rtx;
+  rtx offset_rtx;
+
+  switch (get_attr_type (insn))
+  {
+  case TYPE_LOAD_MULTIPLE:
+    gcc_assert (parallel_elements (insn) >= 2);
+
+    post_update_rtx_index = find_post_update_rtx (insn);
+    if (post_update_rtx_index != -1)
+      plus_rtx = SET_SRC (parallel_element (insn, post_update_rtx_index));
+    else
+      {
+	/* (parallel
+	     [(set (reg) (mem (reg)))              : index 0
+	      (set (reg) (mem (plus (reg) (...)))) : index 1
+	      ...])  */
+	mem_rtx = SET_SRC (parallel_element (insn, 1));
+	if (GET_CODE (mem_rtx) == UNSPEC)
+	  mem_rtx = XVECEXP (mem_rtx, 0, 0);
+	gcc_assert (MEM_P (mem_rtx));
+	plus_rtx = XEXP (mem_rtx, 0);
+      }
+    break;
+
+  case TYPE_STORE_MULTIPLE:
+    gcc_assert (parallel_elements (insn) >= 2);
+
+    post_update_rtx_index = find_post_update_rtx (insn);
+    if (post_update_rtx_index != -1)
+      plus_rtx = SET_SRC (parallel_element (insn, post_update_rtx_index));
+    else
+      {
+	/* (parallel
+	     [(set (mem (reg))              (reg)) : index 0
+	      (set (mem (plus (reg) (...))) (reg)) : index 1
+	      ...])  */
+	mem_rtx = SET_DEST (parallel_element (insn, 1));
+	if (GET_CODE (mem_rtx) == UNSPEC)
+	  mem_rtx = XVECEXP (mem_rtx, 0, 0);
+	gcc_assert (MEM_P (mem_rtx));
+	plus_rtx = XEXP (mem_rtx, 0);
+      }
+    break;
+
+  case TYPE_LOAD:
+  case TYPE_STORE:
+    mem_rtx = extract_mem_rtx (insn);
+
+    switch (GET_CODE (XEXP (mem_rtx, 0)))
+      {
+      case POST_INC:
+	/* (mem (post_inc (...)))  */
+	return MEM_ACCESS_DIR_POS;
+
+      case POST_DEC:
+	/* (mem (post_dec (...)))  */
+	return MEM_ACCESS_DIR_NEG;
+
+      case PLUS:
+	/* (mem (plus (reg) (...)))  */
+	plus_rtx = XEXP (mem_rtx, 0);
+	break;
+
+      case POST_MODIFY:
+	/* (mem (post_modify (reg) (plus (reg) (...))))  */
+	plus_rtx = XEXP (XEXP (mem_rtx, 0), 1);
+	break;
+
+      default:
+	gcc_unreachable ();
+      }
+    break;
+
+  default:
+    gcc_unreachable ();
+  }
+
+  gcc_assert (GET_CODE (plus_rtx) == PLUS);
+
+  offset_rtx = XEXP (plus_rtx, 1);
+  if (GET_CODE (offset_rtx) == CONST_INT)
+    {
+      if (INTVAL (offset_rtx) < 0)
+	return MEM_ACCESS_DIR_NEG;
+      else
+	return MEM_ACCESS_DIR_POS;
+    }
+
+  return MEM_ACCESS_DIR_UNKNOWN;
+}
+
+/* Return the nth load/store operation in the real micro-operation
+   accessing order.  */
+rtx
+extract_nth_access_rtx (rtx_insn *insn, int n)
+{
+  int n_elems = parallel_elements (insn);
+  int post_update_rtx_index = find_post_update_rtx (insn);
+  memory_access_direction direction = determine_access_direction (insn);
+
+  gcc_assert (direction != MEM_ACCESS_DIR_UNKNOWN);
+
+  /* Reverse the order if the direction negative.  */
+  if (direction == MEM_ACCESS_DIR_NEG)
+    n = -1 * n - 1;
+
+  if (post_update_rtx_index != -1)
+    {
+      if (n >= 0 && post_update_rtx_index <= n)
+	++n;
+      else if (n < 0 && post_update_rtx_index >= n + n_elems)
+	--n;
+    }
+
+  return parallel_element (insn, n);
+}
+
+/* Returns the register operated by the nth load/store operation in the real
+   micro-operation accessing order.  This function assumes INSN must be a
+   multiple-word load/store insn.  */
+rtx
+extract_nth_lmsw_access_reg (rtx_insn *insn, int n)
+{
+  rtx nth_rtx = extract_nth_access_rtx (insn, n);
+
+  if (nth_rtx == NULL_RTX)
+    return NULL_RTX;
+
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LOAD_MULTIPLE:
+      return SET_DEST (nth_rtx);
+
+    case TYPE_STORE_MULTIPLE:
+      return SET_SRC (nth_rtx);
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Returns the register operated by the nth load/store operation in the real
+   micro-operation accessing order.  This function assumes INSN must be a
+   double-word load/store insn.  */
+rtx
+extract_nth_ls2_access_reg (rtx_insn *insn, int n)
+{
+  rtx reg;
+  enum machine_mode mode;
+
+  if (post_update_insn_p (insn))
+    {
+      memory_access_direction direction = determine_access_direction (insn);
+      gcc_assert (direction != MEM_ACCESS_DIR_UNKNOWN);
+
+      /* Reverse the order if the direction negative.  */
+      if (direction == MEM_ACCESS_DIR_NEG)
+	n = -1 * n - 1;
+    }
+
+  /* Handle the out-of-range case.  */
+  if (n < -2 || n > 1)
+    return NULL_RTX;
+
+  /* Convert the index to a positive one.  */
+  if (n < 0)
+    n = 2 + n;
+
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LOAD:
+      reg = SET_DEST (PATTERN (insn));
+      break;
+
+    case TYPE_STORE:
+      reg = SET_SRC (PATTERN (insn));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  gcc_assert (REG_P (reg) || GET_CODE (reg) == SUBREG);
+
+  switch (GET_MODE (reg))
+    {
+    case DImode:
+      mode = SImode;
+      break;
+
+    case DFmode:
+      mode = SFmode;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  if (n == 0)
+    return gen_lowpart (mode, reg);
+  else
+    return gen_highpart (mode, reg);
+}
+
+/* Returns the register operated by the nth load/store operation in the real
+   micro-operation accessing order.  */
+rtx
+extract_nth_access_reg (rtx_insn *insn, int index)
+{
+  switch (GET_CODE (PATTERN (insn)))
+    {
+    case PARALLEL:
+      return extract_nth_lmsw_access_reg (insn, index);
+
+    case SET:
+      return extract_nth_ls2_access_reg (insn, index);
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Determine if the latency is occured when the consumer PBSADA_INSN uses the
+   value of DEF_REG in its Ra or Rb fields.  */
+bool
+pbsada_insn_ra_rb_dep_reg_p (rtx pbsada_insn, rtx def_reg)
+{
+  rtx unspec_rtx = SET_SRC (PATTERN (pbsada_insn));
+  gcc_assert (GET_CODE (unspec_rtx) == UNSPEC);
+
+  rtx pbsada_ra = XVECEXP (unspec_rtx, 0, 0);
+  rtx pbsada_rb = XVECEXP (unspec_rtx, 0, 1);
+
+  if (rtx_equal_p (def_reg, pbsada_ra)
+      || rtx_equal_p (def_reg, pbsada_rb))
+    return true;
+
+  return false;
+}
+
+/* Determine if the latency is occured when the consumer PBSADA_INSN uses the
+   value of DEF_REG in its Rt field.  */
+bool
+pbsada_insn_rt_dep_reg_p (rtx pbsada_insn, rtx def_reg)
+{
+  rtx pbsada_rt = SET_DEST (PATTERN (pbsada_insn));
+
+  if (rtx_equal_p (def_reg, pbsada_rt))
+    return true;
+
+  return false;
+}
+
+/* Check if INSN is a movd44 insn consuming DEF_REG.  */
+bool
+movd44_even_dep_p (rtx_insn *insn, rtx def_reg)
+{
+  if (!movd44_insn_p (insn))
+    return false;
+
+  rtx use_rtx = SET_SRC (PATTERN (insn));
+
+  if (REG_P (def_reg))
+    {
+      return rtx_equal_p (def_reg, use_rtx);
+    }
+  else if (GET_CODE (def_reg) == SUBREG
+	   && GET_MODE (def_reg) == SImode
+	   && rtx_equal_p (SUBREG_REG (def_reg), use_rtx))
+    {
+      if (TARGET_BIG_ENDIAN && SUBREG_BYTE (def_reg) == 4)
+	return true;
+
+      if (!TARGET_BIG_ENDIAN && SUBREG_BYTE (def_reg) == 0)
+	return true;
+
+      return false;
+    }
+
+  return false;
+}
+
+/* Check if INSN is a wext insn consuming DEF_REG.  */
+bool
+wext_odd_dep_p (rtx insn, rtx def_reg)
+{
+  rtx shift_rtx = XEXP (SET_SRC (PATTERN (insn)), 0);
+  rtx use_reg = XEXP (shift_rtx, 0);
+  rtx pos_rtx = XEXP (shift_rtx, 1);
+
+  if (REG_P (pos_rtx) && reg_overlap_p (def_reg, pos_rtx))
+    return true;
+
+  if (GET_MODE (def_reg) == DImode)
+    return reg_overlap_p (def_reg, use_reg);
+
+  gcc_assert (REG_P (def_reg) || GET_CODE (def_reg) == SUBREG);
+  gcc_assert (REG_P (use_reg));
+
+  if (REG_P (def_reg))
+    {
+      if (!TARGET_BIG_ENDIAN)
+	return REGNO (def_reg) == REGNO (use_reg) + 1;
+      else
+	return  REGNO (def_reg) == REGNO (use_reg);
+    }
+
+  if (GET_CODE (def_reg) == SUBREG)
+    {
+      if (!reg_overlap_p (def_reg, use_reg))
+	return false;
+
+      if (!TARGET_BIG_ENDIAN)
+	return SUBREG_BYTE (def_reg) == 4;
+      else
+	return SUBREG_BYTE (def_reg) == 0;
+    }
+
+  return false;
+}
+
+/* Check if INSN is a bpick insn consuming DEF_REG.  */
+bool
+bpick_ra_rb_dep_p (rtx insn, rtx def_reg)
+{
+  rtx ior_rtx = SET_SRC (PATTERN (insn));
+  rtx and1_rtx = XEXP (ior_rtx, 0);
+  rtx and2_rtx = XEXP (ior_rtx, 1);
+  rtx reg1_0 = XEXP (and1_rtx, 0);
+  rtx reg1_1 = XEXP (and1_rtx, 1);
+  rtx reg2_0 = XEXP (and2_rtx, 0);
+  rtx reg2_1 = XEXP (and2_rtx, 1);
+
+  if (GET_CODE (reg1_0) == NOT)
+    {
+      if (rtx_equal_p (reg1_0, reg2_0))
+	return reg_overlap_p (def_reg, reg1_1)
+	       || reg_overlap_p (def_reg, reg2_1);
+
+      if (rtx_equal_p (reg1_0, reg2_1))
+	return reg_overlap_p (def_reg, reg1_1)
+	       || reg_overlap_p (def_reg, reg2_0);
+    }
+
+  if (GET_CODE (reg1_1) == NOT)
+    {
+      if (rtx_equal_p (reg1_1, reg2_0))
+	return reg_overlap_p (def_reg, reg1_0)
+	       || reg_overlap_p (def_reg, reg2_1);
+
+      if (rtx_equal_p (reg1_1, reg2_1))
+	return reg_overlap_p (def_reg, reg1_0)
+	       || reg_overlap_p (def_reg, reg2_0);
+    }
+
+  if (GET_CODE (reg2_0) == NOT)
+    {
+      if (rtx_equal_p (reg2_0, reg1_0))
+	return reg_overlap_p (def_reg, reg2_1)
+	       || reg_overlap_p (def_reg, reg1_1);
+
+      if (rtx_equal_p (reg2_0, reg1_1))
+	return reg_overlap_p (def_reg, reg2_1)
+	       || reg_overlap_p (def_reg, reg1_0);
+    }
+
+  if (GET_CODE (reg2_1) == NOT)
+    {
+      if (rtx_equal_p (reg2_1, reg1_0))
+	return reg_overlap_p (def_reg, reg2_0)
+	       || reg_overlap_p (def_reg, reg1_1);
+
+      if (rtx_equal_p (reg2_1, reg1_1))
+	return reg_overlap_p (def_reg, reg2_0)
+	       || reg_overlap_p (def_reg, reg1_0);
+    }
+
+  gcc_unreachable ();
+}
+
+pipeline_simulator::pipeline_simulator ()
+{
+  /* The design of dfa_start () operates on static global variables and
+     allocates memory space without checking whether the function is called
+     twice or not.  We add some guards in order to protect it from abusing.  */
+  if (!gcc_dfa_initialized_++)
+    dfa_start ();
+
+  state_ = xmalloc (state_size());
+  state_reset (state_);
+}
+
+pipeline_simulator::~pipeline_simulator ()
+{
+  /* The design of dfa_finish () operates on a static global variable and
+     deallocates memory space without checking whether the function is called
+     twice or not.  We add some guards in order to protect it from abusing.  */
+  free (state_);
+
+  gcc_assert(gcc_dfa_initialized_ > 0);
+  if (!--gcc_dfa_initialized_)
+    dfa_finish ();
+}
+
+void
+pipeline_simulator::advance_cycle (int cycles)
+{
+  gcc_assert (cycles > 0);
+
+  /* The second argument was 'NULL', but we found the expression is directly
+     written in insn-automata.c:
+       if (insn == 0)
+	 insn_code = DFA__ADVANCE_CYCLE;
+     Hence we change it to '0' in order to make it consistent.  */
+  while (cycles--)
+    state_transition (state_, 0);
+}
+
+/* A wrapper of insn_latency () provided by the insn-attr.h in the object tree.
+   See that file for more information.  */
+int
+pipeline_simulator::query_latency (rtx_insn *producer, rtx_insn *consumer) const
+{
+  return insn_latency (producer, consumer);
+}
+
+/* Return 0 or negative if we can issue INSN at the current cycle.  Otherwise,
+   return a postive value indicates how many cycles we have to wait.  The
+   interface is consistent with state_transition () provided by insn-attr.h
+   in the object directory.  See that file for more information.  */
+int
+pipeline_simulator::issue_insn (rtx_insn *insn)
+{
+  int stalls;
+
+  /* Skip cycles specified by pseudo NOPs.  */
+  if (insn_pseudo_nop_p (insn))
+    {
+      int nop_stalls = INTVAL (XVECEXP (PATTERN (insn), 0, 0));
+
+      gcc_assert (nop_stalls > 0);
+      advance_cycle (nop_stalls);
+      stalls = -1;
+    }
+  else
+    {
+      stalls = state_transition (state_, insn);
+
+      /* All targets are single-issue, so we advance one cycle once after
+	 an insn has been issued successfully.  */
+      if (stalls <= 0)
+	advance_cycle ();
+    }
+
+  return stalls;
+}
+
+/* This function is similar to issue_insn (), but it advances cycles until INSN
+   can be issued successfully.  If INSN can be issued at the current cycle, the
+   return value will be 0 or negaitive.  Otherwise, the function will return
+   the cycles it has been skipped.  */
+int
+pipeline_simulator::force_issue_insn (rtx_insn *insn)
+{
+  int stalls;
+
+  stalls = issue_insn (insn);
+
+  /* Skip cycles until we can issue the insn.  */
+  if (stalls > 0)
+    {
+      advance_cycle (stalls);
+      issue_insn (insn);
+    }
+
+  return stalls;
+}
+
+/* The main flow of the class STALL_INSERTER.  We insert NOPs for structural
+   hazards because self-stalled instructions also consume the delay cycles
+   caused by data hazards.  */
+void
+stall_inserter::insert_stalls ()
+{
+  compute_bb_for_insn_safe ();
+
+  insert_structural_hazard_stalls ();
+  insert_data_hazard_stalls ();
+
+  /* We have to call the following two functions again after we inserting
+     some insns after it has been invoked.  Otherwise, an assert expression
+     in final () will be triggered and cause to an internal compiler error.  */
+  init_insn_lengths ();
+  shorten_branches (get_insns ());
+
+  free_bb_for_insn ();
+}
+
+/* A helper function inserting NOPs.  CYCLES indicates how many cycles the NOP
+   insn consumes.  TYPE indicates what type of the NOP insn we want to insert;
+   now there are two types available: RES_DEP and DATA_DEP.  */
+rtx
+stall_inserter::emit_pseudo_nop_before (
+    rtx_insn *insn, int cycles, enum dep_type type)
+{
+  rtx nop_pattern;
+  rtx_insn *nop_insn;
+  int recog;
+
+  switch (type)
+  {
+  case RES_DEP:
+    nop_pattern = gen_nop_res_dep (GEN_INT (cycles));
+    break;
+  case DATA_DEP:
+    nop_pattern = gen_nop_data_dep (GEN_INT (cycles));
+    break;
+  default:
+    gcc_unreachable ();
+  }
+
+  nop_insn = emit_insn_before (nop_pattern, insn);
+  recog = recog_memoized (nop_insn);
+  gcc_assert(recog != -1);
+
+  return nop_insn;
+}
+
+void
+stall_inserter::insert_structural_hazard_stalls ()
+{
+  pipeline_simulator simulator;
+  rtx_insn *insn;
+
+  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+    {
+      if (!insn_executable_p (insn)) continue;
+
+      int stalls = simulator.force_issue_insn (insn);
+
+      if (stalls > 0)
+	emit_pseudo_nop_before (insn, stalls, RES_DEP);
+    }
+}
+
+void
+stall_inserter::insert_data_hazard_stalls ()
+{
+  pipeline_simulator simulator;
+  rtx_insn *insn;
+
+  /* Calling to df_insn_rescan_all here is required in order to avoid crash
+     when some special options are specified by users, such as
+     -O0 -fschedule-insns2.  */
+  df_chain_add_problem (DF_DU_CHAIN);
+  df_insn_rescan_all ();
+  df_analyze ();
+
+  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+    {
+      if (!insn_executable_p (insn)) continue;
+
+      simulator.force_issue_insn (insn);
+      emit_pseudo_nops_for_data_hazards (insn, simulator);
+    }
+
+  /* We must call df_finish_pass manually because it should be invoked before
+     BB information is destroyed.  Hence we cannot set the TODO_df_finish flag
+     to the pass manager.  */
+  df_insn_rescan_all ();
+  df_finish_pass (false);
+}
+
+/* Traverse all insns using the results produced by INSN and ask SIMULATOR
+   how many delay cycles between them.  If there are some delay cycles, insert
+   corresponding NOP insns there.  */
+void
+stall_inserter::emit_pseudo_nops_for_data_hazards (
+    rtx_insn *insn, pipeline_simulator &simulator)
+{
+  df_ref def;
+  df_link *link;
+  std::set<rtx> processed_insns;
+
+  FOR_EACH_INSN_DEF (def, insn)
+    {
+      for (link = DF_REF_CHAIN (def); link; link = link->next)
+	{
+	  if (!DF_REF_INSN_INFO (link->ref))
+	    continue;
+
+	  rtx_insn *use_insn = DF_REF_INSN (link->ref);
+
+	  if (!insn_executable_p (use_insn)
+	      || processed_insns.count (use_insn))
+	    continue;
+
+	  int stalls = simulator.query_latency (insn, use_insn);
+	  int distance = cycle_distance (insn, use_insn);
+
+	  if (stalls > distance)
+	    {
+	      stalls -= distance;
+	      emit_pseudo_nop_before (use_insn, stalls, DATA_DEP);
+	      processed_insns.insert (use_insn);
+	    }
+	}
+    }
+}
+
+pass_nds32_print_stalls::pass_nds32_print_stalls (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_nds32_print_stalls, ctxt)
+{
+}
+
+bool pass_nds32_print_stalls::gate (function *)
+{
+  return TARGET_PRINT_STALLS;
+}
+
+unsigned int
+pass_nds32_print_stalls::execute (function *)
+{
+  stall_inserter inserter;
+
+  inserter.insert_stalls ();
+  return 0;
+}
+
+} // namespace scheduling
+} // namespace nds32
+
+/* ------------------------------------------------------------------------ */
+
+using namespace nds32;
+using namespace nds32::scheduling;
+
+namespace { // anonymous namespace
+
+/* Check the dependency between the producer defining DEF_REG and CONSUMER
+   requiring input operand at II.  */
+bool
+n7_consumed_by_ii_dep_p (rtx_insn *consumer, rtx def_reg)
+{
+  rtx use_rtx;
+
+  switch (get_attr_type (consumer))
+    {
+    /* MOVD44_E */
+    case TYPE_ALU:
+      if (movd44_even_dep_p (consumer, def_reg))
+	return true;
+
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_MUL:
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_MAC:
+      use_rtx = extract_mac_non_acc_rtx (consumer);
+      break;
+
+   /* Some special instructions, divmodsi4 and udivmodsi4, produce two
+      results, the quotient and the remainder.  It requires two micro-
+      operations in order to write two registers. We have to check the
+      dependency from the producer to the first micro-operation.  */
+    case TYPE_DIV:
+      if (divmod_p (consumer))
+	use_rtx = SET_SRC (parallel_element (consumer, 0));
+      else
+	use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_LOAD:
+      /* ADDR_IN_bi_Ra, ADDR_IN_!bi */
+      if (post_update_insn_p (consumer))
+	use_rtx = extract_base_reg (consumer);
+      else
+	use_rtx = extract_mem_rtx (consumer);
+      break;
+
+    case TYPE_STORE:
+      /* ADDR_IN_bi_Ra, ADDR_IN_!bi */
+      if (post_update_insn_p (consumer))
+	use_rtx = extract_base_reg (consumer);
+      else
+	use_rtx = extract_mem_rtx (consumer);
+
+      if (reg_overlap_p (def_reg, use_rtx))
+	return true;
+
+      /* ST_bi, ST_!bi_RI */
+      if (!post_update_insn_p (consumer)
+	  && !immed_offset_p (extract_mem_rtx (consumer)))
+	return false;
+
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_LOAD_MULTIPLE:
+      use_rtx = extract_base_reg (consumer);
+      break;
+
+    case TYPE_STORE_MULTIPLE:
+      /* ADDR_IN */
+      use_rtx = extract_base_reg (consumer);
+      if (reg_overlap_p (def_reg, use_rtx))
+	return true;
+
+      /* SMW (N, 1) */
+      use_rtx = extract_nth_access_rtx (consumer, 0);
+      break;
+
+    case TYPE_BRANCH:
+      use_rtx = PATTERN (consumer);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  if (reg_overlap_p (def_reg, use_rtx))
+    return true;
+
+  return false;
+}
+
+/* Check the dependency between the producer defining DEF_REG and CONSUMER
+   requiring input operand at AG (II).  */
+bool
+n8_consumed_by_addr_in_p (rtx_insn *consumer, rtx def_reg)
+{
+  rtx use_rtx;
+
+  switch (get_attr_type (consumer))
+    {
+    case TYPE_BRANCH:
+      use_rtx = extract_branch_target_rtx (consumer);
+      break;
+
+    case TYPE_LOAD:
+      if (load_single_p (consumer))
+	use_rtx = extract_mem_rtx (consumer);
+      else
+	use_rtx = extract_base_reg (consumer);
+      break;
+
+    case TYPE_STORE:
+      if (store_single_p (consumer)
+	  && (!post_update_insn_p (consumer)
+	      || immed_offset_p (extract_mem_rtx (consumer))))
+	use_rtx = extract_mem_rtx (consumer);
+      else
+	use_rtx = extract_base_reg (consumer);
+      break;
+
+    case TYPE_LOAD_MULTIPLE:
+    case TYPE_STORE_MULTIPLE:
+      use_rtx = extract_base_reg (consumer);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return reg_overlap_p (def_reg, use_rtx);
+}
+
+/* Check the dependency between the producer defining DEF_REG and CONSUMER
+   requiring input operand at EX.  */
+bool
+n8_consumed_by_ex_p (rtx_insn *consumer, rtx def_reg)
+{
+  rtx use_rtx;
+
+  switch (get_attr_type (consumer))
+    {
+    case TYPE_ALU:
+      if (movd44_even_dep_p (consumer, def_reg))
+	return true;
+
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_MUL:
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_MAC:
+      use_rtx = extract_mac_non_acc_rtx (consumer);
+      break;
+
+   /* Some special instructions, divmodsi4 and udivmodsi4, produce two
+      results, the quotient and the remainder.  It requires two micro-
+      operations in order to write two registers. We have to check the
+      dependency from the producer to the first micro-operation.  */
+    case TYPE_DIV:
+      if (divmod_p (consumer))
+	use_rtx = SET_SRC (parallel_element (consumer, 0));
+      else
+	use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_BRANCH:
+      use_rtx = extract_branch_condition_rtx (consumer);
+      break;
+
+    case TYPE_STORE:
+      /* exclude ST_!bi_RR */
+      if (!post_update_insn_p (consumer)
+	  && !immed_offset_p (extract_mem_rtx (consumer)))
+	return false;
+
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_STORE_MULTIPLE:
+      use_rtx = extract_nth_access_rtx (consumer, 0);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return reg_overlap_p (def_reg, use_rtx);
+}
+
+/* Check the dependency between the producer defining DEF_REG and CONSUMER
+   requiring input operand at AG (II).  */
+bool
+e8_consumed_by_addr_in_p (rtx_insn *consumer, rtx def_reg)
+{
+  return n8_consumed_by_addr_in_p (consumer, def_reg);
+}
+
+/* Check the dependency between the producer defining DEF_REG and CONSUMER
+   requiring input operand at EX.  */
+bool
+e8_consumed_by_ex_p (rtx_insn *consumer, rtx def_reg)
+{
+  rtx use_rtx;
+
+  switch (get_attr_type (consumer))
+    {
+    case TYPE_ALU:
+    case TYPE_STORE:
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_MUL:
+    case TYPE_MAC:
+    case TYPE_DIV:
+    case TYPE_BRANCH:
+    case TYPE_STORE_MULTIPLE:
+      return n8_consumed_by_ex_p (consumer, def_reg);
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return reg_overlap_p (def_reg, use_rtx);
+}
+
+/* Check the dependency between the producer defining DEF_REG and CONSUMER
+   requiring input operand at EX.  */
+bool
+n9_2r1w_consumed_by_ex_dep_p (rtx_insn *consumer, rtx def_reg)
+{
+  rtx use_rtx;
+
+  switch (get_attr_type (consumer))
+    {
+    case TYPE_ALU:
+      if (movd44_even_dep_p (consumer, def_reg))
+	return true;
+
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_PBSAD:
+    case TYPE_MUL:
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_ALU_SHIFT:
+      use_rtx = extract_shift_reg (consumer);
+      break;
+
+    case TYPE_PBSADA:
+      return pbsada_insn_ra_rb_dep_reg_p (consumer, def_reg);
+
+    case TYPE_MAC:
+      use_rtx = PATTERN (consumer);
+      break;
+
+    case TYPE_DIV:
+      if (divmod_p (consumer))
+	use_rtx = SET_SRC (parallel_element (consumer, 0));
+      else
+	use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_MMU:
+      if (GET_CODE (PATTERN (consumer)) == SET)
+	use_rtx = SET_SRC (PATTERN (consumer));
+      else
+	return true;
+      break;
+
+    case TYPE_LOAD:
+      /* ADDR_IN_bi_Ra, ADDR_IN_!bi */
+      if (post_update_insn_p (consumer))
+	use_rtx = extract_base_reg (consumer);
+      else
+	use_rtx = extract_mem_rtx (consumer);
+      break;
+
+    case TYPE_STORE:
+      /* ADDR_IN_bi_Ra, ADDR_IN_!bi */
+      if (post_update_insn_p (consumer))
+	use_rtx = extract_base_reg (consumer);
+      else
+	use_rtx = extract_mem_rtx (consumer);
+
+      if (reg_overlap_p (def_reg, use_rtx))
+	return true;
+
+      /* exclude ST_!bi_RR */
+      if (!post_update_insn_p (consumer)
+	  && !immed_offset_p (extract_mem_rtx (consumer)))
+	return false;
+
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_LOAD_MULTIPLE:
+      use_rtx = extract_base_reg (consumer);
+      break;
+
+    case TYPE_STORE_MULTIPLE:
+      /* ADDR_IN */
+      use_rtx = extract_base_reg (consumer);
+      if (reg_overlap_p (def_reg, use_rtx))
+	return true;
+
+      /* SMW (N, 1) */
+      use_rtx = extract_nth_access_rtx (consumer, 0);
+      break;
+
+    case TYPE_BRANCH:
+      use_rtx = PATTERN (consumer);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  if (reg_overlap_p (def_reg, use_rtx))
+    return true;
+
+  return false;
+}
+
+/* Check the dependency between the producer defining DEF_REG and CONSUMER
+   requiring input operand at EX.  */
+bool
+n9_3r2w_consumed_by_ex_dep_p (rtx_insn *consumer, rtx def_reg)
+{
+  rtx use_rtx;
+
+  switch (get_attr_type (consumer))
+    {
+    case TYPE_ALU:
+    case TYPE_PBSAD:
+    case TYPE_MUL:
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_ALU_SHIFT:
+      use_rtx = extract_shift_reg (consumer);
+      break;
+
+    case TYPE_PBSADA:
+      return pbsada_insn_ra_rb_dep_reg_p (consumer, def_reg);
+
+    case TYPE_MAC:
+      use_rtx = extract_mac_non_acc_rtx (consumer);
+      break;
+
+   /* Some special instructions, divmodsi4 and udivmodsi4, produce two
+      results, the quotient and the remainder.  In 2R1W configuration,
+      it requires two micro-operations in order to write two registers.
+      We have to check the dependency from the producer to the first
+      micro-operation.  */
+    case TYPE_DIV:
+      if (divmod_p (consumer))
+	use_rtx = SET_SRC (parallel_element (consumer, 0));
+      else
+	use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_MMU:
+      if (GET_CODE (PATTERN (consumer)) == SET)
+	use_rtx = SET_SRC (PATTERN (consumer));
+      else
+	return true;
+      break;
+
+    case TYPE_LOAD:
+    case TYPE_STORE:
+      use_rtx = extract_mem_rtx (consumer);
+      break;
+
+    case TYPE_LOAD_MULTIPLE:
+    case TYPE_STORE_MULTIPLE:
+      use_rtx = extract_base_reg (consumer);
+      break;
+
+    case TYPE_BRANCH:
+      use_rtx = PATTERN (consumer);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  if (reg_overlap_p (def_reg, use_rtx))
+    return true;
+
+  return false;
+}
+
+/* Check the dependency between the producer defining DEF_REG and CONSUMER
+   requiring input operand at EX.  */
+bool
+n10_consumed_by_ex_dep_p (rtx_insn *consumer, rtx def_reg)
+{
+  rtx use_rtx;
+
+  switch (get_attr_type (consumer))
+    {
+    case TYPE_ALU:
+    case TYPE_PBSAD:
+    case TYPE_MUL:
+    case TYPE_DALU:
+    case TYPE_DALU64:
+    case TYPE_DMUL:
+    case TYPE_DPACK:
+    case TYPE_DINSB:
+    case TYPE_DCMP:
+    case TYPE_DCLIP:
+    case TYPE_DALUROUND:
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_ALU_SHIFT:
+      use_rtx = extract_shift_reg (consumer);
+      break;
+
+    case TYPE_PBSADA:
+      return pbsada_insn_ra_rb_dep_reg_p (consumer, def_reg);
+
+    case TYPE_MAC:
+    case TYPE_DMAC:
+      use_rtx = extract_mac_non_acc_rtx (consumer);
+      break;
+
+   /* Some special instructions, divmodsi4 and udivmodsi4, produce two
+      results, the quotient and the remainder.  */
+    case TYPE_DIV:
+      if (divmod_p (consumer))
+	use_rtx = SET_SRC (parallel_element (consumer, 0));
+      else
+	use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_DWEXT:
+      return wext_odd_dep_p (consumer, def_reg);
+
+    case TYPE_DBPICK:
+      return bpick_ra_rb_dep_p (consumer, def_reg);
+
+    case TYPE_MMU:
+      if (GET_CODE (PATTERN (consumer)) == SET)
+	use_rtx = SET_SRC (PATTERN (consumer));
+      else
+	return true;
+      break;
+
+    case TYPE_LOAD:
+    case TYPE_STORE:
+      use_rtx = extract_mem_rtx (consumer);
+      break;
+
+    case TYPE_LOAD_MULTIPLE:
+    case TYPE_STORE_MULTIPLE:
+      use_rtx = extract_base_reg (consumer);
+      break;
+
+    case TYPE_BRANCH:
+      use_rtx = PATTERN (consumer);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  if (reg_overlap_p (def_reg, use_rtx))
+    return true;
+
+  return false;
+}
+
+/* Check the dependency between the producer defining DEF_REG and CONSUMER
+   requiring input operand at EX.  */
+bool
+gw_consumed_by_ex_dep_p (rtx_insn *consumer, rtx def_reg)
+{
+  rtx use_rtx;
+
+  switch (get_attr_type (consumer))
+    {
+    case TYPE_ALU:
+    case TYPE_PBSAD:
+    case TYPE_MUL:
+    case TYPE_DALU:
+    case TYPE_DALU64:
+    case TYPE_DMUL:
+    case TYPE_DPACK:
+    case TYPE_DINSB:
+    case TYPE_DCMP:
+    case TYPE_DCLIP:
+    case TYPE_DALUROUND:
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_ALU_SHIFT:
+      use_rtx = extract_shift_reg (consumer);
+      break;
+
+    case TYPE_PBSADA:
+      return pbsada_insn_ra_rb_dep_reg_p (consumer, def_reg);
+
+    case TYPE_MAC:
+    case TYPE_DMAC:
+      use_rtx = extract_mac_non_acc_rtx (consumer);
+      break;
+
+   /* Some special instructions, divmodsi4 and udivmodsi4, produce two
+      results, the quotient and the remainder.  We have to check the
+      dependency from the producer to the first micro-operation.  */
+    case TYPE_DIV:
+      if (divmod_p (consumer))
+	use_rtx = SET_SRC (parallel_element (consumer, 0));
+      else
+	use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_DWEXT:
+      return wext_odd_dep_p (consumer, def_reg);
+
+    case TYPE_DBPICK:
+      return bpick_ra_rb_dep_p (consumer, def_reg);
+
+    case TYPE_MMU:
+      if (GET_CODE (PATTERN (consumer)) == SET)
+	use_rtx = SET_SRC (PATTERN (consumer));
+      else
+	return true;
+      break;
+
+    case TYPE_LOAD:
+    case TYPE_STORE:
+      use_rtx = extract_mem_rtx (consumer);
+      break;
+
+    case TYPE_LOAD_MULTIPLE:
+    case TYPE_STORE_MULTIPLE:
+      use_rtx = extract_base_reg (consumer);
+      break;
+
+    case TYPE_BRANCH:
+      use_rtx = PATTERN (consumer);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  if (reg_overlap_p (def_reg, use_rtx))
+    return true;
+
+  return false;
+}
+
+/* Check dependencies from any stages to ALU_E1 (E1).  This is a helper
+   function of n13_consumed_by_e1_dep_p ().  */
+bool
+n13_alu_e1_insn_dep_reg_p (rtx_insn *alu_e1_insn, rtx def_reg)
+{
+  rtx unspec_rtx, operand_ra, operand_rb;
+  rtx src_rtx, dst_rtx;
+
+  switch (INSN_CODE (alu_e1_insn))
+    {
+    /* BSP and BSE are supported by built-in functions, the corresponding
+       patterns are formed by UNSPEC RTXs.  We have to handle them
+       individually.  */
+    case CODE_FOR_unspec_bsp:
+    case CODE_FOR_unspec_bse:
+      unspec_rtx = SET_SRC (parallel_element (alu_e1_insn, 0));
+      gcc_assert (GET_CODE (unspec_rtx) == UNSPEC);
+
+      operand_ra = XVECEXP (unspec_rtx, 0, 0);
+      operand_rb = XVECEXP (unspec_rtx, 0, 1);
+
+      if (rtx_equal_p (def_reg, operand_ra)
+	  || rtx_equal_p (def_reg, operand_rb))
+	return true;
+
+      return false;
+
+    /* Unlink general ALU instructions, MOVD44 requires operands at E1.  */
+    case CODE_FOR_move_di:
+    case CODE_FOR_move_df:
+      src_rtx = SET_SRC (PATTERN (alu_e1_insn));
+      dst_rtx = SET_DEST (PATTERN (alu_e1_insn));
+
+      if (REG_P (dst_rtx) && REG_P (src_rtx)
+	  && rtx_equal_p (src_rtx, def_reg))
+	return true;
+
+      return false;
+
+    default:
+      return false;
+    }
+}
+
+/* Check the dependency between the producer defining DEF_REG and CONSUMER
+   requiring input operand at E1.  Because the address generation unti is
+   at E1, the address input should be ready at E1.  Note that the branch
+   target is also a kind of addresses, so we have to check it.  */
+bool
+n13_consumed_by_e1_dep_p (rtx_insn *consumer, rtx def_reg)
+{
+  rtx use_rtx;
+
+  switch (get_attr_type (consumer))
+    {
+    /* ALU_E1 */
+    case TYPE_ALU:
+      return n13_alu_e1_insn_dep_reg_p (consumer, def_reg);
+
+    case TYPE_PBSADA:
+      return pbsada_insn_ra_rb_dep_reg_p (consumer, def_reg);
+
+    case TYPE_PBSAD:
+    case TYPE_MUL:
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_MAC:
+      use_rtx = extract_mac_non_acc_rtx (consumer);
+      break;
+
+    case TYPE_DIV:
+      if (divmod_p (consumer))
+	use_rtx = SET_SRC (parallel_element (consumer, 0));
+      else
+	use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_MMU:
+      if (GET_CODE (PATTERN (consumer)) == SET)
+	use_rtx = SET_SRC (PATTERN (consumer));
+      else
+	return true;
+      break;
+
+    case TYPE_BRANCH:
+      use_rtx = extract_branch_target_rtx (consumer);
+      break;
+
+    case TYPE_LOAD:
+    case TYPE_STORE:
+      use_rtx = extract_mem_rtx (consumer);
+      break;
+
+    case TYPE_LOAD_MULTIPLE:
+    case TYPE_STORE_MULTIPLE:
+      use_rtx = extract_base_reg (consumer);
+      break;
+
+    default:
+      return false;
+    }
+
+  if (reg_overlap_p (def_reg, use_rtx))
+    return true;
+
+  return false;
+}
+
+/* Check the dependency between the producer defining DEF_REG and CONSUMER
+   requiring input operand at E2.  */
+bool
+n13_consumed_by_e2_dep_p (rtx_insn *consumer, rtx def_reg)
+{
+  rtx use_rtx;
+
+  switch (get_attr_type (consumer))
+    {
+    case TYPE_ALU:
+    case TYPE_STORE:
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_ALU_SHIFT:
+      use_rtx = extract_shift_reg (consumer);
+      break;
+
+    case TYPE_PBSADA:
+      return pbsada_insn_rt_dep_reg_p (consumer, def_reg);
+
+    case TYPE_STORE_MULTIPLE:
+      use_rtx = extract_nth_access_rtx (consumer, 0);
+      break;
+
+    case TYPE_BRANCH:
+      use_rtx = extract_branch_condition_rtx (consumer);
+      break;
+
+    default:
+      gcc_unreachable();
+    }
+
+  if (reg_overlap_p (def_reg, use_rtx))
+    return true;
+
+  return false;
+}
+
+/* Check the dependency between the producer defining DEF_REG and CONSUMER
+   requiring input operand at AG (E1).  */
+bool
+pn_consumed_by_e1_dep_p (rtx_insn *consumer, rtx def_reg)
+{
+  rtx use_rtx;
+
+  switch (get_attr_type (consumer))
+    {
+    case TYPE_LOAD:
+      if (load_single_p (consumer))
+	use_rtx = extract_mem_rtx (consumer);
+      else
+	use_rtx = extract_base_reg (consumer);
+      break;
+
+    case TYPE_STORE:
+      if (store_single_p (consumer)
+	  && (!post_update_insn_p (consumer)
+	      || immed_offset_p (extract_mem_rtx (consumer))))
+	use_rtx = extract_mem_rtx (consumer);
+      else
+	use_rtx = extract_base_reg (consumer);
+      break;
+
+    case TYPE_LOAD_MULTIPLE:
+    case TYPE_STORE_MULTIPLE:
+      use_rtx = extract_base_reg (consumer);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return reg_overlap_p (def_reg, use_rtx);
+}
+
+bool
+pn_consumed_by_e2_dep_p (rtx_insn *consumer, rtx def_reg)
+{
+  rtx use_rtx;
+
+  switch (get_attr_type (consumer))
+    {
+    case TYPE_ALU:
+      if (get_attr_subtype (consumer) != SUBTYPE_SHIFT)
+	return false;
+    case TYPE_PBSAD:
+    case TYPE_PBSADA:
+    case TYPE_MUL:
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_MAC:
+      use_rtx = extract_mac_non_acc_rtx (consumer);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return reg_overlap_p (def_reg, use_rtx);
+}
+
+bool
+pn_consumed_by_e3_dep_p (rtx_insn *consumer, rtx def_reg)
+{
+  rtx use_rtx;
+
+  switch (get_attr_type (consumer))
+    {
+    case TYPE_ALU:
+      if (get_attr_subtype (consumer) == SUBTYPE_SHIFT)
+	return false;
+    case TYPE_PBSAD:
+    case TYPE_PBSADA:
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_BRANCH:
+      return (reg_overlap_p (def_reg, extract_branch_target_rtx (consumer))
+	      || reg_overlap_p (def_reg,
+				extract_branch_condition_rtx (consumer)));
+      break;
+
+    case TYPE_STORE:
+      use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    case TYPE_STORE_MULTIPLE:
+      use_rtx = extract_nth_access_rtx (consumer, 0);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return reg_overlap_p (def_reg, use_rtx);
+}
+
+bool
+pn_consumed_by_e4_dep_p (rtx_insn *consumer, rtx def_reg)
+{
+  rtx use_rtx;
+
+  switch (get_attr_type (consumer))
+    {
+    case TYPE_MAC:
+      use_rtx = SET_DEST (PATTERN (consumer));
+      break;
+
+    case TYPE_DIV:
+      if (divmod_p (consumer))
+	use_rtx = SET_SRC (parallel_element (consumer, 0));
+      else
+	use_rtx = SET_SRC (PATTERN (consumer));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return reg_overlap_p (def_reg, use_rtx);
+}
+
+} // anonymous namespace
+
+/* ------------------------------------------------------------------------ */
+
+/* Guard functions for N7 core.  */
+
+bool
+nds32_n7_load_to_ii_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  if (post_update_insn_p (producer))
+    return false;
+
+  rtx def_reg = SET_DEST (PATTERN (producer));
+
+  return n7_consumed_by_ii_dep_p (consumer, def_reg);
+}
+
+bool
+nds32_n7_last_load_to_ii_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  /* If PRODUCER is a post-update LMW insn, the last micro-operation updates
+     the base register and the result is ready in II stage, so we don't need
+     to handle that case in this guard function and the corresponding bypass
+     rule.  */
+  if (post_update_insn_p (producer))
+    return false;
+
+  rtx last_def_reg = extract_nth_access_reg (producer, -1);
+
+  if (last_def_reg == NULL_RTX)
+    return false;
+
+  gcc_assert (REG_P (last_def_reg) || GET_CODE (last_def_reg) == SUBREG);
+
+  return n7_consumed_by_ii_dep_p (consumer, last_def_reg);
+}
+
+/* Guard functions for N8 core.  */
+
+bool
+nds32_n8_load_to_ii_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  if (post_update_insn_p (producer))
+    return false;
+
+  rtx def_reg = SET_DEST (PATTERN (producer));
+
+  return n8_consumed_by_addr_in_p (consumer, def_reg);
+}
+
+bool
+nds32_n8_load_bi_to_ii_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  if (!post_update_insn_p (producer))
+    return false;
+
+  rtx def_reg = SET_DEST (PATTERN (producer));
+
+  return n8_consumed_by_addr_in_p (consumer, def_reg);
+}
+
+bool
+nds32_n8_load_to_ex_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  if (post_update_insn_p (producer))
+    return false;
+
+  rtx def_reg = SET_DEST (PATTERN (producer));
+
+  return n8_consumed_by_ex_p (consumer, def_reg);
+}
+
+bool
+nds32_n8_ex_to_ii_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    case TYPE_ALU:
+      if (movd44_insn_p (producer))
+	def_reg = extract_movd44_odd_reg (producer);
+      else
+	def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_MUL:
+    case TYPE_MAC:
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_DIV:
+      if (divmod_p (producer))
+	def_reg = SET_DEST (parallel_element (producer, 1));
+      else
+	def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_LOAD:
+    case TYPE_STORE:
+    case TYPE_LOAD_MULTIPLE:
+    case TYPE_STORE_MULTIPLE:
+      if (!post_update_insn_p (producer))
+	return false;
+
+      def_reg = extract_base_reg (producer);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return n8_consumed_by_addr_in_p (consumer, def_reg);
+}
+
+bool
+nds32_n8_last_load_to_ii_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  /* If PRODUCER is a post-update LMW insn, the last micro-operation updates
+     the base register and the result is ready in EX stage, so we don't need
+     to handle that case in this guard function and the corresponding bypass
+     rule.  */
+  if (post_update_insn_p (producer))
+    return false;
+
+  rtx last_def_reg = extract_nth_access_reg (producer, -1);
+
+  if (last_def_reg == NULL_RTX)
+    return false;
+
+  gcc_assert (REG_P (last_def_reg) || GET_CODE (last_def_reg) == SUBREG);
+
+  return n8_consumed_by_addr_in_p (consumer, last_def_reg);
+}
+
+bool
+nds32_n8_last_load_two_to_ii_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  int index = -2;
+
+  /* If PRODUCER is a post-update insn, there is an additional one micro-
+     operation inserted in the end, so the last memory access operation should
+     be handled by this guard function and the corresponding bypass rule.  */
+  if (post_update_insn_p (producer))
+    index = -1;
+
+  rtx last_two_def_reg = extract_nth_access_reg (producer, index);
+
+  if (last_two_def_reg == NULL_RTX)
+    return false;
+
+  gcc_assert (REG_P (last_two_def_reg)
+	      || GET_CODE (last_two_def_reg) == SUBREG);
+
+  return n8_consumed_by_addr_in_p (consumer, last_two_def_reg);
+}
+
+bool
+nds32_n8_last_load_to_ex_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  /* If PRODUCER is a post-update LMW insn, the last micro-operation updates
+     the base register and the result is ready in EX stage, so we don't need
+     to handle that case in this guard function and the corresponding bypass
+     rule.  */
+  if (post_update_insn_p (producer))
+    return false;
+
+  rtx last_def_reg = extract_nth_access_reg (producer, -1);
+
+  if (last_def_reg == NULL_RTX)
+    return false;
+
+  gcc_assert (REG_P (last_def_reg) || GET_CODE (last_def_reg) == SUBREG);
+
+  return n8_consumed_by_ex_p (consumer, last_def_reg);
+}
+
+/* Guard functions for E8 cores.  */
+
+bool
+nds32_e8_load_to_ii_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg = SET_DEST (PATTERN (producer));
+
+  return e8_consumed_by_addr_in_p (consumer, def_reg);
+}
+
+bool
+nds32_e8_load_to_ex_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg = SET_DEST (PATTERN (producer));
+
+  return e8_consumed_by_ex_p (consumer, def_reg);
+}
+
+bool
+nds32_e8_ex_to_ii_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    case TYPE_ALU:
+      /* No data hazards if AGEN's input is produced by MOVI or SETHI.  */
+      if (GET_CODE (PATTERN (producer)) == SET)
+	{
+	  rtx dest = SET_DEST (PATTERN (producer));
+	  rtx src = SET_SRC (PATTERN (producer));
+
+	  if ((REG_P (dest) || GET_CODE (dest) == SUBREG)
+	      && (GET_CODE (src) == CONST_INT || GET_CODE (src) == HIGH))
+	    return false;
+	}
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_MUL:
+    case TYPE_MAC:
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_DIV:
+      if (divmod_p (producer))
+	{
+	  rtx def_reg1 = SET_DEST (parallel_element (producer, 0));
+	  rtx def_reg2 = SET_DEST (parallel_element (producer, 1));
+
+	  return (e8_consumed_by_addr_in_p (consumer, def_reg1)
+		  || e8_consumed_by_addr_in_p (consumer, def_reg2));
+	}
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_LOAD:
+    case TYPE_STORE:
+    case TYPE_LOAD_MULTIPLE:
+    case TYPE_STORE_MULTIPLE:
+      if (!post_update_insn_p (producer))
+	return false;
+
+      def_reg = extract_base_reg (producer);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return e8_consumed_by_addr_in_p (consumer, def_reg);
+}
+
+bool
+nds32_e8_last_load_to_ii_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx last_def_reg = extract_nth_access_reg (producer, -1);
+
+  if (last_def_reg == NULL_RTX)
+    return false;
+
+  gcc_assert (REG_P (last_def_reg) || GET_CODE (last_def_reg) == SUBREG);
+
+  return e8_consumed_by_addr_in_p (consumer, last_def_reg);
+}
+
+bool
+nds32_e8_last_load_to_ex_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx last_def_reg = extract_nth_access_reg (producer, -1);
+
+  if (last_def_reg == NULL_RTX)
+    return false;
+
+  gcc_assert (REG_P (last_def_reg) || GET_CODE (last_def_reg) == SUBREG);
+
+  return e8_consumed_by_ex_p (consumer, last_def_reg);
+}
+
+/* Guard functions for N9 cores.  */
+
+/* Check dependencies from MM to EX.  */
+bool
+nds32_n9_2r1w_mm_to_ex_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    /* LD_!bi */
+    case TYPE_LOAD:
+      if (post_update_insn_p (producer))
+	return false;
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_MUL:
+    case TYPE_MAC:
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+    return n9_2r1w_consumed_by_ex_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from MM to EX.  */
+bool
+nds32_n9_3r2w_mm_to_ex_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    case TYPE_LOAD:
+    case TYPE_MUL:
+    case TYPE_MAC:
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+   /* Some special instructions, divmodsi4 and udivmodsi4, produce two
+      results, the quotient and the remainder.  We have to handle them
+      individually.  */
+    case TYPE_DIV:
+      if (divmod_p (producer))
+	{
+	  rtx def_reg1 = SET_DEST (parallel_element (producer, 0));
+	  rtx def_reg2 = SET_DEST (parallel_element (producer, 1));
+
+	  return (n9_3r2w_consumed_by_ex_dep_p (consumer, def_reg1)
+		  || n9_3r2w_consumed_by_ex_dep_p (consumer, def_reg2));
+	}
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+    return n9_3r2w_consumed_by_ex_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from LMW(N, N) to EX.  */
+bool
+nds32_n9_last_load_to_ex_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx last_def_reg = extract_nth_access_reg (producer, -1);
+
+  if (nds32_register_ports_config == REG_PORT_2R1W)
+    {
+      /* The base-update micro operation occupies the last cycle.  */
+      if (post_update_insn_p (producer))
+	return false;
+
+      /* When the base register is in the list of a load multiple insn and the
+	 access order of the base register is not the last one, we need an
+	 additional micro operation to commit the load result to the base
+	 register -- we can treat the base register as the last defined
+	 register.  */
+      size_t i;
+      size_t n_elems = parallel_elements (producer);
+      rtx base_reg = extract_base_reg (producer);
+
+      for (i = 0; i < n_elems; ++i)
+	{
+	  rtx load_rtx = extract_nth_access_rtx (producer, i);
+	  rtx list_element = SET_DEST (load_rtx);
+
+	  if (rtx_equal_p (base_reg, list_element) && i != n_elems - 1)
+	    {
+	      last_def_reg = base_reg;
+	      break;
+	    }
+	}
+
+      return n9_2r1w_consumed_by_ex_dep_p (consumer, last_def_reg);
+    }
+  else
+    return n9_3r2w_consumed_by_ex_dep_p (consumer, last_def_reg);
+}
+
+/* Guard functions for N10 cores.  */
+
+/* Check dependencies from EX to EX (ADDR_OUT -> ADDR_IN).  */
+bool
+nds32_n10_ex_to_ex_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  gcc_assert (get_attr_type (producer) == TYPE_FLOAD
+	      || get_attr_type (producer) == TYPE_FSTORE);
+  gcc_assert (get_attr_type (consumer) == TYPE_FLOAD
+	      || get_attr_type (consumer) == TYPE_FSTORE);
+
+  if (!post_update_insn_p (producer))
+    return false;
+
+  return reg_overlap_p (extract_base_reg (producer),
+			extract_mem_rtx (consumer));
+}
+
+/* Check dependencies from MM to EX.  */
+bool
+nds32_n10_mm_to_ex_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    case TYPE_LOAD:
+    case TYPE_MUL:
+    case TYPE_MAC:
+    case TYPE_DALU64:
+    case TYPE_DMUL:
+    case TYPE_DMAC:
+    case TYPE_DALUROUND:
+    case TYPE_DBPICK:
+    case TYPE_DWEXT:
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+   /* Some special instructions, divmodsi4 and udivmodsi4, produce two
+      results, the quotient and the remainder.  We have to handle them
+      individually.  */
+    case TYPE_DIV:
+      if (divmod_p (producer))
+	{
+	  rtx def_reg1 = SET_DEST (parallel_element (producer, 0));
+	  rtx def_reg2 = SET_DEST (parallel_element (producer, 1));
+
+	  return (n10_consumed_by_ex_dep_p (consumer, def_reg1)
+		  || n10_consumed_by_ex_dep_p (consumer, def_reg2));
+	}
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+    return n10_consumed_by_ex_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from LMW(N, N) to EX.  */
+bool
+nds32_n10_last_load_to_ex_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx last_def_reg = extract_nth_access_reg (producer, -1);
+
+  return n10_consumed_by_ex_dep_p (consumer, last_def_reg);
+}
+
+/* Guard functions for Graywolf cores.  */
+
+/* Check dependencies from EX to EX (ADDR_OUT -> ADDR_IN).  */
+bool
+nds32_gw_ex_to_ex_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  return nds32_n10_ex_to_ex_p (producer, consumer);
+}
+
+/* Check dependencies from MM to EX.  */
+bool
+nds32_gw_mm_to_ex_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    case TYPE_LOAD:
+    case TYPE_MUL:
+    case TYPE_MAC:
+    case TYPE_DALU64:
+    case TYPE_DMUL:
+    case TYPE_DMAC:
+    case TYPE_DALUROUND:
+    case TYPE_DBPICK:
+    case TYPE_DWEXT:
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+   /* Some special instructions, divmodsi4 and udivmodsi4, produce two
+      results, the quotient and the remainder.  We have to handle them
+      individually.  */
+    case TYPE_DIV:
+      if (divmod_p (producer))
+	{
+	  rtx def_reg1 = SET_DEST (parallel_element (producer, 0));
+	  rtx def_reg2 = SET_DEST (parallel_element (producer, 1));
+
+	  return (gw_consumed_by_ex_dep_p (consumer, def_reg1)
+		  || gw_consumed_by_ex_dep_p (consumer, def_reg2));
+	}
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+    return gw_consumed_by_ex_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from LMW(N, N) to EX.  */
+bool
+nds32_gw_last_load_to_ex_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx last_def_reg = extract_nth_access_reg (producer, -1);
+
+  return gw_consumed_by_ex_dep_p (consumer, last_def_reg);
+}
+
+/* Guard functions for N12/N13 cores.  */
+
+/* Check dependencies from E2 to E1.  */
+bool
+nds32_n13_e2_to_e1_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    /* Only post-update load/store instructions are considered.  These
+       instructions produces address output at E2.  */
+    case TYPE_LOAD:
+    case TYPE_STORE:
+    case TYPE_LOAD_MULTIPLE:
+    case TYPE_STORE_MULTIPLE:
+      if (!post_update_insn_p (producer))
+	return false;
+
+      def_reg = extract_base_reg (producer);
+      break;
+
+    case TYPE_ALU:
+    case TYPE_ALU_SHIFT:
+    case TYPE_PBSAD:
+    case TYPE_PBSADA:
+    case TYPE_MUL:
+    case TYPE_MAC:
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_BRANCH:
+      return true;
+
+    case TYPE_DIV:
+      /* Some special instructions, divmodsi4 and udivmodsi4, produce two
+	 results, the quotient and the remainder.  We have to handle them
+	 individually.  */
+      if (divmod_p (producer))
+	{
+	  rtx def_reg1 = SET_DEST (parallel_element (producer, 0));
+	  rtx def_reg2 = SET_DEST (parallel_element (producer, 1));
+
+	  return (n13_consumed_by_e1_dep_p (consumer, def_reg1)
+		  || n13_consumed_by_e1_dep_p (consumer, def_reg2));
+	}
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return n13_consumed_by_e1_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from Load-Store Unit (E3) to E1.  */
+bool
+nds32_n13_load_to_e1_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg = SET_DEST (PATTERN (producer));
+
+  gcc_assert (get_attr_type (producer) == TYPE_LOAD);
+  gcc_assert (REG_P (def_reg) || GET_CODE (def_reg) == SUBREG);
+
+  return n13_consumed_by_e1_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from Load-Store Unit (E3) to E2.  */
+bool
+nds32_n13_load_to_e2_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg = SET_DEST (PATTERN (producer));
+
+  gcc_assert (get_attr_type (producer) == TYPE_LOAD);
+  gcc_assert (REG_P (def_reg) || GET_CODE (def_reg) == SUBREG);
+
+  return n13_consumed_by_e2_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from LMW(N, N) to E1.  */
+bool
+nds32_n13_last_load_to_e1_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx last_def_reg = extract_nth_access_reg (producer, -1);
+
+  return n13_consumed_by_e1_dep_p (consumer, last_def_reg);
+}
+
+/* Check dependencies from LMW(N, N) to E2.  */
+bool
+nds32_n13_last_load_to_e2_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx last_def_reg = extract_nth_access_reg (producer, -1);
+
+  return n13_consumed_by_e2_dep_p (consumer, last_def_reg);
+}
+
+/* Check dependencies from LMW(N, N-1) to E2.  */
+bool
+nds32_n13_last_two_load_to_e1_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx last_two_def_reg = extract_nth_access_reg (producer, -2);
+
+  if (last_two_def_reg == NULL_RTX)
+    return false;
+
+  return n13_consumed_by_e1_dep_p (consumer, last_two_def_reg);
+}
+
+/* Guard functions for Panther cores.  */
+
+/* Check dependencies from E2 to E1.  */
+bool
+nds32_pn_e2_to_e1_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    case TYPE_ALU:
+      gcc_assert (get_attr_subtype (producer) == SUBTYPE_SHIFT);
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return pn_consumed_by_e1_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from E3 to E1.  */
+bool
+nds32_pn_e3_to_e1_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    case TYPE_ALU:
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return pn_consumed_by_e1_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from E3 to E2.  */
+bool
+nds32_pn_e3_to_e2_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    case TYPE_ALU:
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return pn_consumed_by_e2_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from E4 to E1.  */
+bool
+nds32_pn_e4_to_e1_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    case TYPE_MUL:
+    case TYPE_MAC:
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_DIV:
+      if (divmod_p (producer))
+	{
+	  rtx def_reg1 = SET_DEST (parallel_element (producer, 0));
+	  rtx def_reg2 = SET_DEST (parallel_element (producer, 1));
+
+	  return (pn_consumed_by_e1_dep_p (consumer, def_reg1)
+		  || pn_consumed_by_e1_dep_p (consumer, def_reg2));
+	}
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_LOAD:
+      if (post_update_insn_p (producer)
+	  && pn_consumed_by_e1_dep_p (consumer, extract_base_reg (producer)))
+	return true;
+
+      if (!load_full_word_p (producer))
+	return false;
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_STORE:
+    case TYPE_LOAD_MULTIPLE:
+    case TYPE_STORE_MULTIPLE:
+      if (!post_update_insn_p (producer))
+	return false;
+
+      def_reg = extract_base_reg (producer);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return pn_consumed_by_e1_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from E4 to E2.  */
+bool
+nds32_pn_e4_to_e2_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    case TYPE_MUL:
+    case TYPE_MAC:
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_DIV:
+      if (divmod_p (producer))
+	{
+	  rtx def_reg1 = SET_DEST (parallel_element (producer, 0));
+	  rtx def_reg2 = SET_DEST (parallel_element (producer, 1));
+
+	  return (pn_consumed_by_e2_dep_p (consumer, def_reg1)
+		  || pn_consumed_by_e2_dep_p (consumer, def_reg2));
+	}
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_LOAD:
+      if (post_update_insn_p (producer)
+	  && pn_consumed_by_e2_dep_p (consumer, extract_base_reg (producer)))
+	return true;
+
+      if (!load_full_word_p (producer))
+	return false;
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_STORE:
+    case TYPE_LOAD_MULTIPLE:
+    case TYPE_STORE_MULTIPLE:
+      if (!post_update_insn_p (producer))
+	return false;
+
+      def_reg = extract_base_reg (producer);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return pn_consumed_by_e2_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from E4 to E3.  */
+bool
+nds32_pn_e4_to_e3_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    case TYPE_MUL:
+    case TYPE_MAC:
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_DIV:
+      if (divmod_p (producer))
+	{
+	  rtx def_reg1 = SET_DEST (parallel_element (producer, 0));
+	  rtx def_reg2 = SET_DEST (parallel_element (producer, 1));
+
+	  return (pn_consumed_by_e3_dep_p (consumer, def_reg1)
+		  || pn_consumed_by_e3_dep_p (consumer, def_reg2));
+	}
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_LOAD:
+      if (post_update_insn_p (producer)
+	  && pn_consumed_by_e3_dep_p (consumer, extract_base_reg (producer)))
+	return true;
+
+      if (load_partial_word_p (producer))
+	return false;
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    case TYPE_STORE:
+    case TYPE_LOAD_MULTIPLE:
+    case TYPE_STORE_MULTIPLE:
+      if (!post_update_insn_p (producer))
+	return false;
+
+      def_reg = extract_base_reg (producer);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return pn_consumed_by_e3_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from WB to E1.  */
+bool
+nds32_pn_wb_to_e1_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    case TYPE_LOAD:
+      if (!load_partial_word_p (producer))
+	return false;
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return pn_consumed_by_e1_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from WB to E2.  */
+bool
+nds32_pn_wb_to_e2_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    case TYPE_LOAD:
+      if (!load_partial_word_p (producer))
+	return false;
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return pn_consumed_by_e2_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from WB to E3.  */
+bool
+nds32_pn_wb_to_e3_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    case TYPE_LOAD:
+      if (!load_partial_word_p (producer))
+	return false;
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return pn_consumed_by_e3_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from WB to E4.  */
+bool
+nds32_pn_wb_to_e4_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx def_reg;
+
+  switch (get_attr_type (producer))
+    {
+    case TYPE_LOAD:
+      if (!load_partial_word_p (producer))
+	return false;
+
+      def_reg = SET_DEST (PATTERN (producer));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return pn_consumed_by_e4_dep_p (consumer, def_reg);
+}
+
+/* Check dependencies from LMW(N, N) to E1.  */
+bool
+nds32_pn_last_load_to_e1_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx last_def_reg = extract_nth_access_reg (producer, -1);
+
+  return pn_consumed_by_e1_dep_p (consumer, last_def_reg);
+}
+
+/* Check dependencies from LMW(N, N) to E2.  */
+bool
+nds32_pn_last_load_to_e2_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx last_def_reg = extract_nth_access_reg (producer, -1);
+
+  return pn_consumed_by_e2_dep_p (consumer, last_def_reg);
+}
+
+/* Check dependencies from LMW(N, N) to E3.  */
+bool
+nds32_pn_last_load_to_e3_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx last_def_reg = extract_nth_access_reg (producer, -1);
+
+  return pn_consumed_by_e3_dep_p (consumer, last_def_reg);
+}
+
+/* Check dependencies from LMW(N, N - 1) to E1.  */
+bool
+nds32_pn_last_two_load_to_e1_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx last_two_def_reg = extract_nth_access_reg (producer, -2);
+
+  if (last_two_def_reg == NULL_RTX)
+    return false;
+
+  return pn_consumed_by_e1_dep_p (consumer, last_two_def_reg);
+}
+
+/* Check dependencies from LMW(N, N - 1) to E2.  */
+bool
+nds32_pn_last_two_load_to_e2_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx last_two_def_reg = extract_nth_access_reg (producer, -2);
+
+  if (last_two_def_reg == NULL_RTX)
+    return false;
+
+  return pn_consumed_by_e2_dep_p (consumer, last_two_def_reg);
+}
+
+/* Check dependencies from LMW(N, N - 2) to E1.  */
+bool
+nds32_pn_last_three_load_to_e1_p (rtx_insn *producer, rtx_insn *consumer)
+{
+  rtx last_three_def_reg = extract_nth_access_reg (producer, -3);
+
+  if (last_three_def_reg == NULL_RTX)
+    return false;
+
+  return pn_consumed_by_e1_dep_p (consumer, last_three_def_reg);
+}

 /* ------------------------------------------------------------------------ */
diff --git a/gcc/config/nds32/nds32-predicates.c b/gcc/config/nds32/nds32-predicates.c
index 361d001..b45d3e6 100644
--- a/gcc/config/nds32/nds32-predicates.c
+++ b/gcc/config/nds32/nds32-predicates.c
@@ -24,14 +24,41 @@
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
-#include "target.h"
-#include "rtl.h"
 #include "tree.h"
-#include "tm_p.h"
-#include "optabs.h"		/* For GEN_FCN.  */
+#include "rtl.h"
+#include "df.h"
+#include "alias.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "regs.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
+#include "explow.h"
 #include "emit-rtl.h"
+#include "stmt.h"
+#include "expr.h"
 #include "recog.h"
+#include "diagnostic-core.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "tm_p.h"
 #include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"		/* For add_builtin_function().  */
+#include "builtins.h"

 /* ------------------------------------------------------------------------ */

@@ -98,21 +125,33 @@ nds32_consecutive_registers_load_store_p (rtx op,
    We have to extract reg and mem of every element and
    check if the information is valid for multiple load/store operation.  */
 bool
-nds32_valid_multiple_load_store (rtx op, bool load_p)
+nds32_valid_multiple_load_store_p (rtx op, bool load_p, bool bim_p)
 {
   int count;
   int first_elt_regno;
+  int update_base_elt_idx;
+  int offset;
   rtx elt;
+  rtx update_base;

-  /* Get the counts of elements in the parallel rtx.  */
-  count = XVECLEN (op, 0);
-  /* Pick up the first element.  */
-  elt = XVECEXP (op, 0, 0);
+  /* Get the counts of elements in the parallel rtx.
+     Last one is update base register if bim_p.
+     and pick up the first element.  */
+  if (bim_p)
+    {
+      count = XVECLEN (op, 0) - 1;
+      elt = XVECEXP (op, 0, 1);
+    }
+  else
+    {
+      count = XVECLEN (op, 0);
+      elt = XVECEXP (op, 0, 0);
+    }

   /* Perform some quick check for the first element in the parallel rtx.  */
   if (GET_CODE (elt) != SET
       || count <= 1
-      || count > 8)
+      || count > 25)
     return false;

   /* Pick up regno of first element for further detail checking.
@@ -138,11 +177,29 @@ nds32_valid_multiple_load_store (rtx op, bool load_p)
      Refer to nds32-multiple.md for more information
      about following checking.
      The starting element of parallel rtx is index 0.  */
-  if (!nds32_consecutive_registers_load_store_p (op, load_p, 0,
+  if (!nds32_consecutive_registers_load_store_p (op, load_p, bim_p ? 1 : 0,
 						 first_elt_regno,
 						 count))
     return false;

+  if (bim_p)
+    {
+      update_base_elt_idx = 0;
+      update_base = XVECEXP (op, 0, update_base_elt_idx);
+      if (!REG_P (SET_DEST (update_base)))
+	return false;
+      if (GET_CODE (SET_SRC (update_base)) != PLUS)
+	return false;
+      else
+	{
+	  offset = count * UNITS_PER_WORD;
+	  elt = XEXP (SET_SRC (update_base), 1);
+	  if (GET_CODE (elt) != CONST_INT
+	      || (INTVAL (elt) != offset))
+	    return false;
+	}
+    }
+
   /* Pass all test, this is a valid rtx.  */
   return true;
 }
@@ -174,47 +231,47 @@ nds32_valid_stack_push_pop_p (rtx op, bool push_p)
     {
       elt = XVECEXP (op, 0, index);
       if (GET_CODE (elt) != SET)
-        return false;
+	return false;
     }

   /* For push operation, the parallel rtx looks like:
      (parallel [(set (mem (plus (reg:SI SP_REGNUM) (const_int -32)))
-                     (reg:SI Rb))
-                (set (mem (plus (reg:SI SP_REGNUM) (const_int -28)))
-                     (reg:SI Rb+1))
-                ...
-                (set (mem (plus (reg:SI SP_REGNUM) (const_int -16)))
-                     (reg:SI Re))
-                (set (mem (plus (reg:SI SP_REGNUM) (const_int -12)))
-                     (reg:SI FP_REGNUM))
-                (set (mem (plus (reg:SI SP_REGNUM) (const_int -8)))
-                     (reg:SI GP_REGNUM))
-                (set (mem (plus (reg:SI SP_REGNUM) (const_int -4)))
-                     (reg:SI LP_REGNUM))
-                (set (reg:SI SP_REGNUM)
-                     (plus (reg:SI SP_REGNUM) (const_int -32)))])
+		     (reg:SI Rb))
+		(set (mem (plus (reg:SI SP_REGNUM) (const_int -28)))
+		     (reg:SI Rb+1))
+		...
+		(set (mem (plus (reg:SI SP_REGNUM) (const_int -16)))
+		     (reg:SI Re))
+		(set (mem (plus (reg:SI SP_REGNUM) (const_int -12)))
+		     (reg:SI FP_REGNUM))
+		(set (mem (plus (reg:SI SP_REGNUM) (const_int -8)))
+		     (reg:SI GP_REGNUM))
+		(set (mem (plus (reg:SI SP_REGNUM) (const_int -4)))
+		     (reg:SI LP_REGNUM))
+		(set (reg:SI SP_REGNUM)
+		     (plus (reg:SI SP_REGNUM) (const_int -32)))])

      For pop operation, the parallel rtx looks like:
      (parallel [(set (reg:SI Rb)
-                     (mem (reg:SI SP_REGNUM)))
-                (set (reg:SI Rb+1)
-                     (mem (plus (reg:SI SP_REGNUM) (const_int 4))))
-                ...
-                (set (reg:SI Re)
-                     (mem (plus (reg:SI SP_REGNUM) (const_int 16))))
-                (set (reg:SI FP_REGNUM)
-                     (mem (plus (reg:SI SP_REGNUM) (const_int 20))))
-                (set (reg:SI GP_REGNUM)
-                     (mem (plus (reg:SI SP_REGNUM) (const_int 24))))
-                (set (reg:SI LP_REGNUM)
-                     (mem (plus (reg:SI SP_REGNUM) (const_int 28))))
-                (set (reg:SI SP_REGNUM)
-                     (plus (reg:SI SP_REGNUM) (const_int 32)))]) */
+		     (mem (reg:SI SP_REGNUM)))
+		(set (reg:SI Rb+1)
+		     (mem (plus (reg:SI SP_REGNUM) (const_int 4))))
+		...
+		(set (reg:SI Re)
+		     (mem (plus (reg:SI SP_REGNUM) (const_int 16))))
+		(set (reg:SI FP_REGNUM)
+		     (mem (plus (reg:SI SP_REGNUM) (const_int 20))))
+		(set (reg:SI GP_REGNUM)
+		     (mem (plus (reg:SI SP_REGNUM) (const_int 24))))
+		(set (reg:SI LP_REGNUM)
+		     (mem (plus (reg:SI SP_REGNUM) (const_int 28))))
+		(set (reg:SI SP_REGNUM)
+		     (plus (reg:SI SP_REGNUM) (const_int 32)))]) */

   /* 1. Consecutive registers push/pop operations.
-        We need to calculate how many registers should be consecutive.
-        The $sp adjustment rtx, $fp push rtx, $gp push rtx,
-        and $lp push rtx are excluded.  */
+	We need to calculate how many registers should be consecutive.
+	The $sp adjustment rtx, $fp push rtx, $gp push rtx,
+	and $lp push rtx are excluded.  */

   /* Detect whether we have $fp, $gp, or $lp in the parallel rtx.  */
   save_fp = reg_mentioned_p (gen_rtx_REG (SImode, FP_REGNUM), op);
@@ -238,19 +295,19 @@ nds32_valid_stack_push_pop_p (rtx op, bool push_p)
       first_regno = REGNO (elt_reg);

       /* The 'push' operation is a kind of store operation.
-         The 'pop' operation is a kind of load operation.
-         Pass corresponding false/true as second argument (bool load_p).
-         The par_index is supposed to start with index 0.  */
+	 The 'pop' operation is a kind of load operation.
+	 Pass corresponding false/true as second argument (bool load_p).
+	 The par_index is supposed to start with index 0.  */
       if (!nds32_consecutive_registers_load_store_p (op,
 						     !push_p ? true : false,
 						     0,
 						     first_regno,
 						     rest_count))
-        return false;
+	return false;
     }

   /* 2. Valid $fp/$gp/$lp push/pop operations.
-        Remember to set start index for checking them.  */
+	Remember to set start index for checking them.  */

   /* The rest_count is the start index for checking $fp/$gp/$lp.  */
   index = rest_count;
@@ -269,9 +326,9 @@ nds32_valid_stack_push_pop_p (rtx op, bool push_p)
       index++;

       if (GET_CODE (elt_mem) != MEM
-          || GET_CODE (elt_reg) != REG
-          || REGNO (elt_reg) != FP_REGNUM)
-        return false;
+	  || GET_CODE (elt_reg) != REG
+	  || REGNO (elt_reg) != FP_REGNUM)
+	return false;
     }
   if (save_gp)
     {
@@ -281,9 +338,9 @@ nds32_valid_stack_push_pop_p (rtx op, bool push_p)
       index++;

       if (GET_CODE (elt_mem) != MEM
-          || GET_CODE (elt_reg) != REG
-          || REGNO (elt_reg) != GP_REGNUM)
-        return false;
+	  || GET_CODE (elt_reg) != REG
+	  || REGNO (elt_reg) != GP_REGNUM)
+	return false;
     }
   if (save_lp)
     {
@@ -293,16 +350,16 @@ nds32_valid_stack_push_pop_p (rtx op, bool push_p)
       index++;

       if (GET_CODE (elt_mem) != MEM
-          || GET_CODE (elt_reg) != REG
-          || REGNO (elt_reg) != LP_REGNUM)
-        return false;
+	  || GET_CODE (elt_reg) != REG
+	  || REGNO (elt_reg) != LP_REGNUM)
+	return false;
     }

   /* 3. The last element must be stack adjustment rtx.
-        Its form of rtx should be:
-          (set (reg:SI SP_REGNUM)
-               (plus (reg:SI SP_REGNUM) (const_int X)))
-        The X could be positive or negative value.  */
+	Its form of rtx should be:
+	  (set (reg:SI SP_REGNUM)
+	       (plus (reg:SI SP_REGNUM) (const_int X)))
+	The X could be positive or negative value.  */

   /* Pick up the last element.  */
   elt = XVECEXP (op, 0, total_count - 1);
@@ -322,54 +379,57 @@ nds32_valid_stack_push_pop_p (rtx op, bool push_p)
 }

 /* Function to check if 'bclr' instruction can be used with IVAL.  */
-int
-nds32_can_use_bclr_p (int ival)
+bool
+nds32_can_use_bclr_p (HOST_WIDE_INT ival)
 {
   int one_bit_count;
+  unsigned HOST_WIDE_INT mask = GET_MODE_MASK (SImode);

   /* Calculate the number of 1-bit of (~ival), if there is only one 1-bit,
      it means the original ival has only one 0-bit,
      So it is ok to perform 'bclr' operation.  */

-  one_bit_count = popcount_hwi ((unsigned HOST_WIDE_INT) (~ival));
+  one_bit_count = popcount_hwi ((unsigned HOST_WIDE_INT) (~ival) & mask);

   /* 'bclr' is a performance extension instruction.  */
-  return (TARGET_PERF_EXT && (one_bit_count == 1));
+  return (TARGET_EXT_PERF && (one_bit_count == 1));
 }

 /* Function to check if 'bset' instruction can be used with IVAL.  */
-int
-nds32_can_use_bset_p (int ival)
+bool
+nds32_can_use_bset_p (HOST_WIDE_INT ival)
 {
   int one_bit_count;
+  unsigned HOST_WIDE_INT mask = GET_MODE_MASK (SImode);

   /* Caculate the number of 1-bit of ival, if there is only one 1-bit,
      it is ok to perform 'bset' operation.  */

-  one_bit_count = popcount_hwi ((unsigned HOST_WIDE_INT) (ival));
+  one_bit_count = popcount_hwi ((unsigned HOST_WIDE_INT) (ival) & mask);

   /* 'bset' is a performance extension instruction.  */
-  return (TARGET_PERF_EXT && (one_bit_count == 1));
+  return (TARGET_EXT_PERF && (one_bit_count == 1));
 }

 /* Function to check if 'btgl' instruction can be used with IVAL.  */
-int
-nds32_can_use_btgl_p (int ival)
+bool
+nds32_can_use_btgl_p (HOST_WIDE_INT ival)
 {
   int one_bit_count;
+  unsigned HOST_WIDE_INT mask = GET_MODE_MASK (SImode);

   /* Caculate the number of 1-bit of ival, if there is only one 1-bit,
      it is ok to perform 'btgl' operation.  */

-  one_bit_count = popcount_hwi ((unsigned HOST_WIDE_INT) (ival));
+  one_bit_count = popcount_hwi ((unsigned HOST_WIDE_INT) (ival) & mask);

   /* 'btgl' is a performance extension instruction.  */
-  return (TARGET_PERF_EXT && (one_bit_count == 1));
+  return (TARGET_EXT_PERF && (one_bit_count == 1));
 }

 /* Function to check if 'bitci' instruction can be used with IVAL.  */
-int
-nds32_can_use_bitci_p (int ival)
+bool
+nds32_can_use_bitci_p (HOST_WIDE_INT ival)
 {
   /* If we are using V3 ISA, we have 'bitci' instruction.
      Try to see if we can present 'andi' semantic with
@@ -381,4 +441,286 @@ nds32_can_use_bitci_p (int ival)
 	  && satisfies_constraint_Iu15 (gen_int_mode (~ival, SImode)));
 }

+/* Return true if is load/store with SYMBOL_REF addressing mode
+   and memory mode is SImode.  */
+bool
+nds32_symbol_load_store_p (rtx_insn *insn)
+{
+  rtx mem_src = NULL_RTX;
+
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LOAD:
+      mem_src = SET_SRC (PATTERN (insn));
+      break;
+    case TYPE_STORE:
+      mem_src = SET_DEST (PATTERN (insn));
+      break;
+    default:
+      break;
+    }
+
+  /* Find load/store insn with addressing mode is SYMBOL_REF.  */
+  if (mem_src != NULL_RTX)
+    {
+      if ((GET_CODE (mem_src) == ZERO_EXTEND)
+	  || (GET_CODE (mem_src) == SIGN_EXTEND))
+	mem_src = XEXP (mem_src, 0);
+
+      if ((GET_CODE (XEXP (mem_src, 0)) == SYMBOL_REF)
+	   || (GET_CODE (XEXP (mem_src, 0)) == LO_SUM))
+	return true;
+    }
+
+  return false;
+}
+
+/* Vaild memory operand for floating-point loads and stores */
+bool
+nds32_float_mem_operand_p (rtx op)
+{
+  enum machine_mode mode = GET_MODE (op);
+  rtx addr = XEXP (op, 0);
+
+  /* Not support [symbol] [const] memory */
+  if (GET_CODE (addr) == SYMBOL_REF
+      || GET_CODE (addr) == CONST
+      || GET_CODE (addr) == LO_SUM)
+    return false;
+
+  if (GET_CODE (addr) == PLUS)
+    {
+      if (GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
+	return false;
+
+      /* Restrict const range: (imm12s << 2) */
+      if (GET_CODE (XEXP (addr, 1)) == CONST_INT)
+	{
+	  if ((mode == SImode || mode == SFmode)
+	      && NDS32_SINGLE_WORD_ALIGN_P (INTVAL (XEXP (addr, 1)))
+	      && !satisfies_constraint_Is14 ( XEXP(addr, 1)))
+	    return false;
+
+	  if ((mode == DImode || mode == DFmode)
+	      && NDS32_DOUBLE_WORD_ALIGN_P (INTVAL (XEXP (addr, 1)))
+	      && !satisfies_constraint_Is14 (XEXP (addr, 1)))
+	    return false;
+	}
+    }
+
+  return true;
+}
+
+int
+nds32_cond_move_p (rtx cmp_rtx)
+{
+  enum machine_mode cmp0_mode = GET_MODE (XEXP (cmp_rtx, 0));
+  enum machine_mode cmp1_mode = GET_MODE (XEXP (cmp_rtx, 1));
+  enum rtx_code cond = GET_CODE (cmp_rtx);
+
+  if ((cmp0_mode == DFmode || cmp0_mode == SFmode)
+      && (cmp1_mode == DFmode || cmp1_mode == SFmode)
+      && (cond == ORDERED || cond == UNORDERED))
+    return true;
+  return false;
+}
+
+/* Return true if the addresses in mem1 and mem2 are suitable for use in
+   an fldi or fsdi instruction.
+
+   This can only happen when addr1 and addr2, the addresses in mem1
+   and mem2, are consecutive memory locations (addr1 + 4 == addr2).
+   addr1 must also be aligned on a 64-bit boundary.  */
+bool
+nds32_memory_merge_peep_p (rtx mem1, rtx mem2)
+{
+  rtx addr1, addr2;
+  unsigned int reg1;
+  HOST_WIDE_INT offset1;
+
+  /* The mems cannot be volatile.  */
+  if (MEM_VOLATILE_P (mem1) || MEM_VOLATILE_P (mem2))
+    return false;
+
+  /* MEM1 should be aligned on a 64-bit boundary.  */
+  if (MEM_ALIGN (mem1) < 64)
+    return false;
+
+  addr1 = XEXP (mem1, 0);
+  addr2 = XEXP (mem2, 0);
+
+  /* Extract a register number and offset (if used) from the first addr.  */
+  if (GET_CODE (addr1) == PLUS)
+    {
+      if (GET_CODE (XEXP (addr1, 0)) != REG)
+	return false;
+      else
+	{
+	  reg1 = REGNO (XEXP (addr1, 0));
+	  if (GET_CODE (XEXP (addr1, 1)) != CONST_INT)
+	    return false;
+
+	  offset1 = INTVAL (XEXP (addr1, 1));
+	}
+    }
+  else if (GET_CODE (addr1) != REG)
+    return false;
+  else
+    {
+     reg1 = REGNO (addr1);
+      /* This was a simple (mem (reg)) expression.  Offset is 0.  */
+      offset1 = 0;
+    }
+  /* Make sure the second address is a (mem (plus (reg) (const_int).  */
+  if (GET_CODE (addr2) != PLUS)
+    return false;
+
+  if (GET_CODE (XEXP (addr2, 0)) != REG
+      || GET_CODE (XEXP (addr2, 1)) != CONST_INT)
+    return false;
+
+  if (reg1 != REGNO (XEXP (addr2, 0)))
+    return false;
+
+  /* The first offset must be evenly divisible by 8 to ensure the
+     address is 64 bit aligned.  */
+  if (offset1 % 8 != 0)
+    return false;
+
+  /* The offset for the second addr must be 4 more than the first addr.  */
+  if (INTVAL (XEXP (addr2, 1)) != offset1 + 4)
+    return false;
+
+  return true;
+}
+
+bool
+nds32_const_double_range_ok_p (rtx op, enum machine_mode mode,
+			       HOST_WIDE_INT lower, HOST_WIDE_INT upper)
+{
+  if (GET_CODE (op) != CONST_DOUBLE
+      || GET_MODE (op) != mode)
+    return false;
+
+  const REAL_VALUE_TYPE *rv;
+  long val;
+
+  rv = CONST_DOUBLE_REAL_VALUE (op);
+  REAL_VALUE_TO_TARGET_SINGLE (*rv, val);
+
+  return val >= lower && val < upper;
+}
+
+bool
+nds32_const_unspec_p (rtx x)
+{
+  if (GET_CODE (x) == CONST)
+    {
+      x = XEXP (x, 0);
+
+      if (GET_CODE (x) == PLUS)
+	x = XEXP (x, 0);
+
+      if (GET_CODE (x) == UNSPEC)
+	{
+	  switch (XINT (x, 1))
+	    {
+	    case UNSPEC_GOTINIT:
+	    case UNSPEC_GOT:
+	    case UNSPEC_GOTOFF:
+	    case UNSPEC_PLT:
+	    case UNSPEC_TLSGD:
+	    case UNSPEC_TLSLD:
+	    case UNSPEC_TLSIE:
+	    case UNSPEC_TLSLE:
+	      return false;
+	    default:
+	      return true;
+	    }
+	}
+    }
+
+  if (GET_CODE (x) == SYMBOL_REF
+      && SYMBOL_REF_TLS_MODEL (x))
+    return false;
+
+  return true;
+}
+
+HOST_WIDE_INT
+const_vector_to_hwint (rtx op)
+{
+  HOST_WIDE_INT hwint = 0;
+  HOST_WIDE_INT mask;
+  int i;
+  int shift_adv;
+  int shift = 0;
+  int nelem;
+
+  switch (GET_MODE (op))
+    {
+      case V2HImode:
+	mask = 0xffff;
+	shift_adv = 16;
+	nelem = 2;
+	break;
+      case V4QImode:
+	mask = 0xff;
+	shift_adv = 8;
+	nelem = 4;
+	break;
+      default:
+	gcc_unreachable ();
+    }
+
+  if (TARGET_BIG_ENDIAN)
+    {
+      for (i = 0; i < nelem; ++i)
+	{
+	  HOST_WIDE_INT val = XINT (XVECEXP (op, 0, nelem - i - 1), 0);
+	  hwint |= (val & mask) << shift;
+	  shift = shift + shift_adv;
+	}
+    }
+  else
+    {
+      for (i = 0; i < nelem; ++i)
+	{
+	  HOST_WIDE_INT val = XINT (XVECEXP (op, 0, i), 0);
+	  hwint |= (val & mask) << shift;
+	  shift = shift + shift_adv;
+	}
+    }
+
+  return hwint;
+}
+
+bool
+nds32_valid_CVp5_p (rtx op)
+{
+  HOST_WIDE_INT ival = const_vector_to_hwint (op);
+  return (ival < ((1 << 5) + 16)) && (ival >= (0 + 16));
+}
+
+bool
+nds32_valid_CVs5_p (rtx op)
+{
+  HOST_WIDE_INT ival = const_vector_to_hwint (op);
+  return (ival < (1 << 4)) && (ival >= -(1 << 4));
+}
+
+bool
+nds32_valid_CVs2_p (rtx op)
+{
+  HOST_WIDE_INT ival = const_vector_to_hwint (op);
+  return (ival < (1 << 19)) && (ival >= -(1 << 19));
+}
+
+bool
+nds32_valid_CVhi_p (rtx op)
+{
+  HOST_WIDE_INT ival = const_vector_to_hwint (op);
+  return (ival != 0) && ((ival & 0xfff) == 0);
+}
+
 /* ------------------------------------------------------------------------ */
diff --git a/gcc/config/nds32/nds32-protos.h b/gcc/config/nds32/nds32-protos.h
index d66749d..19e69e3 100644
--- a/gcc/config/nds32/nds32-protos.h
+++ b/gcc/config/nds32/nds32-protos.h
@@ -28,10 +28,14 @@ extern void nds32_init_expanders (void);

 /* Register Usage.  */

+/* -- Order of Allocation of Registers.  */
+extern void nds32_adjust_reg_alloc_order (void);
+
 /* -- How Values Fit in Registers.  */

-extern int nds32_hard_regno_nregs (int, machine_mode);
-extern int nds32_hard_regno_mode_ok (int, machine_mode);
+extern int nds32_hard_regno_nregs (int, enum machine_mode);
+extern int nds32_hard_regno_mode_ok (int, enum machine_mode);
+extern int nds32_modes_tieable_p (enum machine_mode, enum machine_mode);


 /* Register Classes.  */
@@ -43,6 +47,7 @@ extern enum reg_class nds32_regno_reg_class (int);

 /* -- Basic Stack Layout.  */

+extern rtx nds32_dynamic_chain_address (rtx);
 extern rtx nds32_return_addr_rtx (int, rtx);

 /* -- Eliminating Frame Pointer and Arg Pointer.  */
@@ -61,22 +66,88 @@ extern void nds32_expand_prologue (void);
 extern void nds32_expand_epilogue (bool);
 extern void nds32_expand_prologue_v3push (void);
 extern void nds32_expand_epilogue_v3pop (bool);
+extern void nds32_emit_push_fpr_callee_saved (int);
+extern void nds32_emit_pop_fpr_callee_saved (int);
+extern void nds32_emit_v3pop_fpr_callee_saved (int);
+
+/* Controlling Debugging Information Format.  */
+
+extern unsigned int nds32_dbx_register_number (unsigned int);

 /* ------------------------------------------------------------------------ */

-/* Auxiliary functions for auxiliary macros in nds32.h.  */
+/* Auxiliary functions for manipulation DI mode.  */

-extern bool nds32_ls_333_p (rtx, rtx, rtx, machine_mode);
+extern rtx nds32_di_high_part_subreg(rtx);
+extern rtx nds32_di_low_part_subreg(rtx);

 /* Auxiliary functions for expanding rtl used in nds32-multiple.md.  */

-extern rtx nds32_expand_load_multiple (int, int, rtx, rtx);
-extern rtx nds32_expand_store_multiple (int, int, rtx, rtx);
-extern int nds32_expand_movmemqi (rtx, rtx, rtx, rtx);
+extern rtx nds32_expand_load_multiple (int, int, rtx, rtx, bool, rtx *);
+extern rtx nds32_expand_store_multiple (int, int, rtx, rtx, bool, rtx *);
+extern bool nds32_expand_movmemsi (rtx, rtx, rtx, rtx);
+extern bool nds32_expand_setmem (rtx, rtx, rtx, rtx, rtx, rtx);
+extern bool nds32_expand_movstr (rtx, rtx, rtx);
+extern bool nds32_expand_strlen (rtx, rtx, rtx, rtx);

 /* Auxiliary functions for multiple load/store predicate checking.  */

-extern bool nds32_valid_multiple_load_store (rtx, bool);
+extern bool nds32_valid_multiple_load_store_p (rtx, bool, bool);
+
+/* Auxiliary functions for guard function checking in pipelines.md.  */
+
+extern bool nds32_n7_load_to_ii_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n7_last_load_to_ii_p (rtx_insn *, rtx_insn *);
+
+extern bool nds32_n8_load_to_ii_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n8_load_bi_to_ii_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n8_load_to_ex_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n8_ex_to_ii_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n8_last_load_to_ii_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n8_last_load_two_to_ii_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n8_last_load_to_ex_p (rtx_insn *, rtx_insn *);
+
+extern bool nds32_e8_load_to_ii_p (rtx_insn *, rtx_insn *);
+extern bool nds32_e8_load_to_ex_p (rtx_insn *, rtx_insn *);
+extern bool nds32_e8_ex_to_ii_p (rtx_insn *, rtx_insn *);
+extern bool nds32_e8_last_load_to_ii_p (rtx_insn *, rtx_insn *);
+extern bool nds32_e8_last_load_to_ex_p (rtx_insn *, rtx_insn *);
+
+extern bool nds32_n9_2r1w_mm_to_ex_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n9_3r2w_mm_to_ex_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n9_last_load_to_ex_p (rtx_insn *, rtx_insn *);
+
+extern bool nds32_n10_ex_to_ex_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n10_mm_to_ex_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n10_last_load_to_ex_p (rtx_insn *, rtx_insn *);
+
+extern bool nds32_gw_ex_to_ex_p (rtx_insn *, rtx_insn *);
+extern bool nds32_gw_mm_to_ex_p (rtx_insn *, rtx_insn *);
+extern bool nds32_gw_last_load_to_ex_p (rtx_insn *, rtx_insn *);
+
+extern bool nds32_n13_e2_to_e1_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n13_load_to_e1_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n13_load_to_e2_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n13_last_load_to_e1_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n13_last_load_to_e2_p (rtx_insn *, rtx_insn *);
+extern bool nds32_n13_last_two_load_to_e1_p (rtx_insn *, rtx_insn *);
+
+extern bool nds32_pn_e2_to_e1_p (rtx_insn *, rtx_insn *);
+extern bool nds32_pn_e3_to_e1_p (rtx_insn *, rtx_insn *);
+extern bool nds32_pn_e3_to_e2_p (rtx_insn *, rtx_insn *);
+extern bool nds32_pn_e4_to_e1_p (rtx_insn *, rtx_insn *);
+extern bool nds32_pn_e4_to_e2_p (rtx_insn *, rtx_insn *);
+extern bool nds32_pn_e4_to_e3_p (rtx_insn *, rtx_insn *);
+extern bool nds32_pn_wb_to_e1_p (rtx_insn *, rtx_insn *);
+extern bool nds32_pn_wb_to_e2_p (rtx_insn *, rtx_insn *);
+extern bool nds32_pn_wb_to_e3_p (rtx_insn *, rtx_insn *);
+extern bool nds32_pn_wb_to_e4_p (rtx_insn *, rtx_insn *);
+extern bool nds32_pn_last_load_to_e1_p (rtx_insn *, rtx_insn *);
+extern bool nds32_pn_last_load_to_e2_p (rtx_insn *, rtx_insn *);
+extern bool nds32_pn_last_load_to_e3_p (rtx_insn *, rtx_insn *);
+extern bool nds32_pn_last_two_load_to_e1_p (rtx_insn *, rtx_insn *);
+extern bool nds32_pn_last_two_load_to_e2_p (rtx_insn *, rtx_insn *);
+extern bool nds32_pn_last_three_load_to_e1_p (rtx_insn *, rtx_insn *);

 /* Auxiliary functions for stack operation predicate checking.  */

@@ -84,55 +155,176 @@ extern bool nds32_valid_stack_push_pop_p (rtx, bool);

 /* Auxiliary functions for bit operation detection.  */

-extern int nds32_can_use_bclr_p (int);
-extern int nds32_can_use_bset_p (int);
-extern int nds32_can_use_btgl_p (int);
+extern bool nds32_can_use_bclr_p (HOST_WIDE_INT);
+extern bool nds32_can_use_bset_p (HOST_WIDE_INT);
+extern bool nds32_can_use_btgl_p (HOST_WIDE_INT);

-extern int nds32_can_use_bitci_p (int);
+extern bool nds32_can_use_bitci_p (HOST_WIDE_INT);

-/* Auxiliary function for 'Computing the Length of an Insn'.  */
+extern bool nds32_const_double_range_ok_p (rtx, enum machine_mode,
+					   HOST_WIDE_INT, HOST_WIDE_INT);

-extern int nds32_adjust_insn_length (rtx_insn *, int);
+extern bool nds32_const_unspec_p (rtx x);

 /* Auxiliary functions for FP_AS_GP detection.  */

-extern int nds32_fp_as_gp_check_available (void);
+extern bool nds32_symbol_load_store_p (rtx_insn *);
+extern bool nds32_naked_function_p (tree);

 /* Auxiliary functions for jump table generation.  */

 extern const char *nds32_output_casesi_pc_relative (rtx *);
 extern const char *nds32_output_casesi (rtx *);

+/* Auxiliary functions for conditional branch generation.  */
+
+extern enum nds32_expand_result_type nds32_expand_cbranch (rtx *);
+extern enum nds32_expand_result_type nds32_expand_cstore (rtx *);
+extern void nds32_expand_float_cbranch (rtx *);
+extern void nds32_expand_float_cstore (rtx *);
+
+/* Auxiliary functions for conditional move generation.  */
+
+extern enum nds32_expand_result_type nds32_expand_movcc (rtx *);
+extern void nds32_expand_float_movcc (rtx *);
+
+/* Auxiliary functions for expand unalign load instruction.  */
+
+extern void nds32_expand_unaligned_load (rtx *, enum machine_mode);
+
+/* Auxiliary functions for expand extv/insv instruction.  */
+
+extern enum nds32_expand_result_type nds32_expand_extv (rtx *);
+extern enum nds32_expand_result_type nds32_expand_insv (rtx *);
+
+/* Auxiliary functions for expand unalign store instruction.  */
+
+extern void nds32_expand_unaligned_store (rtx *, enum machine_mode);
+
+/* Auxiliary functions for expand PIC instruction.  */
+
+extern void nds32_expand_pic_move (rtx *);
+
+/* Auxiliary functions to legitimize PIC address.  */
+
+extern rtx nds32_legitimize_pic_address (rtx);
+
+/* Auxiliary functions for expand TLS instruction.  */
+
+extern void nds32_expand_tls_move (rtx *);
+
+/* Auxiliary functions to legitimize TLS address.  */
+
+extern rtx nds32_legitimize_tls_address (rtx);
+
+/* Auxiliary functions to identify thread-local symbol.  */
+
+extern bool nds32_tls_referenced_p (rtx);
+
+/* Auxiliary functions for expand ICT instruction.  */
+
+extern void nds32_expand_ict_move (rtx *);
+
+/* Auxiliary functions to legitimize address for indirect-call symbol.  */
+
+extern rtx nds32_legitimize_ict_address (rtx);
+
+/* Auxiliary functions to identify indirect-call symbol.  */
+
+extern bool nds32_indirect_call_referenced_p (rtx);
+
+/* Auxiliary functions to identify long-call symbol.  */
+extern bool nds32_long_call_p (rtx);
+
+/* Auxiliary functions to identify SYMBOL_REF and LABEL_REF pattern.  */
+
+extern bool symbolic_reference_mentioned_p (rtx);
+
+/* Auxiliary functions to identify conditional move comparison operand.  */
+
+extern int nds32_cond_move_p (rtx);
+
+/* Auxiliary functions to identify address for peephole2 merge instruction.  */
+
+extern bool nds32_memory_merge_peep_p (rtx, rtx);
+
 /* Auxiliary functions to identify 16 bit addresing mode.  */

 extern enum nds32_16bit_address_type nds32_mem_format (rtx);

+/* Auxiliary functions to identify floating-point addresing mode.  */
+
+extern bool nds32_float_mem_operand_p (rtx);
+
 /* Auxiliary functions to output assembly code.  */

 extern const char *nds32_output_16bit_store (rtx *, int);
 extern const char *nds32_output_16bit_load (rtx *, int);
 extern const char *nds32_output_32bit_store (rtx *, int);
 extern const char *nds32_output_32bit_load (rtx *, int);
-extern const char *nds32_output_32bit_load_s (rtx *, int);
+extern const char *nds32_output_32bit_load_se (rtx *, int);
+extern const char *nds32_output_float_load(rtx *);
+extern const char *nds32_output_float_store(rtx *);
+extern const char *nds32_output_smw_single_word (rtx *);
+extern const char *nds32_output_smw_double_word (rtx *);
+extern const char *nds32_output_lmw_single_word (rtx *);
+extern const char *nds32_output_double (rtx *, bool);
+extern const char *nds32_output_cbranchsi4_equality_zero (rtx_insn *, rtx *);
+extern const char *nds32_output_cbranchsi4_equality_reg (rtx_insn *, rtx *);
+extern const char *nds32_output_cbranchsi4_equality_reg_or_const_int (rtx_insn *,
+								      rtx *);
+extern const char *nds32_output_cbranchsi4_greater_less_zero (rtx_insn *, rtx *);
+
+extern const char *nds32_output_unpkd8 (rtx, rtx, rtx, rtx, bool);
+
+extern const char *nds32_output_call (rtx, rtx *, rtx,
+				      const char *, const char *, bool);
+extern const char *nds32_output_tls_desc (rtx *);
+extern const char *nds32_output_tls_ie (rtx *);

 /* Auxiliary functions to output stack push/pop instruction.  */

 extern const char *nds32_output_stack_push (rtx);
 extern const char *nds32_output_stack_pop (rtx);
+extern const char *nds32_output_return (void);
+
+
+/* Auxiliary functions to split/output sms pattern.  */
+extern bool nds32_need_split_sms_p (rtx, rtx, rtx, rtx);
+extern const char *nds32_output_sms (rtx, rtx, rtx, rtx);
+extern void nds32_split_sms (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
+
+/* Auxiliary functions to split double word RTX pattern.  */
+
+extern void nds32_spilt_doubleword (rtx *, bool);
+extern void nds32_split_ashiftdi3 (rtx, rtx, rtx);
+extern void nds32_split_ashiftrtdi3 (rtx, rtx, rtx);
+extern void nds32_split_lshiftrtdi3 (rtx, rtx, rtx);
+extern void nds32_split_rotatertdi3 (rtx, rtx, rtx);
+
+/* Auxiliary functions to split large constant RTX pattern.  */
+
+extern void nds32_expand_constant (enum machine_mode,
+				   HOST_WIDE_INT, rtx, rtx);

 /* Auxiliary functions to check using return with null epilogue.  */

 extern int nds32_can_use_return_insn (void);
+extern enum machine_mode nds32_case_vector_shorten_mode (int, int, rtx);

 /* Auxiliary functions to decide output alignment or not.  */

 extern int nds32_target_alignment (rtx);
+extern unsigned int nds32_data_alignment (tree, unsigned int);
+extern unsigned int nds32_constant_alignment (tree, unsigned int);
+extern unsigned int nds32_local_alignment (tree, unsigned int);

 /* Auxiliary functions to expand builtin functions.  */

 extern void nds32_init_builtins_impl (void);
 extern rtx nds32_expand_builtin_impl (tree, rtx, rtx,
-				      machine_mode, int);
+				      enum machine_mode, int);
+extern tree nds32_builtin_decl_impl (unsigned, bool);

 /* Auxiliary functions for ISR implementation.  */

@@ -141,10 +333,86 @@ extern void nds32_construct_isr_vectors_information (tree, const char *);
 extern void nds32_asm_file_start_for_isr (void);
 extern void nds32_asm_file_end_for_isr (void);
 extern bool nds32_isr_function_p (tree);
+extern bool nds32_isr_function_critical_p (tree);

 /* Auxiliary functions for cost calculation.  */

+extern void nds32_init_rtx_costs (void);
 extern bool nds32_rtx_costs_impl (rtx, machine_mode, int, int, int *, bool);
-extern int nds32_address_cost_impl (rtx, machine_mode, addr_space_t, bool);
+extern int nds32_address_cost_impl (rtx, enum machine_mode, addr_space_t, bool);
+extern struct register_pass_info insert_pass_fp_as_gp;
+
+extern int nds32_adjust_insn_length (rtx_insn *, int);
+
+/* Auxiliary functions for pre-define marco.  */
+extern void nds32_cpu_cpp_builtins(struct cpp_reader *);
+
+/* Auxiliary functions for const_vector's constraints.  */
+
+extern HOST_WIDE_INT const_vector_to_hwint (rtx);
+extern bool nds32_valid_CVp5_p (rtx);
+extern bool nds32_valid_CVs5_p (rtx);
+extern bool nds32_valid_CVs2_p (rtx);
+extern bool nds32_valid_CVhi_p (rtx);
+
+/* Auxiliary functions for lwm/smw.  */
+
+extern bool nds32_valid_smw_lwm_base_p (rtx);
+
+/* Auxiliary functions for register rename pass.  */
+extern reg_class_t nds32_preferred_rename_class_impl (reg_class_t);
+
+extern bool nds32_split_double_word_load_store_p (rtx *,bool);
+
+namespace nds32 {
+
+extern rtx extract_pattern_from_insn (rtx);
+
+size_t parallel_elements (rtx);
+rtx parallel_element (rtx, int);
+
+bool insn_pseudo_nop_p (rtx_insn *);
+bool insn_executable_p (rtx_insn *);
+rtx_insn *prev_executable_insn (rtx_insn *);
+rtx_insn *next_executable_insn (rtx_insn *);
+rtx_insn *prev_executable_insn_local (rtx_insn *);
+rtx_insn *next_executable_insn_local (rtx_insn *);
+bool insn_deleted_p (rtx_insn *);
+
+bool load_single_p (rtx_insn *);
+bool store_single_p (rtx_insn *);
+bool load_double_p (rtx_insn *);
+bool store_double_p (rtx_insn *);
+bool store_offset_reg_p (rtx_insn *);
+bool load_full_word_p (rtx_insn *);
+bool load_partial_word_p (rtx_insn *);
+bool post_update_insn_p (rtx_insn *);
+bool immed_offset_p (rtx);
+int find_post_update_rtx (rtx_insn *);
+rtx extract_mem_rtx (rtx_insn *);
+rtx extract_base_reg (rtx_insn *);
+rtx extract_offset_rtx (rtx_insn *);
+
+rtx extract_shift_reg (rtx_insn *);
+
+bool movd44_insn_p (rtx_insn *);
+rtx extract_movd44_even_reg (rtx_insn *);
+rtx extract_movd44_odd_reg (rtx_insn *);
+
+rtx extract_mac_acc_rtx (rtx_insn *);
+rtx extract_mac_non_acc_rtx (rtx_insn *);
+
+bool divmod_p (rtx_insn *);
+
+rtx extract_branch_target_rtx (rtx_insn *);
+rtx extract_branch_condition_rtx (rtx_insn *);
+
+void compute_bb_for_insn_safe ();
+
+void exchange_insns (rtx_insn *, rtx_insn *);
+
+} // namespace nds32
+
+extern bool nds32_include_fp_arith;

 /* ------------------------------------------------------------------------ */
diff --git a/gcc/config/nds32/nds32-reg-utils.c b/gcc/config/nds32/nds32-reg-utils.c
new file mode 100644
index 0000000..1fd8a83
--- /dev/null
+++ b/gcc/config/nds32/nds32-reg-utils.c
@@ -0,0 +1,190 @@
+
+/* lmwsmw pass of Andes NDS32 cpu for GNU compiler
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+/* ------------------------------------------------------------------------ */
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "hash-set.h"
+#include "machmode.h"
+#include "vec.h"
+#include "double-int.h"
+#include "input.h"
+#include "alias.h"
+#include "symtab.h"
+#include "wide-int.h"
+#include "inchash.h"
+#include "tree.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "rtl.h"
+#include "regs.h"
+#include "hard-reg-set.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "input.h"
+#include "function.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "dominance.h"
+#include "cfg.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "predict.h"
+#include "basic-block.h"
+#include "bitmap.h"
+#include "df.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"		/* For add_builtin_function().  */
+#include "ggc.h"
+#include "tree-pass.h"
+#include "target-globals.h"
+#include "ira.h"
+#include "ira-int.h"
+#include "nds32-reg-utils.h"
+
+#define NDS32_GPR_NUM 32
+
+static bool debug_live_reg = false;
+
+void
+nds32_live_regs (basic_block bb, rtx_insn *first, rtx_insn *last, bitmap *live)
+{
+  df_ref def;
+  rtx_insn *insn;
+  bitmap_copy (*live, DF_LR_IN (bb));
+  df_simulate_initialize_forwards (bb, *live);
+  rtx_insn *first_insn = BB_HEAD (bb);
+
+  for (insn = first_insn; insn != first; insn = NEXT_INSN (insn))
+    df_simulate_one_insn_forwards (bb, insn, *live);
+
+  if (dump_file && debug_live_reg)
+    {
+      fprintf (dump_file, "scan live regs:\nfrom:\n");
+      print_rtl_single (dump_file, first);
+
+      fprintf (dump_file, "to:\n");
+      print_rtl_single (dump_file, last);
+
+      fprintf (dump_file, "bb lr in:\n");
+      dump_bitmap (dump_file, DF_LR_IN (bb));
+
+      fprintf (dump_file, "init:\n");
+      dump_bitmap (dump_file, *live);
+    }
+
+  for (insn = first; insn != last; insn = NEXT_INSN (insn))
+    {
+      if (!INSN_P (insn))
+	continue;
+
+      FOR_EACH_INSN_DEF (def, insn)
+	bitmap_set_bit (*live, DF_REF_REGNO (def));
+
+      if (dump_file && debug_live_reg)
+	{
+	  fprintf (dump_file, "scaning:\n");
+	  print_rtl_single (dump_file, insn);
+	  dump_bitmap (dump_file, *live);
+	}
+    }
+
+  gcc_assert (INSN_P (insn));
+
+  FOR_EACH_INSN_DEF (def, insn)
+    bitmap_set_bit (*live, DF_REF_REGNO (def));
+
+  if (dump_file && debug_live_reg)
+    {
+      fprintf (dump_file, "scaning:\n");
+      print_rtl_single (dump_file, last);
+      dump_bitmap (dump_file, *live);
+    }
+}
+
+void
+print_hard_reg_set (FILE *file, const char *prefix, HARD_REG_SET set)
+{
+  int i;
+  bool first = true;
+  fprintf (file, "%s{ ", prefix);
+
+  for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+    {
+      if (TEST_HARD_REG_BIT (set, i))
+	{
+	  if (first)
+	    {
+	      fprintf (file, "%s", reg_names[i]);
+	      first = false;
+	    }
+	  else
+	    fprintf (file, ", %s", reg_names[i]);
+	}
+    }
+  fprintf (file, "}\n");
+}
+
+void
+nds32_get_available_reg_set (basic_block bb,
+			     rtx_insn *first,
+			     rtx_insn *last,
+			     HARD_REG_SET *available_regset)
+{
+  bitmap live;
+  HARD_REG_SET live_regset;
+  unsigned i;
+  live = BITMAP_ALLOC (&reg_obstack);
+
+  nds32_live_regs (bb, first, last, &live);
+
+  REG_SET_TO_HARD_REG_SET (live_regset, live);
+
+  /* Reverse available_regset. */
+  COMPL_HARD_REG_SET (*available_regset, live_regset);
+
+  /* We only care $r0-$r31, so mask $r0-$r31.  */
+  AND_HARD_REG_SET (*available_regset, reg_class_contents[GENERAL_REGS]);
+
+  /* Fixed register also not available.  */
+  for (i = NDS32_FIRST_GPR_REGNUM; i <= NDS32_LAST_GPR_REGNUM; ++i)
+    {
+      if (fixed_regs[i])
+	CLEAR_HARD_REG_BIT (*available_regset, i);
+    }
+
+  BITMAP_FREE (live);
+}
diff --git a/gcc/config/nds32/nds32-reg-utils.h b/gcc/config/nds32/nds32-reg-utils.h
new file mode 100644
index 0000000..16c23a3
--- /dev/null
+++ b/gcc/config/nds32/nds32-reg-utils.h
@@ -0,0 +1,61 @@
+/* Prototypes for load-store-opt of Andes NDS32 cpu for GNU compiler
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef NDS32_REG_UTILS_OPT_H
+#define NDS32_REG_UTILS_OPT_H
+
+/* Auxiliary functions for register usage analysis.  */
+extern void nds32_live_regs (basic_block, rtx_insn *, rtx_insn *, bitmap *);
+extern void print_hard_reg_set (FILE *, const char *, HARD_REG_SET);
+extern void nds32_get_available_reg_set (basic_block, rtx_insn *,
+					 rtx_insn *, HARD_REG_SET *);
+
+static inline bool
+in_reg_class_p (unsigned regno, enum reg_class clazz)
+{
+  return TEST_HARD_REG_BIT (reg_class_contents[clazz], regno);
+}
+
+static inline bool
+in_reg_class_p (rtx reg, enum reg_class clazz)
+{
+  gcc_assert (REG_P (reg));
+  return in_reg_class_p (REGNO (reg), clazz);
+}
+
+static inline unsigned
+find_available_reg (HARD_REG_SET *available_regset, enum reg_class clazz)
+{
+  hard_reg_set_iterator hrsi;
+  unsigned regno;
+  EXECUTE_IF_SET_IN_HARD_REG_SET (reg_class_contents[clazz], 0, regno, hrsi)
+    {
+      /* Caller-save register or callee-save register but it's ever live.  */
+      if (TEST_HARD_REG_BIT (*available_regset, regno)
+	  && (call_used_regs[regno] || df_regs_ever_live_p (regno)))
+	return regno;
+    }
+
+  return INVALID_REGNUM;
+}
+
+
+
+#endif
diff --git a/gcc/config/nds32/nds32-regrename.c b/gcc/config/nds32/nds32-regrename.c
new file mode 100644
index 0000000..0875722
--- /dev/null
+++ b/gcc/config/nds32/nds32-regrename.c
@@ -0,0 +1,389 @@
+/* Register rename pass of Andes NDS32 cpu for GNU compiler
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "tree.h"
+#include "rtl.h"
+#include "df.h"
+#include "alias.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "regs.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
+#include "explow.h"
+#include "emit-rtl.h"
+#include "stmt.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"		/* For add_builtin_function().  */
+#include "builtins.h"
+#include "cpplib.h"
+#include "params.h"
+#include "tree-pass.h"
+#include "regrename.h"
+
+static reg_class_t current_preferred_rename_class = NO_REGS;
+
+reg_class_t
+nds32_preferred_rename_class_impl (reg_class_t rclass)
+{
+  if (rclass == GENERAL_REGS)
+    return current_preferred_rename_class;
+  else
+    return NO_REGS;
+}
+
+static void
+print_hard_reg_set (FILE *file, HARD_REG_SET set)
+{
+  int i;
+
+  fprintf (file, "{ ");
+  for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+    {
+      if (TEST_HARD_REG_BIT (set, i))
+	fprintf (file, "%d ", i);
+    }
+  fprintf (file, "}\n");
+}
+
+void
+dump_hard_reg_set (FILE *file, HARD_REG_SET set)
+{
+  print_hard_reg_set (file, set);
+}
+
+static bool
+in_reg_class_p (unsigned regno, enum reg_class clazz)
+{
+  return TEST_HARD_REG_BIT (reg_class_contents[clazz], regno);
+}
+
+static unsigned
+try_find_best_rename_reg (du_head_p op_chain, reg_class_t preferred_class)
+{
+  HARD_REG_SET unavailable;
+  unsigned new_reg;
+  current_preferred_rename_class = preferred_class;
+
+  COMPL_HARD_REG_SET (unavailable, reg_class_contents[preferred_class]);
+  CLEAR_HARD_REG_BIT (unavailable, op_chain->regno);
+
+  new_reg = find_rename_reg (op_chain, GENERAL_REGS,
+			     &unavailable, op_chain->regno, false);
+
+  current_preferred_rename_class = NO_REGS;
+  return new_reg;
+}
+
+static bool
+try_rename_operand_to (rtx insn, unsigned op_pos,
+		       reg_class_t preferred_rename_class)
+{
+  insn_rr_info *info;
+  du_head_p op_chain;
+  unsigned newreg;
+  unsigned oldreg;
+
+  info = &insn_rr[INSN_UID (insn)];
+
+  if (info->op_info == NULL)
+    return false;
+
+  if (info->op_info[op_pos].n_chains == 0)
+    return false;
+
+  op_chain = regrename_chain_from_id (info->op_info[op_pos].heads[0]->id);
+
+  if (op_chain->cannot_rename)
+    return false;
+
+  /* Already use preferred class, so do nothing.  */
+  if (TEST_HARD_REG_BIT (reg_class_contents[preferred_rename_class],
+			 op_chain->regno))
+    return false;
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Try to rename operand %d to %s:\n",
+	       op_pos, reg_class_names[preferred_rename_class]);
+      print_rtl_single (dump_file, insn);
+    }
+
+  oldreg = op_chain->regno;
+  newreg = try_find_best_rename_reg (op_chain, preferred_rename_class);
+
+  if (newreg == oldreg)
+    return false;
+
+  regrename_do_replace (op_chain, newreg);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Rename operand %d to %s is Done:\n",
+	       op_pos, reg_class_names[preferred_rename_class]);
+      print_rtl_single (dump_file, insn);
+    }
+  return true;
+}
+
+static bool
+rename_slt_profitlable (rtx insn)
+{
+  rtx pattern;
+  pattern = PATTERN (insn);
+  rtx src = SET_SRC (pattern);
+  rtx op0 = XEXP (src, 0);
+  rtx op1 = XEXP (src, 0);
+
+  insn_rr_info *info;
+  du_head_p op_chain;
+  int op_pos = 0;
+
+  info = &insn_rr[INSN_UID (insn)];
+
+  if (info->op_info == NULL)
+    return false;
+
+  if (info->op_info[op_pos].n_chains == 0)
+    return false;
+
+  op_chain = regrename_chain_from_id (info->op_info[op_pos].heads[0]->id);
+
+  if (in_reg_class_p (op_chain->regno, R15_TA_REG))
+    return false;
+
+  /* slt[s]45 need second operand in MIDDLE_REGS class.  */
+  if (!REG_P (op0) || !in_reg_class_p (REGNO (op0), MIDDLE_REGS))
+    return false;
+
+  /* slt[s]i45 only allow 5 bit unsigned integer.  */
+  if (REG_P (op1)
+      || (CONST_INT_P (op1) && satisfies_constraint_Iu05 (op1)))
+    return true;
+
+  return false;
+}
+
+static bool
+rename_cbranch_eq0_low_reg_profitlable (rtx insn)
+{
+  insn_rr_info *info;
+  du_head_p op_chain;
+  int op_pos = 1;
+
+  info = &insn_rr[INSN_UID (insn)];
+
+  if (info->op_info == NULL)
+    return false;
+
+  if (info->op_info[op_pos].n_chains == 0)
+    return false;
+
+  op_chain = regrename_chain_from_id (info->op_info[op_pos].heads[0]->id);
+
+  if (in_reg_class_p (op_chain->regno, LOW_REGS))
+    return false;
+
+  return true;
+}
+
+
+static bool
+rename_cbranch_eq0_r15_profitlable (rtx insn)
+{
+  rtx pattern;
+  pattern = PATTERN (insn);
+  rtx if_then_else = SET_SRC (pattern);
+  rtx cond = XEXP (if_then_else, 0);
+  rtx op0 = XEXP (cond, 0);
+
+  insn_rr_info *info;
+  du_head_p op_chain;
+  int op_pos = 1;
+
+  info = &insn_rr[INSN_UID (insn)];
+
+  if (info->op_info == NULL)
+    return false;
+
+  if (info->op_info[op_pos].n_chains == 0)
+    return false;
+
+  op_chain = regrename_chain_from_id (info->op_info[op_pos].heads[0]->id);
+
+  if (in_reg_class_p (op_chain->regno, R15_TA_REG))
+    return false;
+
+  /* LOW_REGS or R15_TA_REG both are 2-byte instruction.  */
+  if (REG_P (op0) && in_reg_class_p (REGNO (op0), LOW_REGS))
+    return false;
+
+  return true;
+}
+
+static bool
+rename_cbranch_eq_reg_profitlable (rtx insn)
+{
+  rtx pattern;
+  pattern = PATTERN (insn);
+  rtx if_then_else = SET_SRC (pattern);
+  rtx cond = XEXP (if_then_else, 0);
+  rtx op1 = XEXP (cond, 1);
+
+  insn_rr_info *info;
+  du_head_p op_chain;
+  int op_pos = 1;
+
+  info = &insn_rr[INSN_UID (insn)];
+
+  if (info->op_info == NULL)
+    return false;
+
+  if (info->op_info[op_pos].n_chains == 0)
+    return false;
+
+  op_chain = regrename_chain_from_id (info->op_info[op_pos].heads[0]->id);
+
+  if (in_reg_class_p (op_chain->regno, R5_REG))
+    return false;
+
+  if (REG_P (op1) && in_reg_class_p (REGNO (op1), LOW_REGS))
+     return true;
+  else
+    return false;
+}
+
+static void
+do_regrename ()
+{
+  basic_block bb;
+  rtx_insn *insn;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (!INSN_P (insn))
+	    continue;
+
+	  switch (recog_memoized (insn))
+	    {
+	    case CODE_FOR_slts_compare_impl:
+	    case CODE_FOR_slt_compare_impl:
+	      /* Try to rename operand 0 to $r15 if profitable.  */
+	      if (rename_slt_profitlable (insn))
+		try_rename_operand_to (insn, 0, R15_TA_REG);
+	      break;
+	    case CODE_FOR_slt_eq0:
+	      /* Try to rename operand 0 to $r15.  */
+	      if (rename_slt_profitlable (insn))
+		try_rename_operand_to (insn, 0, R15_TA_REG);
+	      break;
+	    case CODE_FOR_cbranchsi4_equality_zero:
+	      /* Try to rename operand 1 to $r15.  */
+	      if (rename_cbranch_eq0_r15_profitlable (insn))
+		if (!try_rename_operand_to (insn, 1, R15_TA_REG))
+		  if (rename_cbranch_eq0_low_reg_profitlable (insn))
+		    try_rename_operand_to (insn, 1, LOW_REGS);
+	      break;
+	    case CODE_FOR_cbranchsi4_equality_reg:
+	    case CODE_FOR_cbranchsi4_equality_reg_or_const_int:
+	      /* Try to rename operand 1 to $r5.  */
+	      if (rename_cbranch_eq_reg_profitlable (insn))
+		try_rename_operand_to (insn, 1, R5_REG);
+	      break;
+	    }
+	}
+    }
+}
+
+static unsigned int
+nds32_regrename (void)
+{
+  df_set_flags (DF_LR_RUN_DCE);
+  df_note_add_problem ();
+  df_analyze ();
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+
+  regrename_init (true);
+
+  regrename_analyze (NULL);
+
+  do_regrename ();
+
+  regrename_finish ();
+  return 1;
+}
+
+const pass_data pass_data_nds32_regrename =
+{
+  RTL_PASS,				/* type */
+  "nds32-regrename",			/* name */
+  OPTGROUP_NONE,			/* optinfo_flags */
+  TV_MACH_DEP,				/* tv_id */
+  0,					/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_df_finish,			/* todo_flags_finish */
+};
+
+class pass_nds32_regrename_opt : public rtl_opt_pass
+{
+public:
+  pass_nds32_regrename_opt (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_nds32_regrename, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) { return TARGET_16_BIT && TARGET_REGRENAME_OPT; }
+  unsigned int execute (function *) { return nds32_regrename (); }
+};
+
+rtl_opt_pass *
+make_pass_nds32_regrename_opt (gcc::context *ctxt)
+{
+  return new pass_nds32_regrename_opt (ctxt);
+}
diff --git a/gcc/config/nds32/nds32-relax-opt.c b/gcc/config/nds32/nds32-relax-opt.c
new file mode 100644
index 0000000..0919af6
--- /dev/null
+++ b/gcc/config/nds32/nds32-relax-opt.c
@@ -0,0 +1,612 @@
+/* relax-opt pass of Andes NDS32 cpu for GNU compiler
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "hash-set.h"
+#include "machmode.h"
+#include "vec.h"
+#include "double-int.h"
+#include "input.h"
+#include "alias.h"
+#include "symtab.h"
+#include "wide-int.h"
+#include "inchash.h"
+#include "tree.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "rtl.h"
+#include "regs.h"
+#include "hard-reg-set.h"
+#include "insn-config.h"   /* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"     /* For DFA state_t.  */
+#include "insn-codes.h"    /* For CODE_FOR_xxx.  */
+#include "reload.h"     /* For push_reload ().  */
+#include "flags.h"
+#include "input.h"
+#include "function.h"
+#include "emit-rtl.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "dominance.h"
+#include "cfg.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "predict.h"
+#include "basic-block.h"
+#include "bitmap.h"
+#include "df.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"     /* For add_builtin_function ().  */
+#include "ggc.h"
+#include "tree-pass.h"
+#include "target-globals.h"
+using namespace nds32;
+
+/* This is used to create unique relax hint id value.
+   The initial value is 0.  */
+static int relax_group_id = 0;
+
+/* Group the following pattern as relax candidates:
+
+   1. sethi	$ra, hi20(sym)
+      ori	$ra, $ra, lo12(sym)
+    ==>
+      addi.gp	$ra, sym
+
+   2. sethi	$ra, hi20(sym)
+      lwi	$rb, [$ra + lo12(sym)]
+    ==>
+      lwi.gp	$rb, [(sym)]
+
+   3. sethi	$ra, hi20(sym)
+      ori	$ra, $ra, lo12(sym)
+      lwi	$rb, [$ra]
+      swi	$rc, [$ra]
+    ==>
+      lwi37	$rb, [(sym)]
+      swi37	$rc, [(sym)] */
+
+/* Return true if is load/store with REG addressing mode
+   and memory mode is SImode.  */
+static bool
+nds32_reg_base_load_store_p (rtx_insn *insn)
+{
+  rtx mem_src = NULL_RTX;
+
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LOAD:
+      mem_src = SET_SRC (PATTERN (insn));
+      break;
+    case TYPE_STORE:
+      mem_src = SET_DEST (PATTERN (insn));
+      break;
+    default:
+      break;
+    }
+
+  /* Find load/store insn with addressing mode is REG.  */
+  if (mem_src != NULL_RTX)
+    {
+      if ((GET_CODE (mem_src) == ZERO_EXTEND)
+	  || (GET_CODE (mem_src) == SIGN_EXTEND))
+	mem_src = XEXP (mem_src, 0);
+
+      if (GET_CODE (XEXP (mem_src, 0)) == REG)
+	return true;
+    }
+
+  return false;
+}
+
+/* Return true if insn is a sp/fp base or sp/fp plus load-store instruction.  */
+
+static bool
+nds32_sp_base_or_plus_load_store_p (rtx_insn *insn)
+{
+  rtx mem_src = NULL_RTX;
+
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LOAD:
+      mem_src = SET_SRC (PATTERN (insn));
+      break;
+    case TYPE_STORE:
+      mem_src = SET_DEST (PATTERN (insn));
+      break;
+    default:
+      break;
+    }
+  /* Find load/store insn with addressing mode is REG.  */
+  if (mem_src != NULL_RTX)
+    {
+      if ((GET_CODE (mem_src) == ZERO_EXTEND)
+	  || (GET_CODE (mem_src) == SIGN_EXTEND))
+	mem_src = XEXP (mem_src, 0);
+
+      if ((GET_CODE (XEXP (mem_src, 0)) == PLUS))
+	mem_src = XEXP (mem_src, 0);
+
+      if (REG_P (XEXP (mem_src, 0))
+	  && ((frame_pointer_needed
+	       && REGNO (XEXP (mem_src, 0)) == FP_REGNUM)
+	      || REGNO (XEXP (mem_src, 0)) == SP_REGNUM))
+	return true;
+    }
+
+  return false;
+}
+
+/* Return true if is load with [REG + REG/CONST_INT]  addressing mode.  */
+static bool
+nds32_plus_reg_load_store_p (rtx_insn *insn)
+{
+  rtx mem_src = NULL_RTX;
+
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LOAD:
+      mem_src = SET_SRC (PATTERN (insn));
+      break;
+    case TYPE_STORE:
+      mem_src = SET_DEST (PATTERN (insn));
+      break;
+    default:
+      break;
+    }
+
+  /* Find load/store insn with addressing mode is [REG + REG/CONST].  */
+  if (mem_src != NULL_RTX)
+    {
+      if ((GET_CODE (mem_src) == ZERO_EXTEND)
+	  || (GET_CODE (mem_src) == SIGN_EXTEND))
+	mem_src = XEXP (mem_src, 0);
+
+      if ((GET_CODE (XEXP (mem_src, 0)) == PLUS))
+	mem_src = XEXP (mem_src, 0);
+      else
+	return false;
+
+      if (GET_CODE (XEXP (mem_src, 0)) == REG)
+	return true;
+
+    }
+
+  return false;
+}
+
+/* Return true if ins is hwloop last instruction.  */
+static bool
+nds32_hwloop_last_insn_p (rtx_insn *insn)
+{
+  if (recog_memoized (insn) == CODE_FOR_hwloop_last_insn)
+    return true;
+
+  return false;
+}
+
+/* Return true if x is const and the referance is ict symbol.  */
+static bool
+nds32_ict_const_p (rtx x)
+{
+  if (GET_CODE (x) == CONST)
+    {
+      x = XEXP (x, 0);
+      return nds32_indirect_call_referenced_p (x);
+    }
+  return FALSE;
+}
+
+/* Group the following pattern as relax candidates:
+
+   GOT:
+      sethi	$ra, hi20(sym)
+      ori	$ra, $ra, lo12(sym)
+      lw	$rb, [$ra + $gp]
+
+   GOTOFF, TLSLE:
+      sethi	$ra, hi20(sym)
+      ori	$ra, $ra, lo12(sym)
+      LS	$rb, [$ra + $gp]
+
+   GOTOFF, TLSLE:
+      sethi	$ra, hi20(sym)
+      ori	$ra, $ra, lo12(sym)
+      add	$rb, $ra, $gp($tp)
+
+   Initial GOT table:
+      sethi	$gp,hi20(sym)
+      ori	$gp, $gp, lo12(sym)
+      add5.pc	$gp  */
+
+static auto_vec<rtx_insn *, 32> nds32_group_infos;
+/* Group the PIC and TLS relax candidate instructions for linker.  */
+static bool
+nds32_pic_tls_group (rtx_insn *def_insn,
+		     enum nds32_relax_insn_type relax_type,
+		     int sym_type)
+{
+  df_ref def_record;
+  df_link *link;
+  rtx_insn *use_insn = NULL;
+  rtx pat, new_pat;
+  def_record = DF_INSN_DEFS (def_insn);
+  for (link = DF_REF_CHAIN (def_record); link; link = link->next)
+    {
+      if (!DF_REF_INSN_INFO (link->ref))
+	continue;
+
+      use_insn = DF_REF_INSN (link->ref);
+
+      /* Skip if define insn and use insn not in the same basic block.  */
+      if (!dominated_by_p (CDI_DOMINATORS,
+			   BLOCK_FOR_INSN (use_insn),
+			   BLOCK_FOR_INSN (def_insn)))
+	return FALSE;
+
+      /* Skip if use_insn not active insn.  */
+      if (!active_insn_p (use_insn))
+	return FALSE;
+
+      switch (relax_type)
+	{
+	case RELAX_ORI:
+
+	  /* GOTOFF, TLSLE:
+	     sethi	$ra, hi20(sym)
+	     ori	$ra, $ra, lo12(sym)
+	     add	$rb, $ra, $gp($tp)  */
+	  if ((sym_type == UNSPEC_TLSLE
+	       || sym_type == UNSPEC_GOTOFF)
+	      && (recog_memoized (use_insn) == CODE_FOR_addsi3))
+	    {
+	      pat = XEXP (PATTERN (use_insn), 1);
+	      new_pat =
+		gen_rtx_UNSPEC (SImode,
+				gen_rtvec (2, XEXP (pat, 0), XEXP (pat, 1)),
+				UNSPEC_ADD32);
+	      validate_replace_rtx (pat, new_pat, use_insn);
+	      nds32_group_infos.safe_push (use_insn);
+	    }
+	  else if (nds32_plus_reg_load_store_p (use_insn)
+		   && !nds32_sp_base_or_plus_load_store_p (use_insn))
+	    nds32_group_infos.safe_push (use_insn);
+	  else
+	    return FALSE;
+	  break;
+
+	default:
+	  return FALSE;
+	}
+    }
+  return TRUE;
+}
+
+static int
+nds32_pic_tls_symbol_type (rtx x)
+{
+  x = XEXP (SET_SRC (PATTERN (x)), 1);
+
+  if (GET_CODE (x) == CONST)
+    {
+      x = XEXP (x, 0);
+
+      if (GET_CODE (x) == PLUS)
+	x = XEXP (x, 0);
+
+      return XINT (x, 1);
+    }
+
+  return XINT (x, 1);
+}
+
+/* Group the relax candidates with group id.  */
+static void
+nds32_group_insns (rtx sethi)
+{
+  df_ref def_record, use_record;
+  df_link *link;
+  rtx_insn *use_insn = NULL;
+  rtx group_id;
+  bool valid;
+
+  def_record = DF_INSN_DEFS (sethi);
+
+  for (link = DF_REF_CHAIN (def_record); link; link = link->next)
+    {
+      if (!DF_REF_INSN_INFO (link->ref))
+	continue;
+
+      use_insn = DF_REF_INSN (link->ref);
+
+      /* Skip if define insn and use insn not in the same basic block.  */
+      if (!dominated_by_p (CDI_DOMINATORS,
+			   BLOCK_FOR_INSN (use_insn),
+			   BLOCK_FOR_INSN (sethi)))
+	return;
+
+      /* Skip if the low-part used register is from different high-part
+	 instructions.  */
+      use_record = DF_INSN_USES (use_insn);
+      if (DF_REF_CHAIN (use_record) && DF_REF_CHAIN (use_record)->next)
+	return;
+
+      /* Skip if use_insn not active insn.  */
+      if (!active_insn_p (use_insn))
+	return;
+
+     /* Initial use_insn_type.  */
+      if (!(recog_memoized (use_insn) == CODE_FOR_lo_sum
+	    || nds32_symbol_load_store_p (use_insn)
+	    || (nds32_reg_base_load_store_p (use_insn)
+		&&!nds32_sp_base_or_plus_load_store_p (use_insn))))
+	return;
+    }
+
+  group_id = GEN_INT (relax_group_id);
+  /* Insert .relax_* directive for sethi.  */
+  emit_insn_before (gen_relax_group (group_id), sethi);
+
+  /* Scan the use insns and insert the directive.  */
+  for (link = DF_REF_CHAIN (def_record); link; link = link->next)
+    {
+      if (!DF_REF_INSN_INFO (link->ref))
+	continue;
+
+      use_insn = DF_REF_INSN (link->ref);
+
+      /* Insert .relax_* directive.  */
+      if (active_insn_p (use_insn))
+	emit_insn_before (gen_relax_group (group_id), use_insn);
+
+      /* Find ori ra, ra, unspec(symbol) instruction.  */
+      if (use_insn != NULL
+	  && recog_memoized (use_insn) == CODE_FOR_lo_sum
+	  && !nds32_const_unspec_p (XEXP (SET_SRC (PATTERN (use_insn)), 1)))
+	{
+	  int sym_type = nds32_pic_tls_symbol_type (use_insn);
+	  valid = nds32_pic_tls_group (use_insn, RELAX_ORI, sym_type);
+
+	  /* Insert .relax_* directive.  */
+	  while (!nds32_group_infos.is_empty ())
+	    {
+	      use_insn = nds32_group_infos.pop ();
+	      if (valid)
+		emit_insn_before (gen_relax_group (group_id), use_insn);
+	    }
+	}
+    }
+
+  relax_group_id++;
+}
+
+/* Convert relax group id in rtl.  */
+
+static void
+nds32_group_tls_insn (rtx insn)
+{
+  rtx pat = PATTERN (insn);
+  rtx unspec_relax_group = XEXP (XVECEXP (pat, 0, 1), 0);
+
+  while (GET_CODE (pat) != SET && GET_CODE (pat) == PARALLEL)
+    {
+      pat = XVECEXP (pat, 0, 0);
+    }
+
+  if (GET_CODE (unspec_relax_group) == UNSPEC
+      && XINT (unspec_relax_group, 1) == UNSPEC_VOLATILE_RELAX_GROUP)
+    {
+      XVECEXP (unspec_relax_group, 0, 0) = GEN_INT (relax_group_id);
+    }
+
+  relax_group_id++;
+}
+
+static bool
+nds32_float_reg_load_store_p (rtx_insn *insn)
+{
+  rtx pat = PATTERN (insn);
+
+  if (get_attr_type (insn) == TYPE_FLOAD
+      && GET_CODE (pat) == SET
+      && (GET_MODE (XEXP (pat, 0)) == SFmode
+	  || GET_MODE (XEXP (pat, 0)) == DFmode)
+      && MEM_P (XEXP (pat, 1)))
+    {
+      rtx addr = XEXP (XEXP (pat, 1), 0);
+
+      /* [$ra] */
+      if (REG_P (addr))
+	return true;
+      /* [$ra + offset] */
+      if (GET_CODE (addr) == PLUS
+	  && REG_P (XEXP (addr, 0))
+	  && CONST_INT_P (XEXP (addr, 1)))
+	return true;
+    }
+  return false;
+}
+
+
+/* Group float load-store instructions:
+   la $ra, symbol
+   flsi $rt, [$ra + offset] */
+
+static void
+nds32_group_float_insns (rtx insn)
+{
+  df_ref def_record, use_record;
+  df_link *link;
+  rtx_insn *use_insn = NULL;
+  rtx group_id;
+
+  def_record = DF_INSN_DEFS (insn);
+
+  for (link = DF_REF_CHAIN (def_record); link; link = link->next)
+    {
+      if (!DF_REF_INSN_INFO (link->ref))
+	continue;
+
+      use_insn = DF_REF_INSN (link->ref);
+
+      /* Skip if define insn and use insn not in the same basic block.  */
+      if (!dominated_by_p (CDI_DOMINATORS,
+			   BLOCK_FOR_INSN (use_insn),
+			   BLOCK_FOR_INSN (insn)))
+	return;
+
+      /* Skip if the low-part used register is from different high-part
+	 instructions.  */
+      use_record = DF_INSN_USES (use_insn);
+      if (DF_REF_CHAIN (use_record) && DF_REF_CHAIN (use_record)->next)
+	return;
+
+      /* Skip if use_insn not active insn.  */
+      if (!active_insn_p (use_insn))
+	return;
+
+      if (!nds32_float_reg_load_store_p (use_insn)
+	  || find_post_update_rtx (use_insn) != -1)
+	return;
+    }
+
+  group_id = GEN_INT (relax_group_id);
+  /* Insert .relax_* directive for insn.  */
+  emit_insn_before (gen_relax_group (group_id), insn);
+
+  /* Scan the use insns and insert the directive.  */
+  for (link = DF_REF_CHAIN (def_record); link; link = link->next)
+    {
+      if (!DF_REF_INSN_INFO (link->ref))
+	continue;
+
+      use_insn = DF_REF_INSN (link->ref);
+
+      /* Insert .relax_* directive.  */
+	emit_insn_before (gen_relax_group (group_id), use_insn);
+    }
+
+  relax_group_id++;
+}
+
+/* Group the relax candidate instructions for linker.  */
+static void
+nds32_relax_group (void)
+{
+  rtx_insn *insn;
+
+  compute_bb_for_insn ();
+
+  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+  df_insn_rescan_all ();
+  df_analyze ();
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+  calculate_dominance_info (CDI_DOMINATORS);
+
+  insn = get_insns ();
+  gcc_assert (NOTE_P (insn));
+
+  for (insn = next_active_insn (insn); insn; insn = next_active_insn (insn))
+    {
+      if (NONJUMP_INSN_P (insn))
+	{
+	  /* Find sethi ra, symbol  instruction.  */
+	  if (recog_memoized (insn) == CODE_FOR_sethi
+	      && nds32_symbolic_operand (XEXP (SET_SRC (PATTERN (insn)), 0),
+					 SImode)
+	      && !nds32_ict_const_p (XEXP (SET_SRC (PATTERN (insn)), 0))
+	      && !nds32_hwloop_last_insn_p (next_active_insn (insn)))
+
+	    nds32_group_insns (insn);
+	  else if (recog_memoized (insn) == CODE_FOR_tls_ie)
+	    nds32_group_tls_insn (insn);
+	  else if (TARGET_FPU_SINGLE
+		   && recog_memoized (insn) == CODE_FOR_move_addr
+		   && !nds32_ict_const_p (XEXP (SET_SRC (PATTERN (insn)), 0))
+		   && !nds32_hwloop_last_insn_p (next_active_insn (insn)))
+	    {
+	      nds32_group_float_insns (insn);
+	    }
+	}
+      else if (CALL_P (insn) && recog_memoized (insn) == CODE_FOR_tls_desc)
+	{
+	  nds32_group_tls_insn (insn);
+	}
+    }
+
+  /* We must call df_finish_pass manually because it should be invoked before
+     BB information is destroyed. Hence we cannot set the TODO_df_finish flag
+     to the pass manager.  */
+  df_insn_rescan_all ();
+  df_finish_pass (false);
+  free_dominance_info (CDI_DOMINATORS);
+}
+
+static unsigned int
+nds32_relax_opt (void)
+{
+  if (TARGET_RELAX_HINT)
+    nds32_relax_group ();
+  return 1;
+}
+
+const pass_data pass_data_nds32_relax_opt =
+{
+  RTL_PASS,				/* type */
+  "relax_opt",				/* name */
+  OPTGROUP_NONE,			/* optinfo_flags */
+  TV_MACH_DEP,				/* tv_id */
+  0,					/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_df_finish,			/* todo_flags_finish */
+};
+
+class pass_nds32_relax_opt : public rtl_opt_pass
+{
+public:
+  pass_nds32_relax_opt (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_nds32_relax_opt, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) { return TARGET_RELAX_HINT; }
+  unsigned int execute (function *) { return nds32_relax_opt (); }
+};
+
+rtl_opt_pass *
+make_pass_nds32_relax_opt (gcc::context *ctxt)
+{
+  return new pass_nds32_relax_opt (ctxt);
+}
diff --git a/gcc/config/nds32/nds32-scalbn-transform.c b/gcc/config/nds32/nds32-scalbn-transform.c
new file mode 100644
index 0000000..fba7c6f
--- /dev/null
+++ b/gcc/config/nds32/nds32-scalbn-transform.c
@@ -0,0 +1,364 @@
+/* A Gimple-level pass of Andes NDS32 cpu for GNU compiler.
+   This pass transforms the multiplications whose multiplier is a
+   power of 2.
+
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "hash-set.h"
+#include "machmode.h"
+#include "vec.h"
+#include "double-int.h"
+#include "input.h"
+#include "alias.h"
+#include "symtab.h"
+#include "wide-int.h"
+#include "inchash.h"
+#include "tree.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "rtl.h"
+#include "regs.h"
+#include "hard-reg-set.h"
+#include "insn-config.h"   /* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"     /* For DFA state_t.  */
+#include "insn-codes.h"    /* For CODE_FOR_xxx.  */
+#include "reload.h"     /* For push_reload ().  */
+#include "flags.h"
+#include "input.h"
+#include "function.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "dominance.h"
+#include "cfg.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "predict.h"
+#include "basic-block.h"
+#include "bitmap.h"
+#include "df.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"     /* For add_builtin_function ().  */
+#include "ggc.h"
+#include "tree-pass.h"
+#include "tree-ssa-alias.h"
+#include "fold-const.h"
+#include "gimple-expr.h"
+#include "is-a.h"
+#include "gimple.h"
+#include "gimplify.h"
+#include "gimple-iterator.h"
+#include "gimplify-me.h"
+#include "gimple-ssa.h"
+#include "ipa-ref.h"
+#include "lto-streamer.h"
+#include "cgraph.h"
+#include "tree-cfg.h"
+#include "tree-phinodes.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
+#include "tree-pass.h"
+#include "gimple-pretty-print.h"
+#include "gimple-fold.h"
+
+
+/* Return true if the current function name is scalbn/scalbnf, or its alias
+   includes scalbn/scalbnf, otherwise return false.  */
+
+static bool
+nds32_is_scalbn_alias_func_p (void)
+{
+  int i;
+  struct ipa_ref *ref;
+  struct cgraph_node *cfun_node;
+
+  if (!strcmp (function_name (cfun), "scalbn")
+      || !strcmp (function_name (cfun), "scalbnf"))
+    return true;
+
+  cfun_node = cgraph_node::get (current_function_decl);
+
+  if (!cfun_node)
+    return false;
+
+  for (i = 0; cfun_node->iterate_referring (i, ref); i++)
+    if (ref->use == IPA_REF_ALIAS)
+      {
+	struct cgraph_node *alias = dyn_cast <cgraph_node *> (ref->referring);
+	if (!strcmp (alias->asm_name (), "scalbn")
+	    || !strcmp (alias->asm_name (), "scalbnf"))
+	  return true;
+      }
+
+  return false;
+}
+
+/* Return true if value of tree node RT is power of 2.  */
+
+static bool
+nds32_real_ispow2_p (tree rt)
+{
+  if (TREE_CODE (rt) != REAL_CST)
+    return false;
+
+  if (TREE_REAL_CST_PTR (rt)->cl != rvc_normal)
+    return false;
+
+  int i;
+  for (i = 0; i < SIGSZ-1; ++i)
+    if (TREE_REAL_CST_PTR (rt)->sig[i] != 0)
+      return false;
+  if (TREE_REAL_CST_PTR (rt)->sig[SIGSZ-1] != SIG_MSB)
+    return false;
+
+  return true;
+}
+
+/* Return the exponent of tree node RT in base 2.  */
+
+static int
+nds32_real_pow2exp (tree rt)
+{
+  return REAL_EXP (TREE_REAL_CST_PTR (rt)) - 1;
+}
+
+/* Return true if GS is the target of scalbn transform.  */
+
+static bool
+nds32_scalbn_transform_target_p (gimple *gs)
+{
+  if (is_gimple_assign (gs))
+    if ((gimple_assign_rhs_code (gs) == MULT_EXPR)
+	&& (TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (gs))) == REAL_TYPE)
+	&& nds32_real_ispow2_p (gimple_assign_rhs2 (gs)))
+      return true;
+  return false;
+}
+
+/* Do scalbn transform for a GIMPLE statement GS.
+
+   When the multiplier of GIMPLE statement GS is a positive number,
+   GS will be transform to one gimple_call statement and one
+   gimple_assign statement as follows:
+   A = B * 128.0	-> temp = BUILT_IN_SCALBN (B, 7)
+			   A = temp
+
+   When the multiplier is a negative number, the multiplier will be
+   conversed the sign first since BUILT_IN_SCALBN can't handle
+   negative multiplier. The example is shown below:
+   A = B * -128.0	-> temp = BUILT_IN_SCALBN (B, 7)
+			   A = -temp
+*/
+
+static void
+nds32_do_scalbn_transform (gimple *gs)
+{
+  tree mult_cand = gimple_assign_rhs1 (gs);	/* Multiplicand  */
+  tree mult_er = gimple_assign_rhs2 (gs);	/* Multiplier  */
+  bool is_neg = false;
+
+  /* Choose the function by type of arg.  */
+  enum built_in_function fn_name;
+  tree type = TREE_TYPE (mult_cand);
+  if (TYPE_MAIN_VARIANT (type) == double_type_node)
+    fn_name = BUILT_IN_SCALBN;
+  else if (TYPE_MAIN_VARIANT (type) == float_type_node)
+    fn_name = BUILT_IN_SCALBNF;
+  /* Do not transform long double to scalbnl since some c library don't provide
+     it if target don't have real long double type
+  else if (TYPE_MAIN_VARIANT (type) == long_double_type_node)
+    fn_name = BUILT_IN_SCALBNL;
+  */
+  else
+    return;
+
+  /* Converse the sign of negative number.  */
+  if (REAL_VALUE_NEGATIVE (TREE_REAL_CST (mult_er)))
+    {
+      is_neg = true;
+      mult_er = build_real (TREE_TYPE (mult_er),
+			    real_value_negate (&TREE_REAL_CST (mult_er)));
+    }
+
+  /* Set function name for building gimple_call.  */
+  tree fndecl = builtin_decl_explicit (fn_name);
+
+  /* Set last arg for building gimple_call.  */
+  tree exp = build_int_cst (integer_type_node,
+			    nds32_real_pow2exp (mult_er));
+
+  /* Build a new temp ssa.  */
+  tree temp_call_ssa = make_ssa_name (TREE_TYPE (gimple_assign_lhs (gs)), NULL);
+
+  /* Build gimple_call stmt to replace GS.  */
+  gimple *call_stmt = gimple_build_call (fndecl,
+					 2,
+					 mult_cand,
+					 exp);
+  gimple_call_set_lhs (call_stmt, temp_call_ssa);
+
+  enum tree_code subcode = NOP_EXPR;
+  /* Handle negative value.  */
+  if (is_neg)
+    subcode = NEGATE_EXPR;
+
+  /* Build gimple_assign for return value or change the sign.  */
+  gimple *assign_stmt =
+    gimple_build_assign (gimple_assign_lhs (gs),
+			 subcode,
+			 gimple_call_lhs (call_stmt));
+
+  /* Replace gimple_assign GS by new gimple_call.  */
+  gimple_stmt_iterator gsi = gsi_for_stmt (gs);
+  update_stmt (call_stmt);
+  gsi_insert_before (&gsi, call_stmt, GSI_NEW_STMT);
+
+  /* Insert the gimple_assign after the scalbn call.  */
+  update_stmt (assign_stmt);
+  gsi_next (&gsi);
+  gsi_replace (&gsi, assign_stmt, false);
+}
+
+/* Do scalbn transform for each basic block BB.  */
+
+static int
+nds32_scalbn_transform_basic_block (basic_block bb)
+{
+  gimple_stmt_iterator gsi;
+  int transform_number = 0;
+
+  if (dump_file)
+    fprintf (dump_file,
+	     "\n;; Transforming the multiplication for basic block %d\n",
+	     bb->index);
+
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+
+      if (nds32_scalbn_transform_target_p (stmt))
+	{
+	  if (dump_file)
+	    {
+	      fprintf (dump_file,
+		       "* The multiplier of stmt %d is transforming.\n",
+		       gimple_uid (stmt));
+	      print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM|TDF_RAW);
+	    }
+	  nds32_do_scalbn_transform (stmt);
+	  transform_number++;
+	}
+    }
+
+  return transform_number;
+}
+
+/* This function is the entry of scalbn transform pass.  */
+
+static int
+nds32_scalbn_transform_opt (void)
+{
+  basic_block bb;
+  int total_transform_number = 0;
+
+  /* Ignore current and builtin function name are the same.  */
+  if (nds32_is_scalbn_alias_func_p ())
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "* Ignore function %s. "
+		 "Transform it will cause infinite loop.\n",
+		 function_name (cfun));
+      return 0;
+    }
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      total_transform_number += nds32_scalbn_transform_basic_block (bb);
+    }
+
+  if (dump_file)
+    {
+      if (total_transform_number > 0)
+	fprintf (dump_file,
+		 "\n;; Transform %d multiplication stmt in function %s\n",
+		 total_transform_number,
+		 current_function_name ());
+      else
+	fprintf (dump_file,
+		 "\n;; No multiplication stmt is transformed in function %s\n",
+		 current_function_name ());
+    }
+
+  return 1;
+}
+
+static bool
+gate_nds32_scalbn_transform (void)
+{
+  return flag_nds32_scalbn_transform
+    && !TARGET_FPU_SINGLE
+    && !flag_no_builtin;
+}
+
+const pass_data pass_data_nds32_scalbn_transform_opt =
+{
+  GIMPLE_PASS,				/* type */
+  "scalbn_transform",			/* name */
+  OPTGROUP_NONE,			/* optinfo_flags */
+  TV_MACH_DEP,				/* tv_id */
+  ( PROP_cfg | PROP_ssa ),		/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_update_ssa,			/* todo_flags_finish */
+};
+
+class pass_nds32_scalbn_transform_opt : public gimple_opt_pass
+{
+public:
+  pass_nds32_scalbn_transform_opt (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_nds32_scalbn_transform_opt, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) { return gate_nds32_scalbn_transform (); }
+  unsigned int execute (function *) { return nds32_scalbn_transform_opt (); }
+};
+
+gimple_opt_pass *
+make_pass_nds32_scalbn_transform_opt (gcc::context *ctxt)
+{
+  return new pass_nds32_scalbn_transform_opt (ctxt);
+}
diff --git a/gcc/config/nds32/nds32-sign-conversion.c b/gcc/config/nds32/nds32-sign-conversion.c
new file mode 100644
index 0000000..74eefba
--- /dev/null
+++ b/gcc/config/nds32/nds32-sign-conversion.c
@@ -0,0 +1,218 @@
+/* A Gimple-level pass of Andes NDS32 cpu for GNU compiler that
+   converse the sign of constant operand when the FPU do not be
+   accessed.
+
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "hash-set.h"
+#include "machmode.h"
+#include "vec.h"
+#include "double-int.h"
+#include "input.h"
+#include "alias.h"
+#include "symtab.h"
+#include "wide-int.h"
+#include "inchash.h"
+#include "tree.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "rtl.h"
+#include "regs.h"
+#include "hard-reg-set.h"
+#include "insn-config.h"   /* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"     /* For DFA state_t.  */
+#include "insn-codes.h"    /* For CODE_FOR_xxx.  */
+#include "reload.h"     /* For push_reload ().  */
+#include "flags.h"
+#include "input.h"
+#include "function.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "dominance.h"
+#include "cfg.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "predict.h"
+#include "basic-block.h"
+#include "bitmap.h"
+#include "df.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"     /* For add_builtin_function ().  */
+#include "ggc.h"
+#include "tree-pass.h"
+#include "tree-ssa-alias.h"
+#include "fold-const.h"
+#include "gimple-expr.h"
+#include "is-a.h"
+#include "gimple.h"
+#include "gimplify.h"
+#include "gimple-iterator.h"
+#include "gimplify-me.h"
+#include "gimple-ssa.h"
+#include "ipa-ref.h"
+#include "lto-streamer.h"
+#include "cgraph.h"
+#include "tree-cfg.h"
+#include "tree-phinodes.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
+#include "tree-pass.h"
+#include "gimple-pretty-print.h"
+#include "gimple-fold.h"
+
+/* Return true if GS is the target of sign conversion.  */
+
+static bool
+nds32_sign_conversion_target_p (gimple *gs)
+{
+  if (is_gimple_assign (gs))
+    if ((gimple_assign_rhs_code (gs) == MINUS_EXPR)
+	&& (TREE_CODE (gimple_assign_rhs2 (gs)) == REAL_CST))
+      return true;
+  return false;
+}
+
+/* Do sign conversion for a GIMPLE statement GS.  */
+
+static void
+nds32_do_sign_conversion (gimple *gs)
+{
+  /* Rewrite the rhs operand.  */
+  enum tree_code op_code = gimple_assign_rhs_code (gs);
+  op_code = PLUS_EXPR;
+  gimple_assign_set_rhs_code (gs, op_code);
+  /* Rewrite the constant value.  */
+  tree rhs2 = gimple_assign_rhs2 (gs);
+  rhs2 = build_real (TREE_TYPE (rhs2),
+		     real_value_negate (&TREE_REAL_CST (rhs2)));
+  gimple_assign_set_rhs2 (gs, rhs2);
+  /* When the statement is modified, please mark this statement is modified.  */
+  update_stmt (gs);
+}
+
+/* Do sign conversion for each basic block BB.  */
+
+static int
+nds32_sign_conversion_basic_block (basic_block bb)
+{
+  gimple_stmt_iterator gsi;
+  int converse_number = 0;
+
+  if (dump_file)
+    fprintf (dump_file,
+	     "\n;; Conversing the sign of gimple stmts for basic block %d\n",
+	     bb->index);
+
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+
+      if (nds32_sign_conversion_target_p (stmt))
+	{
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "* The sign of stmt %d is conversing.\n",
+		       gimple_uid (stmt));
+	      print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM|TDF_RAW);
+	    }
+	  nds32_do_sign_conversion (stmt);
+	  converse_number++;
+	}
+    }
+
+  return converse_number;
+}
+
+/* This function is the entry of sign conversion pass.  */
+
+static int
+nds32_sign_conversion_opt (void)
+{
+  basic_block bb;
+  int total_converse_number = 0;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      total_converse_number += nds32_sign_conversion_basic_block (bb);
+    }
+
+  if (dump_file)
+    {
+      if (total_converse_number > 0)
+	fprintf (dump_file, "\n;; Converse %d stmts in function %s\n",
+		 total_converse_number,
+		 current_function_name ());
+      else
+	fprintf (dump_file,
+		 "\n;; No sign of stmt is conversed in function %s\n",
+		 current_function_name ());
+    }
+
+  return 1;
+}
+
+const pass_data pass_data_nds32_sign_conversion_opt =
+{
+  GIMPLE_PASS,				/* type */
+  "sign_conversion",			/* name */
+  OPTGROUP_NONE,			/* optinfo_flags */
+  TV_MACH_DEP,				/* tv_id */
+  ( PROP_cfg | PROP_ssa ),		/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_update_ssa,			/* todo_flags_finish */
+};
+
+class pass_nds32_sign_conversion_opt : public gimple_opt_pass
+{
+public:
+  pass_nds32_sign_conversion_opt (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_nds32_sign_conversion_opt, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *)
+  {
+    return flag_nds32_sign_conversion && !TARGET_FPU_SINGLE;
+  }
+  unsigned int execute (function *) { return nds32_sign_conversion_opt (); }
+};
+
+gimple_opt_pass *
+make_pass_nds32_sign_conversion_opt (gcc::context *ctxt)
+{
+  return new pass_nds32_sign_conversion_opt (ctxt);
+}
diff --git a/gcc/config/nds32/nds32-soft-fp-comm.c b/gcc/config/nds32/nds32-soft-fp-comm.c
new file mode 100644
index 0000000..98ba3d5
--- /dev/null
+++ b/gcc/config/nds32/nds32-soft-fp-comm.c
@@ -0,0 +1,205 @@
+/* Operand commutative for soft floating point arithmetic pass
+   of Andes NDS32 cpu for GNU compiler
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "tree.h"
+#include "rtl.h"
+#include "df.h"
+#include "alias.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "regs.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
+#include "explow.h"
+#include "emit-rtl.h"
+#include "stmt.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "tm_p.h"
+#include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"		/* For add_builtin_function().  */
+#include "builtins.h"
+#include "cpplib.h"
+#include "params.h"
+#include "tree-pass.h"
+
+#define SF_ARG0_REGNO 0
+#define SF_ARG1_REGNO 1
+
+#define DF_ARG0_REGNO 0
+#define DF_ARG1_REGNO 2
+
+static int
+nds32_soft_fp_arith_comm_opt (void)
+{
+  basic_block bb;
+  rtx_insn *insn;
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (!CALL_P (insn))
+	    continue;
+
+	  rtx pat = PATTERN (insn);
+	  rtx call_rtx = XVECEXP (pat, 0, 0);
+
+	  if (GET_CODE (call_rtx) == SET)
+	    call_rtx = SET_SRC (call_rtx);
+
+	  rtx func_mem = XEXP (call_rtx, 0);
+	  rtx symbol = XEXP (func_mem, 0);
+
+	  if (GET_CODE (symbol) != SYMBOL_REF)
+	    continue;
+
+	  const char *func_name = XSTR (symbol, 0);
+	  bool df_p;
+	  if (((strcmp("__mulsf3", func_name) == 0)
+	       || (strcmp("__addsf3", func_name) == 0)))
+	    df_p = false;
+	  else if (((strcmp("__muldf3", func_name) == 0)
+		   || (strcmp("__adddf3", func_name) == 0)))
+	    df_p = true;
+	  else
+	    continue;
+
+	  rtx_insn *prev_insn = insn;
+	  rtx_insn *arg0_insn = NULL;
+	  rtx_insn *arg1_insn = NULL;
+	  unsigned arg0_regno = df_p ? DF_ARG0_REGNO : SF_ARG0_REGNO;
+	  unsigned arg1_regno = df_p ? DF_ARG1_REGNO : SF_ARG1_REGNO;
+	  enum machine_mode mode = df_p ? DFmode : SFmode;
+	  while ((prev_insn = PREV_INSN (prev_insn)) && prev_insn)
+	    {
+	      if (arg0_insn != NULL && arg1_insn != NULL)
+		break;
+
+	      if (BLOCK_FOR_INSN (prev_insn) != BLOCK_FOR_INSN (insn))
+		break;
+
+	      if (!NONJUMP_INSN_P (prev_insn))
+		break;
+
+	      if (!INSN_P (prev_insn))
+		continue;
+
+	      rtx set = PATTERN (prev_insn);
+
+	      if (GET_CODE (set) != SET)
+		continue;
+
+	      rtx dst_reg = SET_DEST (set);
+
+	      if (!REG_P (dst_reg))
+		break;
+
+	      unsigned regno = REGNO (dst_reg);
+
+	      if (regno == arg0_regno)
+		{
+		  arg0_insn = prev_insn;
+		  continue;
+		}
+	      else if (regno == arg1_regno)
+		{
+		  arg1_insn = prev_insn;
+		  continue;
+		}
+	      break;
+	    }
+	  if (arg0_insn == NULL || arg1_insn == NULL)
+	   continue;
+
+	  rtx arg0_src = SET_SRC (PATTERN (arg0_insn));
+	  rtx arg1_src = SET_SRC (PATTERN (arg1_insn));
+
+	  if ((REG_P (arg0_src)
+	       && GET_MODE (arg0_src) == mode
+	       && REGNO (arg0_src) == arg1_regno)
+	      || (REG_P (arg1_src)
+		  && GET_MODE (arg1_src) == mode
+		  && REGNO (arg1_src) == arg0_regno))
+	    {
+	      /* Swap operand! */
+	      rtx tmp = SET_DEST (PATTERN (arg0_insn));
+	      SET_DEST (PATTERN (arg0_insn)) = SET_DEST (PATTERN (arg1_insn));
+	      SET_DEST (PATTERN (arg1_insn)) = tmp;
+	    }
+	}
+    }
+  return 1;
+}
+
+const pass_data pass_data_nds32_soft_fp_arith_comm_opt =
+{
+  RTL_PASS,				/* type */
+  "soft_fp_arith_comm",			/* name */
+  OPTGROUP_NONE,			/* optinfo_flags */
+  TV_MACH_DEP,				/* tv_id */
+  0,					/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  0,					/* todo_flags_finish */
+};
+
+class pass_nds32_soft_fp_arith_comm_opt : public rtl_opt_pass
+{
+public:
+  pass_nds32_soft_fp_arith_comm_opt (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_nds32_soft_fp_arith_comm_opt, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) {
+    return TARGET_SOFT_FP_ARITH_COMM && !TARGET_FPU_SINGLE;
+  }
+  unsigned int execute (function *) { return nds32_soft_fp_arith_comm_opt (); }
+};
+
+rtl_opt_pass *
+make_pass_nds32_soft_fp_arith_comm_opt (gcc::context *ctxt)
+{
+  return new pass_nds32_soft_fp_arith_comm_opt (ctxt);
+}
diff --git a/gcc/config/nds32/nds32-utils.c b/gcc/config/nds32/nds32-utils.c
new file mode 100644
index 0000000..3b16738
--- /dev/null
+++ b/gcc/config/nds32/nds32-utils.c
@@ -0,0 +1,923 @@
+/* Auxiliary functions for pipeline descriptions pattern of Andes
+   NDS32 cpu for GNU compiler
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+/* ------------------------------------------------------------------------ */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "hash-set.h"
+#include "machmode.h"
+#include "vec.h"
+#include "double-int.h"
+#include "input.h"
+#include "alias.h"
+#include "symtab.h"
+#include "wide-int.h"
+#include "inchash.h"
+#include "tree.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "calls.h"
+#include "rtl.h"
+#include "regs.h"
+#include "hard-reg-set.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
+#include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "input.h"
+#include "function.h"
+#include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "dominance.h"
+#include "cfg.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "predict.h"
+#include "basic-block.h"
+#include "nds32-protos.h"
+
+namespace nds32 {
+
+/* Get the rtx in the PATTERN field of an insn.  If INSN is not an insn,
+   the funciton doesn't change anything and returns it directly.  */
+rtx
+extract_pattern_from_insn (rtx insn)
+{
+  if (INSN_P (insn))
+    return PATTERN (insn);
+
+  return insn;
+}
+
+/* Get the number of elements in a parallel rtx.  */
+size_t
+parallel_elements (rtx parallel_rtx)
+{
+  parallel_rtx = extract_pattern_from_insn (parallel_rtx);
+  gcc_assert (GET_CODE (parallel_rtx) == PARALLEL);
+
+  return XVECLEN (parallel_rtx, 0);
+}
+
+/* Extract an rtx from a parallel rtx with index NTH.  If NTH is a negative
+   value, the function returns the last NTH rtx.  */
+rtx
+parallel_element (rtx parallel_rtx, int nth)
+{
+  parallel_rtx = extract_pattern_from_insn (parallel_rtx);
+  gcc_assert (GET_CODE (parallel_rtx) == PARALLEL);
+
+  int len = parallel_elements (parallel_rtx);
+
+  if (nth >= 0)
+    {
+      if (nth >= len)
+	return NULL_RTX;
+
+      return XVECEXP (parallel_rtx, 0, nth);
+    }
+  else
+    {
+      if (len + nth < 0)
+	return NULL_RTX;
+
+      return XVECEXP (parallel_rtx, 0, len + nth);
+    }
+}
+
+/* Return true if an insn is a pseudo NOP that is not a real instruction
+   occupying a real cycle and space of the text section.  */
+bool
+insn_pseudo_nop_p (rtx_insn *insn)
+{
+  if (INSN_CODE (insn) == CODE_FOR_nop_data_dep
+      || INSN_CODE (insn) == CODE_FOR_nop_res_dep)
+    return true;
+
+  return false;
+}
+
+/* Indicate whether an insn is a real insn which occupy at least one cycle
+   or not.  The determination cannot be target-independent because some targets
+   use UNSPEC or UNSPEC_VOLATILE insns to represent real instructions.  */
+bool
+insn_executable_p (rtx_insn *insn)
+{
+  if (!INSN_P (insn))
+    return false;
+
+  if (insn_pseudo_nop_p (insn))
+    return true;
+
+  if (get_attr_length (insn) == 0)
+    return false;
+
+  switch (GET_CODE (PATTERN (insn)))
+    {
+    case CONST_INT:
+    case USE:
+    case CLOBBER:
+    case ADDR_VEC:
+    case ADDR_DIFF_VEC:
+    case UNSPEC:
+    case UNSPEC_VOLATILE:
+      return false;
+
+    default:
+      return true;
+    }
+
+  return true;
+}
+
+/* Find the previous executable insn.  */
+rtx_insn *
+prev_executable_insn (rtx_insn *insn)
+{
+  insn = PREV_INSN (insn);
+  while (insn && !insn_executable_p (insn))
+    insn = PREV_INSN (insn);
+
+  return insn;
+}
+
+/* Find the next executable insn.  */
+rtx_insn *
+next_executable_insn (rtx_insn *insn)
+{
+  insn = NEXT_INSN (insn);
+  while (insn && !insn_executable_p (insn))
+    insn = NEXT_INSN (insn);
+
+  return insn;
+}
+
+/* Find the previous executable insn in the current basic block.  */
+rtx_insn *
+prev_executable_insn_local (rtx_insn *insn)
+{
+  insn = PREV_INSN (insn);
+  while (insn && !insn_executable_p (insn))
+    {
+      if(LABEL_P (insn) || JUMP_P (insn) || CALL_P (insn))
+	return NULL;
+
+      insn = PREV_INSN (insn);
+    }
+
+  return insn;
+}
+
+/* Find the next executable insn in the current basic block.  */
+rtx_insn *
+next_executable_insn_local (rtx_insn *insn)
+{
+  insn = NEXT_INSN (insn);
+  while (insn && !insn_executable_p (insn))
+    {
+      if(LABEL_P (insn) || JUMP_P (insn) || CALL_P (insn))
+	return NULL;
+
+      insn = NEXT_INSN (insn);
+    }
+
+  return insn;
+}
+
+/* Return true if an insn is marked as deleted.  */
+bool
+insn_deleted_p (rtx_insn *insn)
+{
+  if (insn->deleted ())
+    return true;
+
+  if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED)
+    return true;
+
+  return false;
+}
+
+/* Functions to determine whether INSN is single-word, double-word
+   or partial-word load/store insn.  */
+
+bool
+load_single_p (rtx_insn *insn)
+{
+  if (get_attr_type (insn) != TYPE_LOAD)
+    return false;
+
+  if (INSN_CODE (insn) == CODE_FOR_move_di ||
+      INSN_CODE (insn) == CODE_FOR_move_df)
+    return false;
+
+  return true;
+}
+
+bool
+store_single_p (rtx_insn *insn)
+{
+  if (get_attr_type (insn) != TYPE_STORE)
+    return false;
+
+  if (INSN_CODE (insn) == CODE_FOR_move_di ||
+      INSN_CODE (insn) == CODE_FOR_move_df)
+    return false;
+
+  return true;
+}
+
+bool
+load_double_p (rtx_insn *insn)
+{
+  if (get_attr_type (insn) != TYPE_LOAD)
+    return false;
+
+  if (INSN_CODE (insn) != CODE_FOR_move_di &&
+      INSN_CODE (insn) != CODE_FOR_move_df)
+    return false;
+
+  return true;
+}
+
+bool
+store_double_p (rtx_insn *insn)
+{
+  if (get_attr_type (insn) != TYPE_STORE)
+    return false;
+
+  if (INSN_CODE (insn) != CODE_FOR_move_di &&
+      INSN_CODE (insn) != CODE_FOR_move_df)
+    return false;
+
+  return true;
+}
+
+bool
+store_offset_reg_p (rtx_insn *insn)
+{
+  if (get_attr_type (insn) != TYPE_STORE)
+    return false;
+
+  rtx offset_rtx = extract_offset_rtx (insn);
+
+  if (offset_rtx == NULL_RTX)
+    return false;
+
+  if (REG_P (offset_rtx))
+    return true;
+
+  return false;
+}
+
+bool
+load_full_word_p (rtx_insn *insn)
+{
+  if (!nds32::load_single_p (insn))
+    return false;
+
+  if (GET_MODE (SET_SRC (PATTERN (insn))) == SImode)
+    return true;
+
+  return false;
+}
+
+bool
+load_partial_word_p (rtx_insn *insn)
+{
+  if (!nds32::load_single_p (insn))
+    return false;
+
+  if (GET_MODE (SET_SRC (PATTERN (insn))) == HImode
+      || GET_MODE (SET_SRC (PATTERN (insn))) == QImode)
+    return true;
+
+  return false;
+}
+
+/* Determine if INSN is a post update insn.  */
+bool
+post_update_insn_p (rtx_insn *insn)
+{
+  if (find_post_update_rtx (insn) == -1)
+    return false;
+  else
+    return true;
+}
+
+/* Check if the address of MEM_RTX consists of a base register and an
+   immediate offset.  */
+bool
+immed_offset_p (rtx mem_rtx)
+{
+  gcc_assert (MEM_P (mem_rtx));
+
+  rtx addr_rtx = XEXP (mem_rtx, 0);
+
+  /* (mem (reg)) is equivalent to (mem (plus (reg) (const_int 0))) */
+  if (REG_P (addr_rtx))
+    return true;
+
+  /* (mem (plus (reg) (const_int))) */
+  if (GET_CODE (addr_rtx) == PLUS
+      && GET_CODE (XEXP (addr_rtx, 1)) == CONST_INT)
+    return true;
+
+  return false;
+}
+
+/* Find the post update rtx in INSN.  If INSN is a load/store multiple insn,
+   the function returns the vector index of its parallel part.  If INSN is a
+   single load/store insn, the function returns 0.  If INSN is not a post-
+   update insn, the function returns -1.  */
+int
+find_post_update_rtx (rtx_insn *insn)
+{
+  rtx mem_rtx;
+  int i, len;
+
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LOAD_MULTIPLE:
+    case TYPE_STORE_MULTIPLE:
+      /* Find a pattern in a parallel rtx:
+	 (set (reg) (plus (reg) (const_int)))  */
+      len = parallel_elements (insn);
+      for (i = 0; i < len; ++i)
+	{
+	  rtx curr_insn = parallel_element (insn, i);
+
+	  if (GET_CODE (curr_insn) == SET
+	      && REG_P (SET_DEST (curr_insn))
+	      && GET_CODE (SET_SRC (curr_insn)) == PLUS)
+		return i;
+	}
+      return -1;
+
+    case TYPE_LOAD:
+    case TYPE_FLOAD:
+    case TYPE_STORE:
+    case TYPE_FSTORE:
+      mem_rtx = extract_mem_rtx (insn);
+      /* (mem (post_inc (reg)))  */
+      switch (GET_CODE (XEXP (mem_rtx, 0)))
+	{
+	case POST_INC:
+	case POST_DEC:
+	case POST_MODIFY:
+	  return 0;
+
+	default:
+	  return -1;
+	}
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Extract the MEM rtx from a load/store insn.  */
+rtx
+extract_mem_rtx (rtx_insn *insn)
+{
+  rtx body = PATTERN (insn);
+
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LOAD:
+    case TYPE_FLOAD:
+      if (MEM_P (SET_SRC (body)))
+	return SET_SRC (body);
+
+      /* unaligned address: (unspec [(mem)])  */
+      if (GET_CODE (SET_SRC (body)) == UNSPEC)
+	{
+	  gcc_assert (MEM_P (XVECEXP (SET_SRC (body), 0, 0)));
+	  return XVECEXP (SET_SRC (body), 0, 0);
+	}
+
+      /* (sign_extend (mem)) */
+      gcc_assert (MEM_P (XEXP (SET_SRC (body), 0)));
+      return XEXP (SET_SRC (body), 0);
+
+    case TYPE_STORE:
+    case TYPE_FSTORE:
+      if (MEM_P (SET_DEST (body)))
+	return SET_DEST (body);
+
+      /* unaligned address: (unspec [(mem)])  */
+      if (GET_CODE (SET_DEST (body)) == UNSPEC)
+	{
+	  gcc_assert (MEM_P (XVECEXP (SET_DEST (body), 0, 0)));
+	  return XVECEXP (SET_DEST (body), 0, 0);
+	}
+
+      /* (sign_extend (mem)) */
+      gcc_assert (MEM_P (XEXP (SET_DEST (body), 0)));
+      return XEXP (SET_DEST (body), 0);
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Extract the base register from load/store insns.  The function returns
+   NULL_RTX if the address is not consist of any registers.  */
+rtx
+extract_base_reg (rtx_insn *insn)
+{
+  int post_update_rtx_index;
+  rtx mem_rtx;
+  rtx plus_rtx;
+
+  /* Find the MEM rtx.  If we can find an insn updating the base register,
+     the base register will be returned directly.  */
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LOAD_MULTIPLE:
+      post_update_rtx_index = find_post_update_rtx (insn);
+
+      if (post_update_rtx_index != -1)
+        return SET_DEST (parallel_element (insn, post_update_rtx_index));
+
+      mem_rtx = SET_SRC (parallel_element (insn, 0));
+      break;
+
+    case TYPE_STORE_MULTIPLE:
+      post_update_rtx_index = find_post_update_rtx (insn);
+
+      if (post_update_rtx_index != -1)
+        return SET_DEST (parallel_element (insn, post_update_rtx_index));
+
+      mem_rtx = SET_DEST (parallel_element (insn, 0));
+      break;
+
+    case TYPE_LOAD:
+    case TYPE_FLOAD:
+    case TYPE_STORE:
+    case TYPE_FSTORE:
+      mem_rtx = extract_mem_rtx (insn);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  gcc_assert (MEM_P (mem_rtx));
+
+  /* (mem (reg))  */
+  if (REG_P (XEXP (mem_rtx, 0)))
+    return XEXP (mem_rtx, 0);
+
+  /* (mem (lo_sum (reg) (symbol_ref)) */
+  if (GET_CODE (XEXP (mem_rtx, 0)) == LO_SUM)
+    return XEXP (XEXP (mem_rtx, 0), 0);
+
+  plus_rtx = XEXP (mem_rtx, 0);
+
+  if (GET_CODE (plus_rtx) == SYMBOL_REF
+      || GET_CODE (plus_rtx) == CONST)
+    return NULL_RTX;
+
+  /* (mem (plus (reg) (const_int))) or
+     (mem (plus (mult (reg) (const_int 4)) (reg))) or
+     (mem (post_inc (reg))) or
+     (mem (post_dec (reg))) or
+     (mem (post_modify (reg) (plus (reg) (reg))))  */
+  gcc_assert (GET_CODE (plus_rtx) == PLUS
+	      || GET_CODE (plus_rtx) == POST_INC
+	      || GET_CODE (plus_rtx) == POST_DEC
+	      || GET_CODE (plus_rtx) == POST_MODIFY);
+
+  if (REG_P (XEXP (plus_rtx, 0)))
+    return XEXP (plus_rtx, 0);
+
+  gcc_assert (REG_P (XEXP (plus_rtx, 1)));
+  return XEXP (plus_rtx, 1);
+}
+
+/* Extract the offset rtx from load/store insns.  The function returns
+   NULL_RTX if offset is absent.  */
+rtx
+extract_offset_rtx (rtx_insn *insn)
+{
+  rtx mem_rtx;
+  rtx plus_rtx;
+  rtx offset_rtx;
+
+  /* Find the MEM rtx.  The multiple load/store insns doens't have
+     the offset field so we can return NULL_RTX here.  */
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LOAD_MULTIPLE:
+    case TYPE_STORE_MULTIPLE:
+      return NULL_RTX;
+
+    case TYPE_LOAD:
+    case TYPE_FLOAD:
+    case TYPE_STORE:
+    case TYPE_FSTORE:
+      mem_rtx = extract_mem_rtx (insn);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  gcc_assert (MEM_P (mem_rtx));
+
+  /* (mem (reg))  */
+  if (REG_P (XEXP (mem_rtx, 0)))
+    return NULL_RTX;
+
+  plus_rtx = XEXP (mem_rtx, 0);
+
+  switch (GET_CODE (plus_rtx))
+    {
+    case SYMBOL_REF:
+    case CONST:
+    case POST_INC:
+    case POST_DEC:
+      return NULL_RTX;
+
+    case PLUS:
+      /* (mem (plus (reg) (const_int))) or
+         (mem (plus (mult (reg) (const_int 4)) (reg))) */
+      if (REG_P (XEXP (plus_rtx, 0)))
+        offset_rtx = XEXP (plus_rtx, 1);
+      else
+	{
+	  gcc_assert (REG_P (XEXP (plus_rtx, 1)));
+	  offset_rtx = XEXP (plus_rtx, 0);
+	}
+
+      if (ARITHMETIC_P (offset_rtx))
+	{
+	  gcc_assert (GET_CODE (offset_rtx) == MULT);
+	  gcc_assert (REG_P (XEXP (offset_rtx, 0)));
+	  offset_rtx = XEXP (offset_rtx, 0);
+	}
+      break;
+
+    case LO_SUM:
+      /* (mem (lo_sum (reg) (symbol_ref)) */
+      offset_rtx = XEXP (plus_rtx, 1);
+      break;
+
+    case POST_MODIFY:
+      /* (mem (post_modify (reg) (plus (reg) (reg / const_int)))) */
+      gcc_assert (REG_P (XEXP (plus_rtx, 0)));
+      plus_rtx = XEXP (plus_rtx, 1);
+      gcc_assert (GET_CODE (plus_rtx) == PLUS);
+      offset_rtx = XEXP (plus_rtx, 0);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return offset_rtx;
+}
+
+/* Extract the register of the shift operand from an ALU_SHIFT rtx.  */
+rtx
+extract_shift_reg (rtx_insn *insn)
+{
+  rtx alu_shift_rtx = extract_pattern_from_insn (insn);
+
+  rtx alu_rtx = SET_SRC (alu_shift_rtx);
+  rtx shift_rtx;
+
+  /* Various forms of ALU_SHIFT can be made by the combiner.
+     See the difference between add_slli and sub_slli in nds32.md.  */
+  if (REG_P (XEXP (alu_rtx, 0)))
+    shift_rtx = XEXP (alu_rtx, 1);
+  else
+    shift_rtx = XEXP (alu_rtx, 0);
+
+  return XEXP (shift_rtx, 0);
+}
+
+/* Check if INSN is a movd44 insn.  */
+bool
+movd44_insn_p (rtx_insn *insn)
+{
+  if (get_attr_type (insn) == TYPE_ALU
+      && (INSN_CODE (insn) == CODE_FOR_move_di
+	  || INSN_CODE (insn) == CODE_FOR_move_df))
+    {
+      rtx body = PATTERN (insn);
+      gcc_assert (GET_CODE (body) == SET);
+
+      rtx src = SET_SRC (body);
+      rtx dest = SET_DEST (body);
+
+      if ((REG_P (src) || GET_CODE (src) == SUBREG)
+	  && (REG_P (dest) || GET_CODE (dest) == SUBREG))
+	return true;
+
+      return false;
+    }
+
+  return false;
+}
+
+/* Extract the first result (even reg) of a movd44 insn.  */
+rtx
+extract_movd44_even_reg (rtx_insn *insn)
+{
+  gcc_assert (movd44_insn_p (insn));
+
+  rtx def_reg = SET_DEST (PATTERN (insn));
+  enum machine_mode mode;
+
+  gcc_assert (REG_P (def_reg) || GET_CODE (def_reg) == SUBREG);
+  switch (GET_MODE (def_reg))
+    {
+    case DImode:
+      mode = SImode;
+      break;
+
+    case DFmode:
+      mode = SFmode;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return gen_lowpart (mode, def_reg);
+}
+
+/* Extract the second result (odd reg) of a movd44 insn.  */
+rtx
+extract_movd44_odd_reg (rtx_insn *insn)
+{
+  gcc_assert (movd44_insn_p (insn));
+
+  rtx def_reg = SET_DEST (PATTERN (insn));
+  enum machine_mode mode;
+
+  gcc_assert (REG_P (def_reg) || GET_CODE (def_reg) == SUBREG);
+  switch (GET_MODE (def_reg))
+    {
+    case DImode:
+      mode = SImode;
+      break;
+
+    case DFmode:
+      mode = SFmode;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return gen_highpart (mode, def_reg);
+}
+
+/* Extract the rtx representing the accumulation operand of a MAC insn.  */
+rtx
+extract_mac_acc_rtx (rtx_insn *insn)
+{
+  return SET_DEST (PATTERN (insn));
+}
+
+/* Extract the rtx representing non-accumulation operands of a MAC insn.  */
+rtx
+extract_mac_non_acc_rtx (rtx_insn *insn)
+{
+  rtx exp = SET_SRC (PATTERN (insn));
+
+  switch (get_attr_type (insn))
+    {
+    case TYPE_MAC:
+    case TYPE_DMAC:
+      if (REG_P (XEXP (exp, 0)))
+	return XEXP (exp, 1);
+      else
+	return XEXP (exp, 0);
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Check if the DIV insn needs two write ports.  */
+bool
+divmod_p (rtx_insn *insn)
+{
+  gcc_assert (get_attr_type (insn) == TYPE_DIV);
+
+  if (INSN_CODE (insn) == CODE_FOR_divmodsi4
+      || INSN_CODE (insn) == CODE_FOR_udivmodsi4)
+    return true;
+
+  return false;
+}
+
+/* Extract the rtx representing the branch target to help recognize
+   data hazards.  */
+rtx
+extract_branch_target_rtx (rtx_insn *insn)
+{
+  gcc_assert (CALL_P (insn) || JUMP_P (insn));
+
+  rtx body = PATTERN (insn);
+
+  if (GET_CODE (body) == SET)
+    {
+      /* RTXs in IF_THEN_ELSE are branch conditions.  */
+      if (GET_CODE (SET_SRC (body)) == IF_THEN_ELSE)
+        return NULL_RTX;
+
+      return SET_SRC (body);
+    }
+
+  if (GET_CODE (body) == CALL)
+    return XEXP (body, 0);
+
+  if (GET_CODE (body) == PARALLEL)
+    {
+      rtx first_rtx = parallel_element (body, 0);
+
+      if (GET_CODE (first_rtx) == SET)
+	return SET_SRC (first_rtx);
+
+      if (GET_CODE (first_rtx) == CALL)
+	return XEXP (first_rtx, 0);
+    }
+
+  /* Handle special cases of bltzal, bgezal and jralnez.  */
+  if (GET_CODE (body) == COND_EXEC)
+    {
+      rtx addr_rtx = XEXP (body, 1);
+
+      if (GET_CODE (addr_rtx) == SET)
+	return SET_SRC (addr_rtx);
+
+      if (GET_CODE (addr_rtx) == PARALLEL)
+	{
+	  rtx first_rtx = parallel_element (addr_rtx, 0);
+
+	  if (GET_CODE (first_rtx) == SET)
+	    {
+	      rtx call_rtx = SET_SRC (first_rtx);
+	      gcc_assert (GET_CODE (call_rtx) == CALL);
+
+	      return XEXP (call_rtx, 0);
+	    }
+
+	  if (GET_CODE (first_rtx) == CALL)
+	    return XEXP (first_rtx, 0);
+	}
+    }
+
+  gcc_unreachable ();
+}
+
+/* Extract the rtx representing the branch condition to help recognize
+   data hazards.  */
+rtx
+extract_branch_condition_rtx (rtx_insn *insn)
+{
+  gcc_assert (CALL_P (insn) || JUMP_P (insn));
+
+  rtx body = PATTERN (insn);
+
+  if (GET_CODE (body) == SET)
+    {
+      rtx if_then_else_rtx = SET_SRC (body);
+
+      if (GET_CODE (if_then_else_rtx) == IF_THEN_ELSE)
+        return XEXP (if_then_else_rtx, 0);
+
+      return NULL_RTX;
+    }
+
+  if (GET_CODE (body) == COND_EXEC)
+    return XEXP (body, 0);
+
+  return NULL_RTX;
+}
+
+/* Building the CFG in later back end passes cannot call compute_bb_for_insn ()
+   directly because calling to BLOCK_FOR_INSN (insn) when some insns have been
+   deleted can cause a segmentation fault.  Use this function to rebuild the CFG
+   can avoid such issues.  */
+void
+compute_bb_for_insn_safe ()
+{
+  basic_block bb;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      rtx_insn *insn, *next_insn, *last_insn;
+      bool after_last_insn = false;
+
+      /* Find the last non-deleted insn. */
+      for (last_insn = BB_END (bb);
+	   PREV_INSN (last_insn) && insn_deleted_p (last_insn);
+	   last_insn = PREV_INSN (last_insn));
+
+      /* Bind each insn to its BB and adjust BB_END (bb).  */
+      for (insn = BB_HEAD (bb); insn; insn = NEXT_INSN (insn))
+	{
+	  BLOCK_FOR_INSN (insn) = bb;
+
+	  if (insn == last_insn)
+	    after_last_insn = true;
+
+	  next_insn = NEXT_INSN (insn);
+
+	  if (after_last_insn
+	      && (!next_insn
+		  || LABEL_P (next_insn)
+		  || NOTE_INSN_BASIC_BLOCK_P (next_insn)))
+	    {
+	      BB_END (bb) = insn;
+	      break;
+	    }
+	}
+    }
+}
+
+/* Exchange insns positions.  */
+void
+exchange_insns (rtx_insn *insn1, rtx_insn *insn2)
+{
+  if (INSN_UID (insn1) == INSN_UID (insn2))
+    return;
+
+  rtx_insn *insn1_from = insn1;
+  rtx_insn *insn1_to = insn1;
+  rtx_insn *insn2_from = insn2;
+  rtx_insn *insn2_to = insn2;
+
+  if (PREV_INSN (insn1)
+      && INSN_CODE (PREV_INSN (insn1)) == CODE_FOR_relax_group)
+    insn1_from = PREV_INSN (insn1);
+
+  if (PREV_INSN (insn2)
+      && INSN_CODE (PREV_INSN (insn2)) == CODE_FOR_relax_group)
+    insn2_from = PREV_INSN (insn2);
+
+  if (GET_MODE (insn1) == TImode && GET_MODE (insn2) == VOIDmode)
+    {
+      PUT_MODE (insn1, VOIDmode);
+      PUT_MODE (insn2, TImode);
+    }
+  else if (GET_MODE (insn1) == VOIDmode && GET_MODE (insn2) == TImode)
+    {
+      PUT_MODE (insn1, TImode);
+      PUT_MODE (insn2, VOIDmode);
+    }
+
+  if (PREV_INSN (insn1_from))
+    {
+      rtx_insn *insn1_prev = PREV_INSN (insn1_from);
+
+      reorder_insns (insn1_from, insn1_to, insn2);
+      reorder_insns (insn2_from, insn2_to, insn1_prev);
+
+      return;
+    }
+
+  gcc_assert (PREV_INSN (insn2_from));
+
+  rtx_insn *insn2_prev = PREV_INSN (insn2_from);
+
+  reorder_insns (insn2_from, insn2_to, insn1);
+  reorder_insns (insn1_from, insn1_to, insn2_prev);
+
+  return;
+}
+
+} // namespace nds32
diff --git a/gcc/config/nds32/nds32.c b/gcc/config/nds32/nds32.c
index c47c122..5f5e668 100644
--- a/gcc/config/nds32/nds32.c
+++ b/gcc/config/nds32/nds32.c
@@ -24,48 +24,103 @@
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
-#include "target.h"
-#include "rtl.h"
 #include "tree.h"
+#include "rtl.h"
 #include "df.h"
-#include "tm_p.h"
-#include "optabs.h"		/* For GEN_FCN.  */
-#include "regs.h"
-#include "emit-rtl.h"
-#include "recog.h"
-#include "diagnostic-core.h"
+#include "alias.h"
+#include "stringpool.h"
 #include "stor-layout.h"
 #include "varasm.h"
 #include "calls.h"
+#include "regs.h"
+#include "insn-config.h"	/* Required by recog.h.  */
+#include "conditions.h"
 #include "output.h"
+#include "insn-attr.h"		/* For DFA state_t.  */
+#include "insn-codes.h"		/* For CODE_FOR_xxx.  */
+#include "reload.h"		/* For push_reload().  */
+#include "flags.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
 #include "explow.h"
+#include "emit-rtl.h"
+#include "stmt.h"
 #include "expr.h"
+#include "recog.h"
+#include "diagnostic-core.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "lcm.h"
+#include "cfgbuild.h"
+#include "cfgcleanup.h"
+#include "tm_p.h"
 #include "tm-constrs.h"
+#include "optabs.h"		/* For GEN_FCN.  */
+#include "target.h"
+#include "langhooks.h"		/* For add_builtin_function().  */
 #include "builtins.h"
+#include "cpplib.h"
+#include "params.h"
+#include "tree-pass.h"
+#include "cfgloop.h"
+#include "cfghooks.h"
+#include "hw-doloop.h"
+#include "context.h"
+#include "sched-int.h"

 /* This file should be included last.  */
 #include "target-def.h"

 /* ------------------------------------------------------------------------ */

-/* This file is divided into five parts:
+/* This file is divided into six parts:

-     PART 1: Auxiliary static variable definitions and
-             target hook static variable definitions.
+     PART 1: Auxiliary external function and variable declarations.

-     PART 2: Auxiliary static function definitions.
+     PART 2: Auxiliary static variable definitions and
+	     target hook static variable definitions.

-     PART 3: Implement target hook stuff definitions.
+     PART 3: Auxiliary static function definitions.

-     PART 4: Implemet extern function definitions,
-             the prototype is in nds32-protos.h.
+     PART 4: Implement target hook stuff definitions.

-     PART 5: Initialize target hook structure and definitions.  */
+     PART 5: Implemet extern function definitions,
+	     the prototype is in nds32-protos.h.
+
+     PART 6: Initialize target hook structure and definitions.  */
+
+/* ------------------------------------------------------------------------ */
+
+/* PART 1: Auxiliary function and variable declarations.  */
+
+namespace nds32 {
+namespace scheduling {
+
+rtl_opt_pass *make_pass_nds32_print_stalls (gcc::context *);
+
+} // namespace scheduling
+} // namespace nds32
+
+rtl_opt_pass *make_pass_nds32_fp_as_gp (gcc::context *);
+rtl_opt_pass *make_pass_nds32_load_store_opt (gcc::context *);
+rtl_opt_pass *make_pass_nds32_soft_fp_arith_comm_opt(gcc::context *);
+rtl_opt_pass *make_pass_nds32_regrename_opt (gcc::context *);
+rtl_opt_pass *make_pass_nds32_gcse_opt (gcc::context *);
+rtl_opt_pass *make_pass_nds32_relax_opt (gcc::context *);
+rtl_opt_pass *make_pass_nds32_rename_lmwsmw_opt (gcc::context *);
+rtl_opt_pass *make_pass_nds32_gen_lmwsmw_opt (gcc::context *);
+rtl_opt_pass *make_pass_nds32_const_remater_opt (gcc::context *);
+rtl_opt_pass *make_pass_nds32_cprop_acc_opt (gcc::context *);
+
+gimple_opt_pass *make_pass_nds32_sign_conversion_opt (gcc::context *);
+gimple_opt_pass *make_pass_nds32_scalbn_transform_opt (gcc::context *);
+gimple_opt_pass *make_pass_nds32_abi_compatible (gcc::context *);

 /* ------------------------------------------------------------------------ */

-/* PART 1: Auxiliary static variable definitions and
-           target hook static variable definitions.  */
+/* PART 2: Auxiliary static variable definitions and
+	   target hook static variable definitions.  */

 /* Define intrinsic register names.
    Please refer to nds32_intrinsic.h file, the index is corresponding to
@@ -73,14 +128,217 @@
    NOTE that the base value starting from 1024.  */
 static const char * const nds32_intrinsic_register_names[] =
 {
-  "$PSW", "$IPSW", "$ITYPE", "$IPC"
+  "$CPU_VER",
+  "$ICM_CFG",
+  "$DCM_CFG",
+  "$MMU_CFG",
+  "$MSC_CFG",
+  "$MSC_CFG2",
+  "$CORE_ID",
+  "$FUCOP_EXIST",
+
+  "$PSW",
+  "$IPSW",
+  "$P_IPSW",
+  "$IVB",
+  "$EVA",
+  "$P_EVA",
+  "$ITYPE",
+  "$P_ITYPE",
+
+  "$MERR",
+  "$IPC",
+  "$P_IPC",
+  "$OIPC",
+  "$P_P0",
+  "$P_P1",
+
+  "$INT_MASK",
+  "$INT_MASK2",
+  "$INT_MASK3",
+  "$INT_PEND",
+  "$INT_PEND2",
+  "$INT_PEND3",
+  "$SP_USR",
+  "$SP_PRIV",
+  "$INT_PRI",
+  "$INT_PRI2",
+  "$INT_PRI3",
+  "$INT_PRI4",
+  "$INT_CTRL",
+  "$INT_TRIGGER",
+  "$INT_TRIGGER2",
+  "$INT_GPR_PUSH_DIS",
+
+  "$MMU_CTL",
+  "$L1_PPTB",
+  "$TLB_VPN",
+  "$TLB_DATA",
+  "$TLB_MISC",
+  "$VLPT_IDX",
+  "$ILMB",
+  "$DLMB",
+
+  "$CACHE_CTL",
+  "$HSMP_SADDR",
+  "$HSMP_EADDR",
+  "$SDZ_CTL",
+  "$N12MISC_CTL",
+  "$MISC_CTL",
+  "$ECC_MISC",
+
+  "$BPC0",
+  "$BPC1",
+  "$BPC2",
+  "$BPC3",
+  "$BPC4",
+  "$BPC5",
+  "$BPC6",
+  "$BPC7",
+
+  "$BPA0",
+  "$BPA1",
+  "$BPA2",
+  "$BPA3",
+  "$BPA4",
+  "$BPA5",
+  "$BPA6",
+  "$BPA7",
+
+  "$BPAM0",
+  "$BPAM1",
+  "$BPAM2",
+  "$BPAM3",
+  "$BPAM4",
+  "$BPAM5",
+  "$BPAM6",
+  "$BPAM7",
+
+  "$BPV0",
+  "$BPV1",
+  "$BPV2",
+  "$BPV3",
+  "$BPV4",
+  "$BPV5",
+  "$BPV6",
+  "$BPV7",
+
+  "$BPCID0",
+  "$BPCID1",
+  "$BPCID2",
+  "$BPCID3",
+  "$BPCID4",
+  "$BPCID5",
+  "$BPCID6",
+  "$BPCID7",
+
+  "$EDM_CFG",
+  "$EDMSW",
+  "$EDM_CTL",
+  "$EDM_DTR",
+  "$BPMTC",
+  "$DIMBR",
+
+  "$TECR0",
+  "$TECR1",
+  "$PFMC0",
+  "$PFMC1",
+  "$PFMC2",
+  "$PFM_CTL",
+  "$PFT_CTL",
+  "$HSP_CTL",
+  "$SP_BOUND",
+  "$SP_BOUND_PRIV",
+  "$SP_BASE",
+  "$SP_BASE_PRIV",
+  "$FUCOP_CTL",
+  "$PRUSR_ACC_CTL",
+
+  "$DMA_CFG",
+  "$DMA_GCSW",
+  "$DMA_CHNSEL",
+  "$DMA_ACT",
+  "$DMA_SETUP",
+  "$DMA_ISADDR",
+  "$DMA_ESADDR",
+  "$DMA_TCNT",
+  "$DMA_STATUS",
+  "$DMA_2DSET",
+  "$DMA_2DSCTL",
+  "$DMA_RCNT",
+  "$DMA_HSTATUS",
+
+  "$PC",
+  "$SP_USR1",
+  "$SP_USR2",
+  "$SP_USR3",
+  "$SP_PRIV1",
+  "$SP_PRIV2",
+  "$SP_PRIV3",
+  "$BG_REGION",
+  "$SFCR",
+  "$SIGN",
+  "$ISIGN",
+  "$P_ISIGN",
+  "$IFC_LP",
+  "$ITB"
+};
+
+/* Define instrinsic cctl names.  */
+static const char * const nds32_cctl_names[] =
+{
+  "L1D_VA_FILLCK",
+  "L1D_VA_ULCK",
+  "L1I_VA_FILLCK",
+  "L1I_VA_ULCK",
+
+  "L1D_IX_WBINVAL",
+  "L1D_IX_INVAL",
+  "L1D_IX_WB",
+  "L1I_IX_INVAL",
+
+  "L1D_VA_INVAL",
+  "L1D_VA_WB",
+  "L1D_VA_WBINVAL",
+  "L1I_VA_INVAL",
+
+  "L1D_IX_RTAG",
+  "L1D_IX_RWD",
+  "L1I_IX_RTAG",
+  "L1I_IX_RWD",
+
+  "L1D_IX_WTAG",
+  "L1D_IX_WWD",
+  "L1I_IX_WTAG",
+  "L1I_IX_WWD"
+};
+
+static const char * const nds32_dpref_names[] =
+{
+  "SRD",
+  "MRD",
+  "SWR",
+  "MWR",
+  "PTE",
+  "CLWR"
+};
+
+/* Defining register allocation order for performance.
+   We want to allocate callee-saved registers after others.
+   It may be used by nds32_adjust_reg_alloc_order().  */
+static const int nds32_reg_alloc_order_for_speed[] =
+{
+   0,   1,   2,   3,   4,   5,  16,  17,
+  18,  19,  20,  21,  22,  23,  24,  25,
+  26,  27,   6,   7,   8,   9,  10,  11,
+  12,  13,  14,  15
 };

 /* Defining target-specific uses of __attribute__.  */
 static const struct attribute_spec nds32_attribute_table[] =
 {
   /* Syntax: { name, min_len, max_len, decl_required, type_required,
-               function_type_required, handler, affects_type_identity } */
+	       function_type_required, handler, affects_type_identity } */

   /* The interrupt vid: [0-63]+ (actual vector number starts from 9 to 72).  */
   { "interrupt",    1, 64, false, false, false, NULL, false },
@@ -93,6 +351,7 @@ static const struct attribute_spec nds32_attribute_table[] =
   { "nested",       0,  0, false, false, false, NULL, false },
   { "not_nested",   0,  0, false, false, false, NULL, false },
   { "nested_ready", 0,  0, false, false, false, NULL, false },
+  { "critical",     0,  0, false, false, false, NULL, false },

   /* The attributes describing isr register save scheme.  */
   { "save_all",     0,  0, false, false, false, NULL, false },
@@ -102,17 +361,32 @@ static const struct attribute_spec nds32_attribute_table[] =
   { "nmi",          1,  1, false, false, false, NULL, false },
   { "warm",         1,  1, false, false, false, NULL, false },

+  /* The attributes describing isr security level. */
+  { "secure",       1,  1, false, false, false, NULL, false },
+
   /* The attribute telling no prologue/epilogue.  */
   { "naked",        0,  0, false, false, false, NULL, false },

+  /* The attribute is used to set signature.  */
+  { "signature",    0,  0, false, false, false, NULL, false },
+
+  /* The attribute is used to tell this function to be ROM patch.  */
+  { "indirect_call",0,  0, false, false, false, NULL, false },
+
+  /* FOR BACKWARD COMPATIBILITY,
+     this attribute also tells no prologue/epilogue.  */
+  { "no_prologue",  0,  0, false, false, false, NULL, false },
+
+  /* The attribute turn off hwloop optimization.  */
+  { "no_ext_zol",    0,  0, false,  false, false, NULL, false},
+
   /* The last attribute spec is set to be NULL.  */
   { NULL,           0,  0, false, false, false, NULL, false }
 };

-
 /* ------------------------------------------------------------------------ */

-/* PART 2: Auxiliary static function definitions.  */
+/* PART 3: Auxiliary static function definitions.  */

 /* Function to save and restore machine-specific function data.  */
 static struct machine_function *
@@ -121,12 +395,24 @@ nds32_init_machine_status (void)
   struct machine_function *machine;
   machine = ggc_cleared_alloc<machine_function> ();

+  /* Initially assume this function does not use __builtin_eh_return.  */
+  machine->use_eh_return_p = 0;
+
   /* Initially assume this function needs prologue/epilogue.  */
   machine->naked_p = 0;

   /* Initially assume this function does NOT use fp_as_gp optimization.  */
   machine->fp_as_gp_p = 0;

+  /* Initially this function is not under strictly aligned situation.  */
+  machine->strict_aligned_p = 0;
+
+  /* Initially this function has no naked and no_prologue attributes.  */
+  machine->attr_naked_p = 0;
+  machine->attr_no_prologue_p = 0;
+
+  /* Initially this function hwloop group ID number.  */
+  machine->hwloop_group_id = 0;
   return machine;
 }

@@ -137,23 +423,63 @@ nds32_compute_stack_frame (void)
 {
   int r;
   int block_size;
+  bool v3pushpop_p;

   /* Because nds32_compute_stack_frame() will be called from different place,
      everytime we enter this function, we have to assume this function
      needs prologue/epilogue.  */
   cfun->machine->naked_p = 0;

+  /* We need to mark whether this function has naked and no_prologue
+     attribute so that we can distinguish the difference if users applies
+     -mret-in-naked-func option.  */
+  cfun->machine->attr_naked_p
+    = lookup_attribute ("naked", DECL_ATTRIBUTES (current_function_decl))
+      ? 1 : 0;
+  cfun->machine->attr_no_prologue_p
+    = lookup_attribute ("no_prologue", DECL_ATTRIBUTES (current_function_decl))
+      ? 1 : 0;
+
+  /* If __builtin_eh_return is used, we better have frame pointer needed
+     so that we can easily locate the stack slot of return address.  */
+  if (crtl->calls_eh_return)
+    {
+      frame_pointer_needed = 1;
+
+      /* We need to mark eh data registers that need to be saved
+	 in the stack.  */
+      cfun->machine->eh_return_data_first_regno = EH_RETURN_DATA_REGNO (0);
+      for (r = 0; EH_RETURN_DATA_REGNO (r) != INVALID_REGNUM; r++)
+	cfun->machine->eh_return_data_last_regno = r;
+
+      cfun->machine->eh_return_data_regs_size
+	= 4 * (cfun->machine->eh_return_data_last_regno
+	       - cfun->machine->eh_return_data_first_regno
+	       + 1);
+      cfun->machine->use_eh_return_p = 1;
+    }
+  else
+    {
+      /* Assigning SP_REGNUM to eh_first_regno and eh_last_regno means we
+	 do not need to handle __builtin_eh_return case in this function.  */
+      cfun->machine->eh_return_data_first_regno = SP_REGNUM;
+      cfun->machine->eh_return_data_last_regno  = SP_REGNUM;
+
+      cfun->machine->eh_return_data_regs_size = 0;
+      cfun->machine->use_eh_return_p = 0;
+    }
+
   /* Get variadic arguments size to prepare pretend arguments and
      we will push them into stack at prologue by ourself.  */
   cfun->machine->va_args_size = crtl->args.pretend_args_size;
   if (cfun->machine->va_args_size != 0)
     {
       cfun->machine->va_args_first_regno
-        = NDS32_GPR_ARG_FIRST_REGNUM
-          + NDS32_MAX_GPR_REGS_FOR_ARGS
-          - (crtl->args.pretend_args_size / UNITS_PER_WORD);
+	= NDS32_GPR_ARG_FIRST_REGNUM
+	  + NDS32_MAX_GPR_REGS_FOR_ARGS
+	  - (crtl->args.pretend_args_size / UNITS_PER_WORD);
       cfun->machine->va_args_last_regno
-        = NDS32_GPR_ARG_FIRST_REGNUM + NDS32_MAX_GPR_REGS_FOR_ARGS - 1;
+	= NDS32_GPR_ARG_FIRST_REGNUM + NDS32_MAX_GPR_REGS_FOR_ARGS - 1;
     }
   else
     {
@@ -183,11 +509,13 @@ nds32_compute_stack_frame (void)

   /* If $gp value is required to be saved on stack, it needs 4 bytes space.
      Check whether we are using PIC code genration.  */
-  cfun->machine->gp_size = (flag_pic) ? 4 : 0;
+  cfun->machine->gp_size =
+    (flag_pic && df_regs_ever_live_p (PIC_OFFSET_TABLE_REGNUM)) ? 4 : 0;

   /* If $lp value is required to be saved on stack, it needs 4 bytes space.
      Check whether $lp is ever live.  */
-  cfun->machine->lp_size = (df_regs_ever_live_p (LP_REGNUM)) ? 4 : 0;
+  cfun->machine->lp_size
+    = (flag_always_save_lp || df_regs_ever_live_p (LP_REGNUM)) ? 4 : 0;

   /* Initially there is no padding bytes.  */
   cfun->machine->callee_saved_area_gpr_padding_bytes = 0;
@@ -196,6 +524,10 @@ nds32_compute_stack_frame (void)
   cfun->machine->callee_saved_gpr_regs_size = 0;
   cfun->machine->callee_saved_first_gpr_regno = SP_REGNUM;
   cfun->machine->callee_saved_last_gpr_regno  = SP_REGNUM;
+  cfun->machine->callee_saved_fpr_regs_size = 0;
+  cfun->machine->callee_saved_first_fpr_regno = SP_REGNUM;
+  cfun->machine->callee_saved_last_fpr_regno  = SP_REGNUM;
+
   /* Currently, there is no need to check $r28~$r31
      because we will save them in another way.  */
   for (r = 0; r < 28; r++)
@@ -213,43 +545,77 @@ nds32_compute_stack_frame (void)
 	}
     }

+  /* Recording fpu callee-saved register.  */
+  if (TARGET_HARD_FLOAT)
+    {
+      for (r = NDS32_FIRST_FPR_REGNUM; r < NDS32_LAST_FPR_REGNUM; r++)
+	{
+	  if (NDS32_REQUIRED_CALLEE_SAVED_P (r))
+	    {
+	      /* Mark the first required callee-saved register.  */
+	      if (cfun->machine->callee_saved_first_fpr_regno == SP_REGNUM)
+		{
+		  /* Make first callee-saved number is even,
+		     bacause we use doubleword access, and this way
+		     promise 8-byte alignemt.  */
+		  if (!NDS32_FPR_REGNO_OK_FOR_DOUBLE (r))
+		    cfun->machine->callee_saved_first_fpr_regno = r - 1;
+		  else
+		    cfun->machine->callee_saved_first_fpr_regno = r;
+		}
+	      cfun->machine->callee_saved_last_fpr_regno = r;
+	    }
+	}
+
+      /* Make last callee-saved register number is odd,
+	 we hope callee-saved register is even.  */
+      int last_fpr = cfun->machine->callee_saved_last_fpr_regno;
+      if (NDS32_FPR_REGNO_OK_FOR_DOUBLE (last_fpr))
+	cfun->machine->callee_saved_last_fpr_regno++;
+    }
+
   /* Check if this function can omit prologue/epilogue code fragment.
-     If there is 'naked' attribute in this function,
+     If there is 'no_prologue'/'naked'/'secure' attribute in this function,
      we can set 'naked_p' flag to indicate that
      we do not have to generate prologue/epilogue.
      Or, if all the following conditions succeed,
      we can set this function 'naked_p' as well:
        condition 1: first_regno == last_regno == SP_REGNUM,
-                    which means we do not have to save
-                    any callee-saved registers.
+		    which means we do not have to save
+		    any callee-saved registers.
        condition 2: Both $lp and $fp are NOT live in this function,
-                    which means we do not need to save them and there
-                    is no outgoing size.
+		    which means we do not need to save them and there
+		    is no outgoing size.
        condition 3: There is no local_size, which means
-                    we do not need to adjust $sp.  */
-  if (lookup_attribute ("naked", DECL_ATTRIBUTES (current_function_decl))
+		    we do not need to adjust $sp.  */
+  if (lookup_attribute ("no_prologue", DECL_ATTRIBUTES (current_function_decl))
+      || lookup_attribute ("naked", DECL_ATTRIBUTES (current_function_decl))
+      || lookup_attribute ("secure", DECL_ATTRIBUTES (current_function_decl))
       || (cfun->machine->callee_saved_first_gpr_regno == SP_REGNUM
 	  && cfun->machine->callee_saved_last_gpr_regno == SP_REGNUM
+	  && cfun->machine->callee_saved_first_fpr_regno == SP_REGNUM
+	  && cfun->machine->callee_saved_last_fpr_regno == SP_REGNUM
 	  && !df_regs_ever_live_p (FP_REGNUM)
 	  && !df_regs_ever_live_p (LP_REGNUM)
-	  && cfun->machine->local_size == 0))
+	  && cfun->machine->local_size == 0
+	  && !flag_pic))
     {
       /* Set this function 'naked_p' and other functions can check this flag.
-         Note that in nds32 port, the 'naked_p = 1' JUST means there is no
-         callee-saved, local size, and outgoing size.
-         The varargs space and ret instruction may still present in
-         the prologue/epilogue expanding.  */
+	 Note that in nds32 port, the 'naked_p = 1' JUST means there is no
+	 callee-saved, local size, and outgoing size.
+	 The varargs space and ret instruction may still present in
+	 the prologue/epilogue expanding.  */
       cfun->machine->naked_p = 1;

       /* No need to save $fp, $gp, and $lp.
-         We should set these value to be zero
-         so that nds32_initial_elimination_offset() can work properly.  */
+	 We should set these value to be zero
+	 so that nds32_initial_elimination_offset() can work properly.  */
       cfun->machine->fp_size = 0;
       cfun->machine->gp_size = 0;
       cfun->machine->lp_size = 0;

       /* If stack usage computation is required,
-         we need to provide the static stack size.  */
+	 we need to provide the static stack size.  */
       if (flag_stack_usage_info)
 	current_function_static_stack_size = 0;

@@ -257,6 +623,8 @@ nds32_compute_stack_frame (void)
       return;
     }

+  v3pushpop_p = NDS32_V3PUSH_AVAILABLE_P;
+
   /* Adjustment for v3push instructions:
      If we are using v3push (push25/pop25) instructions,
      we need to make sure Rb is $r6 and Re is
@@ -264,16 +632,14 @@ nds32_compute_stack_frame (void)
      Some results above will be discarded and recomputed.
      Note that it is only available under V3/V3M ISA and we
      DO NOT setup following stuff for isr or variadic function.  */
-  if (TARGET_V3PUSH
-      && !nds32_isr_function_p (current_function_decl)
-      && (cfun->machine->va_args_size == 0))
+  if (v3pushpop_p)
     {
       /* Recompute:
-           cfun->machine->fp_size
-           cfun->machine->gp_size
-           cfun->machine->lp_size
-           cfun->machine->callee_saved_regs_first_regno
-           cfun->machine->callee_saved_regs_last_regno */
+	   cfun->machine->fp_size
+	   cfun->machine->gp_size
+	   cfun->machine->lp_size
+	   cfun->machine->callee_saved_first_gpr_regno
+	   cfun->machine->callee_saved_last_gpr_regno */

       /* For v3push instructions, $fp, $gp, and $lp are always saved.  */
       cfun->machine->fp_size = 4;
@@ -316,11 +682,46 @@ nds32_compute_stack_frame (void)
 	}
     }

-  /* We have correctly set callee_saved_regs_first_regno
-     and callee_saved_regs_last_regno.
-     Initially, the callee_saved_regs_size is supposed to be 0.
-     As long as callee_saved_regs_last_regno is not SP_REGNUM,
-     we can update callee_saved_regs_size with new size.  */
+  int sp_adjust = cfun->machine->local_size
+		  + cfun->machine->out_args_size
+		  + cfun->machine->callee_saved_area_gpr_padding_bytes
+		  + cfun->machine->callee_saved_fpr_regs_size;
+
+  if (!v3pushpop_p
+      && nds32_memory_model_option == MEMORY_MODEL_FAST
+      && sp_adjust == 0
+      && !frame_pointer_needed)
+    {
+      block_size = cfun->machine->fp_size
+		   + cfun->machine->gp_size
+		   + cfun->machine->lp_size
+		   + (4 * (cfun->machine->callee_saved_last_gpr_regno
+			   - cfun->machine->callee_saved_first_gpr_regno
+			   + 1));
+
+      if (!NDS32_DOUBLE_WORD_ALIGN_P (block_size))
+	{
+	  /* $r14 is last callee save register.  */
+	  if (cfun->machine->callee_saved_last_gpr_regno
+	      < NDS32_LAST_CALLEE_SAVE_GPR_REGNUM)
+	    {
+	      cfun->machine->callee_saved_last_gpr_regno++;
+	    }
+	  else if (cfun->machine->callee_saved_first_gpr_regno == SP_REGNUM)
+	    {
+	      cfun->machine->callee_saved_first_gpr_regno
+		= NDS32_FIRST_CALLEE_SAVE_GPR_REGNUM;
+	      cfun->machine->callee_saved_last_gpr_regno
+		= NDS32_FIRST_CALLEE_SAVE_GPR_REGNUM;
+	    }
+	}
+    }
+
+  /* We have correctly set callee_saved_first_gpr_regno
+     and callee_saved_last_gpr_regno.
+     Initially, the callee_saved_gpr_regs_size is supposed to be 0.
+     As long as callee_saved_last_gpr_regno is not SP_REGNUM,
+     we can update callee_saved_gpr_regs_size with new size.  */
   if (cfun->machine->callee_saved_last_gpr_regno != SP_REGNUM)
     {
       /* Compute pushed size of callee-saved registers.  */
@@ -330,10 +731,22 @@ nds32_compute_stack_frame (void)
 	       + 1);
     }

+  if (TARGET_HARD_FLOAT)
+    {
+      /* Compute size of callee svaed floating-point registers.  */
+      if (cfun->machine->callee_saved_last_fpr_regno != SP_REGNUM)
+	{
+	  cfun->machine->callee_saved_fpr_regs_size
+	   = 4 * (cfun->machine->callee_saved_last_fpr_regno
+		  - cfun->machine->callee_saved_first_fpr_regno
+		  + 1);
+	}
+    }
+
   /* Important: We need to make sure that
-                (fp_size + gp_size + lp_size + callee_saved_regs_size)
-                is 8-byte alignment.
-                If it is not, calculate the padding bytes.  */
+		(fp_size + gp_size + lp_size + callee_saved_gpr_regs_size)
+		is 8-byte alignment.
+		If it is not, calculate the padding bytes.  */
   block_size = cfun->machine->fp_size
 	       + cfun->machine->gp_size
 	       + cfun->machine->lp_size
@@ -361,14 +774,15 @@ nds32_compute_stack_frame (void)
      "push registers to memory",
      "adjust stack pointer".  */
 static void
-nds32_emit_stack_push_multiple (rtx Rb, rtx Re, rtx En4, bool vaarg_p)
+nds32_emit_stack_push_multiple (unsigned Rb, unsigned Re,
+				bool save_fp_p, bool save_gp_p, bool save_lp_p,
+				bool vaarg_p)
 {
-  int regno;
+  unsigned regno;
   int extra_count;
   int num_use_regs;
   int par_index;
   int offset;
-  int save_fp, save_gp, save_lp;

   rtx reg;
   rtx mem;
@@ -381,39 +795,34 @@ nds32_emit_stack_push_multiple (rtx Rb, rtx Re, rtx En4, bool vaarg_p)
      necessary information for data analysis,
      so we create a parallel rtx like this:
      (parallel [(set (mem (plus (reg:SI SP_REGNUM) (const_int -32)))
-                     (reg:SI Rb))
-                (set (mem (plus (reg:SI SP_REGNUM) (const_int -28)))
-                     (reg:SI Rb+1))
-                ...
-                (set (mem (plus (reg:SI SP_REGNUM) (const_int -16)))
-                     (reg:SI Re))
-                (set (mem (plus (reg:SI SP_REGNUM) (const_int -12)))
-                     (reg:SI FP_REGNUM))
-                (set (mem (plus (reg:SI SP_REGNUM) (const_int -8)))
-                     (reg:SI GP_REGNUM))
-                (set (mem (plus (reg:SI SP_REGNUM) (const_int -4)))
-                     (reg:SI LP_REGNUM))
-                (set (reg:SI SP_REGNUM)
-                     (plus (reg:SI SP_REGNUM) (const_int -32)))]) */
-
-  /* Determine whether we need to save $fp, $gp, or $lp.  */
-  save_fp = INTVAL (En4) & 0x8;
-  save_gp = INTVAL (En4) & 0x4;
-  save_lp = INTVAL (En4) & 0x2;
+		     (reg:SI Rb))
+		(set (mem (plus (reg:SI SP_REGNUM) (const_int -28)))
+		     (reg:SI Rb+1))
+		...
+		(set (mem (plus (reg:SI SP_REGNUM) (const_int -16)))
+		     (reg:SI Re))
+		(set (mem (plus (reg:SI SP_REGNUM) (const_int -12)))
+		     (reg:SI FP_REGNUM))
+		(set (mem (plus (reg:SI SP_REGNUM) (const_int -8)))
+		     (reg:SI GP_REGNUM))
+		(set (mem (plus (reg:SI SP_REGNUM) (const_int -4)))
+		     (reg:SI LP_REGNUM))
+		(set (reg:SI SP_REGNUM)
+		     (plus (reg:SI SP_REGNUM) (const_int -32)))]) */

   /* Calculate the number of registers that will be pushed.  */
   extra_count = 0;
-  if (save_fp)
+  if (save_fp_p)
     extra_count++;
-  if (save_gp)
+  if (save_gp_p)
     extra_count++;
-  if (save_lp)
+  if (save_lp_p)
     extra_count++;
   /* Note that Rb and Re may be SP_REGNUM.  DO NOT count it in.  */
-  if (REGNO (Rb) == SP_REGNUM && REGNO (Re) == SP_REGNUM)
+  if (Rb == SP_REGNUM && Re == SP_REGNUM)
     num_use_regs = extra_count;
   else
-    num_use_regs = REGNO (Re) - REGNO (Rb) + 1 + extra_count;
+    num_use_regs = Re - Rb + 1 + extra_count;

   /* In addition to used registers,
      we need one more space for (set sp sp-x) rtx.  */
@@ -425,10 +834,10 @@ nds32_emit_stack_push_multiple (rtx Rb, rtx Re, rtx En4, bool vaarg_p)
   offset = -(num_use_regs * 4);

   /* Create (set mem regX) from Rb, Rb+1 up to Re.  */
-  for (regno = REGNO (Rb); regno <= (int) REGNO (Re); regno++)
+  for (regno = Rb; regno <= Re; regno++)
     {
       /* Rb and Re may be SP_REGNUM.
-         We need to break this loop immediately.  */
+	 We need to break this loop immediately.  */
       if (regno == SP_REGNUM)
 	break;

@@ -444,7 +853,7 @@ nds32_emit_stack_push_multiple (rtx Rb, rtx Re, rtx En4, bool vaarg_p)
     }

   /* Create (set mem fp), (set mem gp), and (set mem lp) if necessary.  */
-  if (save_fp)
+  if (save_fp_p)
     {
       reg = gen_rtx_REG (SImode, FP_REGNUM);
       mem = gen_frame_mem (SImode, plus_constant (Pmode,
@@ -456,7 +865,7 @@ nds32_emit_stack_push_multiple (rtx Rb, rtx Re, rtx En4, bool vaarg_p)
       offset = offset + 4;
       par_index++;
     }
-  if (save_gp)
+  if (save_gp_p)
     {
       reg = gen_rtx_REG (SImode, GP_REGNUM);
       mem = gen_frame_mem (SImode, plus_constant (Pmode,
@@ -468,7 +877,7 @@ nds32_emit_stack_push_multiple (rtx Rb, rtx Re, rtx En4, bool vaarg_p)
       offset = offset + 4;
       par_index++;
     }
-  if (save_lp)
+  if (save_lp_p)
     {
       reg = gen_rtx_REG (SImode, LP_REGNUM);
       mem = gen_frame_mem (SImode, plus_constant (Pmode,
@@ -514,14 +923,14 @@ nds32_emit_stack_push_multiple (rtx Rb, rtx Re, rtx En4, bool vaarg_p)
      "pop registers from memory",
      "adjust stack pointer".  */
 static void
-nds32_emit_stack_pop_multiple (rtx Rb, rtx Re, rtx En4)
+nds32_emit_stack_pop_multiple (unsigned Rb, unsigned Re,
+			       bool save_fp_p, bool save_gp_p, bool save_lp_p)
 {
-  int regno;
+  unsigned regno;
   int extra_count;
   int num_use_regs;
   int par_index;
   int offset;
-  int save_fp, save_gp, save_lp;

   rtx reg;
   rtx mem;
@@ -534,39 +943,34 @@ nds32_emit_stack_pop_multiple (rtx Rb, rtx Re, rtx En4)
      necessary information for data analysis,
      so we create a parallel rtx like this:
      (parallel [(set (reg:SI Rb)
-                     (mem (reg:SI SP_REGNUM)))
-                (set (reg:SI Rb+1)
-                     (mem (plus (reg:SI SP_REGNUM) (const_int 4))))
-                ...
-                (set (reg:SI Re)
-                     (mem (plus (reg:SI SP_REGNUM) (const_int 16))))
-                (set (reg:SI FP_REGNUM)
-                     (mem (plus (reg:SI SP_REGNUM) (const_int 20))))
-                (set (reg:SI GP_REGNUM)
-                     (mem (plus (reg:SI SP_REGNUM) (const_int 24))))
-                (set (reg:SI LP_REGNUM)
-                     (mem (plus (reg:SI SP_REGNUM) (const_int 28))))
-                (set (reg:SI SP_REGNUM)
-                     (plus (reg:SI SP_REGNUM) (const_int 32)))]) */
-
-  /* Determine whether we need to restore $fp, $gp, or $lp.  */
-  save_fp = INTVAL (En4) & 0x8;
-  save_gp = INTVAL (En4) & 0x4;
-  save_lp = INTVAL (En4) & 0x2;
+		     (mem (reg:SI SP_REGNUM)))
+		(set (reg:SI Rb+1)
+		     (mem (plus (reg:SI SP_REGNUM) (const_int 4))))
+		...
+		(set (reg:SI Re)
+		     (mem (plus (reg:SI SP_REGNUM) (const_int 16))))
+		(set (reg:SI FP_REGNUM)
+		     (mem (plus (reg:SI SP_REGNUM) (const_int 20))))
+		(set (reg:SI GP_REGNUM)
+		     (mem (plus (reg:SI SP_REGNUM) (const_int 24))))
+		(set (reg:SI LP_REGNUM)
+		     (mem (plus (reg:SI SP_REGNUM) (const_int 28))))
+		(set (reg:SI SP_REGNUM)
+		     (plus (reg:SI SP_REGNUM) (const_int 32)))]) */

   /* Calculate the number of registers that will be poped.  */
   extra_count = 0;
-  if (save_fp)
+  if (save_fp_p)
     extra_count++;
-  if (save_gp)
+  if (save_gp_p)
     extra_count++;
-  if (save_lp)
+  if (save_lp_p)
     extra_count++;
   /* Note that Rb and Re may be SP_REGNUM.  DO NOT count it in.  */
-  if (REGNO (Rb) == SP_REGNUM && REGNO (Re) == SP_REGNUM)
+  if (Rb == SP_REGNUM && Re == SP_REGNUM)
     num_use_regs = extra_count;
   else
-    num_use_regs = REGNO (Re) - REGNO (Rb) + 1 + extra_count;
+    num_use_regs = Re - Rb + 1 + extra_count;

   /* In addition to used registers,
      we need one more space for (set sp sp+x) rtx.  */
@@ -578,10 +982,10 @@ nds32_emit_stack_pop_multiple (rtx Rb, rtx Re, rtx En4)
   offset = 0;

   /* Create (set regX mem) from Rb, Rb+1 up to Re.  */
-  for (regno = REGNO (Rb); regno <= (int) REGNO (Re); regno++)
+  for (regno = Rb; regno <= Re; regno++)
     {
       /* Rb and Re may be SP_REGNUM.
-         We need to break this loop immediately.  */
+	 We need to break this loop immediately.  */
       if (regno == SP_REGNUM)
 	break;

@@ -599,7 +1003,7 @@ nds32_emit_stack_pop_multiple (rtx Rb, rtx Re, rtx En4)
     }

   /* Create (set fp mem), (set gp mem), and (set lp mem) if necessary.  */
-  if (save_fp)
+  if (save_fp_p)
     {
       reg = gen_rtx_REG (SImode, FP_REGNUM);
       mem = gen_frame_mem (SImode, plus_constant (Pmode,
@@ -613,7 +1017,7 @@ nds32_emit_stack_pop_multiple (rtx Rb, rtx Re, rtx En4)

       dwarf = alloc_reg_note (REG_CFA_RESTORE, reg, dwarf);
     }
-  if (save_gp)
+  if (save_gp_p)
     {
       reg = gen_rtx_REG (SImode, GP_REGNUM);
       mem = gen_frame_mem (SImode, plus_constant (Pmode,
@@ -627,7 +1031,7 @@ nds32_emit_stack_pop_multiple (rtx Rb, rtx Re, rtx En4)

       dwarf = alloc_reg_note (REG_CFA_RESTORE, reg, dwarf);
     }
-  if (save_lp)
+  if (save_lp_p)
     {
       reg = gen_rtx_REG (SImode, LP_REGNUM);
       mem = gen_frame_mem (SImode, plus_constant (Pmode,
@@ -670,12 +1074,11 @@ nds32_emit_stack_pop_multiple (rtx Rb, rtx Re, rtx En4)
      "push registers to memory",
      "adjust stack pointer".  */
 static void
-nds32_emit_stack_v3push (rtx Rb,
-			 rtx Re,
-			 rtx En4 ATTRIBUTE_UNUSED,
-			 rtx imm8u)
+nds32_emit_stack_v3push (unsigned Rb,
+			 unsigned Re,
+			 unsigned imm8u)
 {
-  int regno;
+  unsigned regno;
   int num_use_regs;
   int par_index;
   int offset;
@@ -690,27 +1093,27 @@ nds32_emit_stack_v3push (rtx Rb,
      necessary information for data analysis,
      so we create a parallel rtx like this:
      (parallel [(set (mem (plus (reg:SI SP_REGNUM) (const_int -32)))
-                     (reg:SI Rb))
-                (set (mem (plus (reg:SI SP_REGNUM) (const_int -28)))
-                     (reg:SI Rb+1))
-                ...
-                (set (mem (plus (reg:SI SP_REGNUM) (const_int -16)))
-                     (reg:SI Re))
-                (set (mem (plus (reg:SI SP_REGNUM) (const_int -12)))
-                     (reg:SI FP_REGNUM))
-                (set (mem (plus (reg:SI SP_REGNUM) (const_int -8)))
-                     (reg:SI GP_REGNUM))
-                (set (mem (plus (reg:SI SP_REGNUM) (const_int -4)))
-                     (reg:SI LP_REGNUM))
-                (set (reg:SI SP_REGNUM)
-                     (plus (reg:SI SP_REGNUM) (const_int -32-imm8u)))]) */
+		     (reg:SI Rb))
+		(set (mem (plus (reg:SI SP_REGNUM) (const_int -28)))
+		     (reg:SI Rb+1))
+		...
+		(set (mem (plus (reg:SI SP_REGNUM) (const_int -16)))
+		     (reg:SI Re))
+		(set (mem (plus (reg:SI SP_REGNUM) (const_int -12)))
+		     (reg:SI FP_REGNUM))
+		(set (mem (plus (reg:SI SP_REGNUM) (const_int -8)))
+		     (reg:SI GP_REGNUM))
+		(set (mem (plus (reg:SI SP_REGNUM) (const_int -4)))
+		     (reg:SI LP_REGNUM))
+		(set (reg:SI SP_REGNUM)
+		     (plus (reg:SI SP_REGNUM) (const_int -32-imm8u)))]) */

   /* Calculate the number of registers that will be pushed.
      Since $fp, $gp, and $lp is always pushed with v3push instruction,
      we need to count these three registers.
      Under v3push, Rb is $r6, while Re is $r6, $r8, $r10, or $r14.
      So there is no need to worry about Rb=Re=SP_REGNUM case.  */
-  num_use_regs = REGNO (Re) - REGNO (Rb) + 1 + 3;
+  num_use_regs = Re - Rb + 1 + 3;

   /* In addition to used registers,
      we need one more space for (set sp sp-x-imm8u) rtx.  */
@@ -724,7 +1127,7 @@ nds32_emit_stack_v3push (rtx Rb,
   /* Create (set mem regX) from Rb, Rb+1 up to Re.
      Under v3push, Rb is $r6, while Re is $r6, $r8, $r10, or $r14.
      So there is no need to worry about Rb=Re=SP_REGNUM case.  */
-  for (regno = REGNO (Rb); regno <= (int) REGNO (Re); regno++)
+  for (regno = Rb; regno <= Re; regno++)
     {
       reg = gen_rtx_REG (SImode, regno);
       mem = gen_frame_mem (SImode, plus_constant (Pmode,
@@ -776,7 +1179,7 @@ nds32_emit_stack_v3push (rtx Rb,
     = gen_rtx_SET (stack_pointer_rtx,
 		   plus_constant (Pmode,
 				  stack_pointer_rtx,
-				  offset - INTVAL (imm8u)));
+				  offset - imm8u));
   XVECEXP (parallel_insn, 0, par_index) = adjust_sp_rtx;
   RTX_FRAME_RELATED_P (adjust_sp_rtx) = 1;

@@ -794,12 +1197,11 @@ nds32_emit_stack_v3push (rtx Rb,
      "pop registers from memory",
      "adjust stack pointer".  */
 static void
-nds32_emit_stack_v3pop (rtx Rb,
-			rtx Re,
-			rtx En4 ATTRIBUTE_UNUSED,
-			rtx imm8u)
+nds32_emit_stack_v3pop (unsigned Rb,
+			unsigned Re,
+			unsigned imm8u)
 {
-  int regno;
+  unsigned regno;
   int num_use_regs;
   int par_index;
   int offset;
@@ -815,27 +1217,27 @@ nds32_emit_stack_v3pop (rtx Rb,
      necessary information for data analysis,
      so we create a parallel rtx like this:
      (parallel [(set (reg:SI Rb)
-                     (mem (reg:SI SP_REGNUM)))
-                (set (reg:SI Rb+1)
-                     (mem (plus (reg:SI SP_REGNUM) (const_int 4))))
-                ...
-                (set (reg:SI Re)
-                     (mem (plus (reg:SI SP_REGNUM) (const_int 16))))
-                (set (reg:SI FP_REGNUM)
-                     (mem (plus (reg:SI SP_REGNUM) (const_int 20))))
-                (set (reg:SI GP_REGNUM)
-                     (mem (plus (reg:SI SP_REGNUM) (const_int 24))))
-                (set (reg:SI LP_REGNUM)
-                     (mem (plus (reg:SI SP_REGNUM) (const_int 28))))
-                (set (reg:SI SP_REGNUM)
-                     (plus (reg:SI SP_REGNUM) (const_int 32+imm8u)))]) */
+		     (mem (reg:SI SP_REGNUM)))
+		(set (reg:SI Rb+1)
+		     (mem (plus (reg:SI SP_REGNUM) (const_int 4))))
+		...
+		(set (reg:SI Re)
+		     (mem (plus (reg:SI SP_REGNUM) (const_int 16))))
+		(set (reg:SI FP_REGNUM)
+		     (mem (plus (reg:SI SP_REGNUM) (const_int 20))))
+		(set (reg:SI GP_REGNUM)
+		     (mem (plus (reg:SI SP_REGNUM) (const_int 24))))
+		(set (reg:SI LP_REGNUM)
+		     (mem (plus (reg:SI SP_REGNUM) (const_int 28))))
+		(set (reg:SI SP_REGNUM)
+		     (plus (reg:SI SP_REGNUM) (const_int 32+imm8u)))]) */

   /* Calculate the number of registers that will be poped.
      Since $fp, $gp, and $lp is always poped with v3pop instruction,
      we need to count these three registers.
      Under v3push, Rb is $r6, while Re is $r6, $r8, $r10, or $r14.
      So there is no need to worry about Rb=Re=SP_REGNUM case.  */
-  num_use_regs = REGNO (Re) - REGNO (Rb) + 1 + 3;
+  num_use_regs = Re - Rb + 1 + 3;

   /* In addition to used registers,
      we need one more space for (set sp sp+x+imm8u) rtx.  */
@@ -849,7 +1251,7 @@ nds32_emit_stack_v3pop (rtx Rb,
   /* Create (set regX mem) from Rb, Rb+1 up to Re.
      Under v3pop, Rb is $r6, while Re is $r6, $r8, $r10, or $r14.
      So there is no need to worry about Rb=Re=SP_REGNUM case.  */
-  for (regno = REGNO (Rb); regno <= (int) REGNO (Re); regno++)
+  for (regno = Rb; regno <= Re; regno++)
     {
       reg = gen_rtx_REG (SImode, regno);
       mem = gen_frame_mem (SImode, plus_constant (Pmode,
@@ -907,11 +1309,24 @@ nds32_emit_stack_v3pop (rtx Rb,
     = gen_rtx_SET (stack_pointer_rtx,
 		   plus_constant (Pmode,
 				  stack_pointer_rtx,
-				  offset + INTVAL (imm8u)));
+				  offset + imm8u));
   XVECEXP (parallel_insn, 0, par_index) = adjust_sp_rtx;

-  /* Tell gcc we adjust SP in this insn.  */
-  dwarf = alloc_reg_note (REG_CFA_ADJUST_CFA, copy_rtx (adjust_sp_rtx), dwarf);
+  if (frame_pointer_needed)
+    {
+      /* (expr_list:REG_CFA_DEF_CFA (plus:SI (reg/f:SI $sp)
+					     (const_int 0))
+	 mean reset frame pointer to $sp and reset to offset 0.  */
+      rtx cfa_adjust_rtx = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+					 const0_rtx);
+      dwarf = alloc_reg_note (REG_CFA_DEF_CFA, cfa_adjust_rtx, dwarf);
+    }
+  else
+    {
+      /* Tell gcc we adjust SP in this insn.  */
+      dwarf = alloc_reg_note (REG_CFA_ADJUST_CFA,
+			      copy_rtx (adjust_sp_rtx), dwarf);
+    }

   parallel_insn = emit_insn (parallel_insn);

@@ -924,6 +1339,32 @@ nds32_emit_stack_v3pop (rtx Rb,
   REG_NOTES (parallel_insn) = dwarf;
 }

+static void
+nds32_emit_load_gp (void)
+{
+  rtx got_symbol, pat;
+
+  /* Initial GLOBAL OFFSET TABLE don't do the scheduling.  */
+  emit_insn (gen_blockage ());
+
+  got_symbol = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
+  /* sethi $gp, _GLOBAL_OFFSET_TABLE_ -8 */
+  pat = gen_rtx_UNSPEC (SImode, gen_rtvec (1, got_symbol), UNSPEC_GOTINIT);
+  pat = gen_rtx_CONST (SImode, gen_rtx_PLUS (Pmode, pat, GEN_INT (-8)));
+  emit_insn (gen_sethi (pic_offset_table_rtx,pat));
+
+  /* ori $gp, $gp, _GLOBAL_OFFSET_TABLE_ -4 */
+  pat = gen_rtx_UNSPEC (SImode, gen_rtvec (1, got_symbol), UNSPEC_GOTINIT);
+  pat = gen_rtx_CONST (SImode, gen_rtx_PLUS (Pmode, pat, GEN_INT (-4)));
+  emit_insn (gen_lo_sum (pic_offset_table_rtx, pic_offset_table_rtx, pat));
+
+  /* add5.pc $gp */
+  emit_insn (gen_add_pc (pic_offset_table_rtx, pic_offset_table_rtx));
+
+  /* Initial GLOBAL OFFSET TABLE don't do the scheduling.  */
+  emit_insn (gen_blockage ());
+}
+
 /* Function that may creates more instructions
    for large value on adjusting stack pointer.

@@ -933,79 +1374,70 @@ nds32_emit_stack_v3pop (rtx Rb,
    the adjustment value is not able to be fit in the 'addi' instruction.
    One solution is to move value into a register
    and then use 'add' instruction.
-   In practice, we use TA_REGNUM ($r15) to accomplish this purpose.
-   Also, we need to return zero for sp adjustment so that
-   proglogue/epilogue knows there is no need to create 'addi' instruction.  */
-static int
-nds32_force_addi_stack_int (int full_value)
+   In practice, we use TA_REGNUM ($r15) to accomplish this purpose.  */
+static void
+nds32_emit_adjust_frame (rtx to_reg, rtx from_reg, int adjust_value)
 {
-  int adjust_value;
-
   rtx tmp_reg;
-  rtx sp_adjust_insn;
+  rtx frame_adjust_insn;
+  rtx adjust_value_rtx = GEN_INT (adjust_value);

-  if (!satisfies_constraint_Is15 (GEN_INT (full_value)))
+  if (adjust_value == 0)
+    return;
+
+  if (!satisfies_constraint_Is15 (adjust_value_rtx))
     {
       /* The value is not able to fit in single addi instruction.
-         Create more instructions of moving value into a register
-         and then add stack pointer with it.  */
+	 Create more instructions of moving value into a register
+	 and then add stack pointer with it.  */

       /* $r15 is going to be temporary register to hold the value.  */
       tmp_reg = gen_rtx_REG (SImode, TA_REGNUM);

       /* Create one more instruction to move value
-         into the temporary register.  */
-      emit_move_insn (tmp_reg, GEN_INT (full_value));
+	 into the temporary register.  */
+      emit_move_insn (tmp_reg, adjust_value_rtx);

       /* Create new 'add' rtx.  */
-      sp_adjust_insn = gen_addsi3 (stack_pointer_rtx,
-				   stack_pointer_rtx,
-				   tmp_reg);
+      frame_adjust_insn = gen_addsi3 (to_reg,
+				      from_reg,
+				      tmp_reg);
       /* Emit rtx into insn list and receive its transformed insn rtx.  */
-      sp_adjust_insn = emit_insn (sp_adjust_insn);
-
-      /* At prologue, we need to tell GCC that this is frame related insn,
-         so that we can consider this instruction to output debug information.
-         If full_value is NEGATIVE, it means this function
-         is invoked by expand_prologue.  */
-      if (full_value < 0)
-	{
-	  /* Because (tmp_reg <- full_value) may be split into two
-	     rtl patterns, we can not set its RTX_FRAME_RELATED_P.
-	     We need to construct another (sp <- sp + full_value)
-	     and then insert it into sp_adjust_insn's reg note to
-	     represent a frame related expression.
-	     GCC knows how to refer it and output debug information.  */
-
-	  rtx plus_rtx;
-	  rtx set_rtx;
+      frame_adjust_insn = emit_insn (frame_adjust_insn);

-	  plus_rtx = plus_constant (Pmode, stack_pointer_rtx, full_value);
-	  set_rtx = gen_rtx_SET (stack_pointer_rtx, plus_rtx);
-	  add_reg_note (sp_adjust_insn, REG_FRAME_RELATED_EXPR, set_rtx);
+      /* Because (tmp_reg <- full_value) may be split into two
+	 rtl patterns, we can not set its RTX_FRAME_RELATED_P.
+	 We need to construct another (sp <- sp + full_value)
+	 and then insert it into sp_adjust_insn's reg note to
+	 represent a frame related expression.
+	 GCC knows how to refer it and output debug information.  */

-	  RTX_FRAME_RELATED_P (sp_adjust_insn) = 1;
-	}
+      rtx plus_rtx;
+      rtx set_rtx;

-      /* We have used alternative way to adjust stack pointer value.
-         Return zero so that prologue/epilogue
-         will not generate other instructions.  */
-      return 0;
+      plus_rtx = plus_constant (Pmode, from_reg, adjust_value);
+      set_rtx = gen_rtx_SET (to_reg, plus_rtx);
+      add_reg_note (frame_adjust_insn, REG_FRAME_RELATED_EXPR, set_rtx);
     }
   else
     {
-      /* The value is able to fit in addi instruction.
-         However, remember to make it to be positive value
-         because we want to return 'adjustment' result.  */
-      adjust_value = (full_value < 0) ? (-full_value) : (full_value);
-
-      return adjust_value;
+      /* Generate sp adjustment instruction if and only if sp_adjust != 0.  */
+      frame_adjust_insn = gen_addsi3 (to_reg,
+				      from_reg,
+				      adjust_value_rtx);
+      /* Emit rtx into instructions list and receive INSN rtx form.  */
+      frame_adjust_insn = emit_insn (frame_adjust_insn);
     }
+
+    /* The insn rtx 'sp_adjust_insn' will change frame layout.
+       We need to use RTX_FRAME_RELATED_P so that GCC is able to
+       generate CFI (Call Frame Information) stuff.  */
+    RTX_FRAME_RELATED_P (frame_adjust_insn) = 1;
 }

 /* Return true if MODE/TYPE need double word alignment.  */
 static bool
-nds32_needs_double_word_align (machine_mode mode, const_tree type)
+nds32_needs_double_word_align (enum machine_mode mode, const_tree type)
 {
   unsigned int align;

@@ -1015,18 +1447,25 @@ nds32_needs_double_word_align (machine_mode mode, const_tree type)
   return (align > PARM_BOUNDARY);
 }

-/* Return true if FUNC is a naked function.  */
-static bool
+bool
 nds32_naked_function_p (tree func)
 {
-  tree t;
+  /* FOR BACKWARD COMPATIBILITY,
+     we need to support 'no_prologue' attribute as well.  */
+  tree t_naked;
+  tree t_no_prologue;

   if (TREE_CODE (func) != FUNCTION_DECL)
     abort ();

-  t = lookup_attribute ("naked", DECL_ATTRIBUTES (func));
+  /* We have to use lookup_attribute() to check attributes.
+     Because attr_naked_p and attr_no_prologue_p are set in
+     nds32_compute_stack_frame() and the function has not been
+     invoked yet.  */
+  t_naked       = lookup_attribute ("naked", DECL_ATTRIBUTES (func));
+  t_no_prologue = lookup_attribute ("no_prologue", DECL_ATTRIBUTES (func));

-  return (t != NULL_TREE);
+  return ((t_naked != NULL_TREE) || (t_no_prologue != NULL_TREE));
 }

 /* Function that check if 'X' is a valid address register.
@@ -1035,7 +1474,7 @@ nds32_naked_function_p (tree func)

    STRICT : true
      => We are in reload pass or after reload pass.
-        The register number should be strictly limited in general registers.
+	The register number should be strictly limited in general registers.

    STRICT : false
      => Before reload pass, we are free to use any register number.  */
@@ -1058,10 +1497,10 @@ nds32_address_register_rtx_p (rtx x, bool strict)
 /* Function that check if 'INDEX' is valid to be a index rtx for address.

    OUTER_MODE : Machine mode of outer address rtx.
-        INDEX : Check if this rtx is valid to be a index for address.
+	INDEX : Check if this rtx is valid to be a index for address.
        STRICT : If it is true, we are in reload pass or after reload pass.  */
 static bool
-nds32_legitimate_index_p (machine_mode outer_mode,
+nds32_legitimate_index_p (enum machine_mode outer_mode,
 			  rtx index,
 			  bool strict)
 {
@@ -1074,7 +1513,7 @@ nds32_legitimate_index_p (machine_mode outer_mode,
     case REG:
       regno = REGNO (index);
       /* If we are in reload pass or after reload pass,
-         we need to limit it to general register.  */
+	 we need to limit it to general register.  */
       if (strict)
 	return REGNO_OK_FOR_INDEX_P (regno);
       else
@@ -1082,45 +1521,73 @@ nds32_legitimate_index_p (machine_mode outer_mode,

     case CONST_INT:
       /* The alignment of the integer value is determined by 'outer_mode'.  */
-      if (GET_MODE_SIZE (outer_mode) == 1)
+      switch (GET_MODE_SIZE (outer_mode))
 	{
+	case 1:
 	  /* Further check if the value is legal for the 'outer_mode'.  */
-	  if (!satisfies_constraint_Is15 (index))
-	    return false;
+	  if (satisfies_constraint_Is15 (index))
+	    return true;
+	  break;

-	  /* Pass all test, the value is valid, return true.  */
-	  return true;
-	}
-      if (GET_MODE_SIZE (outer_mode) == 2
-	  && NDS32_HALF_WORD_ALIGN_P (INTVAL (index)))
-	{
+	case 2:
 	  /* Further check if the value is legal for the 'outer_mode'.  */
-	  if (!satisfies_constraint_Is16 (index))
-	    return false;
+	  if (satisfies_constraint_Is16 (index))
+	    {
+	      /* If it is not under strictly aligned situation,
+		 we can return true without checking alignment.  */
+	      if (!cfun->machine->strict_aligned_p)
+		return true;
+	     /* Make sure address is half word alignment.  */
+	      else if (NDS32_HALF_WORD_ALIGN_P (INTVAL (index)))
+		return true;
+	    }
+	  break;

-	  /* Pass all test, the value is valid, return true.  */
-	  return true;
-	}
-      if (GET_MODE_SIZE (outer_mode) == 4
-	  && NDS32_SINGLE_WORD_ALIGN_P (INTVAL (index)))
-	{
+	case 4:
 	  /* Further check if the value is legal for the 'outer_mode'.  */
-	  if (!satisfies_constraint_Is17 (index))
-	    return false;
+	  if (satisfies_constraint_Is17 (index))
+	    {
+	      if ((TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE))
+		{
+		  if (!satisfies_constraint_Is14 (index))
+		    return false;
+		}
+
+	      /* If it is not under strictly aligned situation,
+		 we can return true without checking alignment.  */
+	      if (!cfun->machine->strict_aligned_p)
+		return true;
+	     /* Make sure address is word alignment.  */
+	      else if (NDS32_SINGLE_WORD_ALIGN_P (INTVAL (index)))
+		return true;
+	    }
+	  break;

-	  /* Pass all test, the value is valid, return true.  */
-	  return true;
-	}
-      if (GET_MODE_SIZE (outer_mode) == 8
-	  && NDS32_SINGLE_WORD_ALIGN_P (INTVAL (index)))
-	{
-	  /* Further check if the value is legal for the 'outer_mode'.  */
-	  if (!satisfies_constraint_Is17 (gen_int_mode (INTVAL (index) + 4,
-							SImode)))
-	    return false;
+	case 8:
+	  if (satisfies_constraint_Is17 (gen_int_mode (INTVAL (index) + 4,
+						       SImode)))
+	    {
+	      if ((TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE))
+		{
+		  if (!satisfies_constraint_Is14 (index))
+		    return false;
+		}
+
+	      /* If it is not under strictly aligned situation,
+		 we can return true without checking alignment.  */
+	      if (!cfun->machine->strict_aligned_p)
+		return true;
+	     /* Make sure address is word alignment.
+		Currently we do not have 64-bit load/store yet,
+		so we will use two 32-bit load/store instructions to do
+		memory access and they are single word alignment.  */
+	      else if (NDS32_SINGLE_WORD_ALIGN_P (INTVAL (index)))
+		return true;
+	    }
+	  break;

-	  /* Pass all test, the value is valid, return true.  */
-	  return true;
+	default:
+	  return false;
 	}

       return false;
@@ -1134,9 +1601,10 @@ nds32_legitimate_index_p (machine_mode outer_mode,
 	  int multiplier;
 	  multiplier = INTVAL (op1);

-	  /* We only allow (mult reg const_int_1)
-	     or (mult reg const_int_2) or (mult reg const_int_4).  */
-	  if (multiplier != 1 && multiplier != 2 && multiplier != 4)
+	  /* We only allow (mult reg const_int_1), (mult reg const_int_2),
+	     (mult reg const_int_4) or (mult reg const_int_8).  */
+	  if (multiplier != 1 && multiplier != 2
+	      && multiplier != 4 && multiplier != 8)
 	    return false;

 	  regno = REGNO (op0);
@@ -1161,8 +1629,9 @@ nds32_legitimate_index_p (machine_mode outer_mode,
 	  sv = INTVAL (op1);

 	  /* We only allow (ashift reg const_int_0)
-	     or (ashift reg const_int_1) or (ashift reg const_int_2).  */
-	  if (sv != 0 && sv != 1 && sv !=2)
+	     or (ashift reg const_int_1) or (ashift reg const_int_2) or
+	     (ashift reg const_int_3).  */
+	  if (sv != 0 && sv != 1 && sv !=2 && sv != 3)
 	    return false;

 	  regno = REGNO (op0);
@@ -1181,18 +1650,302 @@ nds32_legitimate_index_p (machine_mode outer_mode,
     }
 }

+static void
+nds32_insert_innermost_loop (void)
+{
+  struct loop *loop;
+  basic_block *bbs, bb;
+
+  compute_bb_for_insn ();
+  /* initial loop structure */
+  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+
+  /* Scan all inner most loops.  */
+  FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
+    {
+      bbs = get_loop_body (loop);
+      bb = *bbs;
+      free (bbs);
+
+      emit_insn_before (gen_innermost_loop_begin (),
+			BB_HEAD (bb));
+
+      /* Find the final basic block in the loop.  */
+      while (bb)
+	{
+	  if (bb->next_bb == NULL)
+	    break;
+
+	  if (bb->next_bb->loop_father != loop)
+	    break;
+
+	  bb = bb->next_bb;
+	}
+
+      emit_insn_before (gen_innermost_loop_end (),
+			BB_END (bb));
+    }
+
+  /* release loop structre */
+  loop_optimizer_finalize ();
+}
+
+/* Insert isps for function with signature attribute.  */
+static void
+nds32_insert_isps (void)
+{
+  rtx_insn *insn;
+  unsigned first = 0;
+
+  if (!lookup_attribute ("signature", DECL_ATTRIBUTES (current_function_decl)))
+    return;
+
+  insn = get_insns ();
+  while (insn)
+    {
+      /* In order to ensure protect whole function, emit the first
+	 isps here rather than in prologue.*/
+      if (!first && INSN_P (insn))
+	{
+	  emit_insn_before (gen_unspec_signature_begin (), insn);
+	  first = 1;
+	}
+
+      if (LABEL_P (insn) || CALL_P (insn) || any_condjump_p (insn)
+	  || (INSN_P (insn) && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
+	      && (XINT (PATTERN (insn), 1) == UNSPEC_VOLATILE_SYSCALL
+		  || XINT (PATTERN (insn), 1) == UNSPEC_VOLATILE_TRAP
+		  || XINT (PATTERN (insn), 1) == UNSPEC_VOLATILE_TEQZ
+		  || XINT (PATTERN (insn), 1) == UNSPEC_VOLATILE_TNEZ)))
+	{
+	  emit_insn_after (gen_unspec_signature_begin (), insn);
+	}
+      insn = NEXT_INSN (insn);
+    }
+}
+
+static void
+nds32_register_pass (
+  rtl_opt_pass *(*make_pass_func) (gcc::context *),
+  enum pass_positioning_ops pass_pos,
+  const char *ref_pass_name)
+{
+  opt_pass *new_opt_pass = make_pass_func (g);
+
+  struct register_pass_info insert_pass =
+    {
+      new_opt_pass,	/* pass */
+      ref_pass_name,	/* reference_pass_name */
+      1,		/* ref_pass_instance_number */
+      pass_pos		/* po_op */
+    };
+
+  register_pass (&insert_pass);
+}
+
+static void
+nds32_register_pass (
+  gimple_opt_pass *(*make_pass_func) (gcc::context *),
+  enum pass_positioning_ops pass_pos,
+  const char *ref_pass_name)
+{
+  opt_pass *new_opt_pass = make_pass_func (g);
+
+  struct register_pass_info insert_pass =
+    {
+      new_opt_pass,	/* pass */
+      ref_pass_name,	/* reference_pass_name */
+      1,		/* ref_pass_instance_number */
+      pass_pos		/* po_op */
+    };
+
+  register_pass (&insert_pass);
+}
+
+/* This function is called from nds32_option_override ().
+   All new passes should be registered here.  */
+static void
+nds32_register_passes (void)
+{
+  nds32_register_pass (
+    make_pass_nds32_fp_as_gp,
+    PASS_POS_INSERT_BEFORE,
+    "ira");
+
+  nds32_register_pass (
+    make_pass_nds32_relax_opt,
+    PASS_POS_INSERT_AFTER,
+    "mach");
+
+  nds32_register_pass (
+    make_pass_nds32_load_store_opt,
+    PASS_POS_INSERT_AFTER,
+    "mach");
+
+  nds32_register_pass (
+    make_pass_nds32_soft_fp_arith_comm_opt,
+    PASS_POS_INSERT_BEFORE,
+    "mach");
+
+  nds32_register_pass (
+    make_pass_nds32_regrename_opt,
+    PASS_POS_INSERT_AFTER,
+    "mach");
+
+  nds32_register_pass (
+    make_pass_nds32_gcse_opt,
+    PASS_POS_INSERT_BEFORE,
+    "cprop_hardreg");
+
+  nds32_register_pass (
+    make_pass_nds32_cprop_acc_opt,
+    PASS_POS_INSERT_AFTER,
+    "cprop_hardreg");
+
+  nds32_register_pass (
+    make_pass_cprop_hardreg,
+    PASS_POS_INSERT_AFTER,
+    "mach");
+
+  nds32_register_pass (
+    make_pass_nds32_rename_lmwsmw_opt,
+    PASS_POS_INSERT_AFTER,
+    "jump2");
+
+  nds32_register_pass (
+    make_pass_nds32_gen_lmwsmw_opt,
+    PASS_POS_INSERT_BEFORE,
+    "peephole2");
+
+  nds32_register_pass (
+    make_pass_nds32_const_remater_opt,
+    PASS_POS_INSERT_BEFORE,
+    "ira");
+
+  nds32_register_pass (
+    make_pass_nds32_scalbn_transform_opt,
+    PASS_POS_INSERT_AFTER,
+    "optimized");
+
+  nds32_register_pass (
+    make_pass_nds32_sign_conversion_opt,
+    PASS_POS_INSERT_BEFORE,
+    "optimized");
+
+  nds32_register_pass (
+    make_pass_nds32_abi_compatible,
+    PASS_POS_INSERT_BEFORE,
+    "optimized");
+
+  nds32_register_pass (
+    nds32::scheduling::make_pass_nds32_print_stalls,
+    PASS_POS_INSERT_BEFORE,
+    "final");
+}
+
 /* ------------------------------------------------------------------------ */

-/* PART 3: Implement target hook stuff definitions.  */
+/* PART 4: Implement target hook stuff definitions.  */
+
+
+/* Computing the Length of an Insn.
+   Modifies the length assigned to instruction INSN.
+   LEN is the initially computed length of the insn.  */
+int
+nds32_adjust_insn_length (rtx_insn *insn, int length)
+{
+  int adjust_value = 0;
+  switch (recog_memoized (insn))
+    {
+    case CODE_FOR_call_internal:
+    case CODE_FOR_call_value_internal:
+      {
+	if (NDS32_ALIGN_P ())
+	  {
+	    rtx_insn *next_insn = next_active_insn (insn);
+	    if (next_insn && get_attr_length (next_insn) != 2)
+	      adjust_value += 2;
+	  }
+	/* We need insert a nop after a noretun function call
+	   to prevent software breakpoint corrupt the next function. */
+	if (find_reg_note (insn, REG_NORETURN, NULL_RTX))
+	  {
+	    if (TARGET_16_BIT)
+	      adjust_value += 2;
+	    else
+	      adjust_value += 4;
+	  }
+      }
+      return length + adjust_value;
+
+    default:
+      return length;
+    }
+}
+
+/* Storage Layout.  */
+
+/* This function will be called just before expansion into rtl.  */
+static void
+nds32_expand_to_rtl_hook (void)
+{
+  /* We need to set strictly aligned situation.
+     After that, the memory address checking in nds32_legitimate_address_p()
+     will take alignment offset into consideration so that it will not create
+     unaligned [base + offset] access during the rtl optimization.  */
+  cfun->machine->strict_aligned_p = 1;
+}
+
+
+/* Register Usage.  */
+
+static void
+nds32_conditional_register_usage (void)
+{
+  int regno;
+
+  if (TARGET_LINUX_ABI)
+    fixed_regs[TP_REGNUM] = 1;
+
+  if (TARGET_HARD_FLOAT)
+    {
+      for (regno = NDS32_FIRST_FPR_REGNUM;
+	   regno <= NDS32_LAST_FPR_REGNUM; regno++)
+	{
+	  fixed_regs[regno] = 0;
+	  if (regno < NDS32_FIRST_FPR_REGNUM + NDS32_MAX_FPR_REGS_FOR_ARGS)
+	    call_used_regs[regno] = 1;
+	  else if (regno >= NDS32_FIRST_FPR_REGNUM + 22
+		   && regno < NDS32_FIRST_FPR_REGNUM + 48)
+	    call_used_regs[regno] = 1;
+	  else
+	    call_used_regs[regno] = 0;
+	}
+    }
+  else if (TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)
+    {
+      for (regno = NDS32_FIRST_FPR_REGNUM;
+	   regno <= NDS32_LAST_FPR_REGNUM;
+	   regno++)
+	fixed_regs[regno] = 0;
+    }
+}
+

 /* Register Classes.  */

+static reg_class_t
+nds32_preferred_rename_class (reg_class_t rclass)
+{
+  return nds32_preferred_rename_class_impl (rclass);
+}
+
 static unsigned char
 nds32_class_max_nregs (reg_class_t rclass ATTRIBUTE_UNUSED,
-		       machine_mode mode)
+		       enum machine_mode mode)
 {
   /* Return the maximum number of consecutive registers
-     needed to represent "mode" in a register of "rclass".  */
+     needed to represent MODE in a register of RCLASS.  */
   return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
 }

@@ -1200,9 +1953,24 @@ static int
 nds32_register_priority (int hard_regno)
 {
   /* Encourage to use r0-r7 for LRA when optimize for size.  */
-  if (optimize_size && hard_regno < 8)
-    return 4;
-  return 3;
+  if (optimize_size)
+    {
+      if (hard_regno < 8)
+	return 4;
+      else if (hard_regno < 16)
+	return 3;
+      else if (hard_regno < 28)
+	return 2;
+      else
+	return 1;
+    }
+  else
+    {
+      if (hard_regno > 27)
+	return 1;
+      else
+	return 4;
+    }
 }


@@ -1222,8 +1990,8 @@ nds32_register_priority (int hard_regno)
        2. return address
        3. callee-saved registers
        4. <padding bytes> (we will calculte in nds32_compute_stack_frame()
-                           and save it at
-                           cfun->machine->callee_saved_area_padding_bytes)
+			   and save it at
+			   cfun->machine->callee_saved_area_padding_bytes)

      [Block B]
        1. local variables
@@ -1241,29 +2009,29 @@ nds32_register_priority (int hard_regno)
    By applying the basic frame/stack/argument pointers concept,
    the layout of a stack frame shoule be like this:

-                            |    |
+			    |    |
        old stack pointer ->  ----
-                            |    | \
-                            |    |   saved arguments for
-                            |    |   vararg functions
-                            |    | /
+			    |    | \
+			    |    |   saved arguments for
+			    |    |   vararg functions
+			    |    | /
       hard frame pointer ->   --
       & argument pointer    |    | \
-                            |    |   previous hardware frame pointer
-                            |    |   return address
-                            |    |   callee-saved registers
-                            |    | /
-           frame pointer ->   --
-                            |    | \
-                            |    |   local variables
-                            |    |   and incoming arguments
-                            |    | /
-                              --
-                            |    | \
-                            |    |   outgoing
-                            |    |   arguments
-                            |    | /
-           stack pointer ->  ----
+			    |    |   previous hardware frame pointer
+			    |    |   return address
+			    |    |   callee-saved registers
+			    |    | /
+	   frame pointer ->   --
+			    |    | \
+			    |    |   local variables
+			    |    |   and incoming arguments
+			    |    | /
+			      --
+			    |    | \
+			    |    |   outgoing
+			    |    |   arguments
+			    |    | /
+	   stack pointer ->  ----

   $SFP and $AP are used to represent frame pointer and arguments pointer,
   which will be both eliminated as hard frame pointer.  */
@@ -1291,7 +2059,7 @@ nds32_can_eliminate (const int from_reg, const int to_reg)
 /* -- Passing Arguments in Registers.  */

 static rtx
-nds32_function_arg (cumulative_args_t ca, machine_mode mode,
+nds32_function_arg (cumulative_args_t ca, enum machine_mode mode,
 		    const_tree type, bool named)
 {
   unsigned int regno;
@@ -1306,7 +2074,7 @@ nds32_function_arg (cumulative_args_t ca, machine_mode mode,
   if (!named)
     {
       /* If we are under hard float abi, we have arguments passed on the
-         stack and all situation can be handled by GCC itself.  */
+	 stack and all situation can be handled by GCC itself.  */
       if (TARGET_HARD_FLOAT)
 	return NULL_RTX;

@@ -1320,7 +2088,7 @@ nds32_function_arg (cumulative_args_t ca, machine_mode mode,
 	}

       /* No register available, return NULL_RTX.
-         The compiler will use stack to pass argument instead.  */
+	 The compiler will use stack to pass argument instead.  */
       return NULL_RTX;
     }

@@ -1329,14 +2097,34 @@ nds32_function_arg (cumulative_args_t ca, machine_mode mode,
      are different.  */
   if (TARGET_HARD_FLOAT)
     {
-      /* Currently we have not implemented hard float yet.  */
-      gcc_unreachable ();
+      /* For TARGET_HARD_FLOAT calling convention, we use GPR and FPR
+	 to pass argument.  We have to further check TYPE and MODE so
+	 that we can determine which kind of register we shall use.  */
+
+      /* Note that we need to pass argument entirely in registers under
+	 hard float abi.  */
+      if (GET_MODE_CLASS (mode) == MODE_FLOAT
+	  && NDS32_ARG_ENTIRE_IN_FPR_REG_P (cum->fpr_offset, mode, type))
+	{
+	  /* Pick up the next available FPR register number.  */
+	  regno
+	    = NDS32_AVAILABLE_REGNUM_FOR_FPR_ARG (cum->fpr_offset, mode, type);
+	  return gen_rtx_REG (mode, regno);
+	}
+      else if (GET_MODE_CLASS (mode) != MODE_FLOAT
+	       && NDS32_ARG_ENTIRE_IN_GPR_REG_P (cum->gpr_offset, mode, type))
+	{
+	  /* Pick up the next available GPR register number.  */
+	  regno
+	    = NDS32_AVAILABLE_REGNUM_FOR_GPR_ARG (cum->gpr_offset, mode, type);
+	  return gen_rtx_REG (mode, regno);
+	}
     }
   else
     {
       /* For !TARGET_HARD_FLOAT calling convention, we always use GPR to pass
-         argument.  Since we allow to pass argument partially in registers,
-         we can just return it if there are still registers available.  */
+	 argument.  Since we allow to pass argument partially in registers,
+	 we can just return it if there are still registers available.  */
       if (NDS32_ARG_PARTIAL_IN_GPR_REG_P (cum->gpr_offset, mode, type))
 	{
 	  /* Pick up the next available register number.  */
@@ -1353,7 +2141,7 @@ nds32_function_arg (cumulative_args_t ca, machine_mode mode,
 }

 static bool
-nds32_must_pass_in_stack (machine_mode mode, const_tree type)
+nds32_must_pass_in_stack (enum machine_mode mode, const_tree type)
 {
   /* Return true if a type must be passed in memory.
      If it is NOT using hard float abi, small aggregates can be
@@ -1366,7 +2154,7 @@ nds32_must_pass_in_stack (machine_mode mode, const_tree type)
 }

 static int
-nds32_arg_partial_bytes (cumulative_args_t ca, machine_mode mode,
+nds32_arg_partial_bytes (cumulative_args_t ca, enum machine_mode mode,
 			 tree type, bool named ATTRIBUTE_UNUSED)
 {
   /* Returns the number of bytes at the beginning of an argument that
@@ -1400,7 +2188,7 @@ nds32_arg_partial_bytes (cumulative_args_t ca, machine_mode mode,
   remaining_reg_count
     = NDS32_MAX_GPR_REGS_FOR_ARGS
       - (NDS32_AVAILABLE_REGNUM_FOR_GPR_ARG (cum->gpr_offset, mode, type)
-         - NDS32_GPR_ARG_FIRST_REGNUM);
+	 - NDS32_GPR_ARG_FIRST_REGNUM);

   /* Note that we have to return the nubmer of bytes, not registers count.  */
   if (needed_reg_count > remaining_reg_count)
@@ -1410,26 +2198,23 @@ nds32_arg_partial_bytes (cumulative_args_t ca, machine_mode mode,
 }

 static void
-nds32_function_arg_advance (cumulative_args_t ca, machine_mode mode,
+nds32_function_arg_advance (cumulative_args_t ca, enum machine_mode mode,
 			    const_tree type, bool named)
 {
-  machine_mode sub_mode;
   CUMULATIVE_ARGS *cum = get_cumulative_args (ca);

   if (named)
     {
       /* We need to further check TYPE and MODE so that we can determine
-         which kind of register we shall advance.  */
-      if (type && TREE_CODE (type) == COMPLEX_TYPE)
-	sub_mode = TYPE_MODE (TREE_TYPE (type));
-      else
-	sub_mode = mode;
+	 which kind of register we shall advance.  */

       /* Under hard float abi, we may advance FPR registers.  */
-      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (sub_mode) == MODE_FLOAT)
+      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT)
 	{
-	  /* Currently we have not implemented hard float yet.  */
-	  gcc_unreachable ();
+	  cum->fpr_offset
+	    = NDS32_AVAILABLE_REGNUM_FOR_FPR_ARG (cum->fpr_offset, mode, type)
+	      - NDS32_FPR_ARG_FIRST_REGNUM
+	      + NDS32_NEED_N_REGS_FOR_ARG (mode, type);
 	}
       else
 	{
@@ -1442,9 +2227,9 @@ nds32_function_arg_advance (cumulative_args_t ca, machine_mode mode,
   else
     {
       /* If this nameless argument is NOT under TARGET_HARD_FLOAT,
-         we can advance next register as well so that caller is
-         able to pass arguments in registers and callee must be
-         in charge of pushing all of them into stack.  */
+	 we can advance next register as well so that caller is
+	 able to pass arguments in registers and callee must be
+	 in charge of pushing all of them into stack.  */
       if (!TARGET_HARD_FLOAT)
 	{
 	  cum->gpr_offset
@@ -1456,13 +2241,23 @@ nds32_function_arg_advance (cumulative_args_t ca, machine_mode mode,
 }

 static unsigned int
-nds32_function_arg_boundary (machine_mode mode, const_tree type)
+nds32_function_arg_boundary (enum machine_mode mode, const_tree type)
 {
   return (nds32_needs_double_word_align (mode, type)
 	  ? NDS32_DOUBLE_WORD_ALIGNMENT
 	  : PARM_BOUNDARY);
 }

+bool
+nds32_vector_mode_supported_p (enum machine_mode mode)
+{
+  if (mode == V4QImode
+      || mode == V2HImode)
+    return NDS32_EXT_DSP_P ();
+
+  return false;
+}
+
 /* -- How Scalar Function Values Are Returned.  */

 static rtx
@@ -1470,28 +2265,68 @@ nds32_function_value (const_tree ret_type,
 		      const_tree fn_decl_or_type ATTRIBUTE_UNUSED,
 		      bool outgoing ATTRIBUTE_UNUSED)
 {
-  machine_mode mode;
+  enum machine_mode mode;
   int unsignedp;

   mode = TYPE_MODE (ret_type);
   unsignedp = TYPE_UNSIGNED (ret_type);

-  mode = promote_mode (ret_type, mode, &unsignedp);
+  if (INTEGRAL_TYPE_P (ret_type))
+    mode = promote_mode (ret_type, mode, &unsignedp);

-  return gen_rtx_REG (mode, NDS32_GPR_RET_FIRST_REGNUM);
+  if (TARGET_HARD_FLOAT && (mode == SFmode || mode == DFmode))
+    return gen_rtx_REG (mode, NDS32_FPR_RET_FIRST_REGNUM);
+  else
+    return gen_rtx_REG (mode, NDS32_GPR_RET_FIRST_REGNUM);
 }

 static rtx
-nds32_libcall_value (machine_mode mode,
+nds32_libcall_value (enum machine_mode mode,
 		     const_rtx fun ATTRIBUTE_UNUSED)
 {
+  if (TARGET_HARD_FLOAT && (mode == SFmode || mode == DFmode))
+    return gen_rtx_REG (mode, NDS32_FPR_RET_FIRST_REGNUM);
+
   return gen_rtx_REG (mode, NDS32_GPR_RET_FIRST_REGNUM);
 }

 static bool
 nds32_function_value_regno_p (const unsigned int regno)
 {
-  return (regno == NDS32_GPR_RET_FIRST_REGNUM);
+  if (regno == NDS32_GPR_RET_FIRST_REGNUM
+      || (TARGET_HARD_FLOAT
+	  && regno == NDS32_FPR_RET_FIRST_REGNUM))
+    return true;
+
+  return false;
+}
+
+/* -- How Large Values Are Returned.  */
+
+static bool
+nds32_return_in_memory (const_tree type,
+			const_tree fntype ATTRIBUTE_UNUSED)
+{
+  /* Note that int_size_in_bytes can return -1 if the size can vary
+     or is larger than an integer.  */
+  HOST_WIDE_INT size = int_size_in_bytes (type);
+
+  /* For COMPLEX_TYPE, if the total size cannot be hold within two registers,
+     the return value is supposed to be in memory.  We need to be aware of
+     that the size may be -1.  */
+  if (TREE_CODE (type) == COMPLEX_TYPE)
+    if (size < 0 || size > 2 * UNITS_PER_WORD)
+      return true;
+
+  /* If it is BLKmode and the total size cannot be hold within two registers,
+     the return value is supposed to be in memory.  We need to be aware of
+     that the size may be -1.  */
+  if (TYPE_MODE (type) == BLKmode)
+    if (size < 0 || size > 2 * UNITS_PER_WORD)
+      return true;
+
+  /* For other cases, having result in memory is unnecessary.  */
+  return false;
 }

 /* -- Function Entry and Exit.  */
@@ -1522,7 +2357,7 @@ nds32_asm_function_prologue (FILE *file,
   /* Use df_regs_ever_live_p() to detect if the register
      is ever used in the current function.  */
   fprintf (file, "\t! registers ever_live: ");
-  for (r = 0; r < 32; r++)
+  for (r = 0; r < 65; r++)
     {
       if (df_regs_ever_live_p (r))
 	fprintf (file, "%s, ", reg_names[r]);
@@ -1554,6 +2389,10 @@ nds32_asm_function_prologue (FILE *file,
       attrs = TREE_CHAIN (attrs);
     }
   fputc ('\n', file);
+
+  /* If there is any critical isr in this file, disable linker ifc.  */
+  if (nds32_isr_function_critical_p (current_function_decl))
+    fprintf (file, "\t.no_relax ifc\n");
 }

 /* After rtl prologue has been expanded, this function is used.  */
@@ -1561,56 +2400,12 @@ static void
 nds32_asm_function_end_prologue (FILE *file)
 {
   fprintf (file, "\t! END PROLOGUE\n");
-
-  /* If frame pointer is NOT needed and -mfp-as-gp is issued,
-     we can generate special directive: ".omit_fp_begin"
-     to guide linker doing fp-as-gp optimization.
-     However, for a naked function, which means
-     it should not have prologue/epilogue,
-     using fp-as-gp still requires saving $fp by push/pop behavior and
-     there is no benefit to use fp-as-gp on such small function.
-     So we need to make sure this function is NOT naked as well.  */
-  if (!frame_pointer_needed
-      && !cfun->machine->naked_p
-      && cfun->machine->fp_as_gp_p)
-    {
-      fprintf (file, "\t! ----------------------------------------\n");
-      fprintf (file, "\t! Guide linker to do "
-		     "link time optimization: fp-as-gp\n");
-      fprintf (file, "\t! We add one more instruction to "
-		     "initialize $fp near to $gp location.\n");
-      fprintf (file, "\t! If linker fails to use fp-as-gp transformation,\n");
-      fprintf (file, "\t! this extra instruction should be "
-		     "eliminated at link stage.\n");
-      fprintf (file, "\t.omit_fp_begin\n");
-      fprintf (file, "\tla\t$fp,_FP_BASE_\n");
-      fprintf (file, "\t! ----------------------------------------\n");
-    }
 }

 /* Before rtl epilogue has been expanded, this function is used.  */
 static void
 nds32_asm_function_begin_epilogue (FILE *file)
 {
-  /* If frame pointer is NOT needed and -mfp-as-gp is issued,
-     we can generate special directive: ".omit_fp_end"
-     to claim fp-as-gp optimization range.
-     However, for a naked function,
-     which means it should not have prologue/epilogue,
-     using fp-as-gp still requires saving $fp by push/pop behavior and
-     there is no benefit to use fp-as-gp on such small function.
-     So we need to make sure this function is NOT naked as well.  */
-  if (!frame_pointer_needed
-      && !cfun->machine->naked_p
-      && cfun->machine->fp_as_gp_p)
-    {
-      fprintf (file, "\t! ----------------------------------------\n");
-      fprintf (file, "\t! Claim the range of fp-as-gp "
-		     "link time optimization\n");
-      fprintf (file, "\t.omit_fp_end\n");
-      fprintf (file, "\t! ----------------------------------------\n");
-    }
-
   fprintf (file, "\t! BEGIN EPILOGUE\n");
 }

@@ -1638,41 +2433,104 @@ nds32_asm_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
 		? 1
 		: 0);

+  if (flag_pic)
+    {
+      fprintf (file, "\tsmw.adm\t$r31, [$r31], $r31, 4\n");
+      fprintf (file, "\tsethi\t%s, hi20(_GLOBAL_OFFSET_TABLE_-8)\n",
+		      reg_names [PIC_OFFSET_TABLE_REGNUM]);
+      fprintf (file, "\tori\t%s, %s, lo12(_GLOBAL_OFFSET_TABLE_-4)\n",
+		      reg_names [PIC_OFFSET_TABLE_REGNUM],
+		      reg_names [PIC_OFFSET_TABLE_REGNUM]);
+
+      if (TARGET_ISA_V3)
+	fprintf (file, "\tadd5.pc\t$gp\n");
+      else
+	{
+	  fprintf (file, "\tmfusr\t$ta, $pc\n");
+	  fprintf (file, "\tadd\t%s, $ta, %s\n",
+			  reg_names [PIC_OFFSET_TABLE_REGNUM],
+			  reg_names [PIC_OFFSET_TABLE_REGNUM]);
+	}
+    }
+
   if (delta != 0)
     {
       if (satisfies_constraint_Is15 (GEN_INT (delta)))
 	{
-	  fprintf (file, "\taddi\t$r%d, $r%d, %ld\n",
+	  fprintf (file, "\taddi\t$r%d, $r%d, " HOST_WIDE_INT_PRINT_DEC "\n",
 		   this_regno, this_regno, delta);
 	}
       else if (satisfies_constraint_Is20 (GEN_INT (delta)))
 	{
-	  fprintf (file, "\tmovi\t$ta, %ld\n", delta);
+	  fprintf (file, "\tmovi\t$ta, " HOST_WIDE_INT_PRINT_DEC "\n", delta);
 	  fprintf (file, "\tadd\t$r%d, $r%d, $ta\n", this_regno, this_regno);
 	}
       else
 	{
-	  fprintf (file, "\tsethi\t$ta, hi20(%ld)\n", delta);
-	  fprintf (file, "\tori\t$ta, $ta, lo12(%ld)\n", delta);
+	  fprintf (file,
+		   "\tsethi\t$ta, hi20(" HOST_WIDE_INT_PRINT_DEC ")\n",
+		   delta);
+	  fprintf (file,
+		   "\tori\t$ta, $ta, lo12(" HOST_WIDE_INT_PRINT_DEC ")\n",
+		   delta);
 	  fprintf (file, "\tadd\t$r%d, $r%d, $ta\n", this_regno, this_regno);
 	}
     }

-  fprintf (file, "\tb\t");
-  assemble_name (file, XSTR (XEXP (DECL_RTL (function), 0), 0));
-  fprintf (file, "\n");
+  if (flag_pic)
+    {
+      fprintf (file, "\tla\t$ta, ");
+      assemble_name (file, XSTR (XEXP (DECL_RTL (function), 0), 0));
+      fprintf (file, "@PLT\n");
+      fprintf (file, "\t! epilogue\n");
+      fprintf (file, "\tlwi.bi\t%s, [%s], 4\n",
+	       reg_names[PIC_OFFSET_TABLE_REGNUM],
+	       reg_names[STACK_POINTER_REGNUM]);
+      fprintf (file, "\tbr\t$ta\n");
+    }
+  else
+    {
+      fprintf (file, "\tb\t");
+      assemble_name (file, XSTR (XEXP (DECL_RTL (function), 0), 0));
+      fprintf (file, "\n");
+    }

   final_end_function ();
 }

 /* -- Permitting tail calls.  */

+/* Return true if it is ok to do sibling call optimization.  */
+static bool
+nds32_function_ok_for_sibcall (tree decl,
+			       tree exp ATTRIBUTE_UNUSED)
+{
+  /* The DECL is NULL if it is an indirect call.  */
+
+  /* 1. Do not apply sibling call if -mv3push is enabled,
+	because pop25 instruction also represents return behavior.
+     2. If this function is a isr function, do not apply sibling call
+	because it may perform the behavior that user does not expect.
+     3. If this function is a variadic function, do not apply sibling call
+	because the stack layout may be a mess.
+     4. We don't want to apply sibling call optimization for indirect
+	sibcall because the pop behavior in epilogue may pollute the
+	content of caller-saved regsiter when the register is used for
+	indirect sibcall.
+     5. In pic mode, it may use some registers for PLT call.  */
+  return (!TARGET_V3PUSH
+	  && !nds32_isr_function_p (current_function_decl)
+	  && (cfun->machine->va_args_size == 0)
+	  && decl
+	  && !flag_pic);
+}
+
 /* Determine whether we need to enable warning for function return check.  */
 static bool
 nds32_warn_func_return (tree decl)
 {
-/* Naked functions are implemented entirely in assembly, including the
-   return sequence, so suppress warnings about this.  */
+  /* Naked functions are implemented entirely in assembly, including the
+     return sequence, so suppress warnings about this.  */
   return !nds32_naked_function_p (decl);
 }

@@ -1681,7 +2539,7 @@ nds32_warn_func_return (tree decl)

 static void
 nds32_setup_incoming_varargs (cumulative_args_t ca,
-			      machine_mode mode,
+			      enum machine_mode mode,
 			      tree type,
 			      int *pretend_args_size,
 			      int second_time ATTRIBUTE_UNUSED)
@@ -1795,7 +2653,7 @@ nds32_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
     sorry ("a nested function is not supported for reduced registers");

   /* STEP 1: Copy trampoline code template into stack,
-             fill up essential data into stack.  */
+	     fill up essential data into stack.  */

   /* Extract nested function address rtx.  */
   fnaddr = XEXP (DECL_RTL (fndecl), 0);
@@ -1831,8 +2689,8 @@ nds32_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
       && (tramp_align_in_bytes % nds32_cache_block_size) == 0)
     {
       /* Under this condition, the starting address of trampoline
-         must be aligned to the starting address of each cache block
-         and we do not have to worry about cross-boundary issue.  */
+	 must be aligned to the starting address of each cache block
+	 and we do not have to worry about cross-boundary issue.  */
       for (i = 0;
 	   i < (TRAMPOLINE_SIZE + nds32_cache_block_size - 1)
 	       / nds32_cache_block_size;
@@ -1847,10 +2705,10 @@ nds32_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
   else if (TRAMPOLINE_SIZE > nds32_cache_block_size)
     {
       /* The starting address of trampoline code
-         may not be aligned to the cache block,
-         so the trampoline code may be across two cache block.
-         We need to sync the last element, which is 4-byte size,
-         of trampoline template.  */
+	 may not be aligned to the cache block,
+	 so the trampoline code may be across two cache block.
+	 We need to sync the last element, which is 4-byte size,
+	 of trampoline template.  */
       for (i = 0;
 	   i < (TRAMPOLINE_SIZE + nds32_cache_block_size - 1)
 	       / nds32_cache_block_size;
@@ -1871,16 +2729,16 @@ nds32_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
   else
     {
       /* This is the simplest case.
-         Because TRAMPOLINE_SIZE is less than or
-         equal to nds32_cache_block_size,
-         we can just sync start address and
-         the last element of trampoline code.  */
+	 Because TRAMPOLINE_SIZE is less than or
+	 equal to nds32_cache_block_size,
+	 we can just sync start address and
+	 the last element of trampoline code.  */

       /* Sync starting address of tampoline code.  */
       emit_move_insn (tmp_reg, sync_cache_addr);
       emit_insn (isync_insn);
       /* Sync the last element, which is 4-byte size,
-         of trampoline template.  */
+	 of trampoline template.  */
       emit_move_insn (tmp_reg,
 		      plus_constant (Pmode, sync_cache_addr,
 				     TRAMPOLINE_SIZE - 4));
@@ -1896,11 +2754,52 @@ nds32_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
 /* Addressing Modes.  */

 static bool
-nds32_legitimate_address_p (machine_mode mode, rtx x, bool strict)
+nds32_legitimate_address_p (enum machine_mode mode, rtx x, bool strict)
 {
+  if (TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)
+    {
+     /* When using floating-point instructions,
+	we don't allow 'addr' to be [symbol_ref], [CONST] pattern.  */
+      if ((mode == DFmode || mode == SFmode)
+	  && (GET_CODE (x) == SYMBOL_REF
+	  || GET_CODE(x) == CONST))
+	return false;
+
+      /* Allow [post_modify] addressing mode, when using FPU instructions.  */
+      if (GET_CODE (x) == POST_MODIFY
+	  && mode == DFmode)
+	{
+	  if (GET_CODE (XEXP (x, 0)) == REG
+	      && GET_CODE (XEXP (x, 1)) == PLUS)
+	    {
+	      rtx plus_op = XEXP (x, 1);
+	      rtx op0 = XEXP (plus_op, 0);
+	      rtx op1 = XEXP (plus_op, 1);
+
+	      if (nds32_address_register_rtx_p (op0, strict)
+		  && CONST_INT_P (op1))
+		{
+		  if (satisfies_constraint_Is14 (op1))
+		    {
+		      /* If it is not under strictly aligned situation,
+			 we can return true without checking alignment.  */
+		      if (!cfun->machine->strict_aligned_p)
+			return true;
+		     /* Make sure address is word alignment.
+			Currently we do not have 64-bit load/store yet,
+			so we will use two 32-bit load/store instructions to do
+			memory access and they are single word alignment.  */
+		      else if (NDS32_SINGLE_WORD_ALIGN_P (INTVAL (op1)))
+			return true;
+		    }
+		}
+	    }
+	}
+    }
+
   /* For (mem:DI addr) or (mem:DF addr) case,
      we only allow 'addr' to be [reg], [symbol_ref],
-                                [const], or [reg + const_int] pattern.  */
+				[const], or [reg + const_int] pattern.  */
   if (mode == DImode || mode == DFmode)
     {
       /* Allow [Reg + const_int] addressing mode.  */
@@ -1910,13 +2809,19 @@ nds32_legitimate_address_p (machine_mode mode, rtx x, bool strict)
 	      && nds32_legitimate_index_p (mode, XEXP (x, 1), strict)
 	      && CONST_INT_P (XEXP (x, 1)))
 	    return true;
-
 	  else if (nds32_address_register_rtx_p (XEXP (x, 1), strict)
 		   && nds32_legitimate_index_p (mode, XEXP (x, 0), strict)
 		   && CONST_INT_P (XEXP (x, 0)))
 	    return true;
 	}

+      /* Allow [post_inc] and [post_dec] addressing mode.  */
+      if (GET_CODE (x) == POST_INC || GET_CODE (x) == POST_DEC)
+	{
+	  if (nds32_address_register_rtx_p (XEXP (x, 0), strict))
+	    return true;
+	}
+
       /* Now check [reg], [symbol_ref], and [const].  */
       if (GET_CODE (x) != REG
 	  && GET_CODE (x) != SYMBOL_REF
@@ -1933,18 +2838,26 @@ nds32_legitimate_address_p (machine_mode mode, rtx x, bool strict)

     case SYMBOL_REF:
       /* (mem (symbol_ref A)) => [symbol_ref] */
+
+      if (flag_pic || SYMBOL_REF_TLS_MODEL (x))
+	return false;
+
+      if (TARGET_ICT_MODEL_LARGE && nds32_indirect_call_referenced_p (x))
+	return false;
+
       /* If -mcmodel=large, the 'symbol_ref' is not a valid address
-         during or after LRA/reload phase.  */
+	 during or after LRA/reload phase.  */
       if (TARGET_CMODEL_LARGE
 	  && (reload_completed
 	      || reload_in_progress
 	      || lra_in_progress))
 	return false;
       /* If -mcmodel=medium and the symbol references to rodata section,
-         the 'symbol_ref' is not a valid address during or after
-         LRA/reload phase.  */
+	 the 'symbol_ref' is not a valid address during or after
+	 LRA/reload phase.  */
       if (TARGET_CMODEL_MEDIUM
-	  && NDS32_SYMBOL_REF_RODATA_P (x)
+	  && (NDS32_SYMBOL_REF_RODATA_P (x)
+	      || CONSTANT_POOL_ADDRESS_P (x))
 	  && (reload_completed
 	      || reload_in_progress
 	      || lra_in_progress))
@@ -1954,7 +2867,7 @@ nds32_legitimate_address_p (machine_mode mode, rtx x, bool strict)

     case CONST:
       /* (mem (const (...)))
-         => [ + const_addr ], where const_addr = symbol_ref + const_int */
+	 => [ + const_addr ], where const_addr = symbol_ref + const_int */
       if (GET_CODE (XEXP (x, 0)) == PLUS)
 	{
 	  rtx plus_op = XEXP (x, 0);
@@ -1965,17 +2878,21 @@ nds32_legitimate_address_p (machine_mode mode, rtx x, bool strict)
 	  if (GET_CODE (op0) == SYMBOL_REF && CONST_INT_P (op1))
 	    {
 	      /* Now we see the [ + const_addr ] pattern, but we need
-	         some further checking.  */
+		 some further checking.  */
+
+	      if (flag_pic)
+		return false;
+
 	      /* If -mcmodel=large, the 'const_addr' is not a valid address
-	         during or after LRA/reload phase.  */
+		 during or after LRA/reload phase.  */
 	      if (TARGET_CMODEL_LARGE
 		  && (reload_completed
 		      || reload_in_progress
 		      || lra_in_progress))
 		return false;
 	      /* If -mcmodel=medium and the symbol references to rodata section,
-	         the 'const_addr' is not a valid address during or after
-	         LRA/reload phase.  */
+		 the 'const_addr' is not a valid address during or after
+		 LRA/reload phase.  */
 	      if (TARGET_CMODEL_MEDIUM
 		  && NDS32_SYMBOL_REF_RODATA_P (op0)
 		  && (reload_completed
@@ -1993,9 +2910,9 @@ nds32_legitimate_address_p (machine_mode mode, rtx x, bool strict)

     case POST_MODIFY:
       /* (mem (post_modify (reg) (plus (reg) (reg))))
-         => [Ra], Rb */
+	 => [Ra], Rb */
       /* (mem (post_modify (reg) (plus (reg) (const_int))))
-         => [Ra], const_int */
+	 => [Ra], const_int */
       if (GET_CODE (XEXP (x, 0)) == REG
 	  && GET_CODE (XEXP (x, 1)) == PLUS)
 	{
@@ -2018,7 +2935,7 @@ nds32_legitimate_address_p (machine_mode mode, rtx x, bool strict)
       /* (mem (post_inc reg)) => [Ra], 1/2/4 */
       /* (mem (post_dec reg)) => [Ra], -1/-2/-4 */
       /* The 1/2/4 or -1/-2/-4 have been displayed in nds32.md.
-         We only need to deal with register Ra.  */
+	 We only need to deal with register Ra.  */
       if (nds32_address_register_rtx_p (XEXP (x, 0), strict))
 	return true;
       else
@@ -2026,11 +2943,11 @@ nds32_legitimate_address_p (machine_mode mode, rtx x, bool strict)

     case PLUS:
       /* (mem (plus reg const_int))
-         => [Ra + imm] */
+	 => [Ra + imm] */
       /* (mem (plus reg reg))
-         => [Ra + Rb] */
+	 => [Ra + Rb] */
       /* (mem (plus (mult reg const_int) reg))
-         => [Ra + Rb << sv] */
+	 => [Ra + Rb << sv] */
       if (nds32_address_register_rtx_p (XEXP (x, 0), strict)
 	  && nds32_legitimate_index_p (mode, XEXP (x, 1), strict))
 	return true;
@@ -2042,39 +2959,292 @@ nds32_legitimate_address_p (machine_mode mode, rtx x, bool strict)

     case LO_SUM:
       /* (mem (lo_sum (reg) (symbol_ref))) */
-      /* (mem (lo_sum (reg) (const))) */
-      gcc_assert (REG_P (XEXP (x, 0)));
-      if (GET_CODE (XEXP (x, 1)) == SYMBOL_REF
-	  || GET_CODE (XEXP (x, 1)) == CONST)
-	return nds32_legitimate_address_p (mode, XEXP (x, 1), strict);
-      else
+      /* (mem (lo_sum (reg) (const (plus (symbol_ref) (reg)))) */
+      /* TLS case: (mem (lo_sum (reg) (const (unspec symbol_ref X)))) */
+      /* The LO_SUM is a valid address if and only if we would like to
+	 generate 32-bit full address memory access with any of following
+	 circumstance:
+	   1. -mcmodel=large.
+	   2. -mcmodel=medium and the symbol_ref references to rodata.  */
+      {
+	rtx sym = NULL_RTX;
+
+	if (flag_pic)
+	  return false;
+
+	if (!REG_P (XEXP (x, 0)))
+	  return false;
+
+	if (GET_CODE (XEXP (x, 1)) == SYMBOL_REF)
+	  sym = XEXP (x, 1);
+	else if (GET_CODE (XEXP (x, 1)) == CONST)
+	  {
+	    rtx plus = XEXP(XEXP (x, 1), 0);
+	    if (GET_CODE (plus) == PLUS)
+	      sym = XEXP (plus, 0);
+	    else if (GET_CODE (plus) == UNSPEC)
+	      sym = XVECEXP (plus, 0, 0);
+	  }
+	else
+	  return false;
+
+	gcc_assert (GET_CODE (sym) == SYMBOL_REF);
+
+	if (TARGET_ICT_MODEL_LARGE
+	    && nds32_indirect_call_referenced_p (sym))
+	  return true;
+
+	if (TARGET_CMODEL_LARGE)
+	  return true;
+	else if (TARGET_CMODEL_MEDIUM
+		 && NDS32_SYMBOL_REF_RODATA_P (sym))
+	  return true;
+	else
+	  return false;
+      }
+
+    default:
+      return false;
+    }
+}
+
+static rtx
+nds32_legitimize_address (rtx x,
+			  rtx oldx ATTRIBUTE_UNUSED,
+			  enum machine_mode mode ATTRIBUTE_UNUSED)
+{
+  if (nds32_tls_referenced_p (x))
+    x = nds32_legitimize_tls_address (x);
+  else if (flag_pic && SYMBOLIC_CONST_P (x))
+    x = nds32_legitimize_pic_address (x);
+  else if (TARGET_ICT_MODEL_LARGE && nds32_indirect_call_referenced_p (x))
+    x = nds32_legitimize_ict_address (x);
+
+  return x;
+}
+
+static bool
+nds32_legitimate_constant_p (enum machine_mode mode, rtx x)
+{
+  switch (GET_CODE (x))
+    {
+    case CONST_DOUBLE:
+      if ((TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)
+	  && (mode == DFmode || mode == SFmode))
+	return false;
+      break;
+    case CONST:
+      x = XEXP (x, 0);
+
+      if (GET_CODE (x) == PLUS)
+	{
+	  if (!CONST_INT_P (XEXP (x, 1)))
+	    return false;
+	  x = XEXP (x, 0);
+	}
+
+      if (GET_CODE (x) == UNSPEC)
+	{
+	  switch (XINT (x, 1))
+	    {
+	    case UNSPEC_GOT:
+	    case UNSPEC_GOTOFF:
+	    case UNSPEC_PLT:
+	    case UNSPEC_TLSGD:
+	    case UNSPEC_TLSLD:
+	    case UNSPEC_TLSIE:
+	    case UNSPEC_TLSLE:
+	    case UNSPEC_ICT:
+	      return false;
+	    default:
+	      return true;
+	    }
+	}
+      break;
+    case SYMBOL_REF:
+      /* TLS symbols need a call to resolve in
+	 precompute_register_parameters.  */
+      if (SYMBOL_REF_TLS_MODEL (x))
 	return false;
+      break;
+    default:
+      return true;
+    }
+
+  return true;
+}
+
+/* Reorgnize the UNSPEC CONST and return its direct symbol.  */
+static rtx
+nds32_delegitimize_address (rtx x)
+{
+  x = delegitimize_mem_from_attrs (x);
+
+  if (GET_CODE(x) == CONST)
+    {
+      rtx inner = XEXP (x, 0);
+
+      /* Handle for GOTOFF.  */
+      if (GET_CODE (inner) == PLUS)
+	inner = XEXP (inner, 0);
+
+      if (GET_CODE (inner) == UNSPEC)
+	{
+	  switch (XINT (inner, 1))
+	    {
+	    case UNSPEC_GOTINIT:
+	    case UNSPEC_GOT:
+	    case UNSPEC_GOTOFF:
+	    case UNSPEC_PLT:
+	    case UNSPEC_TLSGD:
+	    case UNSPEC_TLSLD:
+	    case UNSPEC_TLSIE:
+	    case UNSPEC_TLSLE:
+	    case UNSPEC_ICT:
+	      x = XVECEXP (inner, 0, 0);
+	      break;
+	    default:
+	      break;
+	    }
+	}
+    }
+  return x;
+}
+
+static enum machine_mode
+nds32_vectorize_preferred_simd_mode (enum machine_mode mode)
+{
+  if (!NDS32_EXT_DSP_P ())
+    return word_mode;
+
+  switch (mode)
+    {
+    case QImode:
+      return V4QImode;
+    case HImode:
+      return V2HImode;
+    default:
+      return word_mode;
+    }
+}

+static bool
+nds32_cannot_force_const_mem (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
+{
+  switch (GET_CODE (x))
+    {
+    case CONST:
+      return !nds32_legitimate_constant_p (mode, x);
+    case SYMBOL_REF:
+      /* All symbols have to be accessed through gp-relative in PIC mode.  */
+      /* We don't want to force symbol as constant pool in .text section,
+	 because we use the gp-relatived instruction to load in small
+	 or medium model.  */
+      if (flag_pic
+	  || SYMBOL_REF_TLS_MODEL (x)
+	  || TARGET_CMODEL_SMALL
+	  || TARGET_CMODEL_MEDIUM)
+	return true;
+      break;
+    case CONST_INT:
+    case CONST_DOUBLE:
+      if (flag_pic && (lra_in_progress || reload_completed))
+	return true;
+      break;
     default:
       return false;
     }
+  return false;
+}
+
+
+/* Condition Code Status.  */
+
+/* -- Representation of condition codes using registers.  */
+
+static void
+nds32_canonicalize_comparison (int *code,
+			       rtx *op0 ATTRIBUTE_UNUSED,
+			       rtx *op1,
+			       bool op0_preserve_value ATTRIBUTE_UNUSED)
+{
+  /* When the instruction combination pass tries to combine a comparison insn
+     with its previous insns, it also transforms the operator in order to
+     minimize its constant field.  For example, it tries to transform a
+     comparison insn from
+       (set (reg:SI 54)
+	   (ltu:SI (reg:SI 52)
+	       (const_int 10 [0xa])))
+     to
+       (set (reg:SI 54)
+	   (leu:SI (reg:SI 52)
+	       (const_int 9 [0x9])))
+
+     However, the nds32 target only provides instructions supporting the LTU
+     operation directly, and the implementation of the pattern "cbranchsi4"
+     only expands the LTU form.  In order to handle the non-LTU operations
+     generated from passes other than the RTL expansion pass, we have to
+     implement this hook to revert those changes.  Since we only expand the LTU
+     operator in the RTL expansion pass, we might only need to handle the LEU
+     case, unless we find other optimization passes perform more aggressive
+     transformations.  */
+
+  if (*code == LEU && CONST_INT_P (*op1))
+    {
+      *op1 = gen_int_mode (INTVAL (*op1) + 1, SImode);
+      *code = LTU;
+    }
 }


 /* Describing Relative Costs of Operations.  */

 static int
-nds32_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
+nds32_register_move_cost (enum machine_mode mode,
 			  reg_class_t from,
 			  reg_class_t to)
 {
-  if (from == HIGH_REGS || to == HIGH_REGS)
-    return 6;
+  /* In garywolf cpu, FPR to GPR is chaper than other cpu.  */
+  if (TARGET_PIPELINE_GRAYWOLF)
+    {
+      if (GET_MODE_SIZE (mode) == 8)
+	{
+	  /* DPR to GPR.  */
+	  if (from == FP_REGS && to != FP_REGS)
+	    return 3;
+	  /* GPR to DPR.  */
+	  if (from != FP_REGS && to == FP_REGS)
+	    return 2;
+	}
+      else
+	{
+	  if ((from == FP_REGS && to != FP_REGS)
+	      || (from != FP_REGS && to == FP_REGS))
+	    return 2;
+	}
+    }

-  return 2;
+  if ((from == FP_REGS && to != FP_REGS)
+      || (from != FP_REGS && to == FP_REGS))
+    return 3;
+  else if (from == HIGH_REGS || to == HIGH_REGS)
+    return optimize_size ? 6 : 2;
+  else
+    return 2;
 }

 static int
-nds32_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
+nds32_memory_move_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
 			reg_class_t rclass ATTRIBUTE_UNUSED,
 			bool in ATTRIBUTE_UNUSED)
 {
-  return 8;
+  /* Memory access is only need 1 cycle in our low-end processor,
+     however memory access is most 4-byte instruction,
+     so let it 8 for optimize_size, otherwise be 2.   */
+  if (nds32_memory_model_option == MEMORY_MODEL_FAST)
+    return optimize_size ? 8 : 4;
+  else
+    return 8;
 }

 /* This target hook describes the relative costs of RTL expressions.
@@ -2094,7 +3264,7 @@ nds32_rtx_costs (rtx x,

 static int
 nds32_address_cost (rtx address,
-		    machine_mode mode,
+		    enum machine_mode mode,
 		    addr_space_t as,
 		    bool speed)
 {
@@ -2102,6 +3272,55 @@ nds32_address_cost (rtx address,
 }


+/* Adjusting the Instruction Scheduler.  */
+
+static int
+nds32_sched_issue_rate (void)
+{
+  switch (nds32_cpu_option)
+  {
+  case CPU_GRAYWOLF:
+  case CPU_PANTHER:
+    return 2;
+
+  default:
+    return 1;
+  }
+}
+
+static int
+nds32_sched_adjust_cost (rtx_insn *insn ATTRIBUTE_UNUSED, rtx link, rtx_insn *dep ATTRIBUTE_UNUSED, int cost)
+{
+  if (REG_NOTE_KIND (link) == REG_DEP_ANTI
+      || REG_NOTE_KIND (link) == REG_DEP_OUTPUT)
+    {
+      if (nds32_sched_issue_rate () > 1)
+        return 1;
+
+      return 0;
+    }
+
+  return cost;
+}
+
+static void
+nds32_set_sched_flags (spec_info_t spec_info ATTRIBUTE_UNUSED)
+{
+  if (!flag_reorg_out_of_order
+      || nds32_sched_issue_rate () < 2)
+    return;
+
+  unsigned int *flags = &(current_sched_info->flags);
+
+  // Disallow the sheculder to find inc/mem pairs and break dependencies by
+  // duplication address computations. Otherwise, after doing so, the
+  // scheduler will treat that the two insns can be issued at the same cycle
+  // so that the later insn isn't marked as TImode. It will result in a wrong
+  // behavior for out-of-order reorganization.
+  *flags |= DONT_BREAK_DEPENDENCIES;
+}
+
+
 /* Dividing the Output into Sections (Texts, Data, . . . ).  */

 /* If references to a symbol or a constant must be treated differently
@@ -2150,17 +3369,56 @@ nds32_asm_file_start (void)
 {
   default_file_start ();

+  if (flag_pic)
+    fprintf (asm_out_file, "\t.pic\n");
+
   /* Tell assembler which ABI we are using.  */
   fprintf (asm_out_file, "\t! ABI version\n");
-  fprintf (asm_out_file, "\t.abi_2\n");
+  if (TARGET_HARD_FLOAT)
+    fprintf (asm_out_file, "\t.abi_2fp_plus\n");
+  else
+    fprintf (asm_out_file, "\t.abi_2\n");

   /* Tell assembler that this asm code is generated by compiler.  */
   fprintf (asm_out_file, "\t! This asm file is generated by compiler\n");
   fprintf (asm_out_file, "\t.flag\tverbatim\n");
-  /* Give assembler the size of each vector for interrupt handler.  */
-  fprintf (asm_out_file, "\t! This vector size directive is required "
-			 "for checking inconsistency on interrupt handler\n");
-  fprintf (asm_out_file, "\t.vec_size\t%d\n", nds32_isr_vector_size);
+
+  /* We need to provide the size of each vector for interrupt handler
+     under elf toolchain.  */
+  if (!TARGET_LINUX_ABI)
+    {
+      fprintf (asm_out_file, "\t! This vector size directive is required "
+			     "for checking inconsistency on interrupt handler\n");
+      fprintf (asm_out_file, "\t.vec_size\t%d\n", nds32_isr_vector_size);
+    }
+
+  /* If user enables '-mforce-fp-as-gp' or compiles programs with -Os,
+     the compiler may produce 'la $fp,_FP_BASE_' instruction
+     at prologue for fp-as-gp optimization.
+     We should emit weak reference of _FP_BASE_ to avoid undefined reference
+     in case user does not pass '--relax' option to linker.  */
+  if (!TARGET_LINUX_ABI && (TARGET_FORCE_FP_AS_GP || optimize_size))
+    {
+      fprintf (asm_out_file, "\t! This weak reference is required to do "
+			     "fp-as-gp link time optimization\n");
+      fprintf (asm_out_file, "\t.weak\t_FP_BASE_\n");
+    }
+  /* If user enables '-mifc', we should emit relaxation directive
+     to tell linker that this file is allowed to do ifc optimization.  */
+  if (TARGET_IFC)
+    {
+      fprintf (asm_out_file, "\t! This relaxation directive is required "
+			     "to do ifc link time optimization\n");
+      fprintf (asm_out_file, "\t.relax\tifc\n");
+    }
+  /* If user enables '-mex9', we should emit relaxation directive
+     to tell linker that this file is allowed to do ex9 optimization.  */
+  if (TARGET_EX9)
+    {
+      fprintf (asm_out_file, "\t! This relaxation directive is required "
+			     "to do ex9 link time optimization\n");
+      fprintf (asm_out_file, "\t.relax\tex9\n");
+    }

   fprintf (asm_out_file, "\t! ------------------------------------\n");

@@ -2171,6 +3429,53 @@ nds32_asm_file_start (void)
   if (TARGET_ISA_V3M)
     fprintf (asm_out_file, "\t! ISA family\t\t: %s\n", "V3M");

+  switch (nds32_cpu_option)
+    {
+    case CPU_N6:
+      fprintf (asm_out_file, "\t! Pipeline model\t: %s\n", "N6");
+      break;
+
+    case CPU_N7:
+      fprintf (asm_out_file, "\t! Pipeline model\t: %s\n", "N7");
+      break;
+
+    case CPU_N8:
+      fprintf (asm_out_file, "\t! Pipeline model\t: %s\n", "N8");
+      break;
+
+    case CPU_E8:
+      fprintf (asm_out_file, "\t! Pipeline model\t: %s\n", "E8");
+      break;
+
+    case CPU_N9:
+      fprintf (asm_out_file, "\t! Pipeline model\t: %s\n", "N9");
+      break;
+
+    case CPU_N10:
+      fprintf (asm_out_file, "\t! Pipeline model\t: %s\n", "N10");
+      break;
+
+    case CPU_GRAYWOLF:
+      fprintf (asm_out_file, "\t! Pipeline model\t: %s\n", "Graywolf");
+      break;
+
+    case CPU_N12:
+    case CPU_N13:
+      fprintf (asm_out_file, "\t! Pipeline model\t: %s\n", "N13");
+      break;
+
+    case CPU_PANTHER:
+      fprintf (asm_out_file, "\t! Pipeline model\t: %s\n", "Panther");
+      break;
+
+    case CPU_SIMPLE:
+      fprintf (asm_out_file, "\t! Pipeline model\t: %s\n", "SIMPLE");
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
   if (TARGET_CMODEL_SMALL)
     fprintf (asm_out_file, "\t! Code model\t\t: %s\n", "SMALL");
   if (TARGET_CMODEL_MEDIUM)
@@ -2181,6 +3486,15 @@ nds32_asm_file_start (void)
   fprintf (asm_out_file, "\t! Endian setting\t: %s\n",
 			 ((TARGET_BIG_ENDIAN) ? "big-endian"
 					      : "little-endian"));
+  fprintf (asm_out_file, "\t! Use SP floating-point instruction\t: %s\n",
+			 ((TARGET_FPU_SINGLE) ? "Yes"
+					      : "No"));
+  fprintf (asm_out_file, "\t! Use DP floating-point instruction\t: %s\n",
+			 ((TARGET_FPU_DOUBLE) ? "Yes"
+					      : "No"));
+  fprintf (asm_out_file, "\t! ABI version\t\t: %s\n",
+			 ((TARGET_HARD_FLOAT) ? "ABI2FP+"
+					      : "ABI2"));

   fprintf (asm_out_file, "\t! ------------------------------------\n");

@@ -2188,8 +3502,14 @@ nds32_asm_file_start (void)
 			 ((TARGET_CMOV) ? "Yes"
 					: "No"));
   fprintf (asm_out_file, "\t! Use performance extension\t: %s\n",
-			 ((TARGET_PERF_EXT) ? "Yes"
+			 ((TARGET_EXT_PERF) ? "Yes"
 					    : "No"));
+  fprintf (asm_out_file, "\t! Use performance extension 2\t: %s\n",
+			 ((TARGET_EXT_PERF2) ? "Yes"
+					     : "No"));
+  fprintf (asm_out_file, "\t! Use string extension\t\t: %s\n",
+			 ((TARGET_EXT_STRING) ? "Yes"
+					      : "No"));

   fprintf (asm_out_file, "\t! ------------------------------------\n");

@@ -2203,10 +3523,18 @@ nds32_asm_file_start (void)
 			 ((TARGET_REDUCED_REGS) ? "Yes"
 						: "No"));

+  fprintf (asm_out_file, "\t! Support unaligned access\t\t: %s\n",
+			 (flag_unaligned_access ? "Yes"
+						: "No"));
+
   fprintf (asm_out_file, "\t! ------------------------------------\n");

   if (optimize_size)
     fprintf (asm_out_file, "\t! Optimization level\t: -Os\n");
+  else if (optimize_fast)
+    fprintf (asm_out_file, "\t! Optimization level\t: -Ofast\n");
+  else if (optimize_debug)
+    fprintf (asm_out_file, "\t! Optimization level\t: -Og\n");
   else
     fprintf (asm_out_file, "\t! Optimization level\t: -O%d\n", optimize);

@@ -2225,9 +3553,65 @@ nds32_asm_file_end (void)
 {
   nds32_asm_file_end_for_isr ();

+  /* The NDS32 Linux stack is mapped non-executable by default, so add a
+     .note.GNU-stack section.  */
+  if (TARGET_LINUX_ABI)
+    file_end_indicate_exec_stack ();
+
   fprintf (asm_out_file, "\t! ------------------------------------\n");
 }

+static bool
+nds32_asm_output_addr_const_extra (FILE *file, rtx x)
+{
+  if (GET_CODE (x) == UNSPEC)
+    {
+      switch (XINT (x, 1))
+	{
+	case UNSPEC_GOTINIT:
+	  output_addr_const (file, XVECEXP (x, 0, 0));
+	  break;
+	case UNSPEC_GOTOFF:
+	  output_addr_const (file, XVECEXP (x, 0, 0));
+	  fputs ("@GOTOFF", file);
+	  break;
+	case UNSPEC_GOT:
+	  output_addr_const (file, XVECEXP (x, 0, 0));
+	  fputs ("@GOT", file);
+	  break;
+	case UNSPEC_PLT:
+	  output_addr_const (file, XVECEXP (x, 0, 0));
+	  fputs ("@PLT", file);
+	  break;
+	case UNSPEC_TLSGD:
+	  output_addr_const (file, XVECEXP (x, 0, 0));
+	  fputs ("@TLSDESC", file);
+	  break;
+	case UNSPEC_TLSLD:
+	  output_addr_const (file, XVECEXP (x, 0, 0));
+	  fputs ("@TLSDESC", file);
+	  break;
+	case UNSPEC_TLSIE:
+	  output_addr_const (file, XVECEXP (x, 0, 0));
+	  fputs ("@GOTTPOFF", file);
+	  break;
+	case UNSPEC_TLSLE:
+	  output_addr_const (file, XVECEXP (x, 0, 0));
+	  fputs ("@TPOFF", file);
+	  break;
+	case UNSPEC_ICT:
+	  output_addr_const (file, XVECEXP (x, 0, 0));
+	  fputs ("@ICT", file);
+	  break;
+	default:
+	  return false;
+	}
+      return true;
+    }
+  else
+    return false;
+}
+
 /* -- Output and Generation of Labels.  */

 static void
@@ -2243,7 +3627,15 @@ nds32_asm_globalize_label (FILE *stream, const char *name)
 static void
 nds32_print_operand (FILE *stream, rtx x, int code)
 {
-  int op_value;
+  HOST_WIDE_INT op_value = 0;
+  HOST_WIDE_INT one_position;
+  HOST_WIDE_INT zero_position;
+  bool pick_lsb_p = false;
+  bool pick_msb_p = false;
+  int regno;
+
+  if (CONST_INT_P (x))
+    op_value = INTVAL (x);

   switch (code)
     {
@@ -2251,29 +3643,82 @@ nds32_print_operand (FILE *stream, rtx x, int code)
       /* Do nothing special.  */
       break;

-    case 'V':
-      /* 'x' is supposed to be CONST_INT, get the value.  */
+    case 'b':
+      /* Use exact_log2() to search the 0-bit position.  */
       gcc_assert (CONST_INT_P (x));
-      op_value = INTVAL (x);
+      zero_position = exact_log2 (~UINTVAL (x) & GET_MODE_MASK (SImode));
+      gcc_assert (zero_position != -1);
+      fprintf (stream, HOST_WIDE_INT_PRINT_DEC, zero_position);

-      /* According to the Andes architecture,
-         the system/user register index range is 0 ~ 1023.
-         In order to avoid conflict between user-specified-integer value
-         and enum-specified-register value,
-         the 'enum nds32_intrinsic_registers' value
-         in nds32_intrinsic.h starts from 1024.  */
-      if (op_value < 1024 && op_value >= 0)
-	{
-	  /* If user gives integer value directly (0~1023),
-	     we just print out the value.  */
-	  fprintf (stream, "%d", op_value);
-	}
-      else if (op_value < 0
-	       || op_value >= ((int) ARRAY_SIZE (nds32_intrinsic_register_names)
-			       + 1024))
-	{
-	  /* The enum index value for array size is out of range.  */
-	  error ("intrinsic register index is out of range");
+      /* No need to handle following process, so return immediately.  */
+      return;
+
+    case 'e':
+      gcc_assert (MEM_P (x)
+		  && GET_CODE (XEXP (x, 0)) == PLUS
+		  && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT);
+      fprintf (stream, HOST_WIDE_INT_PRINT_DEC, INTVAL (XEXP (XEXP (x, 0), 1)));
+
+      /* No need to handle following process, so return immediately.  */
+      return;
+
+    case 'v':
+      gcc_assert (CONST_INT_P (x)
+		  && (INTVAL (x) == 0
+		      || INTVAL (x) == 8
+		      || INTVAL (x) == 16
+		      || INTVAL (x) == 24));
+      fprintf (stream, HOST_WIDE_INT_PRINT_DEC, INTVAL (x) / 8);
+
+      /* No need to handle following process, so return immediately.  */
+      return;
+
+    case 'B':
+      /* Use exact_log2() to search the 1-bit position.  */
+      gcc_assert (CONST_INT_P (x));
+      one_position = exact_log2 (UINTVAL (x) & GET_MODE_MASK (SImode));
+      gcc_assert (one_position != -1);
+      fprintf (stream, HOST_WIDE_INT_PRINT_DEC, one_position);
+
+      /* No need to handle following process, so return immediately.  */
+      return;
+
+    case 'L':
+      /* X is supposed to be REG rtx.  */
+      gcc_assert (REG_P (x));
+      /* Claim that we are going to pick LSB part of X.  */
+      pick_lsb_p = true;
+      break;
+
+    case 'H':
+      /* X is supposed to be REG rtx.  */
+      gcc_assert (REG_P (x));
+      /* Claim that we are going to pick MSB part of X.  */
+      pick_msb_p = true;
+      break;
+
+    case 'V':
+      /* X is supposed to be CONST_INT, get the value.  */
+      gcc_assert (CONST_INT_P (x));
+
+      /* According to the Andes architecture,
+	 the system/user register index range is 0 ~ 1023.
+	 In order to avoid conflict between user-specified-integer value
+	 and enum-specified-register value,
+	 the 'enum nds32_intrinsic_registers' value
+	 in nds32_intrinsic.h starts from 1024.  */
+      if (op_value < 1024 && op_value >= 0)
+	{
+	  /* If user gives integer value directly (0~1023),
+	     we just print out the value.  */
+	  fprintf (stream, HOST_WIDE_INT_PRINT_DEC, op_value);
+	}
+      else if (op_value < 0
+	       || op_value >= ((int) ARRAY_SIZE (nds32_intrinsic_register_names)
+			       + 1024))
+	{
+	  /* The enum index value for array size is out of range.  */
+	  error ("intrinsic register index is out of range");
 	}
       else
 	{
@@ -2286,6 +3731,45 @@ nds32_print_operand (FILE *stream, rtx x, int code)
       /* No need to handle following process, so return immediately.  */
       return;

+    case 'R': /* cctl valck  */
+      /* Note the cctl divide to 5 group and share the same name table.  */
+      if (op_value < 0 || op_value > 4)
+	error ("CCTL intrinsic function subtype out of range!");
+      fprintf (stream, "%s", nds32_cctl_names[op_value]);
+      return;
+
+    case 'T': /* cctl idxwbinv  */
+      /* Note the cctl divide to 5 group and share the same name table.  */
+      if (op_value < 0 || op_value > 4)
+	error ("CCTL intrinsic function subtype out of range!");
+      fprintf (stream, "%s", nds32_cctl_names[op_value + 4]);
+      return;
+
+    case 'U': /* cctl vawbinv  */
+      /* Note the cctl divide to 5 group and share the same name table.  */
+      if (op_value < 0 || op_value > 4)
+	error ("CCTL intrinsic function subtype out of range!");
+      fprintf (stream, "%s", nds32_cctl_names[op_value + 8]);
+      return;
+
+    case 'X': /* cctl idxread  */
+      /* Note the cctl divide to 5 group and share the same name table.  */
+      if (op_value < 0 || op_value > 4)
+	error ("CCTL intrinsic function subtype out of range!");
+      fprintf (stream, "%s", nds32_cctl_names[op_value + 12]);
+      return;
+
+    case 'W': /* cctl idxwitre  */
+      /* Note the cctl divide to 5 group and share the same name table.  */
+      if (op_value < 0 || op_value > 4)
+	error ("CCTL intrinsic function subtype out of range!");
+      fprintf (stream, "%s", nds32_cctl_names[op_value + 16]);
+      return;
+
+    case 'Z': /* dpref  */
+      fprintf (stream, "%s", nds32_dpref_names[op_value]);
+      return;
+
     default :
       /* Unknown flag.  */
       output_operand_lossage ("invalid operand output code");
@@ -2295,35 +3779,113 @@ nds32_print_operand (FILE *stream, rtx x, int code)
   switch (GET_CODE (x))
     {
     case LABEL_REF:
+      output_addr_const (stream, x);
+      break;
+
     case SYMBOL_REF:
       output_addr_const (stream, x);
+
+      if (!TARGET_LINUX_ABI && nds32_indirect_call_referenced_p (x))
+	fprintf (stream, "@ICT");
+
       break;

     case REG:
+      /* Print a Double-precision register name.  */
+      if ((GET_MODE (x) == DImode || GET_MODE (x) == DFmode)
+	  && NDS32_IS_FPR_REGNUM (REGNO (x)))
+	{
+	  regno = REGNO (x);
+	  if (!NDS32_FPR_REGNO_OK_FOR_DOUBLE (regno))
+	    {
+	      output_operand_lossage ("invalid operand for code '%c'", code);
+	      break;
+	    }
+	  fprintf (stream, "$fd%d", (regno - NDS32_FIRST_FPR_REGNUM) >> 1);
+	  break;
+	}
+
+      /* Print LSB or MSB part of register pair if the
+	 constraint modifier 'L' or 'H' is specified.  */
+      if ((GET_MODE (x) == DImode || GET_MODE (x) == DFmode)
+	  && NDS32_IS_GPR_REGNUM (REGNO (x)))
+	{
+	  if ((pick_lsb_p && WORDS_BIG_ENDIAN)
+	      || (pick_msb_p && !WORDS_BIG_ENDIAN))
+	    {
+	      /* If we would like to print out LSB register under big-endian,
+		 or print out MSB register under little-endian, we need to
+		 increase register number.  */
+	      regno = REGNO (x);
+	      regno++;
+	      fputs (reg_names[regno], stream);
+	      break;
+	    }
+	}
+
       /* Forbid using static chain register ($r16)
-         on reduced-set registers configuration.  */
+	 on reduced-set registers configuration.  */
       if (TARGET_REDUCED_REGS
 	  && REGNO (x) == STATIC_CHAIN_REGNUM)
 	sorry ("a nested function is not supported for reduced registers");

       /* Normal cases, print out register name.  */
-      fputs (reg_names[REGNO (x)], stream);
+      regno = REGNO (x);
+      fputs (reg_names[regno], stream);
       break;

     case MEM:
       output_address (GET_MODE (x), XEXP (x, 0));
       break;

+    case HIGH:
+      if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE)
+	{
+	  const REAL_VALUE_TYPE *rv;
+	  long val;
+	  gcc_assert (GET_MODE (x) == SFmode);
+
+	  rv = CONST_DOUBLE_REAL_VALUE (XEXP (x, 0));
+	  REAL_VALUE_TO_TARGET_SINGLE (*rv, val);
+
+	  fprintf (stream, "hi20(0x%lx)", val);
+	}
+      else
+	gcc_unreachable ();
+      break;
+
+    case CONST_DOUBLE:
+      const REAL_VALUE_TYPE *rv;
+      long val;
+      gcc_assert (GET_MODE (x) == SFmode);
+
+      rv = CONST_DOUBLE_REAL_VALUE (x);
+      REAL_VALUE_TO_TARGET_SINGLE (*rv, val);
+
+      fprintf (stream, "0x%lx", val);
+      break;
+
     case CODE_LABEL:
     case CONST_INT:
     case CONST:
       output_addr_const (stream, x);
       break;

+    case CONST_VECTOR:
+      fprintf (stream, HOST_WIDE_INT_PRINT_HEX, const_vector_to_hwint (x));
+      break;
+
+    case LO_SUM:
+      /* This is a special case for inline assembly using memory address 'p'.
+	 The inline assembly code is expected to use pesudo instruction
+	 for the operand.  EX: la  */
+      output_addr_const (stream, XEXP(x, 1));
+      break;
+
     default:
       /* Generally, output_addr_const () is able to handle most cases.
-         We want to see what CODE could appear,
-         so we use gcc_unreachable() to stop it.  */
+	 We want to see what CODE could appear,
+	 so we use gcc_unreachable() to stop it.  */
       debug_rtx (x);
       gcc_unreachable ();
       break;
@@ -2331,7 +3893,9 @@ nds32_print_operand (FILE *stream, rtx x, int code)
 }

 static void
-nds32_print_operand_address (FILE *stream, machine_mode /*mode*/, rtx x)
+nds32_print_operand_address (FILE *stream,
+			     machine_mode mode ATTRIBUTE_UNUSED,
+			     rtx x)
 {
   rtx op0, op1;

@@ -2346,15 +3910,25 @@ nds32_print_operand_address (FILE *stream, machine_mode /*mode*/, rtx x)
       fputs ("]", stream);
       break;

+    case LO_SUM:
+      /* This is a special case for inline assembly using memory operand 'm'.
+	 The inline assembly code is expected to use pesudo instruction
+	 for the operand.  EX: [ls].[bhw]  */
+      fputs ("[ + ", stream);
+      op1 = XEXP (x, 1);
+      output_addr_const (stream, op1);
+      fputs ("]", stream);
+      break;
+
     case REG:
       /* Forbid using static chain register ($r16)
-         on reduced-set registers configuration.  */
+	 on reduced-set registers configuration.  */
       if (TARGET_REDUCED_REGS
 	  && REGNO (x) == STATIC_CHAIN_REGNUM)
 	sorry ("a nested function is not supported for reduced registers");

       /* [Ra] */
-      fprintf (stream, "[%s]", reg_names[REGNO (x)]);
+      fprintf (stream, "[%s + 0]", reg_names[REGNO (x)]);
       break;

     case PLUS:
@@ -2362,13 +3936,13 @@ nds32_print_operand_address (FILE *stream, machine_mode /*mode*/, rtx x)
       op1 = XEXP (x, 1);

       /* Checking op0, forbid using static chain register ($r16)
-         on reduced-set registers configuration.  */
+	 on reduced-set registers configuration.  */
       if (TARGET_REDUCED_REGS
 	  && REG_P (op0)
 	  && REGNO (op0) == STATIC_CHAIN_REGNUM)
 	sorry ("a nested function is not supported for reduced registers");
       /* Checking op1, forbid using static chain register ($r16)
-         on reduced-set registers configuration.  */
+	 on reduced-set registers configuration.  */
       if (TARGET_REDUCED_REGS
 	  && REG_P (op1)
 	  && REGNO (op1) == STATIC_CHAIN_REGNUM)
@@ -2377,8 +3951,8 @@ nds32_print_operand_address (FILE *stream, machine_mode /*mode*/, rtx x)
       if (REG_P (op0) && CONST_INT_P (op1))
 	{
 	  /* [Ra + imm] */
-	  fprintf (stream, "[%s + (%d)]",
-			   reg_names[REGNO (op0)], (int)INTVAL (op1));
+	  fprintf (stream, "[%s + (" HOST_WIDE_INT_PRINT_DEC ")]",
+			   reg_names[REGNO (op0)], INTVAL (op1));
 	}
       else if (REG_P (op0) && REG_P (op1))
 	{
@@ -2391,8 +3965,8 @@ nds32_print_operand_address (FILE *stream, machine_mode /*mode*/, rtx x)
 	  /* [Ra + Rb << sv]
 	     From observation, the pattern looks like:
 	     (plus:SI (mult:SI (reg:SI 58)
-	                       (const_int 4 [0x4]))
-	              (reg/f:SI 57)) */
+			       (const_int 4 [0x4]))
+		      (reg/f:SI 57)) */
 	  int sv;

 	  /* We need to set sv to output shift value.  */
@@ -2402,6 +3976,8 @@ nds32_print_operand_address (FILE *stream, machine_mode /*mode*/, rtx x)
 	    sv = 1;
 	  else if (INTVAL (XEXP (op0, 1)) == 4)
 	    sv = 2;
+	  else if (INTVAL (XEXP (op0, 1)) == 8)
+	    sv = 3;
 	  else
 	    gcc_unreachable ();

@@ -2410,6 +3986,20 @@ nds32_print_operand_address (FILE *stream, machine_mode /*mode*/, rtx x)
 			   reg_names[REGNO (XEXP (op0, 0))],
 			   sv);
 	}
+      else if (GET_CODE (op0) == ASHIFT && REG_P (op1))
+	{
+	  /* [Ra + Rb << sv]
+	     In normal, ASHIFT can be converted to MULT like above case.
+	     But when the address rtx does not go through canonicalize_address
+	     defined in fwprop, we'll need this case.  */
+	  int sv = INTVAL (XEXP (op0, 1));
+	  gcc_assert (sv <= 3 && sv >=0);
+
+	  fprintf (stream, "[%s + %s << %d]",
+		   reg_names[REGNO (op1)],
+		   reg_names[REGNO (XEXP (op0, 0))],
+		   sv);
+	}
       else
 	{
 	  /* The control flow is not supposed to be here.  */
@@ -2421,20 +4011,20 @@ nds32_print_operand_address (FILE *stream, machine_mode /*mode*/, rtx x)

     case POST_MODIFY:
       /* (post_modify (regA) (plus (regA) (regB)))
-         (post_modify (regA) (plus (regA) (const_int)))
-         We would like to extract
-         regA and regB (or const_int) from plus rtx.  */
+	 (post_modify (regA) (plus (regA) (const_int)))
+	 We would like to extract
+	 regA and regB (or const_int) from plus rtx.  */
       op0 = XEXP (XEXP (x, 1), 0);
       op1 = XEXP (XEXP (x, 1), 1);

       /* Checking op0, forbid using static chain register ($r16)
-         on reduced-set registers configuration.  */
+	 on reduced-set registers configuration.  */
       if (TARGET_REDUCED_REGS
 	  && REG_P (op0)
 	  && REGNO (op0) == STATIC_CHAIN_REGNUM)
 	sorry ("a nested function is not supported for reduced registers");
       /* Checking op1, forbid using static chain register ($r16)
-         on reduced-set registers configuration.  */
+	 on reduced-set registers configuration.  */
       if (TARGET_REDUCED_REGS
 	  && REG_P (op1)
 	  && REGNO (op1) == STATIC_CHAIN_REGNUM)
@@ -2449,8 +4039,8 @@ nds32_print_operand_address (FILE *stream, machine_mode /*mode*/, rtx x)
       else if (REG_P (op0) && CONST_INT_P (op1))
 	{
 	  /* [Ra], imm */
-	  fprintf (stream, "[%s], %d",
-			   reg_names[REGNO (op0)], (int)INTVAL (op1));
+	  fprintf (stream, "[%s], " HOST_WIDE_INT_PRINT_DEC,
+			   reg_names[REGNO (op0)], INTVAL (op1));
 	}
       else
 	{
@@ -2466,7 +4056,7 @@ nds32_print_operand_address (FILE *stream, machine_mode /*mode*/, rtx x)
       op0 = XEXP (x, 0);

       /* Checking op0, forbid using static chain register ($r16)
-         on reduced-set registers configuration.  */
+	 on reduced-set registers configuration.  */
       if (TARGET_REDUCED_REGS
 	  && REG_P (op0)
 	  && REGNO (op0) == STATIC_CHAIN_REGNUM)
@@ -2490,14 +4080,92 @@ nds32_print_operand_address (FILE *stream, machine_mode /*mode*/, rtx x)

     default :
       /* Generally, output_addr_const () is able to handle most cases.
-         We want to see what CODE could appear,
-         so we use gcc_unreachable() to stop it.  */
+	 We want to see what CODE could appear,
+	 so we use gcc_unreachable() to stop it.  */
       debug_rtx (x);
       gcc_unreachable ();
       break;
     }
 }

+/* -- Assembler Commands for Exception Regions.  */
+
+static rtx
+nds32_dwarf_register_span (rtx reg)
+{
+  rtx dwarf_high, dwarf_low;
+  rtx dwarf_single;
+  enum machine_mode mode;
+  int regno;
+
+  mode = GET_MODE (reg);
+  regno = REGNO (reg);
+
+  /* We need to adjust dwarf register information for floating-point registers
+     rather than using default register number mapping.  */
+  if (regno >= NDS32_FIRST_FPR_REGNUM
+      && regno <= NDS32_LAST_FPR_REGNUM)
+    {
+      if (mode == DFmode || mode == SCmode)
+	{
+	  /* By default, GCC maps increasing register numbers to increasing
+	     memory locations, but paired FPRs in NDS32 target are always
+	     big-endian, i.e.:
+
+	       fd0 :  fs0   fs1
+		     (MSB) (LSB)
+
+	     We must return parallel rtx to represent such layout.  */
+	  dwarf_high = gen_rtx_REG (word_mode, regno);
+	  dwarf_low = gen_rtx_REG (word_mode, regno + 1);
+	  return gen_rtx_PARALLEL (VOIDmode,
+				   gen_rtvec (2, dwarf_low, dwarf_high));
+	}
+      else if (mode == DCmode)
+	{
+	  rtx dwarf_high_re = gen_rtx_REG (word_mode, regno);
+	  rtx dwarf_low_re = gen_rtx_REG (word_mode, regno + 1);
+	  rtx dwarf_high_im = gen_rtx_REG (word_mode, regno);
+	  rtx dwarf_low_im = gen_rtx_REG (word_mode, regno + 1);
+	  return gen_rtx_PARALLEL (VOIDmode,
+				   gen_rtvec (4, dwarf_low_re, dwarf_high_re,
+						 dwarf_high_im, dwarf_low_im));
+	}
+      else if (mode == SFmode || mode == SImode)
+	{
+	  /* Create new dwarf information with adjusted register number.  */
+	  dwarf_single = gen_rtx_REG (word_mode, regno);
+	  return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, dwarf_single));
+	}
+      else
+	{
+	  /* We should not be here.  */
+	  gcc_unreachable ();
+	}
+    }
+
+  return NULL_RTX;
+}
+
+/* Map internal gcc register numbers to DWARF2 register numbers.  */
+
+unsigned int
+nds32_dbx_register_number (unsigned int regno)
+{
+  /* The nds32 port in GDB maintains a mapping between dwarf register
+     number and displayed register name.  For backward compatibility to
+     previous toolchain, currently our gdb still has four registers
+     (d0.l, d0.h, d1.l, and d1.h) between GPR and FPR while compiler
+     does not count those four registers in its register number table.
+     So we have to add 4 on its register number and then create new
+     dwarf information.  Hopefully we can discard such workaround
+     in the future.  */
+  if (NDS32_IS_FPR_REGNUM (regno))
+    return regno + 4;
+
+  return regno;
+}
+

 /* Defining target-specific uses of __attribute__.  */

@@ -2526,6 +4194,27 @@ nds32_merge_decl_attributes (tree olddecl, tree newdecl)
 static void
 nds32_insert_attributes (tree decl, tree *attributes)
 {
+  /* A "indirect_call" function attribute implies "noinline" and "noclone"
+     for elf toolchain to support ROM patch mechanism.  */
+  if (TREE_CODE (decl) == FUNCTION_DECL
+      && lookup_attribute ("indirect_call", *attributes) != NULL)
+    {
+      tree new_attrs = *attributes;
+
+      if (TARGET_LINUX_ABI)
+	error("cannot use indirect_call attribute under linux toolchain");
+
+      if (lookup_attribute ("noinline", new_attrs) == NULL)
+	new_attrs = tree_cons (get_identifier ("noinline"), NULL, new_attrs);
+      if (lookup_attribute ("noclone", new_attrs) == NULL)
+	new_attrs = tree_cons (get_identifier ("noclone"), NULL, new_attrs);
+
+      if (!TREE_PUBLIC (decl))
+	error("indirect_call attribute can't apply for static function");
+
+      *attributes = new_attrs;
+    }
+
   /* For function declaration, we need to check isr-specific attributes:
        1. Call nds32_check_isr_attrs_conflict() to check any conflict.
        2. Check valid integer value for interrupt/exception.
@@ -2543,14 +4232,46 @@ nds32_insert_attributes (tree decl, tree *attributes)
       nds32_check_isr_attrs_conflict (decl, func_attrs);

       /* Now we are starting to check valid id value
-         for interrupt/exception/reset.
-         Note that we ONLY check its validity here.
-         To construct isr vector information, it is still performed
-         by nds32_construct_isr_vectors_information().  */
+	 for interrupt/exception/reset.
+	 Note that we ONLY check its validity here.
+	 To construct isr vector information, it is still performed
+	 by nds32_construct_isr_vectors_information().  */
       intr  = lookup_attribute ("interrupt", func_attrs);
       excp  = lookup_attribute ("exception", func_attrs);
       reset = lookup_attribute ("reset", func_attrs);

+      /* The following code may use attribute arguments.  If there is no
+	 argument from source code, it will cause segmentation fault.
+	 Therefore, return dircetly and report error message later.  */
+      if ((intr && TREE_VALUE (intr) == NULL)
+	  || (excp && TREE_VALUE (excp) == NULL)
+	  || (reset && TREE_VALUE (reset) == NULL))
+	return;
+
+      /* ------------------------------------------------------------- */
+      /* FIXME:
+	 FOR BACKWARD COMPATIBILITY, we need to support following patterns:
+
+	     __attribute__((interrupt("XXX;YYY;id=ZZZ")))
+	     __attribute__((exception("XXX;YYY;id=ZZZ")))
+	     __attribute__((reset("vectors=XXX;nmi_func=YYY;warm_func=ZZZ")))
+
+	 If interrupt/exception/reset appears and its argument is a
+	 STRING_CST, we will use other functions to parse string in the
+	 nds32_construct_isr_vectors_information() and then set necessary
+	 isr information in the nds32_isr_vectors[] array.  Here we can
+	 just return immediately to avoid new-syntax checking.  */
+      if (intr != NULL_TREE
+	  && TREE_CODE (TREE_VALUE (TREE_VALUE (intr))) == STRING_CST)
+	return;
+      if (excp != NULL_TREE
+	  && TREE_CODE (TREE_VALUE (TREE_VALUE (excp))) == STRING_CST)
+	return;
+      if (reset != NULL_TREE
+	  && TREE_CODE (TREE_VALUE (TREE_VALUE (reset))) == STRING_CST)
+	return;
+      /* ------------------------------------------------------------- */
+
       if (intr || excp)
 	{
 	  /* Deal with interrupt/exception.  */
@@ -2576,8 +4297,8 @@ nds32_insert_attributes (tree decl, tree *attributes)
 	      id = TREE_VALUE (id_list);
 	      /* Issue error if it is not a valid integer value.  */
 	      if (TREE_CODE (id) != INTEGER_CST
-		  || wi::ltu_p (id, lower_bound)
-		  || wi::gtu_p (id, upper_bound))
+		  || TREE_INT_CST_LOW (id) < lower_bound
+		  || TREE_INT_CST_LOW (id) > upper_bound)
 		error ("invalid id value for interrupt/exception attribute");

 	      /* Advance to next id.  */
@@ -2604,8 +4325,8 @@ nds32_insert_attributes (tree decl, tree *attributes)

 	  /* 3. Check valid integer value for reset.  */
 	  if (TREE_CODE (id) != INTEGER_CST
-	      || wi::ltu_p (id, lower_bound)
-	      || wi::gtu_p (id, upper_bound))
+	      || TREE_INT_CST_LOW (id) < lower_bound
+	      || TREE_INT_CST_LOW (id) > upper_bound)
 	    error ("invalid id value for reset attribute");

 	  /* 4. Check valid function for nmi/warm.  */
@@ -2667,17 +4388,40 @@ nds32_option_override (void)
     {
       /* Under V2 ISA, we need to strictly disable TARGET_V3PUSH.  */
       target_flags &= ~MASK_V3PUSH;
+      /* Under V2 ISA, we need to strictly disable TARGET_IFC.  */
+      target_flags &= ~MASK_IFC;
+      /* Under V2 ISA, we need to strictly disable TARGET_EX9.  */
+      target_flags &= ~MASK_EX9;
+      /* If this is ARCH_V2J, we need to enable TARGET_REDUCED_REGS.  */
+      if (nds32_arch_option == ARCH_V2J)
+	target_flags |= MASK_REDUCED_REGS;
     }
   if (TARGET_ISA_V3)
     {
-      /* Under V3 ISA, currently nothing should be strictly set.  */
+      /* If this is ARCH_V3J, we need to enable TARGET_REDUCED_REGS.  */
+      if (nds32_arch_option == ARCH_V3J)
+	target_flags |= MASK_REDUCED_REGS;
     }
   if (TARGET_ISA_V3M)
     {
       /* Under V3M ISA, we need to strictly enable TARGET_REDUCED_REGS.  */
       target_flags |= MASK_REDUCED_REGS;
-      /* Under V3M ISA, we need to strictly disable TARGET_PERF_EXT.  */
-      target_flags &= ~MASK_PERF_EXT;
+      if (nds32_arch_option != ARCH_V3M_PLUS)
+	{
+	  /* Under V3M ISA, we need to strictly disable TARGET_IFC.  */
+	  target_flags &= ~MASK_IFC;
+	  /* Under V3M ISA, we need to strictly disable TARGET_EX9.  */
+	  target_flags &= ~MASK_EX9;
+	}
+      /* Under V3M ISA, we need to strictly disable TARGET_EXT_PERF.  */
+      target_flags &= ~MASK_EXT_PERF;
+      /* Under V3M ISA, we need to strictly disable TARGET_EXT_PERF2.  */
+      target_flags &= ~MASK_EXT_PERF2;
+      /* Under V3M ISA, we need to strictly disable TARGET_EXT_STRING.  */
+      target_flags &= ~MASK_EXT_STRING;
+
+      if (flag_pic)
+	error ("not support -fpic option for v3m toolchain");
     }

   /* See if we are using reduced-set registers:
@@ -2688,48 +4432,568 @@ nds32_option_override (void)
       int r;

       /* Prevent register allocator from
-         choosing it as doing register allocation.  */
+	 choosing it as doing register allocation.  */
       for (r = 11; r <= 14; r++)
 	fixed_regs[r] = call_used_regs[r] = 1;
       for (r = 16; r <= 27; r++)
 	fixed_regs[r] = call_used_regs[r] = 1;
     }

+  /* See if user explicitly would like to use fp-as-gp optimization.
+     If so, we must prevent $fp from being allocated
+     during register allocation.  */
+  if (TARGET_FORCE_FP_AS_GP)
+    fixed_regs[FP_REGNUM] = call_used_regs[FP_REGNUM] = 1;
+
   if (!TARGET_16_BIT)
     {
       /* Under no 16 bit ISA, we need to strictly disable TARGET_V3PUSH.  */
       target_flags &= ~MASK_V3PUSH;
     }

-  /* Currently, we don't support PIC code generation yet.  */
-  if (flag_pic)
-    sorry ("not support -fpic");
+  if (TARGET_HARD_FLOAT && !(TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE))
+    {
+      if (nds32_arch_option == ARCH_V3S || nds32_arch_option == ARCH_V3F)
+	error ("Disable FPU ISA, "
+	       "the ABI option must be enable '-mfloat-abi=soft'");
+      else
+	error ("'-mabi=2fp+' option only support when FPU available, "
+	       "must be enable '-mext-fpu-sp' or '-mext-fpu-dp'");
+    }
+
+  nds32_register_passes ();
+
+  nds32_init_rtx_costs ();
 }


 /* Miscellaneous Parameters.  */

+static rtx_insn *
+nds32_md_asm_adjust (vec<rtx> &outputs ATTRIBUTE_UNUSED,
+		     vec<rtx> &inputs ATTRIBUTE_UNUSED,
+		     vec<const char *> &constraints ATTRIBUTE_UNUSED,
+		     vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
+{
+  clobbers.safe_push (gen_rtx_REG (SImode, TA_REGNUM));
+  SET_HARD_REG_BIT (clobbered_regs, TA_REGNUM);
+  return NULL;
+}
+/* Insert end_label and check loop body whether is empty.  */
+static bool
+nds32_hwloop_insert_end_label (rtx loop_id, rtx end_label)
+{
+  rtx_insn *insn = NULL;
+  basic_block bb;
+  rtx cfg_id;
+  rtx_insn *last_insn;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (NOTE_P (insn))
+	    continue;
+
+	  if (recog_memoized (insn) == CODE_FOR_hwloop_cfg
+	      && INSN_P (insn))
+	    {
+	      cfg_id = XVECEXP (XVECEXP (PATTERN (insn), 0, 5), 0, 0);
+	      if (cfg_id == loop_id)
+		{
+		  for (last_insn = PREV_INSN (insn); last_insn != BB_HEAD (bb);
+		       last_insn = PREV_INSN (last_insn))
+		    {
+		      if (NONDEBUG_INSN_P (last_insn))
+			{
+			  emit_label_before (end_label, last_insn);
+			  if (TARGET_IFC)
+			    {
+			      /* The last_insn don't do ifcall.  */
+			      emit_insn_before (gen_no_ifc_begin (), last_insn);
+			      emit_insn_after (gen_no_ifc_end (), last_insn);
+			    }
+			  if (TARGET_EX9)
+			    {
+			      /* The last_insn don't do ex9.  */
+			      emit_insn_before (gen_no_ex9_begin (), last_insn);
+			      emit_insn_after (gen_no_ex9_end (), last_insn);
+			    }
+			  /* Record last instruction for identify in relax pass.  */
+			  emit_insn_after (gen_hwloop_last_insn (), last_insn);
+			  return true;
+			}
+		    }
+
+		  if (NOTE_INSN_BASIC_BLOCK_P (last_insn))
+		    {
+		      rtx_insn *nop = emit_insn_before (gen_unspec_nop (),
+							last_insn);
+		      emit_label_before (end_label, nop);
+		      if (TARGET_IFC)
+			{
+			  /* The last_insn don't do ifcall.  */
+			  emit_insn_before (gen_no_ifc_begin (), last_insn);
+			  emit_insn_after (gen_no_ifc_end (), last_insn);
+			}
+		      if (TARGET_EX9)
+			{
+			  /* The last_insn don't do ex9.  */
+			  emit_insn_before (gen_no_ex9_begin (), last_insn);
+			  emit_insn_after (gen_no_ex9_end (), last_insn);
+			}
+		      return true;
+		    }
+		}
+	    }
+	}
+    }
+
+  if (insn != NULL)
+    delete_insn (insn);
+  return false;
+}
+
+static void
+nds32_hwloop_remove (rtx loop_id)
+{
+  rtx_insn *insn;
+  rtx le_id;
+  basic_block bb;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (NOTE_P (insn))
+	    continue;
+
+	  if (recog_memoized (insn) == CODE_FOR_init_lc
+	      && INSN_P (insn))
+	    {
+	      le_id = XVECEXP (XVECEXP (PATTERN (insn), 0, 1), 0, 0);
+	      if (loop_id == le_id)
+		{
+		  delete_insn (insn);
+		  return;
+		}
+	    }
+	}
+    }
+}
+
+/* Insert isb instruction for hwloop.  */
+static void
+nds32_hwloop_insert_isb (rtx loop_id)
+{
+  rtx_insn *insn;
+  rtx le_id;
+  basic_block bb;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (NOTE_P (insn))
+	    continue;
+
+	  if (recog_memoized (insn) == CODE_FOR_init_lc
+	      && INSN_P (insn))
+	    {
+	      le_id = XVECEXP (XVECEXP (PATTERN (insn), 0, 1), 0, 0);
+	      if (loop_id == le_id)
+		{
+		  emit_insn_after (gen_unspec_volatile_isb (), insn);
+		  return;
+		}
+	    }
+	}
+    }
+}
+/* Insert mtlei instruction for hwloop.  */
+static void
+nds32_hwloop_insert_init_end ()
+{
+  rtx_insn *insn;
+  basic_block bb;
+  rtx loop_id, end_label;
+  bool hwloop_p;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (NOTE_P (insn))
+	    continue;
+
+	  if (recog_memoized (insn) == CODE_FOR_mtlbi_hint
+	      && INSN_P (insn))
+	    {
+	      end_label = gen_label_rtx ();
+	      loop_id = XVECEXP (XVECEXP (PATTERN (insn), 0, 1), 0, 0);
+	      hwloop_p = nds32_hwloop_insert_end_label (loop_id, end_label);
+
+	      if (!hwloop_p)
+		{
+		  delete_insn (insn);
+		  nds32_hwloop_remove (loop_id);
+		}
+	      else
+		{
+		  emit_insn_after (gen_mtlei (gen_rtx_LABEL_REF (Pmode, end_label)), insn);
+		  nds32_hwloop_insert_isb (loop_id);
+		}
+	    }
+	}
+    }
+}
+
+/* Reorganize insns issued at the same cycle in out of order.  */
+static void
+nds32_reorg_out_of_order ()
+{
+  using namespace nds32;
+
+  // The function is controoled by -mreorg-out-of-order and the issue rate.
+  if (!flag_reorg_out_of_order
+      || nds32_sched_issue_rate () < 2)
+    return;
+
+  // We only move load insns up at this moment.
+  rtx_insn *insn;
+
+  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+    {
+      if (!insn_executable_p (insn)
+          || GET_MODE (insn) != TImode
+          || get_attr_type (insn) == TYPE_STORE_MULTIPLE
+          || get_attr_type (insn) == TYPE_LOAD_MULTIPLE
+          || get_attr_type (insn) == TYPE_LOAD
+          || get_attr_type (insn) == TYPE_FLOAD
+          || get_attr_type (insn) == TYPE_STORE
+          || get_attr_type (insn) == TYPE_FSTORE)
+	continue;
+
+      rtx_insn *load_insn = insn;
+
+      while ((load_insn = next_executable_insn_local (load_insn)))
+	{
+	  if (GET_MODE (load_insn) == TImode)
+	    {
+	      load_insn = NULL;
+	      break;
+	    }
+
+	  if ((get_attr_type (load_insn) == TYPE_LOAD
+	       || get_attr_type (load_insn) == TYPE_FLOAD)
+	      && get_attr_length (load_insn) < 4)
+	    break;
+	}
+
+      if (load_insn == NULL_RTX)
+	continue;
+
+      exchange_insns (insn, load_insn);
+    }
+}
+
+/* Perform machine-dependent processing.  */
+static void
+nds32_machine_dependent_reorg (void)
+{
+  /* We are freeing block_for_insn in the toplev to keep compatibility
+     with old MDEP_REORGS that are not CFG based.  Recompute it
+     now.  */
+  compute_bb_for_insn ();
+
+  nds32_reorg_out_of_order ();
+
+  if (TARGET_HWLOOP)
+    nds32_hwloop_insert_init_end ();
+
+  if (flag_var_tracking)
+    {
+      df_analyze ();
+      timevar_push (TV_VAR_TRACKING);
+      variable_tracking_main ();
+      timevar_pop (TV_VAR_TRACKING);
+      df_finish_pass (false);
+    }
+
+  /* Use -minnermost-loop to enable,
+     need more testing to verify result.  */
+  if (TARGET_INNERMOST_LOOP)
+    nds32_insert_innermost_loop ();
+
+  nds32_insert_isps ();
+}
+
 static void
 nds32_init_builtins (void)
 {
   nds32_init_builtins_impl ();
 }

+static tree
+nds32_builtin_decl (unsigned code, bool initialize_p)
+{
+  /* Implement in nds32-intrinsic.c.  */
+  return nds32_builtin_decl_impl (code, initialize_p);
+}
+
 static rtx
 nds32_expand_builtin (tree exp,
 		      rtx target,
 		      rtx subtarget,
-		      machine_mode mode,
+		      enum machine_mode mode,
 		      int ignore)
 {
+  /* Implement in nds32-intrinsic.c.  */
   return nds32_expand_builtin_impl (exp, target, subtarget, mode, ignore);
 }

+static bool
+nds32_have_conditional_execution (void)
+{
+  /* Lie to gcc that we have conditional execution for change optimization flow
+     in if-conversion, LRA and scheduling phase.
+     In our experiment result show that cand reduce about 2% code size with very
+     minor performance degradation in average.  */
+  return optimize_size;
+}
+
+/* Implement TARGET_INIT_LIBFUNCS.  */
+static void
+nds32_init_libfuncs (void)
+{
+  if (TARGET_LINUX_ABI)
+    init_sync_libfuncs (UNITS_PER_WORD);
+}
+
+/* Implement TARGET_CAN_USE_DOLOOP_P.  */
+static bool
+nds32_can_use_doloop_p (const widest_int &, const widest_int &iterations_max,
+			unsigned int, bool entered_at_top)
+{
+  /* Using hwloop must be entered from the top.  */
+  if (!entered_at_top)
+    return false;
+
+  if (lookup_attribute ("no_ext_zol", DECL_ATTRIBUTES (current_function_decl)))
+    return false;
+
+  /* Initial hardware loops too costly, so we must avoid to
+     generate a hardware loops when loop count less then 8. */
+  if (!NDS32_HW_LOOP_P ()
+      || iterations_max.ulow() < 8)
+    return false;
+  return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+   Otherwise return why doloop cannot be applied.  */
+static const char *
+nds32_invalid_within_doloop (const rtx_insn *insn)
+{
+  if (CALL_P (insn))
+    return "Function call in the loop.";
+  else if (INSN_CODE (insn) == CODE_FOR_pop25return
+	   || INSN_CODE (insn) == CODE_FOR_return_internal)
+    return "Simple return in the loop.";
+  else if (INSN_CODE (insn) == CODE_FOR_unspec_no_hwloop)
+    return "no_hwloop hint in the loop";
+
+  return NULL;
+}

 /* ------------------------------------------------------------------------ */

-/* PART 4: Implemet extern function definitions,
-           the prototype is in nds32-protos.h.  */
+/* PART 5: Implemet extern function definitions,
+	   the prototype is in nds32-protos.h.  */
+
+/* Run-time Target Specification.  */
+
+void
+nds32_cpu_cpp_builtins(struct cpp_reader *pfile)
+{
+#define builtin_define(TXT) cpp_define (pfile, TXT)
+#define builtin_assert(TXT) cpp_assert (pfile, TXT)
+  builtin_define ("__nds32__");
+  builtin_define ("__NDS32__");
+
+  /* We need to provide builtin macro to describe the size of
+     each vector for interrupt handler under elf toolchain.  */
+  if (!TARGET_LINUX_ABI)
+    {
+      if (TARGET_ISR_VECTOR_SIZE_4_BYTE)
+	builtin_define ("__NDS32_ISR_VECTOR_SIZE_4__");
+      else
+	builtin_define ("__NDS32_ISR_VECTOR_SIZE_16__");
+    }
+
+  if (TARGET_HARD_FLOAT)
+    builtin_define ("__NDS32_ABI_2FP_PLUS__");
+  else
+    builtin_define ("__NDS32_ABI_2__");
+
+  if (TARGET_ISA_V2)
+    builtin_define ("__NDS32_ISA_V2__");
+  if (TARGET_ISA_V3)
+    builtin_define ("__NDS32_ISA_V3__");
+  if (TARGET_ISA_V3M)
+    builtin_define ("__NDS32_ISA_V3M__");
+
+  if (TARGET_FPU_SINGLE)
+    builtin_define ("__NDS32_EXT_FPU_SP__");
+  if (TARGET_FPU_DOUBLE)
+    builtin_define ("__NDS32_EXT_FPU_DP__");
+
+  if (TARGET_EXT_FPU_FMA)
+    builtin_define ("__NDS32_EXT_FPU_FMA__");
+  if (NDS32_EXT_FPU_DOT_E)
+    builtin_define ("__NDS32_EXT_FPU_DOT_E__");
+  if (TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)
+    {
+      switch (nds32_fp_regnum)
+	{
+	case 0:
+	case 4:
+	  builtin_define ("__NDS32_EXT_FPU_CONFIG_0__");
+	  break;
+	case 1:
+	case 5:
+	  builtin_define ("__NDS32_EXT_FPU_CONFIG_1__");
+	  break;
+	case 2:
+	case 6:
+	  builtin_define ("__NDS32_EXT_FPU_CONFIG_2__");
+	  break;
+	case 3:
+	case 7:
+	  builtin_define ("__NDS32_EXT_FPU_CONFIG_3__");
+	  break;
+	default:
+	  abort ();
+	}
+    }
+
+  if (TARGET_BIG_ENDIAN)
+    builtin_define ("__NDS32_EB__");
+  else
+    builtin_define ("__NDS32_EL__");
+
+  if (TARGET_REDUCED_REGS)
+    builtin_define ("__NDS32_REDUCED_REGS__");
+  if (TARGET_CMOV)
+    builtin_define ("__NDS32_CMOV__");
+  if (TARGET_EXT_PERF)
+    builtin_define ("__NDS32_EXT_PERF__");
+  if (TARGET_EXT_PERF2)
+    builtin_define ("__NDS32_EXT_PERF2__");
+  if (TARGET_EXT_STRING)
+    builtin_define ("__NDS32_EXT_STRING__");
+  if (TARGET_16_BIT)
+    builtin_define ("__NDS32_16_BIT__");
+  if (TARGET_GP_DIRECT)
+    builtin_define ("__NDS32_GP_DIRECT__");
+  if (TARGET_VH)
+    builtin_define ("__NDS32_VH__");
+  if (NDS32_EXT_DSP_P ())
+    builtin_define ("__NDS32_EXT_DSP__");
+  if (NDS32_HW_LOOP_P ())
+    builtin_define ("__NDS32_EXT_ZOL__");
+
+  /* Extra builtin macros.  */
+  if (TARGET_ISA_V3 || TARGET_ISA_V3M_PLUS)
+    builtin_define ("__NDS32_EXT_IFC__");
+  if (TARGET_ISA_V3 || TARGET_ISA_V3M_PLUS)
+    builtin_define ("__NDS32_EXT_EX9__");
+  if (TARGET_BIG_ENDIAN)
+    builtin_define ("__big_endian__");
+
+  builtin_assert ("cpu=nds32");
+  builtin_assert ("machine=nds32");
+
+  /* FOR BACKWARD COMPATIBILITY.  */
+  if (TARGET_ISA_V2)
+    builtin_define ("__NDS32_BASELINE_V2__");
+  if (TARGET_ISA_V3)
+    builtin_define ("__NDS32_BASELINE_V3__");
+  if (TARGET_ISA_V3M)
+    builtin_define ("__NDS32_BASELINE_V3M__");
+  if (TARGET_REDUCED_REGS)
+    builtin_define ("__NDS32_REDUCE_REGS__");
+
+  if (TARGET_ISA_V2)
+    builtin_define ("NDS32_BASELINE_V2");
+  if (TARGET_ISA_V3)
+    builtin_define ("NDS32_BASELINE_V3");
+  if (TARGET_ISA_V3M)
+    builtin_define ("NDS32_BASELINE_V3M");
+  if (TARGET_REDUCED_REGS)
+    builtin_define ("NDS32_REDUCE_REGS");
+  if (TARGET_FPU_SINGLE)
+    builtin_define ("NDS32_EXT_FPU_SP");
+  if (TARGET_FPU_DOUBLE)
+    builtin_define ("NDS32_EXT_FPU_DP");
+  if (TARGET_EXT_PERF)
+    builtin_define ("NDS32_EXT_PERF");
+  if (TARGET_EXT_PERF2)
+    builtin_define ("NDS32_EXT_PERF2");
+  if (TARGET_EXT_STRING)
+    builtin_define ("NDS32_EXT_STRING");
+  if (TARGET_ISA_V3)
+    builtin_define ("NDS32_EXT_IFC");
+  if (TARGET_ISA_V3)
+    builtin_define ("NDS32_EXT_EX9");
+
+  if (TARGET_HARD_FLOAT)
+    builtin_define ("NDS32_ABI_2FP_PLUS");
+  else
+    builtin_define ("NDS32_ABI_2");
+
+  if (TARGET_BIG_ENDIAN)
+    builtin_define ("NDS32_EB");
+  else
+    builtin_define ("NDS32_EL");
+
+  if (TARGET_ISA_V2)
+    builtin_define ("__NDS32_BASELINE_V2");
+  if (TARGET_ISA_V3)
+    builtin_define ("__NDS32_BASELINE_V3");
+  if (TARGET_ISA_V3M)
+    builtin_define ("__NDS32_BASELINE_V3M");
+  if (TARGET_REDUCED_REGS)
+    builtin_define ("__NDS32_REDUCE_REGS");
+  if (TARGET_FPU_SINGLE)
+    builtin_define ("__NDS32_EXT_FPU_SP");
+  if (TARGET_FPU_DOUBLE)
+    builtin_define ("__NDS32_EXT_FPU_DP");
+  if (TARGET_EXT_PERF)
+    builtin_define ("__NDS32_EXT_PERF");
+  if (TARGET_EXT_PERF2)
+    builtin_define ("__NDS32_EXT_PERF2");
+  if (TARGET_EXT_STRING)
+    builtin_define ("__NDS32_EXT_STRING");
+  if (TARGET_ISA_V3)
+    builtin_define ("__NDS32_EXT_IFC");
+
+  if (TARGET_ISA_V3)
+    builtin_define ("__NDS32_EXT_EX9");
+
+  if (TARGET_HARD_FLOAT)
+    builtin_define ("__NDS32_ABI_2FP_PLUS");
+  else
+    builtin_define ("__NDS32_ABI_2");
+
+  if (TARGET_BIG_ENDIAN)
+    builtin_define ("__NDS32_EB");
+  else
+    builtin_define ("__NDS32_EL");
+#undef builtin_define
+#undef builtin_assert
+}
+

 /* Defining Data Structures for Per-function Information.  */

@@ -2743,26 +5007,80 @@ nds32_init_expanders (void)

 /* Register Usage.  */

+/* -- Order of Allocation of Registers.  */
+
+void
+nds32_adjust_reg_alloc_order (void)
+{
+  const int nds32_reg_alloc_order[] = REG_ALLOC_ORDER;
+
+  /* Copy the default register allocation order, which is designed
+     to optimize for code size.  */
+  memcpy(reg_alloc_order, nds32_reg_alloc_order, sizeof (reg_alloc_order));
+
+  /* Adjust few register allocation order when optimizing for speed.  */
+  if (!optimize_size)
+    {
+      memcpy (reg_alloc_order, nds32_reg_alloc_order_for_speed,
+	      sizeof (nds32_reg_alloc_order_for_speed));
+    }
+}
+
 /* -- How Values Fit in Registers.  */

 int
 nds32_hard_regno_nregs (int regno ATTRIBUTE_UNUSED,
-			machine_mode mode)
+			enum machine_mode mode)
 {
   return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
 }

 int
-nds32_hard_regno_mode_ok (int regno, machine_mode mode)
+nds32_hard_regno_mode_ok (int regno, enum machine_mode mode)
 {
+  if (regno > FIRST_PSEUDO_REGISTER)
+    return true;
+
+  if ((TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE) && NDS32_IS_FPR_REGNUM (regno))
+    {
+      if (NDS32_IS_EXT_FPR_REGNUM(regno))
+	return (NDS32_FPR_REGNO_OK_FOR_DOUBLE(regno) && (mode == DFmode));
+      else if (mode == SFmode || mode == SImode)
+	return NDS32_FPR_REGNO_OK_FOR_SINGLE (regno);
+      else if (mode == DFmode)
+	return NDS32_FPR_REGNO_OK_FOR_DOUBLE (regno);
+
+      return false;
+    }
+
   /* Restrict double-word quantities to even register pairs.  */
-  if (HARD_REGNO_NREGS (regno, mode) == 1
-      || !((regno) & 1))
-    return 1;
+  if (regno <= NDS32_LAST_GPR_REGNUM)
+    return (HARD_REGNO_NREGS (regno, mode) == 1
+	    || !((regno) & 1));

-  return 0;
+  return false;
 }

+int
+nds32_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
+{
+  if ((GET_MODE_CLASS (mode1) == MODE_INT
+       && GET_MODE_CLASS (mode2) == MODE_INT)
+      && GET_MODE_SIZE (mode1) <= UNITS_PER_WORD
+      && GET_MODE_SIZE (mode2) <= UNITS_PER_WORD)
+    return true;
+
+  if (GET_MODE_SIZE (mode1) == GET_MODE_SIZE (mode2))
+    {
+      if ((TARGET_FPU_SINGLE && !TARGET_FPU_DOUBLE)
+	  && (mode1 == DFmode || mode2 == DFmode))
+	return false;
+      else
+	return true;
+    }
+
+  return false;
+}

 /* Register Classes.  */

@@ -2784,7 +5102,16 @@ nds32_regno_reg_class (int regno)
   else if (regno >= 20 && regno <= 31)
     return HIGH_REGS;
   else if (regno == 32 || regno == 33)
-    return FRAME_REGS;
+    {
+      /* $SFP and $AP is FRAME_REGS in fact, However prevent IRA don't
+	 know how to allocate register for $SFP and $AP, just tell IRA they
+	 are GENERAL_REGS, and ARM do this hack too.  */
+      return GENERAL_REGS;
+    }
+  else if (regno >= 34 && regno <= 97)
+    return FP_REGS;
+  else if (regno >= 98 && regno <= 100)
+    return LOOP_REGS;
   else
     return NO_REGS;
 }
@@ -2795,14 +5122,39 @@ nds32_regno_reg_class (int regno)
 /* -- Basic Stack Layout.  */

 rtx
+nds32_dynamic_chain_address (rtx frameaddr)
+{
+  if (TARGET_V3PUSH)
+    {
+      /* If -mv3push is specified, we push $fp, $gp, and $lp into stack.
+         We can access dynamic chain address from stack by [$fp - 12].  */
+      return plus_constant (Pmode, frameaddr, -12);
+    }
+  else
+    {
+      /* For general case we push $fp and $lp into stack at prologue.
+         We can access dynamic chain address from stack by [$fp - 8].  */
+      return plus_constant (Pmode, frameaddr, -8);
+    }
+}
+
+rtx
 nds32_return_addr_rtx (int count,
-		       rtx frameaddr ATTRIBUTE_UNUSED)
+		       rtx frameaddr)
 {
-  /* There is no way to determine the return address
-     if frameaddr is the frame that has 'count' steps
-     up from current frame.  */
+  int offset;
+  rtx addr;
+
   if (count != 0)
-    return NULL_RTX;
+    {
+      /* In nds32 ABI design, we can expect that $lp is always available
+         from stack by [$fp - 4] location.  */
+      offset = -4;
+      addr = plus_constant (Pmode, frameaddr, offset);
+      addr = memory_address (Pmode, addr);
+
+      return gen_rtx_MEM (Pmode, addr);
+    }

   /* If count == 0, it means we are at current frame,
      the return address is $r30 ($lp).  */
@@ -2821,15 +5173,18 @@ nds32_initial_elimination_offset (unsigned int from_reg, unsigned int to_reg)
   nds32_compute_stack_frame ();

   /* Remember to consider
-     cfun->machine->callee_saved_area_padding_bytes
+     cfun->machine->callee_saved_area_gpr_padding_bytes and
+     cfun->machine->eh_return_data_regs_size
      when calculating offset.  */
   if (from_reg == ARG_POINTER_REGNUM && to_reg == STACK_POINTER_REGNUM)
     {
       offset = (cfun->machine->fp_size
-	        + cfun->machine->gp_size
+		+ cfun->machine->gp_size
 		+ cfun->machine->lp_size
 		+ cfun->machine->callee_saved_gpr_regs_size
 		+ cfun->machine->callee_saved_area_gpr_padding_bytes
+		+ cfun->machine->callee_saved_fpr_regs_size
+		+ cfun->machine->eh_return_data_regs_size
 		+ cfun->machine->local_size
 		+ cfun->machine->out_args_size);
     }
@@ -2850,7 +5205,9 @@ nds32_initial_elimination_offset (unsigned int from_reg, unsigned int to_reg)
 		       + cfun->machine->gp_size
 		       + cfun->machine->lp_size
 		       + cfun->machine->callee_saved_gpr_regs_size
-		       + cfun->machine->callee_saved_area_gpr_padding_bytes);
+		       + cfun->machine->callee_saved_area_gpr_padding_bytes
+		       + cfun->machine->callee_saved_fpr_regs_size
+		       + cfun->machine->eh_return_data_regs_size);
     }
   else
     {
@@ -2869,10 +5226,11 @@ nds32_init_cumulative_args (CUMULATIVE_ARGS *cum,
 			    tree fndecl ATTRIBUTE_UNUSED,
 			    int n_named_args ATTRIBUTE_UNUSED)
 {
-  /* Initial available registers
-     (in offset, corresponding to NDS32_GPR_ARG_FIRST_REGNUM)
+  /* Initial available registers.  The values are offset against
+     NDS32_GPR_ARG_FIRST_REGNUM and NDS32_FPR_ARG_FIRST_REGNUM
      for passing arguments.  */
   cum->gpr_offset = 0;
+  cum->fpr_offset = 0;
 }

 /* -- Function Entry and Exit.  */
@@ -2883,125 +5241,178 @@ nds32_expand_prologue (void)
 {
   int fp_adjust;
   int sp_adjust;
-  int en4_const;
-
-  rtx Rb, Re;
-  rtx fp_adjust_insn, sp_adjust_insn;
+  unsigned Rb, Re;

   /* Compute and setup stack frame size.
      The result will be in cfun->machine.  */
   nds32_compute_stack_frame ();

+  /* Check frame_pointer_needed again to prevent fp is need after reload.  */
+  if (frame_pointer_needed)
+    cfun->machine->fp_as_gp_p = false;
+
   /* If this is a variadic function, first we need to push argument
      registers that hold the unnamed argument value.  */
   if (cfun->machine->va_args_size != 0)
     {
-      Rb = gen_rtx_REG (SImode, cfun->machine->va_args_first_regno);
-      Re = gen_rtx_REG (SImode, cfun->machine->va_args_last_regno);
-      /* No need to push $fp, $gp, or $lp, so use GEN_INT(0).  */
-      nds32_emit_stack_push_multiple (Rb, Re, GEN_INT (0), true);
+      Rb = cfun->machine->va_args_first_regno;
+      Re = cfun->machine->va_args_last_regno;
+      /* No need to push $fp, $gp, or $lp.  */
+      nds32_emit_stack_push_multiple (Rb, Re, false, false, false, true);

       /* We may also need to adjust stack pointer for padding bytes
-         because varargs may cause $sp not 8-byte aligned.  */
+	 because varargs may cause $sp not 8-byte aligned.  */
       if (cfun->machine->va_args_area_padding_bytes)
 	{
 	  /* Generate sp adjustment instruction.  */
 	  sp_adjust = cfun->machine->va_args_area_padding_bytes;
-	  sp_adjust_insn = gen_addsi3 (stack_pointer_rtx,
-				       stack_pointer_rtx,
-				       GEN_INT (-1 * sp_adjust));

-	  /* Emit rtx into instructions list and receive INSN rtx form.  */
-	  sp_adjust_insn = emit_insn (sp_adjust_insn);
-
-	  /* The insn rtx 'sp_adjust_insn' will change frame layout.
-	     We need to use RTX_FRAME_RELATED_P so that GCC is able to
-	     generate CFI (Call Frame Information) stuff.  */
-	  RTX_FRAME_RELATED_P (sp_adjust_insn) = 1;
+	  nds32_emit_adjust_frame (stack_pointer_rtx,
+	  			   stack_pointer_rtx,
+				   -1 * sp_adjust);
 	}
     }

   /* If the function is 'naked',
      we do not have to generate prologue code fragment.  */
-  if (cfun->machine->naked_p)
+  if (cfun->machine->naked_p && !flag_pic)
     return;

   /* Get callee_first_regno and callee_last_regno.  */
-  Rb = gen_rtx_REG (SImode, cfun->machine->callee_saved_first_gpr_regno);
-  Re = gen_rtx_REG (SImode, cfun->machine->callee_saved_last_gpr_regno);
-
-  /* nds32_emit_stack_push_multiple(first_regno, last_regno),
-     the pattern 'stack_push_multiple' is implemented in nds32.md.
-     For En4 field, we have to calculate its constant value.
-     Refer to Andes ISA for more information.  */
-  en4_const = 0;
-  if (cfun->machine->fp_size)
-    en4_const += 8;
-  if (cfun->machine->gp_size)
-    en4_const += 4;
-  if (cfun->machine->lp_size)
-    en4_const += 2;
+  Rb = cfun->machine->callee_saved_first_gpr_regno;
+  Re = cfun->machine->callee_saved_last_gpr_regno;

   /* If $fp, $gp, $lp, and all callee-save registers are NOT required
      to be saved, we don't have to create multiple push instruction.
      Otherwise, a multiple push instruction is needed.  */
-  if (!(REGNO (Rb) == SP_REGNUM && REGNO (Re) == SP_REGNUM && en4_const == 0))
+  if (!(Rb == SP_REGNUM && Re == SP_REGNUM
+	&& cfun->machine->fp_size == 0
+	&& cfun->machine->gp_size == 0
+	&& cfun->machine->lp_size == 0))
     {
       /* Create multiple push instruction rtx.  */
-      nds32_emit_stack_push_multiple (Rb, Re, GEN_INT (en4_const), false);
+      nds32_emit_stack_push_multiple (
+	Rb, Re,
+	cfun->machine->fp_size, cfun->machine->gp_size, cfun->machine->lp_size,
+	false);
+    }
+
+  /* Save eh data registers.  */
+  if (cfun->machine->use_eh_return_p)
+    {
+      Rb = cfun->machine->eh_return_data_first_regno;
+      Re = cfun->machine->eh_return_data_last_regno;
+
+      /* No need to push $fp, $gp, or $lp.
+	 Also, this is not variadic arguments push.  */
+      nds32_emit_stack_push_multiple (Rb, Re, false, false, false, false);
     }

-  /* Check frame_pointer_needed to see
-     if we shall emit fp adjustment instruction.  */
-  if (frame_pointer_needed)
-    {
-      /* adjust $fp = $sp + ($fp size) + ($gp size) + ($lp size)
-                          + (4 * callee-saved-registers)
-         Note: No need to adjust
-               cfun->machine->callee_saved_area_padding_bytes,
-               because, at this point, stack pointer is just
-               at the position after push instruction.  */
-      fp_adjust = cfun->machine->fp_size
-		  + cfun->machine->gp_size
-		  + cfun->machine->lp_size
-		  + cfun->machine->callee_saved_gpr_regs_size;
-      fp_adjust_insn = gen_addsi3 (hard_frame_pointer_rtx,
+  /* Check frame_pointer_needed to see
+     if we shall emit fp adjustment instruction.  */
+  if (frame_pointer_needed)
+    {
+      /* adjust $fp = $sp + ($fp size) + ($gp size) + ($lp size)
+			  + (4 * callee-saved-registers)
+			  + (4 * exception-handling-data-registers)
+	 Note: No need to adjust
+	       cfun->machine->callee_saved_area_gpr_padding_bytes,
+	       because, at this point, stack pointer is just
+	       at the position after push instruction.  */
+      fp_adjust = cfun->machine->fp_size
+		  + cfun->machine->gp_size
+		  + cfun->machine->lp_size
+		  + cfun->machine->callee_saved_gpr_regs_size
+		  + cfun->machine->eh_return_data_regs_size;
+
+      nds32_emit_adjust_frame (hard_frame_pointer_rtx,
+			       stack_pointer_rtx,
+			       fp_adjust);
+    }
+
+  /* Save fpu registers.  */
+  if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+    {
+      /* When $sp moved to bottom of stack, we need to check whether
+	 the range of offset in the FPU instruction.  */
+      int fpr_offset = cfun->machine->local_size
+		       + cfun->machine->out_args_size
+		       + cfun->machine->callee_saved_fpr_regs_size;
+
+      /* Check FPU instruction offset imm14s.  */
+      if (!satisfies_constraint_Is14 (GEN_INT (fpr_offset)))
+	{
+	  int fpr_space = cfun->machine->callee_saved_area_gpr_padding_bytes
+			  + cfun->machine->callee_saved_fpr_regs_size;
+
+	  /* Save fpu registers, need to allocate stack space
+	     for fpu callee registers.  And now $sp position
+	     on callee saved fpr registers.  */
+	  nds32_emit_adjust_frame (stack_pointer_rtx,
+				   stack_pointer_rtx,
+				   -1 * fpr_space);
+
+	  /* Emit fpu store instruction, using [$sp + offset] store
+	     fpu registers.  */
+	  nds32_emit_push_fpr_callee_saved (0);
+
+          /* Adjust $sp = $sp - local_size - out_args_size.  */
+	  sp_adjust = cfun->machine->local_size
+		      + cfun->machine->out_args_size;
+
+	  /* Allocate stack space for local size and out args size.  */
+	  nds32_emit_adjust_frame (stack_pointer_rtx,
+				   stack_pointer_rtx,
+				   -1 * sp_adjust);
+	}
+      else
+	{
+	  /* Offset range in Is14, so $sp moved to bottom of stack.  */
+
+          /* Adjust $sp = $sp - local_size - out_args_size
+			      - callee_saved_area_gpr_padding_bytes
+			      - callee_saved_fpr_regs_size.  */
+	  sp_adjust = cfun->machine->local_size
+		      + cfun->machine->out_args_size
+		      + cfun->machine->callee_saved_area_gpr_padding_bytes
+		      + cfun->machine->callee_saved_fpr_regs_size;
+
+	  nds32_emit_adjust_frame (stack_pointer_rtx,
 				   stack_pointer_rtx,
-				   GEN_INT (fp_adjust));
-      /* Emit rtx into instructions list and receive INSN rtx form.  */
-      fp_adjust_insn = emit_insn (fp_adjust_insn);
+				   -1 * sp_adjust);

-      /* The insn rtx 'fp_adjust_insn' will change frame layout.  */
-      RTX_FRAME_RELATED_P (fp_adjust_insn) = 1;
+	  /* Emit fpu store instruction, using [$sp + offset] store
+	     fpu registers.  */
+	  int fpr_position = cfun->machine->out_args_size
+			     + cfun->machine->local_size;
+	  nds32_emit_push_fpr_callee_saved (fpr_position);
+	}
     }
-
-  /* Adjust $sp = $sp - local_size - out_args_size
-                      - callee_saved_area_padding_bytes.  */
-  sp_adjust = cfun->machine->local_size
-	      + cfun->machine->out_args_size
-	      + cfun->machine->callee_saved_area_gpr_padding_bytes;
-  /* sp_adjust value may be out of range of the addi instruction,
-     create alternative add behavior with TA_REGNUM if necessary,
-     using NEGATIVE value to tell that we are decreasing address.  */
-  sp_adjust = nds32_force_addi_stack_int ( (-1) * sp_adjust);
-  if (sp_adjust)
+  else
     {
-      /* Generate sp adjustment instruction if and only if sp_adjust != 0.  */
-      sp_adjust_insn = gen_addsi3 (stack_pointer_rtx,
-				   stack_pointer_rtx,
-				   GEN_INT (-1 * sp_adjust));
-      /* Emit rtx into instructions list and receive INSN rtx form.  */
-      sp_adjust_insn = emit_insn (sp_adjust_insn);
+      /* Adjust $sp = $sp - local_size - out_args_size
+			  - callee_saved_area_gpr_padding_bytes.  */
+      sp_adjust = cfun->machine->local_size
+		  + cfun->machine->out_args_size
+		  + cfun->machine->callee_saved_area_gpr_padding_bytes;

-      /* The insn rtx 'sp_adjust_insn' will change frame layout.
-         We need to use RTX_FRAME_RELATED_P so that GCC is able to
-         generate CFI (Call Frame Information) stuff.  */
-      RTX_FRAME_RELATED_P (sp_adjust_insn) = 1;
+      /* sp_adjust value may be out of range of the addi instruction,
+	 create alternative add behavior with TA_REGNUM if necessary,
+	 using NEGATIVE value to tell that we are decreasing address.  */
+      nds32_emit_adjust_frame (stack_pointer_rtx,
+			       stack_pointer_rtx,
+			       -1 * sp_adjust);
     }

-  /* Prevent the instruction scheduler from
-     moving instructions across the boundary.  */
-  emit_insn (gen_blockage ());
+  /* Emit gp setup instructions for -fpic.  */
+  if (flag_pic && df_regs_ever_live_p (PIC_OFFSET_TABLE_REGNUM))
+    nds32_emit_load_gp ();
+
+  /* If user applies -mno-sched-prolog-epilog option,
+     we need to prevent instructions of function body from being
+     scheduled with stack adjustment in prologue.  */
+  if (!flag_sched_prolog_epilog)
+    emit_insn (gen_blockage ());
 }

 /* Function for normal multiple pop epilogue.  */
@@ -3009,18 +5420,17 @@ void
 nds32_expand_epilogue (bool sibcall_p)
 {
   int sp_adjust;
-  int en4_const;
-
-  rtx Rb, Re;
-  rtx sp_adjust_insn;
+  unsigned Rb, Re;

   /* Compute and setup stack frame size.
      The result will be in cfun->machine.  */
   nds32_compute_stack_frame ();

-  /* Prevent the instruction scheduler from
-     moving instructions across the boundary.  */
-  emit_insn (gen_blockage ());
+  /* If user applies -mno-sched-prolog-epilog option,
+     we need to prevent instructions of function body from being
+     scheduled with stack adjustment in epilogue.  */
+  if (!flag_sched_prolog_epilog)
+    emit_insn (gen_blockage ());

   /* If the function is 'naked', we do not have to generate
      epilogue code fragment BUT 'ret' instruction.
@@ -3029,110 +5439,156 @@ nds32_expand_epilogue (bool sibcall_p)
   if (cfun->machine->naked_p)
     {
       /* If this is a variadic function, we do not have to restore argument
-         registers but need to adjust stack pointer back to previous stack
-         frame location before return.  */
+	 registers but need to adjust stack pointer back to previous stack
+	 frame location before return.  */
       if (cfun->machine->va_args_size != 0)
 	{
 	  /* Generate sp adjustment instruction.
 	     We  need to consider padding bytes here.  */
 	  sp_adjust = cfun->machine->va_args_size
 		      + cfun->machine->va_args_area_padding_bytes;
-	  sp_adjust_insn = gen_addsi3 (stack_pointer_rtx,
-				       stack_pointer_rtx,
-				       GEN_INT (sp_adjust));
-	  /* Emit rtx into instructions list and receive INSN rtx form.  */
-	  sp_adjust_insn = emit_insn (sp_adjust_insn);

-	  /* The insn rtx 'sp_adjust_insn' will change frame layout.
-	     We need to use RTX_FRAME_RELATED_P so that GCC is able to
-	     generate CFI (Call Frame Information) stuff.  */
-	  RTX_FRAME_RELATED_P (sp_adjust_insn) = 1;
+  	  nds32_emit_adjust_frame (stack_pointer_rtx,
+				   stack_pointer_rtx,
+				   sp_adjust);
 	}

       /* Generate return instruction by using 'return_internal' pattern.
-         Make sure this instruction is after gen_blockage().  */
+	 Make sure this instruction is after gen_blockage().
+	 First we need to check this is a function without sibling call.  */
       if (!sibcall_p)
-	emit_jump_insn (gen_return_internal ());
+	{
+	  /* We need to further check attributes to determine whether
+	     there should be return instruction at epilogue.
+	     If the attribute naked exists but -mno-ret-in-naked-func
+	     is issued, there is NO need to generate return instruction.  */
+	  if (cfun->machine->attr_naked_p && !flag_ret_in_naked_func)
+	    return;
+
+	  emit_jump_insn (gen_return_internal ());
+	}
       return;
     }

   if (frame_pointer_needed)
     {
-      /* adjust $sp = $fp - ($fp size) - ($gp size) - ($lp size)
-                          - (4 * callee-saved-registers)
-         Note: No need to adjust
-               cfun->machine->callee_saved_area_padding_bytes,
-               because we want to adjust stack pointer
-               to the position for pop instruction.  */
-      sp_adjust = cfun->machine->fp_size
-		  + cfun->machine->gp_size
-		  + cfun->machine->lp_size
-		  + cfun->machine->callee_saved_gpr_regs_size;
-      sp_adjust_insn = gen_addsi3 (stack_pointer_rtx,
+      /* Restore fpu registers.  */
+      if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	{
+	  int gpr_padding = cfun->machine->callee_saved_area_gpr_padding_bytes;
+
+	  /* adjust $sp = $fp - ($fp size) - ($gp size) - ($lp size)
+			      - (4 * callee-saved-registers)
+			      - (4 * exception-handling-data-registers)
+			      - (4 * callee-saved-gpr-registers padding byte)
+			      - (4 * callee-saved-fpr-registers)
+	     Note:  we want to adjust stack pointer
+		    to the position for callee-saved fpr register,
+		    And restore fpu register use .bi instruction to adjust $sp
+		    from callee-saved fpr register to pop instruction.  */
+	  sp_adjust = cfun->machine->fp_size
+		      + cfun->machine->gp_size
+		      + cfun->machine->lp_size
+		      + cfun->machine->callee_saved_gpr_regs_size
+		      + cfun->machine->eh_return_data_regs_size
+		      + cfun->machine->callee_saved_area_gpr_padding_bytes
+		      + cfun->machine->callee_saved_fpr_regs_size;
+
+	  nds32_emit_adjust_frame (stack_pointer_rtx,
 				   hard_frame_pointer_rtx,
-				   GEN_INT (-1 * sp_adjust));
-      /* Emit rtx into instructions list and receive INSN rtx form.  */
-      sp_adjust_insn = emit_insn (sp_adjust_insn);
+				   -1 * sp_adjust);
+
+	  /* Emit fpu load instruction, using .bi instruction
+	     load fpu registers.  */
+	  nds32_emit_pop_fpr_callee_saved (gpr_padding);
+	}
+      else
+	{
+	  /* adjust $sp = $fp - ($fp size) - ($gp size) - ($lp size)
+			      - (4 * callee-saved-registers)
+			      - (4 * exception-handling-data-registers)
+	     Note: No need to adjust
+		   cfun->machine->callee_saved_area_gpr_padding_bytes,
+		   because we want to adjust stack pointer
+		   to the position for pop instruction.  */
+	  sp_adjust = cfun->machine->fp_size
+		      + cfun->machine->gp_size
+		      + cfun->machine->lp_size
+		      + cfun->machine->callee_saved_gpr_regs_size
+		      + cfun->machine->eh_return_data_regs_size;

-      /* The insn rtx 'sp_adjust_insn' will change frame layout.  */
-      RTX_FRAME_RELATED_P (sp_adjust_insn) = 1;
+	  nds32_emit_adjust_frame (stack_pointer_rtx,
+				   hard_frame_pointer_rtx,
+				   -1 * sp_adjust);
+	}
     }
   else
     {
-      /* If frame pointer is NOT needed,
-         we cannot calculate the sp adjustment from frame pointer.
-         Instead, we calculate the adjustment by local_size,
-         out_args_size, and callee_saved_area_padding_bytes.
-         Notice that such sp adjustment value may be out of range,
-         so we have to deal with it as well.  */
+      /* Restore fpu registers.  */
+      if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	{
+	  int gpr_padding = cfun->machine->callee_saved_area_gpr_padding_bytes;

-      /* Adjust $sp = $sp + local_size + out_args_size
-                          + callee_saved_area_padding_bytes.  */
-      sp_adjust = cfun->machine->local_size
-		  + cfun->machine->out_args_size
-		  + cfun->machine->callee_saved_area_gpr_padding_bytes;
-      /* sp_adjust value may be out of range of the addi instruction,
-         create alternative add behavior with TA_REGNUM if necessary,
-         using POSITIVE value to tell that we are increasing address.  */
-      sp_adjust = nds32_force_addi_stack_int (sp_adjust);
-      if (sp_adjust)
-	{
-	  /* Generate sp adjustment instruction
-	     if and only if sp_adjust != 0.  */
-	  sp_adjust_insn = gen_addsi3 (stack_pointer_rtx,
-				       stack_pointer_rtx,
-				       GEN_INT (sp_adjust));
-	  /* Emit rtx into instructions list and receive INSN rtx form.  */
-	  sp_adjust_insn = emit_insn (sp_adjust_insn);
+	  /* Adjust $sp = $sp + local_size + out_args_size.  */
+	  sp_adjust = cfun->machine->local_size
+		      + cfun->machine->out_args_size;

-	  /* The insn rtx 'sp_adjust_insn' will change frame layout.  */
-	  RTX_FRAME_RELATED_P (sp_adjust_insn) = 1;
+	  nds32_emit_adjust_frame (stack_pointer_rtx,
+				   stack_pointer_rtx,
+				   sp_adjust);
+
+	  /* Emit fpu load instruction, using .bi instruction
+	     load fpu registers, and adjust $sp from callee-saved fpr register
+	     to callee-saved gpr register.  */
+	  nds32_emit_pop_fpr_callee_saved (gpr_padding);
+	}
+      else
+	{
+	  /* If frame pointer is NOT needed,
+	     we cannot calculate the sp adjustment from frame pointer.
+	     Instead, we calculate the adjustment by local_size,
+	     out_args_size, and callee_saved_area_gpr_padding_bytes.
+	     Notice that such sp adjustment value may be out of range,
+	     so we have to deal with it as well.  */
+
+	  /* Adjust $sp = $sp + local_size + out_args_size
+			      + callee_saved_area_gpr_padding_bytes.  */
+	  sp_adjust = cfun->machine->local_size
+		      + cfun->machine->out_args_size
+		      + cfun->machine->callee_saved_area_gpr_padding_bytes;
+
+	  nds32_emit_adjust_frame (stack_pointer_rtx,
+				   stack_pointer_rtx,
+				   sp_adjust);
 	}
     }

+  /* Restore eh data registers.  */
+  if (cfun->machine->use_eh_return_p)
+    {
+      Rb = cfun->machine->eh_return_data_first_regno;
+      Re = cfun->machine->eh_return_data_last_regno;
+
+      /* No need to pop $fp, $gp, or $lp.  */
+      nds32_emit_stack_pop_multiple (Rb, Re, false, false, false);
+    }
+
   /* Get callee_first_regno and callee_last_regno.  */
-  Rb = gen_rtx_REG (SImode, cfun->machine->callee_saved_first_gpr_regno);
-  Re = gen_rtx_REG (SImode, cfun->machine->callee_saved_last_gpr_regno);
-
-  /* nds32_emit_stack_pop_multiple(first_regno, last_regno),
-     the pattern 'stack_pop_multiple' is implementad in nds32.md.
-     For En4 field, we have to calculate its constant value.
-     Refer to Andes ISA for more information.  */
-  en4_const = 0;
-  if (cfun->machine->fp_size)
-    en4_const += 8;
-  if (cfun->machine->gp_size)
-    en4_const += 4;
-  if (cfun->machine->lp_size)
-    en4_const += 2;
+  Rb = cfun->machine->callee_saved_first_gpr_regno;
+  Re = cfun->machine->callee_saved_last_gpr_regno;

   /* If $fp, $gp, $lp, and all callee-save registers are NOT required
      to be saved, we don't have to create multiple pop instruction.
      Otherwise, a multiple pop instruction is needed.  */
-  if (!(REGNO (Rb) == SP_REGNUM && REGNO (Re) == SP_REGNUM && en4_const == 0))
+  if (!(Rb == SP_REGNUM && Re == SP_REGNUM
+	&& cfun->machine->fp_size == 0
+	&& cfun->machine->gp_size == 0
+	&& cfun->machine->lp_size == 0))
     {
       /* Create multiple pop instruction rtx.  */
-      nds32_emit_stack_pop_multiple (Rb, Re, GEN_INT (en4_const));
+      nds32_emit_stack_pop_multiple (
+	Rb, Re,
+	cfun->machine->fp_size, cfun->machine->gp_size, cfun->machine->lp_size);
     }

   /* If this is a variadic function, we do not have to restore argument
@@ -3141,19 +5597,49 @@ nds32_expand_epilogue (bool sibcall_p)
   if (cfun->machine->va_args_size != 0)
     {
       /* Generate sp adjustment instruction.
-         We  need to consider padding bytes here.  */
+	 We need to consider padding bytes here.  */
       sp_adjust = cfun->machine->va_args_size
 		  + cfun->machine->va_args_area_padding_bytes;
-      sp_adjust_insn = gen_addsi3 (stack_pointer_rtx,
-				   stack_pointer_rtx,
-				   GEN_INT (sp_adjust));
-      /* Emit rtx into instructions list and receive INSN rtx form.  */
-      sp_adjust_insn = emit_insn (sp_adjust_insn);

-      /* The insn rtx 'sp_adjust_insn' will change frame layout.
-         We need to use RTX_FRAME_RELATED_P so that GCC is able to
-         generate CFI (Call Frame Information) stuff.  */
-      RTX_FRAME_RELATED_P (sp_adjust_insn) = 1;
+      nds32_emit_adjust_frame (stack_pointer_rtx,
+			       stack_pointer_rtx,
+			       sp_adjust);
+    }
+
+  /* If this function uses __builtin_eh_return, make stack adjustment
+     for exception handler.  */
+  if (cfun->machine->use_eh_return_p)
+    {
+      /* We need to unwind the stack by the offset computed by
+	 EH_RETURN_STACKADJ_RTX.  However, at this point the CFA is
+	 based on SP.  Ideally we would update the SP and define the
+	 CFA along the lines of:
+
+	 SP = SP + EH_RETURN_STACKADJ_RTX
+	 (regnote CFA = SP - EH_RETURN_STACKADJ_RTX)
+
+	 However the dwarf emitter only understands a constant
+	 register offset.
+
+	 The solution chosen here is to use the otherwise $ta ($r15)
+	 as a temporary register to hold the current SP value.  The
+	 CFA is described using $ta then SP is modified.  */
+
+      rtx ta_reg;
+      rtx insn;
+
+      ta_reg = gen_rtx_REG (SImode, TA_REGNUM);
+
+      insn = emit_move_insn (ta_reg, stack_pointer_rtx);
+      add_reg_note (insn, REG_CFA_DEF_CFA, ta_reg);
+      RTX_FRAME_RELATED_P (insn) = 1;
+
+      emit_insn (gen_addsi3 (stack_pointer_rtx,
+			     stack_pointer_rtx,
+			     EH_RETURN_STACKADJ_RTX));
+
+      /* Ensure the assignment to $ta does not get optimized away.  */
+      emit_use (ta_reg);
     }

   /* Generate return instruction.  */
@@ -3167,28 +5653,35 @@ nds32_expand_prologue_v3push (void)
 {
   int fp_adjust;
   int sp_adjust;
-
-  rtx Rb, Re;
-  rtx fp_adjust_insn, sp_adjust_insn;
+  int fpr_space = 0;
+  unsigned Rb, Re;

   /* Compute and setup stack frame size.
      The result will be in cfun->machine.  */
   nds32_compute_stack_frame ();

+  if (cfun->machine->callee_saved_gpr_regs_size > 0)
+    df_set_regs_ever_live (FP_REGNUM, 1);
+
+  /* Check frame_pointer_needed again to prevent fp is need after reload.  */
+  if (frame_pointer_needed)
+    cfun->machine->fp_as_gp_p = false;
+
   /* If the function is 'naked',
      we do not have to generate prologue code fragment.  */
-  if (cfun->machine->naked_p)
+  if (cfun->machine->naked_p && !flag_pic)
     return;

   /* Get callee_first_regno and callee_last_regno.  */
-  Rb = gen_rtx_REG (SImode, cfun->machine->callee_saved_first_gpr_regno);
-  Re = gen_rtx_REG (SImode, cfun->machine->callee_saved_last_gpr_regno);
+  Rb = cfun->machine->callee_saved_first_gpr_regno;
+  Re = cfun->machine->callee_saved_last_gpr_regno;

   /* Calculate sp_adjust first to test if 'push25 Re,imm8u' is available,
      where imm8u has to be 8-byte alignment.  */
   sp_adjust = cfun->machine->local_size
 	      + cfun->machine->out_args_size
-	      + cfun->machine->callee_saved_area_gpr_padding_bytes;
+	      + cfun->machine->callee_saved_area_gpr_padding_bytes
+	      + cfun->machine->callee_saved_fpr_regs_size;

   if (satisfies_constraint_Iu08 (GEN_INT (sp_adjust))
       && NDS32_DOUBLE_WORD_ALIGN_P (sp_adjust))
@@ -3196,94 +5689,118 @@ nds32_expand_prologue_v3push (void)
       /* We can use 'push25 Re,imm8u'.  */

       /* nds32_emit_stack_v3push(last_regno, sp_adjust),
-         the pattern 'stack_v3push' is implemented in nds32.md.
-         The (const_int 14) means v3push always push { $fp $gp $lp }.  */
-      nds32_emit_stack_v3push (Rb, Re,
-			       GEN_INT (14), GEN_INT (sp_adjust));
+	 the pattern 'stack_v3push' is implemented in nds32.md.  */
+      nds32_emit_stack_v3push (Rb, Re, sp_adjust);
+
+      /* Save fpu registers.  */
+      if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	{
+	  /* Calculate fpr position.  */
+	  int fpr_position = cfun->machine->local_size
+			     + cfun->machine->out_args_size;
+	  /* Emit fpu store instruction, using [$sp + offset] store
+	     fpu registers.  */
+	  nds32_emit_push_fpr_callee_saved (fpr_position);
+	}

       /* Check frame_pointer_needed to see
-         if we shall emit fp adjustment instruction.  */
+	 if we shall emit fp adjustment instruction.  */
       if (frame_pointer_needed)
 	{
 	  /* adjust $fp = $sp   + 4         ($fp size)
-	                        + 4         ($gp size)
-	                        + 4         ($lp size)
-	                        + (4 * n)   (callee-saved registers)
-	                        + sp_adjust ('push25 Re,imm8u')
+				+ 4         ($gp size)
+				+ 4         ($lp size)
+				+ (4 * n)   (callee-saved registers)
+				+ sp_adjust ('push25 Re,imm8u')
 	     Note: Since we use 'push25 Re,imm8u',
-	           the position of stack pointer is further
-	           changed after push instruction.
-	           Hence, we need to take sp_adjust value
-	           into consideration.  */
+		the position of stack pointer is further
+		changed after push instruction.
+		Hence, we need to take sp_adjust value
+		into consideration.  */
 	  fp_adjust = cfun->machine->fp_size
 		      + cfun->machine->gp_size
 		      + cfun->machine->lp_size
 		      + cfun->machine->callee_saved_gpr_regs_size
 		      + sp_adjust;
-	  fp_adjust_insn = gen_addsi3 (hard_frame_pointer_rtx,
-				       stack_pointer_rtx,
-				       GEN_INT (fp_adjust));
-	  /* Emit rtx into instructions list and receive INSN rtx form.  */
-	  fp_adjust_insn = emit_insn (fp_adjust_insn);
+
+	  nds32_emit_adjust_frame (hard_frame_pointer_rtx,
+				   stack_pointer_rtx,
+				   fp_adjust);
 	}
     }
   else
     {
-      /* We have to use 'push25 Re,0' and
-         expand one more instruction to adjust $sp later.  */
+      if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	{
+	  /* Calculate fpr space.  */
+	  fpr_space = cfun->machine->callee_saved_area_gpr_padding_bytes
+		      + cfun->machine->callee_saved_fpr_regs_size;
+
+	  /* We have to use 'push25 Re, fpr_space', to pre-allocate
+	     callee saved fpr registers space.  */
+	  nds32_emit_stack_v3push (Rb, Re, fpr_space);
+	  nds32_emit_push_fpr_callee_saved (0);
+	}
+      else
+	{
+	  /* We have to use 'push25 Re,0' and
+	     expand one more instruction to adjust $sp later.  */

-      /* nds32_emit_stack_v3push(last_regno, sp_adjust),
-         the pattern 'stack_v3push' is implemented in nds32.md.
-         The (const_int 14) means v3push always push { $fp $gp $lp }.  */
-      nds32_emit_stack_v3push (Rb, Re,
-			       GEN_INT (14), GEN_INT (0));
+	  /* nds32_emit_stack_v3push(last_regno, sp_adjust),
+	     the pattern 'stack_v3push' is implemented in nds32.md.  */
+	  nds32_emit_stack_v3push (Rb, Re, 0);
+	}

       /* Check frame_pointer_needed to see
-         if we shall emit fp adjustment instruction.  */
+	 if we shall emit fp adjustment instruction.  */
       if (frame_pointer_needed)
 	{
 	  /* adjust $fp = $sp + 4        ($fp size)
-	                      + 4        ($gp size)
-	                      + 4        ($lp size)
-	                      + (4 * n)  (callee-saved registers)
+			      + 4        ($gp size)
+			      + 4        ($lp size)
+			      + (4 * n)  (callee-saved registers)
 	     Note: Since we use 'push25 Re,0',
-	           the stack pointer is just at the position
-	           after push instruction.
-	           No need to take sp_adjust into consideration.  */
+		   the stack pointer is just at the position
+		   after push instruction.
+		   No need to take sp_adjust into consideration.  */
 	  fp_adjust = cfun->machine->fp_size
 		      + cfun->machine->gp_size
 		      + cfun->machine->lp_size
 		      + cfun->machine->callee_saved_gpr_regs_size;
-	  fp_adjust_insn = gen_addsi3 (hard_frame_pointer_rtx,
-				       stack_pointer_rtx,
-				       GEN_INT (fp_adjust));
-	  /* Emit rtx into instructions list and receive INSN rtx form.  */
-	  fp_adjust_insn = emit_insn (fp_adjust_insn);
-	}

-      /* Because we use 'push25 Re,0',
-         we need to expand one more instruction to adjust $sp.
-         However, sp_adjust value may be out of range of the addi instruction,
-         create alternative add behavior with TA_REGNUM if necessary,
-         using NEGATIVE value to tell that we are decreasing address.  */
-      sp_adjust = nds32_force_addi_stack_int ( (-1) * sp_adjust);
-      if (sp_adjust)
-	{
-	  /* Generate sp adjustment instruction
-	     if and only if sp_adjust != 0.  */
-	  sp_adjust_insn = gen_addsi3 (stack_pointer_rtx,
-				       stack_pointer_rtx,
-				       GEN_INT (-1 * sp_adjust));
-	  /* Emit rtx into instructions list and receive INSN rtx form.  */
-	  sp_adjust_insn = emit_insn (sp_adjust_insn);
+	  if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	    {
+	      /* We use 'push25 Re, fpr_space', the $sp is
+		 on callee saved fpr position, so need to consider
+		 fpr space.  */
+	      fp_adjust = fp_adjust + fpr_space;
+	    }
+
+	  nds32_emit_adjust_frame (hard_frame_pointer_rtx,
+				   stack_pointer_rtx,
+				   fp_adjust);
+	}

-	  /* The insn rtx 'sp_adjust_insn' will change frame layout.
-	     We need to use RTX_FRAME_RELATED_P so that GCC is able to
-	     generate CFI (Call Frame Information) stuff.  */
-	  RTX_FRAME_RELATED_P (sp_adjust_insn) = 1;
+      if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	{
+	  /* We use 'push25 Re, fpr_space',
+	     the $sp is on callee saved fpr position,
+	     no need to consider fpr space.  */
+	  sp_adjust = sp_adjust - fpr_space;
 	}
+
+      /* Because we use 'push25 Re,0',
+	 we need to expand one more instruction to adjust $sp.
+	 using NEGATIVE value to tell that we are decreasing address.  */
+      nds32_emit_adjust_frame (stack_pointer_rtx,
+			       stack_pointer_rtx,
+			       -1 * sp_adjust);
     }

+  /* Emit gp setup instructions for -fpic.  */
+  if (flag_pic && df_regs_ever_live_p (PIC_OFFSET_TABLE_REGNUM))
+    nds32_emit_load_gp ();
+
   /* Prevent the instruction scheduler from
      moving instructions across the boundary.  */
   emit_insn (gen_blockage ());
@@ -3294,9 +5811,7 @@ void
 nds32_expand_epilogue_v3pop (bool sibcall_p)
 {
   int sp_adjust;
-
-  rtx Rb, Re;
-  rtx sp_adjust_insn;
+  unsigned Rb, Re;

   /* Compute and setup stack frame size.
      The result will be in cfun->machine.  */
@@ -3311,21 +5826,32 @@ nds32_expand_epilogue_v3pop (bool sibcall_p)
   if (cfun->machine->naked_p)
     {
       /* Generate return instruction by using 'return_internal' pattern.
-         Make sure this instruction is after gen_blockage().  */
+	 Make sure this instruction is after gen_blockage().
+	 First we need to check this is a function without sibling call.  */
       if (!sibcall_p)
-	emit_jump_insn (gen_return_internal ());
+	{
+	  /* We need to further check attributes to determine whether
+	     there should be return instruction at epilogue.
+	     If the attribute naked exists but -mno-ret-in-naked-func
+	     is issued, there is NO need to generate return instruction.  */
+	  if (cfun->machine->attr_naked_p && !flag_ret_in_naked_func)
+	    return;
+
+	  emit_jump_insn (gen_return_internal ());
+	}
       return;
     }

   /* Get callee_first_regno and callee_last_regno.  */
-  Rb = gen_rtx_REG (SImode, cfun->machine->callee_saved_first_gpr_regno);
-  Re = gen_rtx_REG (SImode, cfun->machine->callee_saved_last_gpr_regno);
+  Rb = cfun->machine->callee_saved_first_gpr_regno;
+  Re = cfun->machine->callee_saved_last_gpr_regno;

   /* Calculate sp_adjust first to test if 'pop25 Re,imm8u' is available,
      where imm8u has to be 8-byte alignment.  */
   sp_adjust = cfun->machine->local_size
 	      + cfun->machine->out_args_size
-	      + cfun->machine->callee_saved_area_gpr_padding_bytes;
+	      + cfun->machine->callee_saved_area_gpr_padding_bytes
+	      + cfun->machine->callee_saved_fpr_regs_size;

   /* We have to consider alloca issue as well.
      If the function does call alloca(), the stack pointer is not fixed.
@@ -3338,38 +5864,65 @@ nds32_expand_epilogue_v3pop (bool sibcall_p)
       && NDS32_DOUBLE_WORD_ALIGN_P (sp_adjust)
       && !cfun->calls_alloca)
     {
+      /* Restore fpu registers.  */
+      if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	{
+	  int fpr_position = cfun->machine->local_size
+			     + cfun->machine->out_args_size;
+	  /* Emit fpu load instruction, using [$sp + offset] restore
+	     fpu registers.  */
+	  nds32_emit_v3pop_fpr_callee_saved (fpr_position);
+	}
+
       /* We can use 'pop25 Re,imm8u'.  */

       /* nds32_emit_stack_v3pop(last_regno, sp_adjust),
-         the pattern 'stack_v3pop' is implementad in nds32.md.
-         The (const_int 14) means v3pop always pop { $fp $gp $lp }.  */
-      nds32_emit_stack_v3pop (Rb, Re,
-			      GEN_INT (14), GEN_INT (sp_adjust));
+	 the pattern 'stack_v3pop' is implementad in nds32.md.  */
+      nds32_emit_stack_v3pop (Rb, Re, sp_adjust);
     }
   else
     {
       /* We have to use 'pop25 Re,0', and prior to it,
-         we must expand one more instruction to adjust $sp.  */
+	 we must expand one more instruction to adjust $sp.  */

       if (frame_pointer_needed)
 	{
 	  /* adjust $sp = $fp - 4        ($fp size)
-	                      - 4        ($gp size)
-	                      - 4        ($lp size)
-	                      - (4 * n)  (callee-saved registers)
+			      - 4        ($gp size)
+			      - 4        ($lp size)
+			      - (4 * n)  (callee-saved registers)
 	     Note: No need to adjust
-	           cfun->machine->callee_saved_area_padding_bytes,
-	           because we want to adjust stack pointer
-	           to the position for pop instruction.  */
+		   cfun->machine->callee_saved_area_gpr_padding_bytes,
+		   because we want to adjust stack pointer
+		   to the position for pop instruction.  */
 	  sp_adjust = cfun->machine->fp_size
 		      + cfun->machine->gp_size
 		      + cfun->machine->lp_size
 		      + cfun->machine->callee_saved_gpr_regs_size;
-	  sp_adjust_insn = gen_addsi3 (stack_pointer_rtx,
+
+	  /* Restore fpu registers.  */
+	  if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	    {
+	      /* Set $sp to callee saved fpr position, we need to restore
+		 fpr registers.  */
+	      sp_adjust = sp_adjust
+			  + cfun->machine->callee_saved_area_gpr_padding_bytes
+			  + cfun->machine->callee_saved_fpr_regs_size;
+
+	      nds32_emit_adjust_frame (stack_pointer_rtx,
+				       hard_frame_pointer_rtx,
+				       -1 * sp_adjust);
+
+	      /* Emit fpu load instruction, using [$sp + offset] restore
+		 fpu registers.  */
+	      nds32_emit_v3pop_fpr_callee_saved (0);
+	    }
+	  else
+	    {
+	      nds32_emit_adjust_frame (stack_pointer_rtx,
 				       hard_frame_pointer_rtx,
-				       GEN_INT (-1 * sp_adjust));
-	  /* Emit rtx into instructions list and receive INSN rtx form.  */
-	  sp_adjust_insn = emit_insn (sp_adjust_insn);
+				       -1 * sp_adjust);
+	    }
 	}
       else
 	{
@@ -3381,33 +5934,57 @@ nds32_expand_epilogue_v3pop (bool sibcall_p)
 	     so we have to deal with it as well.  */

 	  /* Adjust $sp = $sp + local_size + out_args_size
-			      + callee_saved_area_padding_bytes.  */
+			      + callee_saved_area_gpr_padding_bytes
+			      + callee_saved_fpr_regs_size.  */
 	  sp_adjust = cfun->machine->local_size
 		      + cfun->machine->out_args_size
-		      + cfun->machine->callee_saved_area_gpr_padding_bytes;
-	  /* sp_adjust value may be out of range of the addi instruction,
-	     create alternative add behavior with TA_REGNUM if necessary,
-	     using POSITIVE value to tell that we are increasing address.  */
-	  sp_adjust = nds32_force_addi_stack_int (sp_adjust);
-	  if (sp_adjust)
+		      + cfun->machine->callee_saved_area_gpr_padding_bytes
+		      + cfun->machine->callee_saved_fpr_regs_size;
+
+	  /* Restore fpu registers.  */
+	  if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	    {
+	      /* Set $sp to callee saved fpr position, we need to restore
+		 fpr registers.  */
+	      sp_adjust = sp_adjust
+			  - cfun->machine->callee_saved_area_gpr_padding_bytes
+			  - cfun->machine->callee_saved_fpr_regs_size;
+
+	      nds32_emit_adjust_frame (stack_pointer_rtx,
+				       stack_pointer_rtx,
+				       sp_adjust);
+
+	      /* Emit fpu load instruction, using [$sp + offset] restore
+		 fpu registers.  */
+	      nds32_emit_v3pop_fpr_callee_saved (0);
+	    }
+	  else
 	    {
-	      /* Generate sp adjustment instruction
-	         if and only if sp_adjust != 0.  */
-	      sp_adjust_insn = gen_addsi3 (stack_pointer_rtx,
-					   stack_pointer_rtx,
-					   GEN_INT (sp_adjust));
-	      /* Emit rtx into instructions list and receive INSN rtx form.  */
-	      sp_adjust_insn = emit_insn (sp_adjust_insn);
+	       /* sp_adjust value may be out of range of the addi instruction,
+		  create alternative add behavior with TA_REGNUM if necessary,
+		  using POSITIVE value to tell that we are increasing
+		  address.  */
+	      nds32_emit_adjust_frame (stack_pointer_rtx,
+				       stack_pointer_rtx,
+				       sp_adjust);
 	    }
 	}

-      /* nds32_emit_stack_v3pop(last_regno, sp_adjust),
-         the pattern 'stack_v3pop' is implementad in nds32.md.  */
-      /* The (const_int 14) means v3pop always pop { $fp $gp $lp }.  */
-      nds32_emit_stack_v3pop (Rb, Re,
-			      GEN_INT (14), GEN_INT (0));
+      if (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)
+	{
+	  /* We have fpr need to restore, so $sp is set on callee saved fpr
+	     position.  And we use 'pop25 Re, fpr_space' to adjust $sp.  */
+	  int fpr_space = cfun->machine->callee_saved_area_gpr_padding_bytes
+			  + cfun->machine->callee_saved_fpr_regs_size;
+	  nds32_emit_stack_v3pop (Rb, Re, fpr_space);
+	}
+      else
+	{
+	  /* nds32_emit_stack_v3pop(last_regno, sp_adjust),
+	     the pattern 'stack_v3pop' is implementad in nds32.md.  */
+	  nds32_emit_stack_v3pop (Rb, Re, 0);
+	}
     }
-
   /* Generate return instruction.  */
   emit_jump_insn (gen_pop25return ());
 }
@@ -3418,97 +5995,179 @@ nds32_expand_epilogue_v3pop (bool sibcall_p)
 int
 nds32_can_use_return_insn (void)
 {
+  int sp_adjust;
+
   /* Prior to reloading, we can't tell how many registers must be saved.
      Thus we can not determine whether this function has null epilogue.  */
   if (!reload_completed)
     return 0;

+  /* If attribute 'naked' appears but -mno-ret-in-naked-func is used,
+     we cannot use return instruction.  */
+  if (cfun->machine->attr_naked_p && !flag_ret_in_naked_func)
+    return 0;
+
+  sp_adjust = cfun->machine->local_size
+	      + cfun->machine->out_args_size
+	      + cfun->machine->callee_saved_area_gpr_padding_bytes
+	      + cfun->machine->callee_saved_fpr_regs_size;
+  if (!cfun->machine->fp_as_gp_p
+      && satisfies_constraint_Iu08 (GEN_INT (sp_adjust))
+      && NDS32_DOUBLE_WORD_ALIGN_P (sp_adjust)
+      && !cfun->calls_alloca
+      && NDS32_V3PUSH_AVAILABLE_P
+      && !(TARGET_HARD_FLOAT
+	   && (cfun->machine->callee_saved_first_fpr_regno != SP_REGNUM)))
+    return 1;
+
   /* If no stack was created, two conditions must be satisfied:
      1. This is a naked function.
-        So there is no callee-saved, local size, or outgoing size.
+	So there is no callee-saved, local size, or outgoing size.
      2. This is NOT a variadic function.
-        So there is no pushing arguement registers into the stack.  */
-  return (cfun->machine->naked_p && (cfun->machine->va_args_size == 0));
+	So there is no pushing arguement registers into the stack.  */
+  return ((cfun->machine->naked_p && (cfun->machine->va_args_size == 0)));
 }

-/* ------------------------------------------------------------------------ */
-
-/* Function to test 333-form for load/store instructions.
-   This is auxiliary extern function for auxiliary macro in nds32.h.
-   Because it is a little complicated, we use function instead of macro.  */
-bool
-nds32_ls_333_p (rtx rt, rtx ra, rtx imm, machine_mode mode)
+enum machine_mode
+nds32_case_vector_shorten_mode (int min_offset, int max_offset,
+				rtx body ATTRIBUTE_UNUSED)
 {
-  if (REGNO_REG_CLASS (REGNO (rt)) == LOW_REGS
-      && REGNO_REG_CLASS (REGNO (ra)) == LOW_REGS)
+  if (min_offset < 0 || max_offset >= 0x2000)
+    return SImode;
+  else
     {
-      if (GET_MODE_SIZE (mode) == 4)
-	return satisfies_constraint_Iu05 (imm);
-
-      if (GET_MODE_SIZE (mode) == 2)
-	return satisfies_constraint_Iu04 (imm);
-
-      if (GET_MODE_SIZE (mode) == 1)
-	return satisfies_constraint_Iu03 (imm);
+      /* The jump table maybe need to 2 byte alignment,
+	 so reserved 1 byte for check max_offset.  */
+      if (max_offset >= 0xff)
+	return HImode;
+      else
+	return QImode;
     }
+}
+
+static bool
+nds32_cannot_copy_insn_p (rtx_insn *insn)
+{
+  /* The hwloop_cfg insn cannot be copied.  */
+  if (recog_memoized (insn) == CODE_FOR_hwloop_cfg)
+    return true;

   return false;
 }

-
-/* Computing the Length of an Insn.
-   Modifies the length assigned to instruction INSN.
-   LEN is the initially computed length of the insn.  */
+/* Return alignment for the label.  */
 int
-nds32_adjust_insn_length (rtx_insn *insn, int length)
+nds32_target_alignment (rtx label)
 {
-  rtx src, dst;
+  rtx_insn *insn;

-  switch (recog_memoized (insn))
+  if (!NDS32_ALIGN_P ())
+    return 0;
+
+  insn = next_active_insn (label);
+
+  /* Always align to 4 byte when first instruction after label is jump
+     instruction since length for that might changed, so let's always align
+     it for make sure we don't lose any perfomance here.  */
+  if (insn == 0
+      || (get_attr_length (insn) == 2
+	  && !JUMP_P (insn) && !CALL_P (insn)))
+    return 0;
+  else
+    return 2;
+}
+
+/* Return alignment for data.  */
+unsigned int
+nds32_data_alignment (tree data,
+		      unsigned int basic_align)
+{
+  if ((basic_align < BITS_PER_WORD)
+      && (TREE_CODE (data) == ARRAY_TYPE
+	 || TREE_CODE (data) == UNION_TYPE
+	 || TREE_CODE (data) == RECORD_TYPE))
+    return BITS_PER_WORD;
+  else
+    return basic_align;
+}
+
+/* Return alignment for constant value.  */
+unsigned int
+nds32_constant_alignment (tree constant,
+			  unsigned int basic_align)
+{
+  /* Make string literal and constant for constructor to word align.  */
+  if (((TREE_CODE (constant) == STRING_CST
+	|| TREE_CODE (constant) == CONSTRUCTOR
+	|| TREE_CODE (constant) == UNION_TYPE
+	|| TREE_CODE (constant) == RECORD_TYPE
+	|| TREE_CODE (constant) == ARRAY_TYPE)
+       && basic_align < BITS_PER_WORD))
+    return BITS_PER_WORD;
+  else
+    return basic_align;
+}
+
+/* Return alignment for local variable.  */
+unsigned int
+nds32_local_alignment (tree local ATTRIBUTE_UNUSED,
+		       unsigned int basic_align)
+{
+  bool at_least_align_to_word = false;
+  /* Make local array, struct and union at least align to word for make
+     sure it can unroll memcpy when initialize by constant.  */
+  switch (TREE_CODE (local))
     {
-    case CODE_FOR_move_df:
-    case CODE_FOR_move_di:
-      /* Adjust length of movd44 to 2.  */
-      src = XEXP (PATTERN (insn), 1);
-      dst = XEXP (PATTERN (insn), 0);
-
-      if (REG_P (src)
-	  && REG_P (dst)
-	  && (REGNO (src) % 2) == 0
-	  && (REGNO (dst) % 2) == 0)
-	length = 2;
+    case ARRAY_TYPE:
+    case RECORD_TYPE:
+    case UNION_TYPE:
+      at_least_align_to_word = true;
       break;
-
     default:
+      at_least_align_to_word = false;
       break;
     }
-
-  return length;
+  if (at_least_align_to_word
+      && (basic_align < BITS_PER_WORD))
+    return BITS_PER_WORD;
+  else
+    return basic_align;
 }

-
-/* Return align 2 (log base 2) if the next instruction of LABEL is 4 byte.  */
-int
-nds32_target_alignment (rtx label)
+bool
+nds32_split_double_word_load_store_p(rtx *operands, bool load_p)
 {
-  rtx_insn *insn;
+  rtx mem = load_p ? operands[1] : operands[0];
+  /* Do split at split2 if -O0 or schedule 2 not enable.  */
+  if (optimize == 0 || !flag_schedule_insns_after_reload)
+    return !satisfies_constraint_Da (mem) || MEM_VOLATILE_P (mem);

-  if (optimize_size)
-    return 0;
+  /* Split double word load store after copy propgation.  */
+  if (current_pass == NULL)
+    return false;

-  insn = next_active_insn (label);
+  const char *pass_name = current_pass->name;
+  if (pass_name && ((strcmp (pass_name, "split4") == 0)
+		     || (strcmp (pass_name, "split5") == 0)))
+    return !satisfies_constraint_Da (mem) || MEM_VOLATILE_P (mem);

-  if (insn == 0)
-    return 0;
-  else if ((get_attr_length (insn) % 4) == 0)
-    return 2;
+  return false;
+}
+
+static bool
+nds32_use_blocks_for_constant_p (enum machine_mode mode,
+				 const_rtx x ATTRIBUTE_UNUSED)
+{
+  if ((TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)
+      && (mode == DFmode || mode == SFmode))
+    return true;
   else
-    return 0;
+    return false;
 }

 /* ------------------------------------------------------------------------ */

-/* PART 5: Initialize target hook structure and definitions.  */
+/* PART 6: Initialize target hook structure and definitions.  */

 /* Controlling the Compilation Driver.  */

@@ -3525,6 +6184,9 @@ nds32_target_alignment (rtx label)
 #define TARGET_PROMOTE_FUNCTION_MODE \
   default_promote_function_mode_always_promote

+#undef TARGET_EXPAND_TO_RTL_HOOK
+#define TARGET_EXPAND_TO_RTL_HOOK nds32_expand_to_rtl_hook
+

 /* Layout of Source Language Data Types.  */

@@ -3533,6 +6195,9 @@ nds32_target_alignment (rtx label)

 /* -- Basic Characteristics of Registers.  */

+#undef TARGET_CONDITIONAL_REGISTER_USAGE
+#define TARGET_CONDITIONAL_REGISTER_USAGE nds32_conditional_register_usage
+
 /* -- Order of Allocation of Registers.  */

 /* -- How Values Fit in Registers.  */
@@ -3544,6 +6209,9 @@ nds32_target_alignment (rtx label)

 /* Register Classes.  */

+#undef TARGET_PREFERRED_RENAME_CLASS
+#define TARGET_PREFERRED_RENAME_CLASS nds32_preferred_rename_class
+
 #undef TARGET_CLASS_MAX_NREGS
 #define TARGET_CLASS_MAX_NREGS nds32_class_max_nregs

@@ -3591,6 +6259,9 @@ nds32_target_alignment (rtx label)
 #undef TARGET_FUNCTION_ARG_BOUNDARY
 #define TARGET_FUNCTION_ARG_BOUNDARY nds32_function_arg_boundary

+#undef TARGET_VECTOR_MODE_SUPPORTED_P
+#define TARGET_VECTOR_MODE_SUPPORTED_P nds32_vector_mode_supported_p
+
 /* -- How Scalar Function Values Are Returned.  */

 #undef TARGET_FUNCTION_VALUE
@@ -3604,6 +6275,9 @@ nds32_target_alignment (rtx label)

 /* -- How Large Values Are Returned.  */

+#undef TARGET_RETURN_IN_MEMORY
+#define TARGET_RETURN_IN_MEMORY nds32_return_in_memory
+
 /* -- Caller-Saves Register Allocation.  */

 /* -- Function Entry and Exit.  */
@@ -3630,6 +6304,9 @@ nds32_target_alignment (rtx label)

 /* -- Permitting tail calls.  */

+#undef TARGET_FUNCTION_OK_FOR_SIBCALL
+#define TARGET_FUNCTION_OK_FOR_SIBCALL nds32_function_ok_for_sibcall
+
 #undef TARGET_WARN_FUNC_RETURN
 #define TARGET_WARN_FUNC_RETURN nds32_warn_func_return

@@ -3662,6 +6339,21 @@ nds32_target_alignment (rtx label)
 #undef TARGET_LEGITIMATE_ADDRESS_P
 #define TARGET_LEGITIMATE_ADDRESS_P nds32_legitimate_address_p

+#undef TARGET_LEGITIMIZE_ADDRESS
+#define TARGET_LEGITIMIZE_ADDRESS nds32_legitimize_address
+
+#undef TARGET_LEGITIMATE_CONSTANT_P
+#define TARGET_LEGITIMATE_CONSTANT_P nds32_legitimate_constant_p
+
+#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
+#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE nds32_vectorize_preferred_simd_mode
+
+#undef TARGET_CANNOT_FORCE_CONST_MEM
+#define TARGET_CANNOT_FORCE_CONST_MEM nds32_cannot_force_const_mem
+
+#undef TARGET_DELEGITIMIZE_ADDRESS
+#define TARGET_DELEGITIMIZE_ADDRESS nds32_delegitimize_address
+

 /* Anchored Addresses.  */

@@ -3672,6 +6364,9 @@ nds32_target_alignment (rtx label)

 /* -- Representation of condition codes using registers.  */

+#undef TARGET_CANONICALIZE_COMPARISON
+#define TARGET_CANONICALIZE_COMPARISON nds32_canonicalize_comparison
+
 /* -- Macros to control conditional execution.  */


@@ -3692,6 +6387,15 @@ nds32_target_alignment (rtx label)

 /* Adjusting the Instruction Scheduler.  */

+#undef TARGET_SCHED_ISSUE_RATE
+#define TARGET_SCHED_ISSUE_RATE nds32_sched_issue_rate
+
+#undef  TARGET_SCHED_ADJUST_COST
+#define TARGET_SCHED_ADJUST_COST nds32_sched_adjust_cost
+
+#undef TARGET_SCHED_SET_SCHED_FLAGS
+#define TARGET_SCHED_SET_SCHED_FLAGS nds32_set_sched_flags
+

 /* Dividing the Output into Sections (Texts, Data, . . . ).  */

@@ -3719,6 +6423,9 @@ nds32_target_alignment (rtx label)
 #undef TARGET_ASM_ALIGNED_SI_OP
 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"

+#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
+#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA nds32_asm_output_addr_const_extra
+
 /* -- Output of Uninitialized Variables.  */

 /* -- Output and Generation of Labels.  */
@@ -3741,6 +6448,9 @@ nds32_target_alignment (rtx label)

 /* -- Assembler Commands for Exception Regions.  */

+#undef TARGET_DWARF_REGISTER_SPAN
+#define TARGET_DWARF_REGISTER_SPAN nds32_dwarf_register_span
+
 /* -- Assembler Commands for Alignment.  */


@@ -3756,6 +6466,11 @@ nds32_target_alignment (rtx label)

 /* -- Macros for SDB and DWARF Output.  */

+/* Variable tracking should be run after all optimizations which
+   change order of insns.  It also needs a valid CFG.  */
+#undef TARGET_DELAY_VARTRACK
+#define TARGET_DELAY_VARTRACK true
+
 /* -- Macros for VMS Debug Format.  */


@@ -3785,6 +6500,9 @@ nds32_target_alignment (rtx label)

 /* Emulating TLS.  */

+#undef TARGET_HAVE_TLS
+#define TARGET_HAVE_TLS TARGET_LINUX_ABI
+

 /* Defining coprocessor specifics for MIPS targets.  */

@@ -3800,12 +6518,43 @@ nds32_target_alignment (rtx label)

 /* Miscellaneous Parameters.  */

+#undef TARGET_MD_ASM_ADJUST
+#define TARGET_MD_ASM_ADJUST nds32_md_asm_adjust
+
+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG nds32_machine_dependent_reorg
+
 #undef TARGET_INIT_BUILTINS
 #define TARGET_INIT_BUILTINS nds32_init_builtins

+#undef  TARGET_BUILTIN_DECL
+#define TARGET_BUILTIN_DECL nds32_builtin_decl
+
 #undef TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN nds32_expand_builtin

+#undef TARGET_HAVE_CONDITIONAL_EXECUTION
+#define TARGET_HAVE_CONDITIONAL_EXECUTION nds32_have_conditional_execution
+
+#undef TARGET_INIT_LIBFUNCS
+#define TARGET_INIT_LIBFUNCS nds32_init_libfuncs
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P nds32_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP nds32_invalid_within_doloop
+
+#undef  TARGET_CANNOT_COPY_INSN_P
+#define TARGET_CANNOT_COPY_INSN_P nds32_cannot_copy_insn_p
+
+#undef TARGET_MIN_ANCHOR_OFFSET
+#define TARGET_MIN_ANCHOR_OFFSET -((long long int) 1 << 14)
+#undef TARGET_MAX_ANCHOR_OFFSET
+#define TARGET_MAX_ANCHOR_OFFSET (((long long int) 1 << 14) - 1)
+#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
+#define TARGET_USE_BLOCKS_FOR_CONSTANT_P nds32_use_blocks_for_constant_p
+

 /* ------------------------------------------------------------------------ */

diff --git a/gcc/config/nds32/nds32.h b/gcc/config/nds32/nds32.h
index eb4558c..a3e07cd 100644
--- a/gcc/config/nds32/nds32.h
+++ b/gcc/config/nds32/nds32.h
@@ -24,6 +24,9 @@
 /* The following are auxiliary macros or structure declarations
    that are used all over the nds32.c and nds32.h.  */

+#define ADJUST_INSN_LENGTH(INSN, LENGTH) \
+  (LENGTH = nds32_adjust_insn_length (INSN, LENGTH))
+
 /* Use SYMBOL_FLAG_MACH_DEP to define our own symbol_ref flag.
    It is used in nds32_encode_section_info() to store flag in symbol_ref
    in case the symbol should be placed in .rodata section.
@@ -33,68 +36,23 @@
 #define NDS32_SYMBOL_REF_RODATA_P(x) \
   ((SYMBOL_REF_FLAGS (x) & NDS32_SYMBOL_FLAG_RODATA) != 0)

-/* Computing the Length of an Insn.  */
-#define ADJUST_INSN_LENGTH(INSN, LENGTH) \
-  (LENGTH = nds32_adjust_insn_length (INSN, LENGTH))
+enum nds32_relax_insn_type
+{
+  RELAX_ORI,
+  RELAX_PLT_ADD,
+  RELAX_TLS_ADD_or_LW,
+  RELAX_TLS_ADD_LW,
+  RELAX_TLS_LW_JRAL,
+  RELAX_DONE
+};

-/* Check instruction LS-37-FP-implied form.
-   Note: actually its immediate range is imm9u
-         since it is used for lwi37/swi37 instructions.  */
-#define NDS32_LS_37_FP_P(rt, ra, imm)       \
-  (REGNO_REG_CLASS (REGNO (rt)) == LOW_REGS \
-   && REGNO (ra) == FP_REGNUM               \
-   && satisfies_constraint_Iu09 (imm))
-
-/* Check instruction LS-37-SP-implied form.
-   Note: actually its immediate range is imm9u
-         since it is used for lwi37/swi37 instructions.  */
-#define NDS32_LS_37_SP_P(rt, ra, imm)       \
-  (REGNO_REG_CLASS (REGNO (rt)) == LOW_REGS \
-   && REGNO (ra) == SP_REGNUM               \
-   && satisfies_constraint_Iu09 (imm))
-
-
-/* Check load/store instruction form : Rt3, Ra3, imm3u.  */
-#define NDS32_LS_333_P(rt, ra, imm, mode) nds32_ls_333_p (rt, ra, imm, mode)
-
-/* Check load/store instruction form : Rt4, Ra5, const_int_0.
-   Note: no need to check ra because Ra5 means it covers all registers.  */
-#define NDS32_LS_450_P(rt, ra, imm)                     \
-  ((imm == const0_rtx)                                  \
-   && (REGNO_REG_CLASS (REGNO (rt)) == LOW_REGS         \
-       || REGNO_REG_CLASS (REGNO (rt)) == MIDDLE_REGS))
-
-/* Check instruction RRI-333-form.  */
-#define NDS32_RRI_333_P(rt, ra, imm)           \
-  (REGNO_REG_CLASS (REGNO (rt)) == LOW_REGS    \
-   && REGNO_REG_CLASS (REGNO (ra)) == LOW_REGS \
-   && satisfies_constraint_Iu03 (imm))
-
-/* Check instruction RI-45-form.  */
-#define NDS32_RI_45_P(rt, ra, imm)                     \
-  (REGNO (rt) == REGNO (ra)                            \
-   && (REGNO_REG_CLASS (REGNO (rt)) == LOW_REGS        \
-       || REGNO_REG_CLASS (REGNO (rt)) == MIDDLE_REGS) \
-   && satisfies_constraint_Iu05 (imm))
-
-
-/* Check instruction RR-33-form.  */
-#define NDS32_RR_33_P(rt, ra)                   \
-  (REGNO_REG_CLASS (REGNO (rt)) == LOW_REGS     \
-   && REGNO_REG_CLASS (REGNO (ra)) == LOW_REGS)
-
-/* Check instruction RRR-333-form.  */
-#define NDS32_RRR_333_P(rt, ra, rb)             \
-  (REGNO_REG_CLASS (REGNO (rt)) == LOW_REGS     \
-   && REGNO_REG_CLASS (REGNO (ra)) == LOW_REGS  \
-   && REGNO_REG_CLASS (REGNO (rb)) == LOW_REGS)
-
-/* Check instruction RR-45-form.
-   Note: no need to check rb because Rb5 means it covers all registers.  */
-#define NDS32_RR_45_P(rt, ra, rb)               \
-  (REGNO (rt) == REGNO (ra)                     \
-   && (REGNO_REG_CLASS (REGNO (rt)) == LOW_REGS \
-       || REGNO_REG_CLASS (REGNO (rt)) == MIDDLE_REGS))
+/* Classifies expand result for expand helper function.  */
+enum nds32_expand_result_type
+{
+  EXPAND_DONE,
+  EXPAND_FAIL,
+  EXPAND_CREATE_TEMPLATE
+};

 /* Classifies address type to distinguish 16-bit/32-bit format.  */
 enum nds32_16bit_address_type
@@ -105,6 +63,10 @@ enum nds32_16bit_address_type
   ADDRESS_LO_REG_IMM3U,
   /* post_inc [lo_reg + imm3u]: 333 format address.  */
   ADDRESS_POST_INC_LO_REG_IMM3U,
+  /* post_modify [lo_reg + imm3u]: 333 format address.  */
+  ADDRESS_POST_MODIFY_LO_REG_IMM3U,
+  /* [$r8 + imm7u]: r8 imply address.  */
+  ADDRESS_R8_IMM7U,
   /* [$fp + imm7u]: fp imply address.  */
   ADDRESS_FP_IMM7U,
   /* [$sp + imm7u]: sp imply address.  */
@@ -113,23 +75,67 @@ enum nds32_16bit_address_type
   ADDRESS_NOT_16BIT_FORMAT
 };

-
 /* ------------------------------------------------------------------------ */

 /* Define maximum numbers of registers for passing arguments.  */
 #define NDS32_MAX_GPR_REGS_FOR_ARGS 6
+#define NDS32_MAX_FPR_REGS_FOR_ARGS 6

 /* Define the register number for first argument.  */
 #define NDS32_GPR_ARG_FIRST_REGNUM 0
+#define NDS32_FPR_ARG_FIRST_REGNUM 34

 /* Define the register number for return value.  */
 #define NDS32_GPR_RET_FIRST_REGNUM 0
+#define NDS32_FPR_RET_FIRST_REGNUM 34

 /* Define the first integer register number.  */
 #define NDS32_FIRST_GPR_REGNUM 0
 /* Define the last integer register number.  */
 #define NDS32_LAST_GPR_REGNUM 31

+#define NDS32_FIRST_CALLEE_SAVE_GPR_REGNUM 6
+#define NDS32_LAST_CALLEE_SAVE_GPR_REGNUM \
+  (TARGET_REDUCED_REGS ? 10 : 14)
+
+/* Define the floating-point number of registers.  */
+#define NDS32_FLOAT_REGISTER_NUMBER                           \
+ (((nds32_fp_regnum == NDS32_CONFIG_FPU_0)              \
+   || (nds32_fp_regnum == NDS32_CONFIG_FPU_4)) ? 8      \
+  : ((nds32_fp_regnum == NDS32_CONFIG_FPU_1)            \
+    || (nds32_fp_regnum == NDS32_CONFIG_FPU_5)) ? 16    \
+  : ((nds32_fp_regnum == NDS32_CONFIG_FPU_2)            \
+    || (nds32_fp_regnum == NDS32_CONFIG_FPU_6)) ? 32    \
+  : ((nds32_fp_regnum == NDS32_CONFIG_FPU_3)            \
+    || (nds32_fp_regnum == NDS32_CONFIG_FPU_7)) ? 64    \
+  : 32)
+
+#define NDS32_EXT_FPU_DOT_E (nds32_fp_regnum >= 4)
+
+/* Define the first floating-point register number.  */
+#define NDS32_FIRST_FPR_REGNUM 34
+/* Define the last floating-point register number.  */
+#define NDS32_LAST_FPR_REGNUM \
+  (NDS32_FIRST_FPR_REGNUM + NDS32_FLOAT_REGISTER_NUMBER - 1)
+
+
+#define NDS32_IS_EXT_FPR_REGNUM(regno) \
+  (((regno) >= NDS32_FIRST_FPR_REGNUM + 32) \
+   && ((regno) < NDS32_FIRST_FPR_REGNUM + 64))
+
+#define NDS32_IS_FPR_REGNUM(regno) \
+  (((regno) >= NDS32_FIRST_FPR_REGNUM) \
+   && ((regno) <= NDS32_LAST_FPR_REGNUM))
+
+#define NDS32_FPR_REGNO_OK_FOR_SINGLE(regno) \
+  ((regno) <= NDS32_LAST_FPR_REGNUM)
+
+#define NDS32_FPR_REGNO_OK_FOR_DOUBLE(regno) \
+  ((((regno) - NDS32_FIRST_FPR_REGNUM) & 1) == 0)
+
+#define NDS32_IS_GPR_REGNUM(regno) \
+  (((regno) <= NDS32_LAST_GPR_REGNUM))
+
 /* Define double word alignment bits.  */
 #define NDS32_DOUBLE_WORD_ALIGNMENT 64

@@ -138,6 +144,16 @@ enum nds32_16bit_address_type
 #define NDS32_SINGLE_WORD_ALIGN_P(value) (((value) & 0x03) == 0)
 #define NDS32_DOUBLE_WORD_ALIGN_P(value) (((value) & 0x07) == 0)

+/* Determine whether we would like to have code generation strictly aligned.
+   We set it strictly aligned when -malways-align is enabled.
+   Check gcc/common/config/nds32/nds32-common.c for the optimizations that
+   apply -malways-align.  */
+#define NDS32_ALIGN_P() (TARGET_ALWAYS_ALIGN)
+
+#define NDS32_HW_LOOP_P() (TARGET_HWLOOP && !TARGET_FORCE_NO_HWLOOP)
+
+#define NDS32_EXT_DSP_P() (TARGET_EXT_DSP && !TARGET_FORCE_NO_EXT_DSP)
+
 /* Get alignment according to mode or type information.
    When 'type' is nonnull, there is no need to look at 'mode'.  */
 #define NDS32_MODE_TYPE_ALIGN(mode, type) \
@@ -159,21 +175,28 @@ enum nds32_16bit_address_type
 /* This macro is used to return the register number for passing argument.
    We need to obey the following rules:
      1. If it is required MORE THAN one register,
-        we need to further check if it really needs to be
-        aligned on double words.
-          a) If double word alignment is necessary,
-             the register number must be even value.
-          b) Otherwise, the register number can be odd or even value.
+	we need to further check if it really needs to be
+	aligned on double words.
+	  a) If double word alignment is necessary,
+	     the register number must be even value.
+	  b) Otherwise, the register number can be odd or even value.
      2. If it is required ONLY one register,
-        the register number can be odd or even value.  */
-#define NDS32_AVAILABLE_REGNUM_FOR_GPR_ARG(reg_offset, mode, type)  \
-  ((NDS32_NEED_N_REGS_FOR_ARG (mode, type) > 1)                     \
-   ? ((NDS32_MODE_TYPE_ALIGN (mode, type) > PARM_BOUNDARY)          \
-      ? (((reg_offset) + NDS32_GPR_ARG_FIRST_REGNUM + 1) & ~1)      \
-      : ((reg_offset) + NDS32_GPR_ARG_FIRST_REGNUM))                \
+	the register number can be odd or even value.  */
+#define NDS32_AVAILABLE_REGNUM_FOR_GPR_ARG(reg_offset, mode, type) \
+  ((NDS32_NEED_N_REGS_FOR_ARG (mode, type) > 1)                    \
+   ? ((NDS32_MODE_TYPE_ALIGN (mode, type) > PARM_BOUNDARY)         \
+      ? (((reg_offset) + NDS32_GPR_ARG_FIRST_REGNUM + 1) & ~1)     \
+      : ((reg_offset) + NDS32_GPR_ARG_FIRST_REGNUM))               \
    : ((reg_offset) + NDS32_GPR_ARG_FIRST_REGNUM))

-/* This macro is to check if there are still available registers
+#define NDS32_AVAILABLE_REGNUM_FOR_FPR_ARG(reg_offset, mode, type) \
+  ((NDS32_NEED_N_REGS_FOR_ARG (mode, type) > 1)                    \
+   ? ((NDS32_MODE_TYPE_ALIGN (mode, type) > PARM_BOUNDARY)         \
+      ? (((reg_offset) + NDS32_FPR_ARG_FIRST_REGNUM + 1) & ~1)     \
+      : ((reg_offset) + NDS32_FPR_ARG_FIRST_REGNUM))               \
+   : ((reg_offset) + NDS32_FPR_ARG_FIRST_REGNUM))
+
+/* These two macros are to check if there are still available registers
    for passing argument, which must be entirely in registers.  */
 #define NDS32_ARG_ENTIRE_IN_GPR_REG_P(reg_offset, mode, type)   \
   ((NDS32_AVAILABLE_REGNUM_FOR_GPR_ARG (reg_offset, mode, type) \
@@ -181,13 +204,23 @@ enum nds32_16bit_address_type
    <= (NDS32_GPR_ARG_FIRST_REGNUM                               \
        + NDS32_MAX_GPR_REGS_FOR_ARGS))

-/* This macro is to check if there are still available registers
+#define NDS32_ARG_ENTIRE_IN_FPR_REG_P(reg_offset, mode, type)   \
+  ((NDS32_AVAILABLE_REGNUM_FOR_FPR_ARG (reg_offset, mode, type) \
+    + NDS32_NEED_N_REGS_FOR_ARG (mode, type))                   \
+   <= (NDS32_FPR_ARG_FIRST_REGNUM                               \
+       + NDS32_MAX_FPR_REGS_FOR_ARGS))
+
+/* These two macros are to check if there are still available registers
    for passing argument, either entirely in registers or partially
    in registers.  */
 #define NDS32_ARG_PARTIAL_IN_GPR_REG_P(reg_offset, mode, type) \
   (NDS32_AVAILABLE_REGNUM_FOR_GPR_ARG (reg_offset, mode, type) \
    < NDS32_GPR_ARG_FIRST_REGNUM + NDS32_MAX_GPR_REGS_FOR_ARGS)

+#define NDS32_ARG_PARTIAL_IN_FPR_REG_P(reg_offset, mode, type) \
+  (NDS32_AVAILABLE_REGNUM_FOR_FPR_ARG (reg_offset, mode, type) \
+   < NDS32_FPR_ARG_FIRST_REGNUM + NDS32_MAX_FPR_REGS_FOR_ARGS)
+
 /* This macro is to check if the register is required to be saved on stack.
    If call_used_regs[regno] == 0, regno is the callee-saved register.
    If df_regs_ever_live_p(regno) == true, it is used in the current function.
@@ -196,6 +229,19 @@ enum nds32_16bit_address_type
 #define NDS32_REQUIRED_CALLEE_SAVED_P(regno)                  \
   ((!call_used_regs[regno]) && (df_regs_ever_live_p (regno)))

+/* This macro is to check if the push25/pop25 are available to be used
+   for code generation.  Because pop25 also performs return behavior,
+   the instructions may not be available for some cases.
+   If we want to use push25/pop25, all the following conditions must
+   be satisfied:
+     1. TARGET_V3PUSH is set.
+     2. Current function is not an ISR function.
+     3. Current function is not a variadic function.*/
+#define NDS32_V3PUSH_AVAILABLE_P  \
+  (TARGET_V3PUSH \
+   && !nds32_isr_function_p (current_function_decl) \
+   && (cfun->machine->va_args_size == 0))
+
 /* ------------------------------------------------------------------------ */

 /* A C structure for machine-specific, per-function data.
@@ -222,6 +268,10 @@ struct GTY(()) machine_function
      callee-saved registers.  */
   int callee_saved_gpr_regs_size;

+  /* Number of bytes on the stack for saving floating-point
+     callee-saved registers.  */
+  int callee_saved_fpr_regs_size;
+
   /* The padding bytes in callee-saved area may be required.  */
   int callee_saved_area_gpr_padding_bytes;

@@ -230,26 +280,57 @@ struct GTY(()) machine_function
   /* The last required general purpose callee-saved register.  */
   int callee_saved_last_gpr_regno;

+  /* The first required floating-point callee-saved register.  */
+  int callee_saved_first_fpr_regno;
+  /* The last required floating-point callee-saved register.  */
+  int callee_saved_last_fpr_regno;
+
   /* The padding bytes in varargs area may be required.  */
   int va_args_area_padding_bytes;
-
   /* The first required register that should be saved on stack for va_args.  */
   int va_args_first_regno;
   /* The last required register that should be saved on stack for va_args.  */
   int va_args_last_regno;

+  /* Number of bytes on the stack for saving exception handling registers.  */
+  int eh_return_data_regs_size;
+  /* The first register of passing exception handling information.  */
+  int eh_return_data_first_regno;
+  /* The last register of passing exception handling information.  */
+  int eh_return_data_last_regno;
+
+  /* Indicate that whether this function
+     calls __builtin_eh_return.  */
+  int use_eh_return_p;
+
   /* Indicate that whether this function needs
      prologue/epilogue code generation.  */
   int naked_p;
   /* Indicate that whether this function
      uses fp_as_gp optimization.  */
   int fp_as_gp_p;
+  /* Indicate that whether this function is under strictly aligned
+     situation for legitimate address checking.  This flag informs
+     nds32_legitimate_address_p() how to treat offset alignment:
+       1. The IVOPT phase needs to detect available range for memory access,
+	  such as checking [base + 32767] ~ [base + (-32768)].
+	  For this case we do not want address to be strictly aligned.
+       2. The rtl lowering and optimization are close to target code.
+	  For this case we need address to be strictly aligned.  */
+  int strict_aligned_p;
+
+  /* Record two similar attributes status.  */
+  int attr_naked_p;
+  int attr_no_prologue_p;
+  /* Record hwloop group, use in reorg pass.  */
+  int hwloop_group_id;
 };

 /* A C structure that contains the arguments information.  */
 typedef struct
 {
   unsigned int gpr_offset;
+  unsigned int fpr_offset;
 } nds32_cumulative_args;

 /* ------------------------------------------------------------------------ */
@@ -288,7 +369,8 @@ enum nds32_isr_nested_type
 {
   NDS32_NESTED,
   NDS32_NOT_NESTED,
-  NDS32_NESTED_READY
+  NDS32_NESTED_READY,
+  NDS32_CRITICAL
 };

 /* Define structure to record isr information.
@@ -316,6 +398,13 @@ struct nds32_isr_info
      unless user specifies attribute to change it.  */
   enum nds32_isr_nested_type nested_type;

+  /* Secure isr level.
+     Currently we have 0-3 security level.
+     It should be set to 0 by default.
+     For security processors, this is determined by secure
+     attribute or compiler options.  */
+  unsigned int security_level;
+
   /* Total vectors.
      The total vectors = interrupt + exception numbers + reset.
      It should be set to 0 by default.
@@ -340,19 +429,477 @@ enum nds32_builtins
 {
   NDS32_BUILTIN_ISYNC,
   NDS32_BUILTIN_ISB,
+  NDS32_BUILTIN_DSB,
+  NDS32_BUILTIN_MSYNC_ALL,
+  NDS32_BUILTIN_MSYNC_STORE,
   NDS32_BUILTIN_MFSR,
   NDS32_BUILTIN_MFUSR,
   NDS32_BUILTIN_MTSR,
+  NDS32_BUILTIN_MTSR_ISB,
+  NDS32_BUILTIN_MTSR_DSB,
   NDS32_BUILTIN_MTUSR,
   NDS32_BUILTIN_SETGIE_EN,
-  NDS32_BUILTIN_SETGIE_DIS
+  NDS32_BUILTIN_SETGIE_DIS,
+  NDS32_BUILTIN_FMFCFG,
+  NDS32_BUILTIN_FMFCSR,
+  NDS32_BUILTIN_FMTCSR,
+  NDS32_BUILTIN_FCPYNSS,
+  NDS32_BUILTIN_FCPYSS,
+  NDS32_BUILTIN_FCPYNSD,
+  NDS32_BUILTIN_FCPYSD,
+  NDS32_BUILTIN_FABSS,
+  NDS32_BUILTIN_FABSD,
+  NDS32_BUILTIN_FSQRTS,
+  NDS32_BUILTIN_FSQRTD,
+  NDS32_BUILTIN_ABS,
+  NDS32_BUILTIN_AVE,
+  NDS32_BUILTIN_BCLR,
+  NDS32_BUILTIN_BSET,
+  NDS32_BUILTIN_BTGL,
+  NDS32_BUILTIN_BTST,
+  NDS32_BUILTIN_CLIP,
+  NDS32_BUILTIN_CLIPS,
+  NDS32_BUILTIN_CLZ,
+  NDS32_BUILTIN_CLO,
+  NDS32_BUILTIN_MAX,
+  NDS32_BUILTIN_MIN,
+  NDS32_BUILTIN_PBSAD,
+  NDS32_BUILTIN_PBSADA,
+  NDS32_BUILTIN_BSE,
+  NDS32_BUILTIN_BSP,
+  NDS32_BUILTIN_FFB,
+  NDS32_BUILTIN_FFMISM,
+  NDS32_BUILTIN_FLMISM,
+  NDS32_BUILTIN_KADDW,
+  NDS32_BUILTIN_KSUBW,
+  NDS32_BUILTIN_KADDH,
+  NDS32_BUILTIN_KSUBH,
+  NDS32_BUILTIN_KDMBB,
+  NDS32_BUILTIN_V_KDMBB,
+  NDS32_BUILTIN_KDMBT,
+  NDS32_BUILTIN_V_KDMBT,
+  NDS32_BUILTIN_KDMTB,
+  NDS32_BUILTIN_V_KDMTB,
+  NDS32_BUILTIN_KDMTT,
+  NDS32_BUILTIN_V_KDMTT,
+  NDS32_BUILTIN_KHMBB,
+  NDS32_BUILTIN_V_KHMBB,
+  NDS32_BUILTIN_KHMBT,
+  NDS32_BUILTIN_V_KHMBT,
+  NDS32_BUILTIN_KHMTB,
+  NDS32_BUILTIN_V_KHMTB,
+  NDS32_BUILTIN_KHMTT,
+  NDS32_BUILTIN_V_KHMTT,
+  NDS32_BUILTIN_KSLRAW,
+  NDS32_BUILTIN_KSLRAW_U,
+  NDS32_BUILTIN_RDOV,
+  NDS32_BUILTIN_CLROV,
+  NDS32_BUILTIN_ROTR,
+  NDS32_BUILTIN_SVA,
+  NDS32_BUILTIN_SVS,
+  NDS32_BUILTIN_WSBH,
+  NDS32_BUILTIN_JR_ITOFF,
+  NDS32_BUILTIN_JR_TOFF,
+  NDS32_BUILTIN_JRAL_ITON,
+  NDS32_BUILTIN_JRAL_TON,
+  NDS32_BUILTIN_RET_ITOFF,
+  NDS32_BUILTIN_RET_TOFF,
+  NDS32_BUILTIN_STANDBY_NO_WAKE_GRANT,
+  NDS32_BUILTIN_STANDBY_WAKE_GRANT,
+  NDS32_BUILTIN_STANDBY_WAKE_DONE,
+  NDS32_BUILTIN_TEQZ,
+  NDS32_BUILTIN_TNEZ,
+  NDS32_BUILTIN_TRAP,
+  NDS32_BUILTIN_SETEND_BIG,
+  NDS32_BUILTIN_SETEND_LITTLE,
+  NDS32_BUILTIN_SYSCALL,
+  NDS32_BUILTIN_BREAK,
+  NDS32_BUILTIN_NOP,
+  NDS32_BUILTIN_SCHE_BARRIER,
+  NDS32_BUILTIN_GET_CURRENT_SP,
+  NDS32_BUILTIN_SET_CURRENT_SP,
+  NDS32_BUILTIN_RETURN_ADDRESS,
+  NDS32_BUILTIN_LLW,
+  NDS32_BUILTIN_LWUP,
+  NDS32_BUILTIN_LBUP,
+  NDS32_BUILTIN_SCW,
+  NDS32_BUILTIN_SWUP,
+  NDS32_BUILTIN_SBUP,
+  NDS32_BUILTIN_CCTL_VA_LCK,
+  NDS32_BUILTIN_CCTL_IDX_WBINVAL,
+  NDS32_BUILTIN_CCTL_VA_WBINVAL_L1,
+  NDS32_BUILTIN_CCTL_VA_WBINVAL_LA,
+  NDS32_BUILTIN_CCTL_IDX_READ,
+  NDS32_BUILTIN_CCTL_IDX_WRITE,
+  NDS32_BUILTIN_CCTL_L1D_INVALALL,
+  NDS32_BUILTIN_CCTL_L1D_WBALL_ALVL,
+  NDS32_BUILTIN_CCTL_L1D_WBALL_ONE_LVL,
+  NDS32_BUILTIN_DPREF_QW,
+  NDS32_BUILTIN_DPREF_HW,
+  NDS32_BUILTIN_DPREF_W,
+  NDS32_BUILTIN_DPREF_DW,
+  NDS32_BUILTIN_TLBOP_TRD,
+  NDS32_BUILTIN_TLBOP_TWR,
+  NDS32_BUILTIN_TLBOP_RWR,
+  NDS32_BUILTIN_TLBOP_RWLK,
+  NDS32_BUILTIN_TLBOP_UNLK,
+  NDS32_BUILTIN_TLBOP_PB,
+  NDS32_BUILTIN_TLBOP_INV,
+  NDS32_BUILTIN_TLBOP_FLUA,
+  NDS32_BUILTIN_UALOAD_HW,
+  NDS32_BUILTIN_UALOAD_W,
+  NDS32_BUILTIN_UALOAD_DW,
+  NDS32_BUILTIN_UASTORE_HW,
+  NDS32_BUILTIN_UASTORE_W,
+  NDS32_BUILTIN_UASTORE_DW,
+  NDS32_BUILTIN_GIE_DIS,
+  NDS32_BUILTIN_GIE_EN,
+  NDS32_BUILTIN_ENABLE_INT,
+  NDS32_BUILTIN_DISABLE_INT,
+  NDS32_BUILTIN_SET_PENDING_SWINT,
+  NDS32_BUILTIN_CLR_PENDING_SWINT,
+  NDS32_BUILTIN_CLR_PENDING_HWINT,
+  NDS32_BUILTIN_GET_ALL_PENDING_INT,
+  NDS32_BUILTIN_GET_PENDING_INT,
+  NDS32_BUILTIN_SET_INT_PRIORITY,
+  NDS32_BUILTIN_GET_INT_PRIORITY,
+  NDS32_BUILTIN_SET_TRIG_LEVEL,
+  NDS32_BUILTIN_SET_TRIG_EDGE,
+  NDS32_BUILTIN_GET_TRIG_TYPE,
+  NDS32_BUILTIN_SIGNATURE_BEGIN,
+  NDS32_BUILTIN_SIGNATURE_END,
+  NDS32_BUILTIN_DSP_BEGIN,
+  NDS32_BUILTIN_ADD16,
+  NDS32_BUILTIN_V_UADD16,
+  NDS32_BUILTIN_V_SADD16,
+  NDS32_BUILTIN_RADD16,
+  NDS32_BUILTIN_V_RADD16,
+  NDS32_BUILTIN_URADD16,
+  NDS32_BUILTIN_V_URADD16,
+  NDS32_BUILTIN_KADD16,
+  NDS32_BUILTIN_V_KADD16,
+  NDS32_BUILTIN_UKADD16,
+  NDS32_BUILTIN_V_UKADD16,
+  NDS32_BUILTIN_SUB16,
+  NDS32_BUILTIN_V_USUB16,
+  NDS32_BUILTIN_V_SSUB16,
+  NDS32_BUILTIN_RSUB16,
+  NDS32_BUILTIN_V_RSUB16,
+  NDS32_BUILTIN_URSUB16,
+  NDS32_BUILTIN_V_URSUB16,
+  NDS32_BUILTIN_KSUB16,
+  NDS32_BUILTIN_V_KSUB16,
+  NDS32_BUILTIN_UKSUB16,
+  NDS32_BUILTIN_V_UKSUB16,
+  NDS32_BUILTIN_CRAS16,
+  NDS32_BUILTIN_V_UCRAS16,
+  NDS32_BUILTIN_V_SCRAS16,
+  NDS32_BUILTIN_RCRAS16,
+  NDS32_BUILTIN_V_RCRAS16,
+  NDS32_BUILTIN_URCRAS16,
+  NDS32_BUILTIN_V_URCRAS16,
+  NDS32_BUILTIN_KCRAS16,
+  NDS32_BUILTIN_V_KCRAS16,
+  NDS32_BUILTIN_UKCRAS16,
+  NDS32_BUILTIN_V_UKCRAS16,
+  NDS32_BUILTIN_CRSA16,
+  NDS32_BUILTIN_V_UCRSA16,
+  NDS32_BUILTIN_V_SCRSA16,
+  NDS32_BUILTIN_RCRSA16,
+  NDS32_BUILTIN_V_RCRSA16,
+  NDS32_BUILTIN_URCRSA16,
+  NDS32_BUILTIN_V_URCRSA16,
+  NDS32_BUILTIN_KCRSA16,
+  NDS32_BUILTIN_V_KCRSA16,
+  NDS32_BUILTIN_UKCRSA16,
+  NDS32_BUILTIN_V_UKCRSA16,
+  NDS32_BUILTIN_ADD8,
+  NDS32_BUILTIN_V_UADD8,
+  NDS32_BUILTIN_V_SADD8,
+  NDS32_BUILTIN_RADD8,
+  NDS32_BUILTIN_V_RADD8,
+  NDS32_BUILTIN_URADD8,
+  NDS32_BUILTIN_V_URADD8,
+  NDS32_BUILTIN_KADD8,
+  NDS32_BUILTIN_V_KADD8,
+  NDS32_BUILTIN_UKADD8,
+  NDS32_BUILTIN_V_UKADD8,
+  NDS32_BUILTIN_SUB8,
+  NDS32_BUILTIN_V_USUB8,
+  NDS32_BUILTIN_V_SSUB8,
+  NDS32_BUILTIN_RSUB8,
+  NDS32_BUILTIN_V_RSUB8,
+  NDS32_BUILTIN_URSUB8,
+  NDS32_BUILTIN_V_URSUB8,
+  NDS32_BUILTIN_KSUB8,
+  NDS32_BUILTIN_V_KSUB8,
+  NDS32_BUILTIN_UKSUB8,
+  NDS32_BUILTIN_V_UKSUB8,
+  NDS32_BUILTIN_SRA16,
+  NDS32_BUILTIN_V_SRA16,
+  NDS32_BUILTIN_SRA16_U,
+  NDS32_BUILTIN_V_SRA16_U,
+  NDS32_BUILTIN_SRL16,
+  NDS32_BUILTIN_V_SRL16,
+  NDS32_BUILTIN_SRL16_U,
+  NDS32_BUILTIN_V_SRL16_U,
+  NDS32_BUILTIN_SLL16,
+  NDS32_BUILTIN_V_SLL16,
+  NDS32_BUILTIN_KSLL16,
+  NDS32_BUILTIN_V_KSLL16,
+  NDS32_BUILTIN_KSLRA16,
+  NDS32_BUILTIN_V_KSLRA16,
+  NDS32_BUILTIN_KSLRA16_U,
+  NDS32_BUILTIN_V_KSLRA16_U,
+  NDS32_BUILTIN_CMPEQ16,
+  NDS32_BUILTIN_V_SCMPEQ16,
+  NDS32_BUILTIN_V_UCMPEQ16,
+  NDS32_BUILTIN_SCMPLT16,
+  NDS32_BUILTIN_V_SCMPLT16,
+  NDS32_BUILTIN_SCMPLE16,
+  NDS32_BUILTIN_V_SCMPLE16,
+  NDS32_BUILTIN_UCMPLT16,
+  NDS32_BUILTIN_V_UCMPLT16,
+  NDS32_BUILTIN_UCMPLE16,
+  NDS32_BUILTIN_V_UCMPLE16,
+  NDS32_BUILTIN_CMPEQ8,
+  NDS32_BUILTIN_V_SCMPEQ8,
+  NDS32_BUILTIN_V_UCMPEQ8,
+  NDS32_BUILTIN_SCMPLT8,
+  NDS32_BUILTIN_V_SCMPLT8,
+  NDS32_BUILTIN_SCMPLE8,
+  NDS32_BUILTIN_V_SCMPLE8,
+  NDS32_BUILTIN_UCMPLT8,
+  NDS32_BUILTIN_V_UCMPLT8,
+  NDS32_BUILTIN_UCMPLE8,
+  NDS32_BUILTIN_V_UCMPLE8,
+  NDS32_BUILTIN_SMIN16,
+  NDS32_BUILTIN_V_SMIN16,
+  NDS32_BUILTIN_UMIN16,
+  NDS32_BUILTIN_V_UMIN16,
+  NDS32_BUILTIN_SMAX16,
+  NDS32_BUILTIN_V_SMAX16,
+  NDS32_BUILTIN_UMAX16,
+  NDS32_BUILTIN_V_UMAX16,
+  NDS32_BUILTIN_SCLIP16,
+  NDS32_BUILTIN_V_SCLIP16,
+  NDS32_BUILTIN_UCLIP16,
+  NDS32_BUILTIN_V_UCLIP16,
+  NDS32_BUILTIN_KHM16,
+  NDS32_BUILTIN_V_KHM16,
+  NDS32_BUILTIN_KHMX16,
+  NDS32_BUILTIN_V_KHMX16,
+  NDS32_BUILTIN_KABS16,
+  NDS32_BUILTIN_V_KABS16,
+  NDS32_BUILTIN_SMIN8,
+  NDS32_BUILTIN_V_SMIN8,
+  NDS32_BUILTIN_UMIN8,
+  NDS32_BUILTIN_V_UMIN8,
+  NDS32_BUILTIN_SMAX8,
+  NDS32_BUILTIN_V_SMAX8,
+  NDS32_BUILTIN_UMAX8,
+  NDS32_BUILTIN_V_UMAX8,
+  NDS32_BUILTIN_KABS8,
+  NDS32_BUILTIN_V_KABS8,
+  NDS32_BUILTIN_SUNPKD810,
+  NDS32_BUILTIN_V_SUNPKD810,
+  NDS32_BUILTIN_SUNPKD820,
+  NDS32_BUILTIN_V_SUNPKD820,
+  NDS32_BUILTIN_SUNPKD830,
+  NDS32_BUILTIN_V_SUNPKD830,
+  NDS32_BUILTIN_SUNPKD831,
+  NDS32_BUILTIN_V_SUNPKD831,
+  NDS32_BUILTIN_ZUNPKD810,
+  NDS32_BUILTIN_V_ZUNPKD810,
+  NDS32_BUILTIN_ZUNPKD820,
+  NDS32_BUILTIN_V_ZUNPKD820,
+  NDS32_BUILTIN_ZUNPKD830,
+  NDS32_BUILTIN_V_ZUNPKD830,
+  NDS32_BUILTIN_ZUNPKD831,
+  NDS32_BUILTIN_V_ZUNPKD831,
+  NDS32_BUILTIN_RADDW,
+  NDS32_BUILTIN_URADDW,
+  NDS32_BUILTIN_RSUBW,
+  NDS32_BUILTIN_URSUBW,
+  NDS32_BUILTIN_SRA_U,
+  NDS32_BUILTIN_KSLL,
+  NDS32_BUILTIN_PKBB16,
+  NDS32_BUILTIN_V_PKBB16,
+  NDS32_BUILTIN_PKBT16,
+  NDS32_BUILTIN_V_PKBT16,
+  NDS32_BUILTIN_PKTB16,
+  NDS32_BUILTIN_V_PKTB16,
+  NDS32_BUILTIN_PKTT16,
+  NDS32_BUILTIN_V_PKTT16,
+  NDS32_BUILTIN_SMMUL,
+  NDS32_BUILTIN_SMMUL_U,
+  NDS32_BUILTIN_KMMAC,
+  NDS32_BUILTIN_KMMAC_U,
+  NDS32_BUILTIN_KMMSB,
+  NDS32_BUILTIN_KMMSB_U,
+  NDS32_BUILTIN_KWMMUL,
+  NDS32_BUILTIN_KWMMUL_U,
+  NDS32_BUILTIN_SMMWB,
+  NDS32_BUILTIN_V_SMMWB,
+  NDS32_BUILTIN_SMMWB_U,
+  NDS32_BUILTIN_V_SMMWB_U,
+  NDS32_BUILTIN_SMMWT,
+  NDS32_BUILTIN_V_SMMWT,
+  NDS32_BUILTIN_SMMWT_U,
+  NDS32_BUILTIN_V_SMMWT_U,
+  NDS32_BUILTIN_KMMAWB,
+  NDS32_BUILTIN_V_KMMAWB,
+  NDS32_BUILTIN_KMMAWB_U,
+  NDS32_BUILTIN_V_KMMAWB_U,
+  NDS32_BUILTIN_KMMAWT,
+  NDS32_BUILTIN_V_KMMAWT,
+  NDS32_BUILTIN_KMMAWT_U,
+  NDS32_BUILTIN_V_KMMAWT_U,
+  NDS32_BUILTIN_SMBB,
+  NDS32_BUILTIN_V_SMBB,
+  NDS32_BUILTIN_SMBT,
+  NDS32_BUILTIN_V_SMBT,
+  NDS32_BUILTIN_SMTT,
+  NDS32_BUILTIN_V_SMTT,
+  NDS32_BUILTIN_KMDA,
+  NDS32_BUILTIN_V_KMDA,
+  NDS32_BUILTIN_KMXDA,
+  NDS32_BUILTIN_V_KMXDA,
+  NDS32_BUILTIN_SMDS,
+  NDS32_BUILTIN_V_SMDS,
+  NDS32_BUILTIN_SMDRS,
+  NDS32_BUILTIN_V_SMDRS,
+  NDS32_BUILTIN_SMXDS,
+  NDS32_BUILTIN_V_SMXDS,
+  NDS32_BUILTIN_KMABB,
+  NDS32_BUILTIN_V_KMABB,
+  NDS32_BUILTIN_KMABT,
+  NDS32_BUILTIN_V_KMABT,
+  NDS32_BUILTIN_KMATT,
+  NDS32_BUILTIN_V_KMATT,
+  NDS32_BUILTIN_KMADA,
+  NDS32_BUILTIN_V_KMADA,
+  NDS32_BUILTIN_KMAXDA,
+  NDS32_BUILTIN_V_KMAXDA,
+  NDS32_BUILTIN_KMADS,
+  NDS32_BUILTIN_V_KMADS,
+  NDS32_BUILTIN_KMADRS,
+  NDS32_BUILTIN_V_KMADRS,
+  NDS32_BUILTIN_KMAXDS,
+  NDS32_BUILTIN_V_KMAXDS,
+  NDS32_BUILTIN_KMSDA,
+  NDS32_BUILTIN_V_KMSDA,
+  NDS32_BUILTIN_KMSXDA,
+  NDS32_BUILTIN_V_KMSXDA,
+  NDS32_BUILTIN_SMAL,
+  NDS32_BUILTIN_V_SMAL,
+  NDS32_BUILTIN_BITREV,
+  NDS32_BUILTIN_WEXT,
+  NDS32_BUILTIN_BPICK,
+  NDS32_BUILTIN_INSB,
+  NDS32_BUILTIN_SADD64,
+  NDS32_BUILTIN_UADD64,
+  NDS32_BUILTIN_RADD64,
+  NDS32_BUILTIN_URADD64,
+  NDS32_BUILTIN_KADD64,
+  NDS32_BUILTIN_UKADD64,
+  NDS32_BUILTIN_SSUB64,
+  NDS32_BUILTIN_USUB64,
+  NDS32_BUILTIN_RSUB64,
+  NDS32_BUILTIN_URSUB64,
+  NDS32_BUILTIN_KSUB64,
+  NDS32_BUILTIN_UKSUB64,
+  NDS32_BUILTIN_SMAR64,
+  NDS32_BUILTIN_SMSR64,
+  NDS32_BUILTIN_UMAR64,
+  NDS32_BUILTIN_UMSR64,
+  NDS32_BUILTIN_KMAR64,
+  NDS32_BUILTIN_KMSR64,
+  NDS32_BUILTIN_UKMAR64,
+  NDS32_BUILTIN_UKMSR64,
+  NDS32_BUILTIN_SMALBB,
+  NDS32_BUILTIN_V_SMALBB,
+  NDS32_BUILTIN_SMALBT,
+  NDS32_BUILTIN_V_SMALBT,
+  NDS32_BUILTIN_SMALTT,
+  NDS32_BUILTIN_V_SMALTT,
+  NDS32_BUILTIN_SMALDA,
+  NDS32_BUILTIN_V_SMALDA,
+  NDS32_BUILTIN_SMALXDA,
+  NDS32_BUILTIN_V_SMALXDA,
+  NDS32_BUILTIN_SMALDS,
+  NDS32_BUILTIN_V_SMALDS,
+  NDS32_BUILTIN_SMALDRS,
+  NDS32_BUILTIN_V_SMALDRS,
+  NDS32_BUILTIN_SMALXDS,
+  NDS32_BUILTIN_V_SMALXDS,
+  NDS32_BUILTIN_SMUL16,
+  NDS32_BUILTIN_V_SMUL16,
+  NDS32_BUILTIN_SMULX16,
+  NDS32_BUILTIN_V_SMULX16,
+  NDS32_BUILTIN_UMUL16,
+  NDS32_BUILTIN_V_UMUL16,
+  NDS32_BUILTIN_UMULX16,
+  NDS32_BUILTIN_V_UMULX16,
+  NDS32_BUILTIN_SMSLDA,
+  NDS32_BUILTIN_V_SMSLDA,
+  NDS32_BUILTIN_SMSLXDA,
+  NDS32_BUILTIN_V_SMSLXDA,
+  NDS32_BUILTIN_UCLIP32,
+  NDS32_BUILTIN_SCLIP32,
+  NDS32_BUILTIN_KABS,
+  NDS32_BUILTIN_UALOAD_U16,
+  NDS32_BUILTIN_UALOAD_S16,
+  NDS32_BUILTIN_UALOAD_U8,
+  NDS32_BUILTIN_UALOAD_S8,
+  NDS32_BUILTIN_UASTORE_U16,
+  NDS32_BUILTIN_UASTORE_S16,
+  NDS32_BUILTIN_UASTORE_U8,
+  NDS32_BUILTIN_UASTORE_S8,
+  NDS32_BUILTIN_DSP_END,
+  NDS32_BUILTIN_NO_HWLOOP,
+  NDS32_BUILTIN_UNALIGNED_FEATURE,
+  NDS32_BUILTIN_ENABLE_UNALIGNED,
+  NDS32_BUILTIN_DISABLE_UNALIGNED,
+  NDS32_BUILTIN_COUNT
 };

 /* ------------------------------------------------------------------------ */

-#define TARGET_ISA_V2   (nds32_arch_option == ARCH_V2)
-#define TARGET_ISA_V3   (nds32_arch_option == ARCH_V3)
-#define TARGET_ISA_V3M  (nds32_arch_option == ARCH_V3M)
+#define TARGET_ISR_VECTOR_SIZE_4_BYTE \
+  (nds32_isr_vector_size == 4)
+
+#define TARGET_ISA_V2 \
+  (nds32_arch_option == ARCH_V2 || nds32_arch_option == ARCH_V2J)
+#define TARGET_ISA_V3 \
+  (nds32_arch_option == ARCH_V3 \
+   || nds32_arch_option == ARCH_V3J \
+   || nds32_arch_option == ARCH_V3F \
+   || nds32_arch_option == ARCH_V3S)
+#define TARGET_ISA_V3M \
+  (nds32_arch_option == ARCH_V3M || \
+   nds32_arch_option == ARCH_V3M_PLUS)
+
+#define TARGET_ISA_V3M_PLUS \
+  (nds32_arch_option == ARCH_V3M_PLUS)
+
+#define TARGET_PIPELINE_N7 \
+  (nds32_cpu_option == CPU_N7)
+#define TARGET_PIPELINE_N8 \
+  (nds32_cpu_option == CPU_N6 \
+   || nds32_cpu_option == CPU_N8)
+#define TARGET_PIPELINE_N9 \
+  (nds32_cpu_option == CPU_N9)
+#define TARGET_PIPELINE_N10 \
+  (nds32_cpu_option == CPU_N10)
+#define TARGET_PIPELINE_N13 \
+  (nds32_cpu_option == CPU_N12 || nds32_cpu_option == CPU_N13)
+#define TARGET_PIPELINE_GRAYWOLF \
+  (nds32_cpu_option == CPU_GRAYWOLF)
+#define TARGET_PIPELINE_PANTHER \
+  (nds32_cpu_option == CPU_PANTHER)
+#define TARGET_PIPELINE_SIMPLE \
+  (nds32_cpu_option == CPU_SIMPLE)

 #define TARGET_CMODEL_SMALL \
    (nds32_cmodel_option == CMODEL_SMALL)
@@ -361,55 +908,153 @@ enum nds32_builtins
 #define TARGET_CMODEL_LARGE \
    (nds32_cmodel_option == CMODEL_LARGE)

+#define TARGET_ICT_MODEL_SMALL \
+   (nds32_ict_model == ICT_MODEL_SMALL)
+
+#define TARGET_ICT_MODEL_LARGE \
+   (nds32_ict_model == ICT_MODEL_LARGE)
+
 /* When -mcmodel=small or -mcmodel=medium,
    compiler may generate gp-base instruction directly.  */
 #define TARGET_GP_DIRECT \
    (nds32_cmodel_option == CMODEL_SMALL\
     || nds32_cmodel_option == CMODEL_MEDIUM)

-#define TARGET_SOFT_FLOAT 1
-#define TARGET_HARD_FLOAT 0
+/* There are three kinds of mul configurations:
+   1-cycle fast mul, 2-cycle fast mul, and slow mul operation.  */
+#define TARGET_MUL_FAST_1 \
+  (nds32_mul_config == MUL_TYPE_FAST_1)
+#define TARGET_MUL_FAST_2 \
+  (nds32_mul_config == MUL_TYPE_FAST_2)
+#define TARGET_MUL_SLOW \
+  (nds32_mul_config == MUL_TYPE_SLOW)
+
+/* Run-time Target Specification.  */
+#define TARGET_SOFT_FLOAT (nds32_abi == NDS32_ABI_V2)
+/* Use hardware floating point calling convention.  */
+#define TARGET_HARD_FLOAT (nds32_abi == NDS32_ABI_V2_FP_PLUS)
+
+/* Record arch version in TARGET_ARCH_DEFAULT. 0 means soft ABI,
+   1 means  hard ABI and using full floating-point instruction,
+   2 means hard ABI and only using single-precision floating-point
+   instruction  */
+#if TARGET_ARCH_DEFAULT == 1
+#  define TARGET_DEFAULT_ABI NDS32_ABI_V2_FP_PLUS
+#  define TARGET_DEFAULT_FPU_ISA MASK_FPU_DOUBLE | MASK_FPU_SINGLE
+#  define TARGET_DEFAULT_FPU_FMA 0
+#else
+#  if TARGET_ARCH_DEFAULT == 2
+#    define TARGET_DEFAULT_ABI NDS32_ABI_V2_FP_PLUS
+#    define TARGET_DEFAULT_FPU_ISA MASK_FPU_SINGLE
+#    define TARGET_DEFAULT_FPU_FMA 0
+#  else
+#    define TARGET_DEFAULT_ABI NDS32_ABI_V2
+#    define TARGET_DEFAULT_FPU_ISA 0
+#    define TARGET_DEFAULT_FPU_FMA 0
+#  endif
+#endif
+
+#define TARGET_CONFIG_FPU_DEFAULT NDS32_CONFIG_FPU_2
+
+#define TARGET_LMWSMW_OPT_AUTO \
+   (flag_lmwsmw_cost == LMWSMW_OPT_AUTO)
+
+#define TARGET_LMWSMW_OPT_SIZE \
+   (flag_lmwsmw_cost == LMWSMW_OPT_SIZE)
+
+#define TARGET_LMWSMW_OPT_SPEED \
+   (flag_lmwsmw_cost == LMWSMW_OPT_SPEED)
+
+#define TARGET_LMWSMW_OPT_ALL \
+   (flag_lmwsmw_cost == LMWSMW_OPT_ALL)
+
+/* ------------------------------------------------------------------------ */
+
+#ifdef TARGET_DEFAULT_RELAX
+#  define NDS32_RELAX_SPEC " %{!mno-relax:--relax}"
+#else
+#  define NDS32_RELAX_SPEC " %{mrelax:--relax}"
+#endif
+
+#ifdef TARGET_OS_DEFAULT_IFC
+#  define NDS32_IFC_SPEC " %{Os3|Os|mifc:%{!mno-ifc:--mifc}}"
+#else
+#  define NDS32_IFC_SPEC " %{mifc:--mifc}"
+#endif
+#define NDS32_IFC_V3M_PLUS_SPEC " %{march=v3m+:%{Os3|Os|mifc:%{!mno-ifc:-mifc}}}"
+
+#ifdef TARGET_OS_DEFAULT_EX9
+#  define NDS32_EX9_SPEC " %{Os3|Os|mex9:%{!mno-ex9:--mex9}}"
+#else
+#  define NDS32_EX9_SPEC " %{mex9:--mex9}"
+#endif
+#define NDS32_EX9_V3M_PLUS_SPEC " %{march=v3m+:%{Os3|Os|mex9:%{!mno-ex9:-mex9}}}"
+
+#ifdef TARGET_DEFAULT_EXT_DSP
+#  define NDS32_EXT_DSP_SPEC " %{!mno-ext-dsp:-mext-dsp}"
+#else
+#  define NDS32_EXT_DSP_SPEC ""
+#endif
+
+#ifdef TARGET_DEFAULT_HWLOOP
+#  define NDS32_HWLOOP_SPEC " %{!mno-ext-zol:-mext-zol}"
+#else
+#  define NDS32_HWLOOP_SPEC ""
+#endif
+
+#ifdef TARGET_DEFAULT_16BIT
+#  define NDS32_16BIT_SPEC " %{!mno-16-bit:%{!mno-16bit:-m16bit}}"
+#else
+#  define NDS32_16BIT_SPEC " %{!m16-bit:%{!m16bit:-mno-16bit}}"
+#endif

 /* ------------------------------------------------------------------------ */

 /* Controlling the Compilation Driver.  */

+#define DRIVER_SELF_SPECS \
+  " %{mno-16bit|mno-16-bit:-mno-ifc -mno-ex9}" \
+  NDS32_IFC_V3M_PLUS_SPEC \
+  NDS32_EX9_V3M_PLUS_SPEC \
+  NDS32_16BIT_SPEC
+
 #define OPTION_DEFAULT_SPECS \
-  {"arch", "%{!march=*:-march=%(VALUE)}" }
+  {"arch", " %{!march=*:-march=%(VALUE)}" \
+	   " %{march=v3f:%{!mfloat-abi=*:-mfloat-abi=hard}" \
+	   " %{!mno-ext-fpu-sp:%{!mext-fpu-sp:-mext-fpu-sp}}" \
+	   " %{!mno-ext-fpu-dp:%{!mext-fpu-dp:-mext-fpu-dp}}}" \
+	   " %{march=v3s:%{!mfloat-abi=*:-mfloat-abi=hard}" \
+	   " %{!mno-ext-fpu-sp:%{!mext-fpu-sp:-mext-fpu-sp}}}" }, \
+  {"cpu",  "%{!mcpu=*:-mcpu=%(VALUE)}" },   \
+  {"memory_model", "%{!mmemory-model=*:-mmemory-model=%(VALUE)}"}, \
+  {"float", "%{!mfloat-abi=*:-mfloat-abi=%(VALUE)}" }

 #define CC1_SPEC \
-  ""
+  " %{Os1:-Os -mno-ifc -mno-ex9;" \
+     "Os2:-Os -minnermost-loop;" \
+     "Os3:-Os}" \
+  " %{ffast-math:%{!mno-soft-fp-arith-comm:-msoft-fp-arith-comm}}" \
+  NDS32_EXT_DSP_SPEC \
+  NDS32_HWLOOP_SPEC

 #define ASM_SPEC \
-  " %{mbig-endian:-EB} %{mlittle-endian:-EL}"
-
-/* If user issues -mrelax, we need to pass '--relax' to linker.  */
-#define LINK_SPEC \
   " %{mbig-endian:-EB} %{mlittle-endian:-EL}" \
-  " %{mrelax:--relax}"
-
-#define LIB_SPEC \
-  " -lc -lgloss"
-
-/* The option -mno-ctor-dtor can disable constructor/destructor feature
-   by applying different crt stuff.  In the convention, crt0.o is the
-   startup file without constructor/destructor;
-   crt1.o, crti.o, crtbegin.o, crtend.o, and crtn.o are the
-   startup files with constructor/destructor.
-   Note that crt0.o, crt1.o, crti.o, and crtn.o are provided
-   by newlib/mculib/glibc/ublic, while crtbegin.o and crtend.o are
-   currently provided by GCC for nds32 target.
-
-   For nds32 target so far:
-   If -mno-ctor-dtor, we are going to link
-   "crt0.o [user objects]".
-   If general cases, we are going to link
-   "crt1.o crtbegin1.o [user objects] crtend1.o".  */
-#define STARTFILE_SPEC \
-  " %{!mno-ctor-dtor:crt1.o%s;:crt0.o%s}" \
-  " %{!mno-ctor-dtor:crtbegin1.o%s}"
-#define ENDFILE_SPEC \
-  " %{!mno-ctor-dtor:crtend1.o%s}"
+  " %{march=*:-march=%*}" \
+  " %{mno-16-bit|mno-16bit:-mno-16bit-ext}" \
+  " %{march=v3m:%{!mfull-regs:%{!mreduced-regs:-mreduced-regs}}}" \
+  " %{mfull-regs:-mno-reduced-regs}" \
+  " %{mreduced-regs:-mreduced-regs}" \
+  " %{mabi=*:-mabi=v%*}" \
+  " %{mconfig-fpu=*:-mfpu-freg=%*}" \
+  " %{mext-fpu-mac:-mmac}" \
+  " %{mno-ext-fpu-mac:-mno-mac}" \
+  " %{mext-fpu-sp:-mfpu-sp-ext}" \
+  " %{mno-ext-fpu-sp:-mno-fpu-sp-ext}" \
+  " %{mext-fpu-dp:-mfpu-dp-ext}" \
+  " %{mno-ext-fpu-sp:-mno-fpu-dp-ext}" \
+  " %{mext-dsp:-mdsp-ext}" \
+  " %{mext-zol:-mzol-ext}" \
+  " %{O|O1|O2|O3|Ofast:-O1;:-Os}"

 /* The TARGET_BIG_ENDIAN_DEFAULT is defined if we
    configure gcc with --target=nds32be-* setting.
@@ -422,7 +1067,11 @@ enum nds32_builtins

 /* Currently we only have elf toolchain,
    where -mcmodel=medium is always the default.  */
-#define NDS32_CMODEL_DEFAULT "mcmodel=medium"
+#if TARGET_ELF
+#  define NDS32_CMODEL_DEFAULT "mcmodel=medium"
+#else
+#  define NDS32_CMODEL_DEFAULT "mcmodel=medium"
+#endif

 #define MULTILIB_DEFAULTS \
   { NDS32_ENDIAN_DEFAULT, NDS32_CMODEL_DEFAULT }
@@ -430,34 +1079,8 @@ enum nds32_builtins

 /* Run-time Target Specification.  */

-#define TARGET_CPU_CPP_BUILTINS()                     \
-  do                                                  \
-    {                                                 \
-      builtin_define ("__nds32__");                   \
-                                                      \
-      if (TARGET_ISA_V2)                              \
-        builtin_define ("__NDS32_ISA_V2__");          \
-      if (TARGET_ISA_V3)                              \
-        builtin_define ("__NDS32_ISA_V3__");          \
-      if (TARGET_ISA_V3M)                             \
-        builtin_define ("__NDS32_ISA_V3M__");         \
-                                                      \
-      if (TARGET_BIG_ENDIAN)                          \
-        builtin_define ("__big_endian__");            \
-      if (TARGET_REDUCED_REGS)                        \
-        builtin_define ("__NDS32_REDUCED_REGS__");    \
-      if (TARGET_CMOV)                                \
-        builtin_define ("__NDS32_CMOV__");            \
-      if (TARGET_PERF_EXT)                            \
-        builtin_define ("__NDS32_PERF_EXT__");        \
-      if (TARGET_16_BIT)                              \
-        builtin_define ("__NDS32_16_BIT__");          \
-      if (TARGET_GP_DIRECT)                           \
-        builtin_define ("__NDS32_GP_DIRECT__");       \
-                                                      \
-      builtin_assert ("cpu=nds32");                   \
-      builtin_assert ("machine=nds32");               \
-    } while (0)
+#define TARGET_CPU_CPP_BUILTINS() \
+  nds32_cpu_cpp_builtins (pfile)


 /* Defining Data Structures for Per-function Information.  */
@@ -487,10 +1110,20 @@ enum nds32_builtins

 #define STACK_BOUNDARY 64

-#define FUNCTION_BOUNDARY 32
+#define FUNCTION_BOUNDARY \
+  ((NDS32_ALIGN_P () || TARGET_ALIGN_FUNCTION) ? (TARGET_PIPELINE_PANTHER ? 64 : 32) : 16)

 #define BIGGEST_ALIGNMENT 64

+#define DATA_ALIGNMENT(constant, basic_align) \
+  nds32_data_alignment (constant, basic_align)
+
+#define CONSTANT_ALIGNMENT(constant, basic_align) \
+  nds32_constant_alignment (constant, basic_align)
+
+#define LOCAL_ALIGNMENT(type, basic_align) \
+  nds32_local_alignment (type, basic_align)
+
 #define EMPTY_FIELD_BOUNDARY 32

 #define STRUCTURE_SIZE_BOUNDARY 8
@@ -515,8 +1148,8 @@ enum nds32_builtins

 #define SIZE_TYPE "long unsigned int"
 #define PTRDIFF_TYPE "long int"
-#define WCHAR_TYPE "short unsigned int"
-#define WCHAR_TYPE_SIZE 16
+#define WCHAR_TYPE "unsigned int"
+#define WCHAR_TYPE_SIZE 32


 /* Register Usage.  */
@@ -526,7 +1159,7 @@ enum nds32_builtins
    from 0 to just below FIRST_PSEUDO_REGISTER.
    All registers that the compiler knows about must be given numbers,
    even those that are not normally considered general registers.  */
-#define FIRST_PSEUDO_REGISTER 34
+#define FIRST_PSEUDO_REGISTER 101

 /* An initializer that says which registers are used for fixed
    purposes all throughout the compiled code and are therefore
@@ -537,24 +1170,38 @@ enum nds32_builtins
    $r30 : $lp
    $r31 : $sp

-   caller-save registers: $r0 ~ $r5, $r16 ~ $r23
-   callee-save registers: $r6 ~ $r10, $r11 ~ $r14
+   caller-save registers: $r0 ~ $r5, $r16 ~ $r23, $fs0 ~ $fs5, $fs22 ~ $fs47
+   callee-save registers: $r6 ~ $r10, $r11 ~ $r14, $fs6 ~ $fs21, $fs48 ~ $fs63

    reserved for assembler : $r15
    reserved for other use : $r24, $r25, $r26, $r27 */
-#define FIXED_REGISTERS                 \
-{ /* r0  r1  r2  r3  r4  r5  r6  r7  */ \
-      0,  0,  0,  0,  0,  0,  0,  0,    \
-  /* r8  r9  r10 r11 r12 r13 r14 r15 */ \
-      0,  0,  0,  0,  0,  0,  0,  1,    \
-  /* r16 r17 r18 r19 r20 r21 r22 r23 */ \
-      0,  0,  0,  0,  0,  0,  0,  0,    \
-  /* r24 r25 r26 r27 r28 r29 r30 r31 */ \
-      1,  1,  1,  1,  0,  1,  0,  1,    \
-  /* ARG_POINTER:32 */                  \
-      1,                                \
-  /* FRAME_POINTER:33 */                \
-      1                                 \
+#define FIXED_REGISTERS \
+{ /* r0   r1   r2   r3   r4   r5   r6   r7   */ \
+      0,   0,   0,   0,   0,   0,   0,   0,     \
+  /* r8   r9   r10  r11  r12  r13  r14  r15  */ \
+      0,   0,   0,   0,   0,   0,   0,   0,     \
+  /* r16  r17  r18  r19  r20  r21  r22  r23  */ \
+      0,   0,   0,   0,   0,   0,   0,   0,     \
+  /* r24  r25  r26  r27  r28  r29  r30  r31  */ \
+      0,   0,   1,   1,   0,   1,   0,   1,     \
+  /* AP   FP   fs0  fs1  fs2  fs3  fs4  fs5  */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fs6  fs7  fs8  fs9  fs10 fs11 fs12 fs13 */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fs14 fs15 fs16 fs17 fs18 fs19 fs20 fs21 */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fs22 fs23 fs24 fs25 fs26 fs27 fs28 fs29 */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fs30 fs31 fd16      fd17      fd18      */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fd19      fd20      fd21      fd22      */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fd23      fd24      fd25      fd26      */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fd27      fd28      fd29      fd30      */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fd31      LB   LE   LC                  */ \
+      1,   1,   1,   1,   1                     \
 }

 /* Identifies the registers that are not available for
@@ -563,35 +1210,59 @@ enum nds32_builtins

    0 : callee-save registers
    1 : caller-save registers */
-#define CALL_USED_REGISTERS             \
-{ /* r0  r1  r2  r3  r4  r5  r6  r7  */ \
-      1,  1,  1,  1,  1,  1,  0,  0,    \
-  /* r8  r9  r10 r11 r12 r13 r14 r15 */ \
-      0,  0,  0,  0,  0,  0,  0,  1,    \
-  /* r16 r17 r18 r19 r20 r21 r22 r23 */ \
-      1,  1,  1,  1,  1,  1,  1,  1,    \
-  /* r24 r25 r26 r27 r28 r29 r30 r31 */ \
-      1,  1,  1,  1,  0,  1,  0,  1,    \
-  /* ARG_POINTER:32 */                  \
-      1,                                \
-  /* FRAME_POINTER:33 */                \
-      1                                 \
+#define CALL_USED_REGISTERS \
+{ /* r0   r1   r2   r3   r4   r5   r6   r7   */ \
+      1,   1,   1,   1,   1,   1,   0,   0,     \
+  /* r8   r9   r10  r11  r12  r13  r14  r15  */ \
+      0,   0,   0,   0,   0,   0,   0,   1,     \
+  /* r16  r17  r18  r19  r20  r21  r22  r23  */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* r24  r25  r26  r27  r28  r29  r30  r31  */ \
+      1,   1,   1,   1,   0,   1,   0,   1,     \
+  /* AP   FP   fs0  fs1  fs2  fs3  fs4  fs5  */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fs6  fs7  fs8  fs9  fs10 fs11 fs12 fs13 */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fs14 fs15 fs16 fs17 fs18 fs19 fs20 fs21 */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fs22 fs23 fs24 fs25 fs26 fs27 fs28 fs29 */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fs30 fs31 fd16      fd17      fd18      */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fd19      fd20      fd21      fd22      */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fd23      fd24      fd25      fd26      */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fd27      fd28      fd29      fd30      */ \
+      1,   1,   1,   1,   1,   1,   1,   1,     \
+  /* fd31      LB   LE   LC                  */ \
+      1,   1,   1,   1,   1                     \
 }

 /* In nds32 target, we have three levels of registers:
      LOW_COST_REGS    : $r0 ~ $r7
      MIDDLE_COST_REGS : $r8 ~ $r11, $r16 ~ $r19
      HIGH_COST_REGS   : $r12 ~ $r14, $r20 ~ $r31 */
-#define REG_ALLOC_ORDER           \
-{                                 \
-   0,  1,  2,  3,  4,  5,  6,  7, \
-   8,  9, 10, 11, 16, 17, 18, 19, \
-  12, 13, 14, 15, 20, 21, 22, 23, \
-  24, 25, 26, 27, 28, 29, 30, 31, \
-  32,                             \
-  33                              \
+#define REG_ALLOC_ORDER \
+{   0,   1,   2,   3,   4,   5,   6,   7, \
+   16,  17,  18,  19,   9,  10,  11,  12, \
+   13,  14,  8,   15,  20,  21,  22,  23, \
+   24,  25,  26,  27,  28,  29,  30,  31, \
+   32,  33,  34,  35,  36,  37,  38,  39, \
+   40,  41,  42,  43,  44,  45,  46,  47, \
+   48,  49,  50,  51,  52,  53,  54,  55, \
+   56,  57,  58,  59,  60,  61,  62,  63, \
+   64,  65,  66,  67,  68,  69,  70,  71, \
+   72,  73,  74,  75,  76,  77,  78,  79, \
+   80,  81,  82,  83,  84,  85,  86,  87, \
+   88,  89,  90,  91,  92,  93,  94,  95, \
+   96,  97,  98,  99, 100,                \
 }

+/* ADJUST_REG_ALLOC_ORDER is a macro which permits reg_alloc_order
+   to be rearranged based on optimizing for speed or size.  */
+#define ADJUST_REG_ALLOC_ORDER nds32_adjust_reg_alloc_order ()
+
 /* Tell IRA to use the order we define rather than messing it up with its
    own cost calculations.  */
 #define HONOR_REG_ALLOC_ORDER optimize_size
@@ -609,11 +1280,7 @@ enum nds32_builtins
    Define this macro to return nonzero in as many cases as possible
    since doing so will allow GCC to perform better register allocation.
    We can use general registers to tie QI/HI/SI modes together.  */
-#define MODES_TIEABLE_P(mode1, mode2)          \
-  (GET_MODE_CLASS (mode1) == MODE_INT          \
-   && GET_MODE_CLASS (mode2) == MODE_INT       \
-   && GET_MODE_SIZE (mode1) <= UNITS_PER_WORD  \
-   && GET_MODE_SIZE (mode2) <= UNITS_PER_WORD)
+#define MODES_TIEABLE_P(mode1, mode2) nds32_modes_tieable_p (mode1, mode2)


 /* Register Classes.  */
@@ -628,13 +1295,18 @@ enum nds32_builtins
 enum reg_class
 {
   NO_REGS,
+  R5_REG,
+  R8_REG,
   R15_TA_REG,
   STACK_REG,
+  FRAME_POINTER_REG,
   LOW_REGS,
   MIDDLE_REGS,
   HIGH_REGS,
   GENERAL_REGS,
   FRAME_REGS,
+  FP_REGS,
+  LOOP_REGS,
   ALL_REGS,
   LIM_REG_CLASSES
 };
@@ -644,27 +1316,50 @@ enum reg_class
 #define REG_CLASS_NAMES \
 {                       \
   "NO_REGS",            \
+  "R5_REG",             \
+  "R8_REG",             \
   "R15_TA_REG",         \
   "STACK_REG",          \
+  "FRAME_POINTER_REG",  \
   "LOW_REGS",           \
   "MIDDLE_REGS",        \
   "HIGH_REGS",          \
   "GENERAL_REGS",       \
   "FRAME_REGS",         \
+  "FP_REGS",            \
+  "LOOP_REGS",          \
   "ALL_REGS"            \
 }

 #define REG_CLASS_CONTENTS \
-{                                                            \
-  {0x00000000, 0x00000000}, /* NO_REGS     :              */ \
-  {0x00008000, 0x00000000}, /* R15_TA_REG  : 15           */ \
-  {0x80000000, 0x00000000}, /* STACK_REG   : 31           */ \
-  {0x000000ff, 0x00000000}, /* LOW_REGS    : 0-7          */ \
-  {0x000f0fff, 0x00000000}, /* MIDDLE_REGS : 0-11, 16-19  */ \
-  {0xfff07000, 0x00000000}, /* HIGH_REGS   : 12-14, 20-31 */ \
-  {0xffffffff, 0x00000000}, /* GENERAL_REGS: 0-31         */ \
-  {0x00000000, 0x00000003}, /* FRAME_REGS  : 32, 33       */ \
-  {0xffffffff, 0x00000003}  /* ALL_REGS    : 0-31, 32, 33 */ \
+{ /* NO_REGS                                    */  \
+  {0x00000000, 0x00000000, 0x00000000, 0x00000000}, \
+  /* R5_REG              : 5                    */  \
+  {0x00000020, 0x00000000, 0x00000000, 0x00000000}, \
+  /* R8_REG              : 8                    */  \
+  {0x00000100, 0x00000000, 0x00000000, 0x00000000}, \
+  /* R15_TA_REG          : 15                   */  \
+  {0x00008000, 0x00000000, 0x00000000, 0x00000000}, \
+  /* STACK_REG           : 31                   */  \
+  {0x80000000, 0x00000000, 0x00000000, 0x00000000}, \
+  /* FRAME_POINTER_REG   : 28                   */  \
+  {0x10000000, 0x00000000, 0x00000000, 0x00000000}, \
+  /* LOW_REGS            : 0-7                  */  \
+  {0x000000ff, 0x00000000, 0x00000000, 0x00000000}, \
+  /* MIDDLE_REGS         : 0-11, 16-19          */  \
+  {0x000f0fff, 0x00000000, 0x00000000, 0x00000000}, \
+  /* HIGH_REGS           : 12-14, 20-31         */  \
+  {0xfff07000, 0x00000000, 0x00000000, 0x00000000}, \
+  /* GENERAL_REGS        : 0-31                 */  \
+  {0xffffffff, 0x00000000, 0x00000000, 0x00000000}, \
+  /* FRAME_REGS          : 32, 33               */  \
+  {0x00000000, 0x00000003, 0x00000000, 0x00000000}, \
+  /* FP_REGS             : 34-98                */  \
+  {0x00000000, 0xfffffffc, 0xffffffff, 0x00000003}, \
+  /* LOOP_REGS             99-101               */  \
+  {0x00000000, 0x00000000, 0x00000000, 0x0000001c}, \
+  /* ALL_REGS            : 0-101                */  \
+  {0xffffffff, 0xffffffff, 0xffffffff, 0x0000001f}  \
 }

 #define REGNO_REG_CLASS(regno) nds32_regno_reg_class (regno)
@@ -672,13 +1367,18 @@ enum reg_class
 #define BASE_REG_CLASS GENERAL_REGS
 #define INDEX_REG_CLASS GENERAL_REGS

+#define TEST_REGNO(R, TEST, VALUE) \
+  ((R TEST VALUE) || ((unsigned) reg_renumber[R] TEST VALUE))
+
 /* Return nonzero if it is suitable for use as a
    base register in operand addresses.
    So far, we return nonzero only if "num" is a hard reg
    of the suitable class or a pseudo register which is
    allocated to a suitable hard reg.  */
 #define REGNO_OK_FOR_BASE_P(num) \
-  ((num) < 32 || (unsigned) reg_renumber[num] < 32)
+  (TEST_REGNO (num, <, 32) \
+   || TEST_REGNO (num, ==, FRAME_POINTER_REGNUM) \
+   || TEST_REGNO (num, ==, ARG_POINTER_REGNUM))

 /* Return nonzero if it is suitable for use as a
    index register in operand addresses.
@@ -688,7 +1388,15 @@ enum reg_class
    The difference between an index register and a base register is that
    the index register may be scaled.  */
 #define REGNO_OK_FOR_INDEX_P(num) \
-  ((num) < 32 || (unsigned) reg_renumber[num] < 32)
+  (TEST_REGNO (num, <, 32) \
+   || TEST_REGNO (num, ==, FRAME_POINTER_REGNUM) \
+   || TEST_REGNO (num, ==, ARG_POINTER_REGNUM))
+
+/* Don't spill double-precision register to two singal-precision registers  */
+#define CANNOT_CHANGE_MODE_CLASS(FROM, TO, CLASS) \
+ ((TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)        \
+  && GET_MODE_SIZE (FROM) != GET_MODE_SIZE (TO)   \
+  ? reg_classes_intersect_p (CLASS, FP_REGS) : 0)


 /* Obsolete Macros for Defining Constraints.  */
@@ -707,6 +1415,11 @@ enum reg_class
 #define FIRST_PARM_OFFSET(fundecl) \
   (NDS32_DOUBLE_WORD_ALIGN_P (crtl->args.pretend_args_size) ? 0 : 4)

+/* A C expression whose value is RTL representing the address in a stack frame
+   where the pointer to the caller's frame is stored.  */
+#define DYNAMIC_CHAIN_ADDRESS(frameaddr) \
+  nds32_dynamic_chain_address (frameaddr)
+
 #define RETURN_ADDR_RTX(count, frameaddr) \
   nds32_return_addr_rtx (count, frameaddr)

@@ -718,6 +1431,15 @@ enum reg_class
 #define INCOMING_RETURN_ADDR_RTX    gen_rtx_REG (Pmode, LP_REGNUM)
 #define DWARF_FRAME_RETURN_COLUMN   DWARF_FRAME_REGNUM (LP_REGNUM)

+/* Use $r0 $r1 to pass exception handling information.  */
+#define EH_RETURN_DATA_REGNO(N) (((N) < 2) ? (N) : INVALID_REGNUM)
+/* The register $r2 that represents a location in which to store a stack
+   adjustment to be applied before function return.
+   This is used to unwind the stack to an exception handler's call frame.  */
+#define EH_RETURN_STACKADJ_RTX gen_rtx_REG (Pmode, 2)
+
+#define DBX_REGISTER_NUMBER(REGNO) nds32_dbx_register_number (REGNO)
+
 #define STACK_POINTER_REGNUM SP_REGNUM

 #define FRAME_POINTER_REGNUM 33
@@ -746,12 +1468,11 @@ enum reg_class
 #define INIT_CUMULATIVE_ARGS(cum, fntype, libname, fndecl, n_named_args) \
   nds32_init_cumulative_args (&cum, fntype, libname, fndecl, n_named_args)

-/* The REGNO is an unsigned integer but NDS32_GPR_ARG_FIRST_REGNUM may be 0.
-   We better cast REGNO into signed integer so that we can avoid
-   'comparison of unsigned expression >= 0 is always true' warning.  */
-#define FUNCTION_ARG_REGNO_P(regno)                                        \
-  (((int) regno - NDS32_GPR_ARG_FIRST_REGNUM >= 0)                         \
-   && ((int) regno - NDS32_GPR_ARG_FIRST_REGNUM < NDS32_MAX_GPR_REGS_FOR_ARGS))
+#define FUNCTION_ARG_REGNO_P(regno)                                           \
+ (IN_RANGE ((regno), NDS32_FIRST_GPR_REGNUM, NDS32_MAX_GPR_REGS_FOR_ARGS - 1) \
+  || ((TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE)                                \
+      && IN_RANGE ((regno), NDS32_FPR_ARG_FIRST_REGNUM,                       \
+		   NDS32_FIRST_FPR_REGNUM + NDS32_MAX_FPR_REGS_FOR_ARGS - 1)))

 #define DEFAULT_PCC_STRUCT_RETURN 0

@@ -763,7 +1484,15 @@ enum reg_class
 #define EXIT_IGNORE_STACK 1

 #define FUNCTION_PROFILER(file, labelno) \
-  fprintf (file, "/* profiler %d */", (labelno))
+  fprintf (file, "/* profiler %d */\n", (labelno))
+
+#define PROFILE_HOOK(LABEL)                                             \
+  {                                                                     \
+    rtx fun, lp;                                                        \
+    lp = get_hard_reg_initial_val (Pmode, LP_REGNUM);                   \
+    fun = gen_rtx_SYMBOL_REF (Pmode, "_mcount");                        \
+    emit_library_call (fun, LCT_NORMAL, VOIDmode, 1, lp, Pmode);        \
+  }


 /* Implementing the Varargs Macros.  */
@@ -780,13 +1509,13 @@ enum reg_class
    The trampoline code for nds32 target must contains following parts:

      1. instructions (4 * 4 = 16 bytes):
-          get $pc first
-          load chain_value to static chain register via $pc
-          load nested function address to $r15 via $pc
-          jump to desired nested function via $r15
+	  get $pc first
+	  load chain_value to static chain register via $pc
+	  load nested function address to $r15 via $pc
+	  jump to desired nested function via $r15
      2. data (4 * 2 = 8 bytes):
-          chain_value
-          nested function address
+	  chain_value
+	  nested function address

    Please check nds32.c implementation for more information.  */
 #define TRAMPOLINE_SIZE 24
@@ -811,9 +1540,22 @@ enum reg_class
 /* We have "LW.bi   Rt, [Ra], Rb" instruction form.  */
 #define HAVE_POST_MODIFY_REG  1

-#define CONSTANT_ADDRESS_P(x) (CONSTANT_P (x) && GET_CODE (x) != CONST_DOUBLE)
+#define USE_LOAD_POST_INCREMENT(mode) \
+  (GET_MODE_SIZE (mode) <= GET_MODE_SIZE(DImode))
+#define USE_LOAD_POST_DECREMENT(mode) \
+  (GET_MODE_SIZE (mode) <= GET_MODE_SIZE(DImode))
+#define USE_STORE_POST_DECREMENT(mode) USE_LOAD_POST_DECREMENT(mode)
+#define USE_STORE_POST_INCREMENT(mode) USE_LOAD_POST_INCREMENT(mode)
+
+#define CONSTANT_ADDRESS_P(x) \
+  (CONSTANT_P (x) && memory_address_p (GET_MODE (x), x))

-#define MAX_REGS_PER_ADDRESS 2
+/* CONST_DOUBLE is legal without TARGET_FPU in legitimate_constant_p.
+   Therefore, let it be a legal PIC operand and split it later.*/
+#define LEGITIMATE_PIC_OPERAND_P(x) \
+  (GET_CODE (x) != CONST_DOUBLE || !(TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE))
+
+#define MAX_REGS_PER_ADDRESS 3


 /* Anchored Addresses.  */
@@ -827,7 +1569,11 @@ enum reg_class
 /* A C expression for the cost of a branch instruction.
    A value of 1 is the default;
    other values are interpreted relative to that.  */
-#define BRANCH_COST(speed_p, predictable_p) ((speed_p) ? 2 : 0)
+#define BRANCH_COST(speed_p, predictable_p) ((speed_p) ? 2 : 1)
+
+/* Override BRANCH_COST heuristic which empirically produces worse
+   performance for removing short circuiting from the logical ops.  */
+#define LOGICAL_OP_NON_SHORT_CIRCUIT 0

 #define SLOW_BYTE_ACCESS 1

@@ -857,12 +1603,17 @@ enum reg_class

 #define PIC_OFFSET_TABLE_REGNUM GP_REGNUM

+#define SYMBOLIC_CONST_P(X)	\
+(GET_CODE (X) == SYMBOL_REF						\
+ || GET_CODE (X) == LABEL_REF						\
+ || (GET_CODE (X) == CONST && symbolic_reference_mentioned_p (X)))
+

 /* Defining the Output Assembler Language.  */

 #define ASM_COMMENT_START "!"

-#define ASM_APP_ON "! #APP"
+#define ASM_APP_ON "! #APP\n"

 #define ASM_APP_OFF "! #NO_APP\n"

@@ -877,14 +1628,77 @@ enum reg_class

 #define LOCAL_LABEL_PREFIX "."

-#define REGISTER_NAMES                                            \
-{                                                                 \
-  "$r0",  "$r1",  "$r2",  "$r3",  "$r4",  "$r5",  "$r6",  "$r7",  \
+#define REGISTER_NAMES \
+{ "$r0",  "$r1",  "$r2",  "$r3",  "$r4",  "$r5",  "$r6",  "$r7",  \
   "$r8",  "$r9",  "$r10", "$r11", "$r12", "$r13", "$r14", "$ta",  \
   "$r16", "$r17", "$r18", "$r19", "$r20", "$r21", "$r22", "$r23", \
   "$r24", "$r25", "$r26", "$r27", "$fp",  "$gp",  "$lp",  "$sp",  \
-  "$AP",                                                          \
-  "$SFP"                                                          \
+  "$AP",  "$SFP", "$fs0", "$fs1", "$fs2", "$fs3", "$fs4", "$fs5", \
+  "$fs6", "$fs7", "$fs8", "$fs9", "$fs10","$fs11","$fs12","$fs13",\
+  "$fs14","$fs15","$fs16","$fs17","$fs18","$fs19","$fs20","$fs21",\
+  "$fs22","$fs23","$fs24","$fs25","$fs26","$fs27","$fs28","$fs29",\
+  "$fs30","$fs31","$fs32","$fs33","$fs34","$fs35","$fs36","$fs37",\
+  "$fs38","$fs39","$fs40","$fs41","$fs42","$fs43","$fs44","$fs45",\
+  "$fs46","$fs47","$fs48","$fs49","$fs50","$fs51","$fs52","$fs53",\
+  "$fs54","$fs55","$fs56","$fs57","$fs58","$fs59","$fs60","$fs61",\
+  "$fs62","$fs63",   "LB",   "LE",   "LC"                         \
+}
+
+#define ADDITIONAL_REGISTER_NAMES				\
+{								\
+  {"$r15", 15},							\
+  {"$r28", 28},	{"$r29", 29},	{"$r30", 30},	{"$r31", 31},	\
+  {"$a0", 0},	{"$a1", 1},	{"$a2", 2},			\
+  {"$a3", 3},	{"$a4", 4},	{"$a5", 5},			\
+  {"$s0", 6},	{"$s1", 7},	{"$s2", 8},	{"$s3", 9},	\
+  {"$s4", 10},	{"$s5", 11},	{"$s6", 12},	{"$s7", 13},	\
+  {"$s8", 14},							\
+  {"$t0", 16},	{"$t1", 17},	{"$t2", 18},	{"$t3", 19},	\
+  {"$t4", 20},	{"$t5", 21},	{"$t6", 22},	{"$t7", 23},	\
+  {"$t8", 24},	{"$t9", 25},					\
+  {"$p0", 26},	{"$p1", 27},					\
+  {"$h0", 0},	{"$h1", 1},	{"$h2", 2},	{"$h3", 3},	\
+  {"$h4", 4},	{"$h5", 5},	{"$h6", 6},	{"$h7", 7},	\
+  {"$h8", 8},	{"$h9", 9},	{"$h10", 10},	{"$h11", 11},	\
+  {"$h12", 16},	{"$h13", 17},	{"$h14", 18},	{"$h15", 19},	\
+  {"$o0", 0},	{"$o1", 1},	{"$o2", 2},	{"$o3", 3},	\
+  {"$o4", 4},	{"$o5", 5},	{"$o6", 6},	{"$o7", 7},	\
+}
+
+#define OVERLAPPING_REGISTER_NAMES		\
+{						\
+  {"$fd0",  NDS32_FIRST_FPR_REGNUM + 0,  2},	\
+  {"$fd1",  NDS32_FIRST_FPR_REGNUM + 2,  2},	\
+  {"$fd2",  NDS32_FIRST_FPR_REGNUM + 4,  2},	\
+  {"$fd3",  NDS32_FIRST_FPR_REGNUM + 6,  2},	\
+  {"$fd4",  NDS32_FIRST_FPR_REGNUM + 8,  2},	\
+  {"$fd5",  NDS32_FIRST_FPR_REGNUM + 10, 2},	\
+  {"$fd6",  NDS32_FIRST_FPR_REGNUM + 12, 2},	\
+  {"$fd7",  NDS32_FIRST_FPR_REGNUM + 14, 2},	\
+  {"$fd8",  NDS32_FIRST_FPR_REGNUM + 16, 2},	\
+  {"$fd9",  NDS32_FIRST_FPR_REGNUM + 18, 2},	\
+  {"$fd10", NDS32_FIRST_FPR_REGNUM + 20, 2},	\
+  {"$fd11", NDS32_FIRST_FPR_REGNUM + 22, 2},	\
+  {"$fd12", NDS32_FIRST_FPR_REGNUM + 24, 2},	\
+  {"$fd13", NDS32_FIRST_FPR_REGNUM + 26, 2},	\
+  {"$fd14", NDS32_FIRST_FPR_REGNUM + 28, 2},	\
+  {"$fd15", NDS32_FIRST_FPR_REGNUM + 30, 2},	\
+  {"$fd16", NDS32_FIRST_FPR_REGNUM + 32, 2},	\
+  {"$fd17", NDS32_FIRST_FPR_REGNUM + 34, 2},	\
+  {"$fd18", NDS32_FIRST_FPR_REGNUM + 36, 2},	\
+  {"$fd19", NDS32_FIRST_FPR_REGNUM + 38, 2},	\
+  {"$fd20", NDS32_FIRST_FPR_REGNUM + 40, 2},	\
+  {"$fd21", NDS32_FIRST_FPR_REGNUM + 42, 2},	\
+  {"$fd22", NDS32_FIRST_FPR_REGNUM + 44, 2},	\
+  {"$fd23", NDS32_FIRST_FPR_REGNUM + 46, 2},	\
+  {"$fd24", NDS32_FIRST_FPR_REGNUM + 48, 2},	\
+  {"$fd25", NDS32_FIRST_FPR_REGNUM + 50, 2},	\
+  {"$fd26", NDS32_FIRST_FPR_REGNUM + 52, 2},	\
+  {"$fd27", NDS32_FIRST_FPR_REGNUM + 54, 2},	\
+  {"$fd28", NDS32_FIRST_FPR_REGNUM + 56, 2},	\
+  {"$fd29", NDS32_FIRST_FPR_REGNUM + 58, 2},	\
+  {"$fd30", NDS32_FIRST_FPR_REGNUM + 60, 2},	\
+  {"$fd31", NDS32_FIRST_FPR_REGNUM + 62, 2},	\
 }

 /* Output normal jump table entry.  */
@@ -896,19 +1710,19 @@ enum reg_class
   do                                                                    \
     {                                                                   \
       switch (GET_MODE (body))                                          \
-        {                                                               \
-        case QImode:                                                    \
-          asm_fprintf (stream, "\t.byte\t.L%d-.L%d\n", value, rel);     \
-          break;                                                        \
-        case HImode:                                                    \
-          asm_fprintf (stream, "\t.short\t.L%d-.L%d\n", value, rel);    \
-          break;                                                        \
-        case SImode:                                                    \
-          asm_fprintf (stream, "\t.word\t.L%d-.L%d\n", value, rel);     \
-          break;                                                        \
-        default:                                                        \
-          gcc_unreachable();                                            \
-        }                                                               \
+	{                                                               \
+	case QImode:                                                    \
+	  asm_fprintf (stream, "\t.byte\t.L%d-.L%d\n", value, rel);     \
+	  break;                                                        \
+	case HImode:                                                    \
+	  asm_fprintf (stream, "\t.short\t.L%d-.L%d\n", value, rel);    \
+	  break;                                                        \
+	case SImode:                                                    \
+	  asm_fprintf (stream, "\t.word\t.L%d-.L%d\n", value, rel);     \
+	  break;                                                        \
+	default:                                                        \
+	  gcc_unreachable();                                            \
+	}                                                               \
     } while (0)

 /* We have to undef it first because elfos.h formerly define it
@@ -925,10 +1739,10 @@ enum reg_class
   do                                                   \
     {                                                  \
       /* Because our jump table is in text section,    \
-         we need to make sure 2-byte alignment after   \
-         the jump table for instructions fetch.  */    \
+	 we need to make sure 2-byte alignment after   \
+	 the jump table for instructions fetch.  */    \
       if (GET_MODE (PATTERN (table)) == QImode)        \
-        ASM_OUTPUT_ALIGN (stream, 1);                  \
+	ASM_OUTPUT_ALIGN (stream, 1);                  \
       asm_fprintf (stream, "\t! Jump Table End\n");    \
     }  while (0)

@@ -992,9 +1806,7 @@ enum reg_class
 /* Return the preferred mode for and addr_diff_vec when the mininum
    and maximum offset are known.  */
 #define CASE_VECTOR_SHORTEN_MODE(min_offset, max_offset, body)  \
-   ((min_offset < 0 || max_offset >= 0x2000 ) ? SImode          \
-   : (max_offset >= 100) ? HImode                               \
-   : QImode)
+  nds32_case_vector_shorten_mode (min_offset, max_offset, body)

 /* Generate pc relative jump table when -fpic or -Os.  */
 #define CASE_VECTOR_PC_RELATIVE (flag_pic || optimize_size)
@@ -1027,6 +1839,11 @@ enum reg_class
    when the condition is true.  */
 #define STORE_FLAG_VALUE 1

+/* A C expression that indicates whether the architecture defines a value for
+   clz or ctz with a zero operand.  In nds32 clz for 0 result 32 is defined
+   in ISA spec */
+#define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE)  ((VALUE) = 32, 1)
+
 /* An alias for the machine mode for pointers.  */
 #define Pmode SImode

diff --git a/gcc/config/nds32/nds32.md b/gcc/config/nds32/nds32.md
index 5cdd8b2..557c466 100644
--- a/gcc/config/nds32/nds32.md
+++ b/gcc/config/nds32/nds32.md
@@ -46,58 +46,144 @@
 ;; Include DImode/DFmode operations.
 (include "nds32-doubleword.md")

+;; Include floating-point patterns.
+(include "nds32-fpu.md")
+
 ;; Include peephole patterns.
 (include "nds32-peephole2.md")


+;; ------------------------------------------------------------------------
+
+;; CPU pipeline model.
+(define_attr "pipeline_model" "n7,n8,e8,n9,n10,graywolf,n13,panther,simple"
+  (const
+    (cond [(match_test "nds32_cpu_option == CPU_N7")  (const_string "n7")
+	   (match_test "nds32_cpu_option == CPU_N6 || nds32_cpu_option == CPU_N8")  (const_string "n8")
+	   (match_test "nds32_cpu_option == CPU_E8")  (const_string "e8")
+	   (match_test "nds32_cpu_option == CPU_N9")  (const_string "n9")
+	   (match_test "nds32_cpu_option == CPU_N10") (const_string "n10")
+	   (match_test "nds32_cpu_option == CPU_GRAYWOLF") (const_string "graywolf")
+	   (match_test "nds32_cpu_option == CPU_N12") (const_string "n13")
+	   (match_test "nds32_cpu_option == CPU_N13") (const_string "n13")
+	   (match_test "nds32_cpu_option == CPU_PANTHER") (const_string "panther")
+	   (match_test "nds32_cpu_option == CPU_SIMPLE") (const_string "simple")]
+	  (const_string "n9"))))
+
 ;; Insn type, it is used to default other attribute values.
 (define_attr "type"
-  "unknown,move,load,store,alu,compare,branch,call,misc"
+  "unknown,load,store,load_multiple,store_multiple,alu,alu_shift,pbsad,pbsada,mul,mac,div,branch,mmu,misc,\
+   falu,fmuls,fmuld,fmacs,fmacd,fdivs,fdivd,fsqrts,fsqrtd,fcmp,fabs,fcpy,fcmov,fmfsr,fmfdr,fmtsr,fmtdr,fload,fstore,\
+   dalu,dalu64,daluround,dcmp,dclip,dmul,dmac,dinsb,dpack,dbpick,dwext"
   (const_string "unknown"))

+;; Insn sub-type
+(define_attr "subtype"
+  "simple,shift,saturation"
+  (const_string "simple"))

 ;; Length, in bytes, default is 4-bytes.
 (define_attr "length" "" (const_int 4))

+;; Indicate the amount of micro instructions.
+(define_attr "combo"
+  "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25"
+  (const_string "1"))
+
+;; Insn in which feature set, it is used to enable/disable insn alternatives.
+;; v1  : Baseline Instructions
+;; v2  : Baseline Version 2 Instructions
+;; v3m : Baseline Version 3m Instructions
+;; v3  : Baseline Version 3 Instructions
+;; pe1 : Performance Extension Instructions
+;; pe2 : Performance Extension Version 2 Instructions
+;; se  : String Extension instructions
+(define_attr "feature"
+  "v1,v2,v3m,v3,pe1,pe2,se,fpu"
+  (const_string "v1"))

 ;; Enabled, which is used to enable/disable insn alternatives.
 ;; Note that we use length and TARGET_16_BIT here as criteria.
-;; If the instruction pattern already check TARGET_16_BIT to
-;; determine the length by itself, its enabled attribute should be
-;; always 1 to avoid the conflict with the settings here.
-(define_attr "enabled" ""
-  (cond [(and (eq_attr "length" "2")
-	      (match_test "!TARGET_16_BIT"))
-	 (const_int 0)]
-	(const_int 1)))
+;; If the instruction pattern already check TARGET_16_BIT to determine
+;; the length by itself, its enabled attribute should be customized to
+;; avoid the conflict between length attribute and this default setting.
+(define_attr "enabled" "no,yes"
+  (if_then_else
+    (and (eq_attr "length" "2")
+	 (match_test "!TARGET_16_BIT"))
+    (const_string "no")
+    (cond [(eq_attr "feature" "v1")   (const_string "yes")
+	   (eq_attr "feature" "v2")   (if_then_else (match_test "TARGET_ISA_V2 || TARGET_ISA_V3 || TARGET_ISA_V3M")
+						    (const_string "yes")
+						    (const_string "no"))
+	   (eq_attr "feature" "v3")   (if_then_else (match_test "TARGET_ISA_V3")
+						    (const_string "yes")
+						    (const_string "no"))
+	   (eq_attr "feature" "v3m")  (if_then_else (match_test "TARGET_ISA_V3 || TARGET_ISA_V3M")
+						    (const_string "yes")
+						    (const_string "no"))
+	   (eq_attr "feature" "pe1")  (if_then_else (match_test "TARGET_EXT_PERF")
+						    (const_string "yes")
+						    (const_string "no"))
+	   (eq_attr "feature" "pe2")  (if_then_else (match_test "TARGET_EXT_PERF2")
+						    (const_string "yes")
+						    (const_string "no"))
+	   (eq_attr "feature" "se")   (if_then_else (match_test "TARGET_EXT_STRING")
+						    (const_string "yes")
+						    (const_string "no"))
+	   (eq_attr "feature" "fpu")  (if_then_else (match_test "TARGET_FPU_SINGLE || TARGET_FPU_DOUBLE")
+						    (const_string "yes")
+						    (const_string "no"))]
+	   (const_string "yes"))))


 ;; ----------------------------------------------------------------------------

+(include "nds32-dspext.md")

 ;; Move instructions.

 ;; For QImode and HImode, the immediate value can be fit in imm20s.
 ;; So there is no need to split rtx for QI and HI patterns.

-(define_expand "movqi"
-  [(set (match_operand:QI 0 "general_operand" "")
-	(match_operand:QI 1 "general_operand" ""))]
+(define_expand "mov<mode>"
+  [(set (match_operand:QIHI 0 "general_operand" "")
+	(match_operand:QIHI 1 "general_operand" ""))]
   ""
 {
   /* Need to force register if mem <- !reg.  */
   if (MEM_P (operands[0]) && !REG_P (operands[1]))
-    operands[1] = force_reg (QImode, operands[1]);
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+
+  if (MEM_P (operands[1]) && optimize > 0)
+    {
+      rtx reg = gen_reg_rtx (SImode);
+
+      emit_insn (gen_zero_extend<mode>si2 (reg, operands[1]));
+      operands[1] = gen_lowpart (<MODE>mode, reg);
+    }
 })

-(define_expand "movhi"
-  [(set (match_operand:HI 0 "general_operand" "")
-	(match_operand:HI 1 "general_operand" ""))]
+(define_expand "movmisalign<mode>"
+  [(set (match_operand:SIDI 0 "general_operand" "")
+	(match_operand:SIDI 1 "general_operand" ""))]
   ""
 {
-  /* Need to force register if mem <- !reg.  */
+  rtx addr;
   if (MEM_P (operands[0]) && !REG_P (operands[1]))
-    operands[1] = force_reg (HImode, operands[1]);
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+
+  if (MEM_P (operands[0]))
+    {
+      addr = force_reg (Pmode, XEXP (operands[0], 0));
+      emit_insn (gen_unaligned_store<mode> (addr, operands[1]));
+    }
+  else
+    {
+      addr = force_reg (Pmode, XEXP (operands[1], 0));
+      emit_insn (gen_unaligned_load<mode> (operands[0], addr));
+    }
+  DONE;
 })

 (define_expand "movsi"
@@ -130,12 +216,33 @@
 						  low12_int));
       DONE;
     }
+
+  if (REG_P (operands[0]) && SYMBOLIC_CONST_P (operands[1]))
+    {
+      if (TARGET_ICT_MODEL_LARGE
+	  && nds32_indirect_call_referenced_p (operands[1]))
+	{
+	  nds32_expand_ict_move (operands);
+	  DONE;
+	}
+      else if (nds32_tls_referenced_p (operands [1]))
+	{
+	  nds32_expand_tls_move (operands);
+	  DONE;
+	}
+      else if (flag_pic)
+	{
+	  nds32_expand_pic_move (operands);
+	  DONE;
+	}
+    }
 })

 (define_insn "*mov<mode>"
-  [(set (match_operand:QIHISI 0 "nonimmediate_operand" "=r, r, U45, U33, U37, U45, m,   l,   l,   l,   d, r,    d,    r,    r,    r")
-	(match_operand:QIHISI 1 "nds32_move_operand"   " r, r,   l,   l,   l,   d, r, U45, U33, U37, U45, m, Ip05, Is05, Is20, Ihig"))]
-  ""
+  [(set (match_operand:QIHISI 0 "nonimmediate_operand" "=r, r,U45,U33,U37,U45, m,  l,  l,  l,  d,  d, r,   d,    r,    r,    r, *f, *f,  r, *f,  Q, A")
+	(match_operand:QIHISI 1 "nds32_move_operand"   " r, r,  l,  l,  l,  d, r,U45,U33,U37,U45,Ufe, m,Ip05, Is05, Is20, Ihig, *f,  r, *f,  Q, *f, r"))]
+  "register_operand(operands[0], <MODE>mode)
+   || register_operand(operands[1], <MODE>mode)"
 {
   switch (which_alternative)
     {
@@ -154,37 +261,54 @@
     case 8:
     case 9:
     case 10:
-      return nds32_output_16bit_load (operands, <byte>);
     case 11:
-      return nds32_output_32bit_load (operands, <byte>);
+      return nds32_output_16bit_load (operands, <byte>);
     case 12:
-      return "movpi45\t%0, %1";
+      return nds32_output_32bit_load (operands, <byte>);
     case 13:
-      return "movi55\t%0, %1";
+      return "movpi45\t%0, %1";
     case 14:
-      return "movi\t%0, %1";
+      return "movi55\t%0, %1";
     case 15:
+      return "movi\t%0, %1";
+    case 16:
       return "sethi\t%0, hi20(%1)";
+    case 17:
+      if (TARGET_FPU_SINGLE)
+	return "fcpyss\t%0, %1, %1";
+      else
+	return "#";
+    case 18:
+      return "fmtsr\t%1, %0";
+    case 19:
+      return "fmfsr\t%0, %1";
+    case 20:
+      return nds32_output_float_load (operands);
+    case 21:
+      return nds32_output_float_store (operands);
+    case 22:
+      return "mtusr\t%1, %0";
     default:
       gcc_unreachable ();
     }
 }
-  [(set_attr "type"   "alu,alu,store,store,store,store,store,load,load,load,load,load,alu,alu,alu,alu")
-   (set_attr "length" "  2,  4,    2,    2,    2,    2,    4,   2,   2,   2,   2,   4,  2,  2,  4,  4")])
+  [(set_attr "type"    "alu,alu,store,store,store,store,store,load,load,load,load,load,load,alu,alu,alu,alu,fcpy,fmtsr,fmfsr,fload,fstore,alu")
+   (set_attr "length"  "  2,  4,    2,    2,    2,    2,    4,   2,   2,   2,   2,   2,   4,  2,  2,  4,  4,   4,    4,    4,    4,     4,  4")
+   (set_attr "feature" " v1, v1,   v1,   v1,   v1,   v1,   v1,  v1,  v1,  v1,  v1, v3m,  v1, v1, v1, v1, v1, fpu,  fpu,  fpu,  fpu,   fpu, v1")])


 ;; We use nds32_symbolic_operand to limit that only CONST/SYMBOL_REF/LABEL_REF
 ;; are able to match such instruction template.
-(define_insn "*move_addr"
-  [(set (match_operand:SI 0 "register_operand"       "=l, r")
-	(match_operand:SI 1 "nds32_symbolic_operand" " i, i"))]
+(define_insn "move_addr"
+  [(set (match_operand:SI 0 "nds32_general_register_operand"   "=l, r")
+	(match_operand:SI 1 "nds32_nonunspec_symbolic_operand" " i, i"))]
   ""
   "la\t%0, %1"
-  [(set_attr "type" "move")
+  [(set_attr "type"  "alu")
    (set_attr "length"  "8")])


-(define_insn "*sethi"
+(define_insn "sethi"
   [(set (match_operand:SI 0 "register_operand"                "=r")
 	(high:SI (match_operand:SI 1 "nds32_symbolic_operand" " i")))]
   ""
@@ -193,7 +317,7 @@
    (set_attr "length" "4")])


-(define_insn "*lo_sum"
+(define_insn "lo_sum"
   [(set (match_operand:SI 0 "register_operand"                  "=r")
 	(lo_sum:SI (match_operand:SI 1 "register_operand"       " r")
 		   (match_operand:SI 2 "nds32_symbolic_operand" " i")))]
@@ -208,8 +332,8 @@
 ;; Zero extension instructions.

 (define_insn "zero_extend<mode>si2"
-  [(set (match_operand:SI 0 "register_operand"                       "=l, r,   l, *r")
-	(zero_extend:SI (match_operand:QIHI 1 "nonimmediate_operand" " l, r, U33,  m")))]
+  [(set (match_operand:SI 0 "register_operand"                       "=l, r,  l, *r")
+	(zero_extend:SI (match_operand:QIHI 1 "nonimmediate_operand" " l, r,U33,  m")))]
   ""
 {
   switch (which_alternative)
@@ -245,7 +369,7 @@
     case 1:
       return "se<size>\t%0, %1";
     case 2:
-      return nds32_output_32bit_load_s (operands, <byte>);
+      return nds32_output_32bit_load_se (operands, <byte>);

     default:
       gcc_unreachable ();
@@ -256,25 +380,70 @@


 ;; ----------------------------------------------------------------------------
+(define_expand "extv"
+  [(set (match_operand 0 "register_operand" "")
+        (sign_extract (match_operand 1 "nonimmediate_operand" "")
+                      (match_operand 2 "const_int_operand" "")
+                      (match_operand 3 "const_int_operand" "")))]
+  ""
+{
+  enum nds32_expand_result_type result = nds32_expand_extv (operands);
+  switch (result)
+    {
+    case EXPAND_DONE:
+      DONE;
+      break;
+    case EXPAND_FAIL:
+      FAIL;
+      break;
+    case EXPAND_CREATE_TEMPLATE:
+      break;
+    default:
+      gcc_unreachable ();
+    }
+})
+
+(define_expand "insv"
+  [(set (zero_extract (match_operand 0 "nonimmediate_operand" "")
+                      (match_operand 1 "const_int_operand" "")
+                      (match_operand 2 "const_int_operand" ""))
+        (match_operand 3 "register_operand" ""))]
+  ""
+{
+  enum nds32_expand_result_type result = nds32_expand_insv (operands);
+  switch (result)
+    {
+    case EXPAND_DONE:
+      DONE;
+      break;
+    case EXPAND_FAIL:
+      FAIL;
+      break;
+    case EXPAND_CREATE_TEMPLATE:
+      break;
+    default:
+      gcc_unreachable ();
+    }
+})

 ;; Arithmetic instructions.

-(define_insn "add<mode>3"
-  [(set (match_operand:QIHISI 0 "register_operand"                   "=   d,    l,    d,    l,  d, l,    k,    l,    r, r")
-	(plus:QIHISI (match_operand:QIHISI 1 "register_operand"      "%   0,    l,    0,    l,  0, l,    0,    k,    r, r")
-		     (match_operand:QIHISI 2 "nds32_rimm15s_operand" " In05, In03, Iu05, Iu03,  r, l, Is10, Iu06, Is15, r")))]
+(define_insn "addsi3"
+  [(set (match_operand:SI 0 "register_operand"               "=   d,   l,   d,   l, d,l,   k,   l,    r, r")
+	(plus:SI (match_operand:SI 1 "register_operand"      "%   0,   l,   0,   l, 0,l,   0,   k,    r, r")
+		 (match_operand:SI 2 "nds32_rimm15s_operand" " In05,In03,Iu05,Iu03, r,l,Is10,IU06, Is15, r")))]
   ""
 {
   switch (which_alternative)
     {
     case 0:
       /* addi Rt4,Rt4,-x  ==>  subi45 Rt4,x
-         where 0 <= x <= 31 */
+	 where 0 <= x <= 31 */
       operands[2] = gen_int_mode (-INTVAL (operands[2]), SImode);
       return "subi45\t%0, %2";
     case 1:
       /* addi Rt3,Ra3,-x  ==>  subi333 Rt3,Ra3,x
-         where 0 <= x <= 7 */
+	 where 0 <= x <= 7 */
       operands[2] = gen_int_mode (-INTVAL (operands[2]), SImode);
       return "subi333\t%0, %1, %2";
     case 2:
@@ -298,19 +467,20 @@
       gcc_unreachable ();
     }
 }
-  [(set_attr "type"   "alu,alu,alu,alu,alu,alu,alu,alu,alu,alu")
-   (set_attr "length" "  2,  2,  2,  2,  2,  2,  2,  2,  4,  4")])
-
-(define_insn "sub<mode>3"
-  [(set (match_operand:QIHISI 0 "register_operand"                    "=d, l,    r, r")
-	(minus:QIHISI (match_operand:QIHISI 1 "nds32_rimm15s_operand" " 0, l, Is15, r")
-		      (match_operand:QIHISI 2 "register_operand"      " r, l,    r, r")))]
+  [(set_attr "type"    "alu,alu,alu,alu,alu,alu,alu,alu,alu,alu")
+   (set_attr "length"  "  2,  2,  2,  2,  2,  2,  2,  2,  4,  4")
+   (set_attr "feature" " v1, v1, v1, v1, v1, v1, v2, v1, v1, v1")])
+
+(define_insn "subsi3"
+  [(set (match_operand:SI 0 "register_operand"                "=d, l,    r, r")
+	(minus:SI (match_operand:SI 1 "nds32_rimm15s_operand" " 0, l, Is15, r")
+		  (match_operand:SI 2 "register_operand"      " r, l,    r, r")))]
   ""
   "@
-  sub45\t%0, %2
-  sub333\t%0, %1, %2
-  subri\t%0, %2, %1
-  sub\t%0, %1, %2"
+   sub45\t%0, %2
+   sub333\t%0, %1, %2
+   subri\t%0, %2, %1
+   sub\t%0, %1, %2"
   [(set_attr "type"   "alu,alu,alu,alu")
    (set_attr "length" "  2,  2,  4,  4")])

@@ -320,10 +490,10 @@
 ;; and needs to ensure it is exact_log2 value.
 (define_insn "*add_slli"
   [(set (match_operand:SI 0 "register_operand"                    "=r")
-        (plus:SI (mult:SI (match_operand:SI 1 "register_operand"  " r")
+	(plus:SI (mult:SI (match_operand:SI 1 "register_operand"  " r")
 			  (match_operand:SI 2 "immediate_operand" " i"))
 		 (match_operand:SI 3 "register_operand"           " r")))]
-  "TARGET_ISA_V3
+  "TARGET_ISA_V3 && (TARGET_PIPELINE_PANTHER || optimize_size)
    && (exact_log2 (INTVAL (operands[2])) != -1)
    && (exact_log2 (INTVAL (operands[2])) <= 31)"
 {
@@ -333,18 +503,20 @@

   return "add_slli\t%0, %3, %1, %2";
 }
-  [(set_attr "type" "alu")
-   (set_attr "length" "4")])
+  [(set_attr "type" "alu_shift")
+   (set_attr "combo"        "2")
+   (set_attr "length"       "4")])

 (define_insn "*add_srli"
-  [(set (match_operand:SI 0 "register_operand"                        "=   r")
-	(plus:SI (lshiftrt:SI (match_operand:SI 1 "register_operand"  "    r")
-			      (match_operand:SI 2 "immediate_operand" " Iu05"))
-		 (match_operand:SI 3 "register_operand"               "    r")))]
-  "TARGET_ISA_V3"
+  [(set (match_operand:SI 0 "register_operand"                          "=   r")
+	(plus:SI (lshiftrt:SI (match_operand:SI 1 "register_operand"    "    r")
+			      (match_operand:SI 2 "nds32_imm5u_operand" " Iu05"))
+		 (match_operand:SI 3 "register_operand"                 "    r")))]
+  "TARGET_ISA_V3 && (TARGET_PIPELINE_PANTHER || optimize_size)"
   "add_srli\t%0, %3, %1, %2"
-  [(set_attr "type" "alu")
-   (set_attr "length" "4")])
+  [(set_attr "type" "alu_shift")
+   (set_attr "combo"        "2")
+   (set_attr "length"       "4")])


 ;; GCC intends to simplify (minus (reg) (ashift ...))
@@ -355,7 +527,7 @@
 	(minus:SI (match_operand:SI 1 "register_operand"           " r")
 		  (mult:SI (match_operand:SI 2 "register_operand"  " r")
 			   (match_operand:SI 3 "immediate_operand" " i"))))]
-  "TARGET_ISA_V3
+  "TARGET_ISA_V3 && (TARGET_PIPELINE_PANTHER || optimize_size)
    && (exact_log2 (INTVAL (operands[3])) != -1)
    && (exact_log2 (INTVAL (operands[3])) <= 31)"
 {
@@ -365,32 +537,35 @@

   return "sub_slli\t%0, %1, %2, %3";
 }
-  [(set_attr "type" "alu")
-   (set_attr "length" "4")])
+  [(set_attr "type" "alu_shift")
+   (set_attr "combo"        "2")
+   (set_attr "length"       "4")])

 (define_insn "*sub_srli"
-  [(set (match_operand:SI 0 "register_operand"                         "=   r")
-	(minus:SI (match_operand:SI 1 "register_operand"               "    r")
-		  (lshiftrt:SI (match_operand:SI 2 "register_operand"  "    r")
-			       (match_operand:SI 3 "immediate_operand" " Iu05"))))]
-  "TARGET_ISA_V3"
+  [(set (match_operand:SI 0 "register_operand"                           "=   r")
+	(minus:SI (match_operand:SI 1 "register_operand"                 "    r")
+		  (lshiftrt:SI (match_operand:SI 2 "register_operand"    "    r")
+			       (match_operand:SI 3 "nds32_imm5u_operand" " Iu05"))))]
+  "TARGET_ISA_V3 && (TARGET_PIPELINE_PANTHER || optimize_size)"
   "sub_srli\t%0, %1, %2, %3"
-  [(set_attr "type" "alu")
-   (set_attr "length" "4")])
+  [(set_attr "type" "alu_shift")
+   (set_attr "combo"        "2")
+   (set_attr "length"       "4")])


 ;; Multiplication instructions.

 (define_insn "mulsi3"
-  [(set (match_operand:SI 0 "register_operand"          "=w, r")
+  [(set (match_operand:SI 0 "register_operand"          "=l, r")
 	(mult:SI (match_operand:SI 1 "register_operand" "%0, r")
-		 (match_operand:SI 2 "register_operand" " w, r")))]
+		 (match_operand:SI 2 "register_operand" " l, r")))]
   ""
   "@
-  mul33\t%0, %2
-  mul\t%0, %1, %2"
-  [(set_attr "type"   "alu,alu")
-   (set_attr "length" "  2,  4")])
+   mul33\t%0, %2
+   mul\t%0, %1, %2"
+  [(set_attr "type"    "mul,mul")
+   (set_attr "length"  "  2,  4")
+   (set_attr "feature" "v3m, v1")])

 (define_insn "mulsidi3"
   [(set (match_operand:DI 0 "register_operand"                          "=r")
@@ -398,7 +573,7 @@
 		 (sign_extend:DI (match_operand:SI 2 "register_operand" " r"))))]
   "TARGET_ISA_V2 || TARGET_ISA_V3"
   "mulsr64\t%0, %1, %2"
-  [(set_attr "type"   "alu")
+  [(set_attr "type"   "mul")
    (set_attr "length"   "4")])

 (define_insn "umulsidi3"
@@ -407,7 +582,7 @@
 		 (zero_extend:DI (match_operand:SI 2 "register_operand" " r"))))]
   "TARGET_ISA_V2 || TARGET_ISA_V3"
   "mulr64\t%0, %1, %2"
-  [(set_attr "type"   "alu")
+  [(set_attr "type"   "mul")
    (set_attr "length"   "4")])


@@ -415,32 +590,32 @@

 (define_insn "*maddr32_0"
   [(set (match_operand:SI 0 "register_operand"                   "=r")
-        (plus:SI (match_operand:SI 3 "register_operand"          " 0")
-                 (mult:SI (match_operand:SI 1 "register_operand" " r")
-                          (match_operand:SI 2 "register_operand" " r"))))]
+	(plus:SI (match_operand:SI 3 "register_operand"          " 0")
+		 (mult:SI (match_operand:SI 1 "register_operand" " r")
+			  (match_operand:SI 2 "register_operand" " r"))))]
   ""
   "maddr32\t%0, %1, %2"
-  [(set_attr "type"   "alu")
+  [(set_attr "type"   "mac")
    (set_attr "length"   "4")])

 (define_insn "*maddr32_1"
   [(set (match_operand:SI 0 "register_operand"                   "=r")
-        (plus:SI (mult:SI (match_operand:SI 1 "register_operand" " r")
-                          (match_operand:SI 2 "register_operand" " r"))
-                 (match_operand:SI 3 "register_operand"          " 0")))]
+	(plus:SI (mult:SI (match_operand:SI 1 "register_operand" " r")
+			  (match_operand:SI 2 "register_operand" " r"))
+		 (match_operand:SI 3 "register_operand"          " 0")))]
   ""
   "maddr32\t%0, %1, %2"
-  [(set_attr "type"   "alu")
+  [(set_attr "type"   "mac")
    (set_attr "length"   "4")])

 (define_insn "*msubr32"
   [(set (match_operand:SI 0 "register_operand"                    "=r")
-        (minus:SI (match_operand:SI 3 "register_operand"          " 0")
-                  (mult:SI (match_operand:SI 1 "register_operand" " r")
-                           (match_operand:SI 2 "register_operand" " r"))))]
+	(minus:SI (match_operand:SI 3 "register_operand"          " 0")
+		  (mult:SI (match_operand:SI 1 "register_operand" " r")
+			   (match_operand:SI 2 "register_operand" " r"))))]
   ""
   "msubr32\t%0, %1, %2"
-  [(set_attr "type"   "alu")
+  [(set_attr "type"   "mac")
    (set_attr "length"   "4")])


@@ -448,26 +623,46 @@

 (define_insn "divmodsi4"
   [(set (match_operand:SI 0 "register_operand"         "=r")
-        (div:SI (match_operand:SI 1 "register_operand" " r")
-                (match_operand:SI 2 "register_operand" " r")))
+	(div:SI (match_operand:SI 1 "register_operand" " r")
+		(match_operand:SI 2 "register_operand" " r")))
    (set (match_operand:SI 3 "register_operand"         "=r")
-        (mod:SI (match_dup 1) (match_dup 2)))]
+	(mod:SI (match_dup 1) (match_dup 2)))]
   ""
   "divsr\t%0, %3, %1, %2"
-  [(set_attr "type"   "alu")
+  [(set_attr "type"   "div")
    (set_attr "length"   "4")])

 (define_insn "udivmodsi4"
   [(set (match_operand:SI 0 "register_operand"          "=r")
-        (udiv:SI (match_operand:SI 1 "register_operand" " r")
-                (match_operand:SI 2 "register_operand"  " r")))
+	(udiv:SI (match_operand:SI 1 "register_operand" " r")
+		 (match_operand:SI 2 "register_operand"  " r")))
    (set (match_operand:SI 3 "register_operand"          "=r")
-        (umod:SI (match_dup 1) (match_dup 2)))]
+	(umod:SI (match_dup 1) (match_dup 2)))]
   ""
   "divr\t%0, %3, %1, %2"
-  [(set_attr "type"   "alu")
+  [(set_attr "type"   "div")
+   (set_attr "length"   "4")])
+
+;; divsr/divr will keep quotient only when quotient and remainder is the same
+;; register in our ISA spec, it's can reduce 1 register presure if we don't
+;; want remainder.
+(define_insn "divsi4"
+  [(set (match_operand:SI 0 "register_operand"         "=r")
+	(div:SI (match_operand:SI 1 "register_operand" " r")
+		(match_operand:SI 2 "register_operand" " r")))]
+  ""
+  "divsr\t%0, %0, %1, %2"
+  [(set_attr "type"   "div")
    (set_attr "length"   "4")])

+(define_insn "udivsi4"
+  [(set (match_operand:SI 0 "register_operand"          "=r")
+	(udiv:SI (match_operand:SI 1 "register_operand" " r")
+		 (match_operand:SI 2 "register_operand"  " r")))]
+  ""
+  "divr\t%0, %0, %1, %2"
+  [(set_attr "type"   "div")
+   (set_attr "length"   "4")])

 ;; ----------------------------------------------------------------------------

@@ -488,14 +683,28 @@
    (set_attr "length" "4")]
 )

-(define_insn "andsi3"
-  [(set (match_operand:SI 0 "register_operand"         "=w, r,    l,    l,    l,    l,    l,    l,    r,   r,     r,    r,    r")
-	(and:SI (match_operand:SI 1 "register_operand" "%0, r,    l,    l,    l,    l,    0,    0,    r,   r,     r,    r,    r")
-		(match_operand:SI 2 "general_operand"  " w, r, Izeb, Izeh, Ixls, Ix11, Ibms, Ifex, Izeb, Izeh, Iu15, Ii15, Ic15")))]
+(define_expand "andsi3"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(and:SI (match_operand:SI 1 "register_operand" "")
+		(match_operand:SI 2 "nds32_reg_constant_operand" "")))]
+  ""
+{
+  if (CONST_INT_P (operands[2])
+      && !nds32_and_operand (operands[2], SImode))
+    {
+      nds32_expand_constant (SImode, INTVAL (operands[2]),
+			     operands[0], operands[1]);
+      DONE;
+    }
+})
+
+(define_insn "*andsi3"
+  [(set (match_operand:SI 0 "register_operand"          "=l, r,   l,   l,   l,   l,   l,   l,    r,   r,     r,    r,    r")
+	(and:SI (match_operand:SI 1 "register_operand"  "%0, r,   l,   l,   l,   l,   0,   0,    r,   r,     r,    r,    r")
+		(match_operand:SI 2 "nds32_and_operand" " l, r,Izeb,Izeh,Ixls,Ix11,Ibms,Ifex, Izeb, Izeh, Iu15, Ii15, Ic15")))]
   ""
 {
   HOST_WIDE_INT mask = INTVAL (operands[2]);
-  int zero_position;

   /* 16-bit andi instructions:
      andi Rt3,Ra3,0xff   -> zeb33  Rt3,Ra3
@@ -520,8 +729,7 @@
     case 5:
       return "x11b33\t%0, %1";
     case 6:
-      operands[2] = GEN_INT (floor_log2 (mask));
-      return "bmski33\t%0, %2";
+      return "bmski33\t%0, %B2";
     case 7:
       operands[2] = GEN_INT (floor_log2 (mask + 1) - 1);
       return "fexti33\t%0, %2";
@@ -535,47 +743,35 @@
       operands[2] = GEN_INT (~mask);
       return "bitci\t%0, %1, %2";
     case 12:
-      /* If we reach this alternative,
-         it must pass the nds32_can_use_bclr_p() test,
-         so that we can guarantee there is only one 0-bit
-         within the immediate value.  */
-      for (zero_position = 31; zero_position >= 0; zero_position--)
-	{
-	  if ((INTVAL (operands[2]) & (1 << zero_position)) == 0)
-	    {
-	      /* Found the 0-bit position.  */
-	      operands[2] = GEN_INT (zero_position);
-	      break;
-	    }
-	}
-      return "bclr\t%0, %1, %2";
+      return "bclr\t%0, %1, %b2";

     default:
       gcc_unreachable ();
     }
 }
-  [(set_attr "type"   "alu,alu,alu,alu,alu,alu,alu,alu,alu,alu,alu,alu,alu")
-   (set_attr "length" "  2,  4,  2,  2,  2,  2,  2,  2,  4,  4,  4,  4,  4")])
+  [(set_attr "type"    "alu,alu,alu,alu,alu,alu,alu,alu,alu,alu,alu,alu,alu")
+   (set_attr "length"  "  2,  4,  2,  2,  2,  2,  2,  2,  4,  4,  4,  4,  4")
+   (set_attr "feature" "v3m, v1, v1, v1, v1, v1,v3m,v3m, v1, v1, v1, v3,pe1")])

 (define_insn "*and_slli"
-  [(set (match_operand:SI 0 "register_operand"                      "=   r")
-	(and:SI (ashift:SI (match_operand:SI 1 "register_operand"   "    r")
-			    (match_operand:SI 2 "immediate_operand" " Iu05"))
-		(match_operand:SI 3 "register_operand"              "    r")))]
-  "TARGET_ISA_V3"
+  [(set (match_operand:SI 0 "register_operand"                        "=   r")
+	(and:SI (ashift:SI (match_operand:SI 1 "register_operand"     "    r")
+			    (match_operand:SI 2 "nds32_imm5u_operand" " Iu05"))
+		(match_operand:SI 3 "register_operand"                "    r")))]
+  "TARGET_ISA_V3 && (TARGET_PIPELINE_PANTHER || optimize_size)"
   "and_slli\t%0, %3, %1, %2"
-  [(set_attr "type" "alu")
-   (set_attr "length" "4")])
+  [(set_attr "type" "alu_shift")
+   (set_attr "length"       "4")])

 (define_insn "*and_srli"
-  [(set (match_operand:SI 0 "register_operand"                       "=   r")
-	(and:SI (lshiftrt:SI (match_operand:SI 1 "register_operand"  "    r")
-			     (match_operand:SI 2 "immediate_operand" " Iu05"))
-		(match_operand:SI 3 "register_operand"               "    r")))]
-  "TARGET_ISA_V3"
+  [(set (match_operand:SI 0 "register_operand"                         "=   r")
+	(and:SI (lshiftrt:SI (match_operand:SI 1 "register_operand"    "    r")
+			     (match_operand:SI 2 "nds32_imm5u_operand" " Iu05"))
+		(match_operand:SI 3 "register_operand"                 "    r")))]
+  "TARGET_ISA_V3 && (TARGET_PIPELINE_PANTHER || optimize_size)"
   "and_srli\t%0, %3, %1, %2"
-  [(set_attr "type" "alu")
-   (set_attr "length" "4")])
+  [(set_attr "type" "alu_shift")
+   (set_attr "length"       "4")])


 ;; ----------------------------------------------------------------------------
@@ -584,58 +780,50 @@

 ;; For V3/V3M ISA, we have 'or33' instruction.
 ;; So we can identify 'or Rt3,Rt3,Ra3' case and set its length to be 2.
-(define_insn "iorsi3"
-  [(set (match_operand:SI 0 "register_operand"         "=w, r,    r,    r")
-	(ior:SI (match_operand:SI 1 "register_operand" "%0, r,    r,    r")
-		(match_operand:SI 2 "general_operand"  " w, r, Iu15, Ie15")))]
+
+(define_expand "iorsi3"
+  [(set (match_operand:SI 0 "register_operand"         "")
+	(ior:SI (match_operand:SI 1 "register_operand" "")
+		(match_operand:SI 2 "general_operand"  "")))]
   ""
 {
-  int one_position;
-
-  switch (which_alternative)
-    {
-    case 0:
-      return "or33\t%0, %2";
-    case 1:
-      return "or\t%0, %1, %2";
-    case 2:
-      return "ori\t%0, %1, %2";
-    case 3:
-      /* If we reach this alternative,
-         it must pass the nds32_can_use_bset_p() test,
-         so that we can guarantee there is only one 1-bit
-         within the immediate value.  */
-      /* Use exact_log2() to search the 1-bit position.  */
-      one_position = exact_log2 (INTVAL (operands[2]));
-      operands[2] = GEN_INT (one_position);
-      return "bset\t%0, %1, %2";
+  if (!nds32_ior_operand (operands[2], SImode))
+    operands[2] = force_reg (SImode, operands[2]);
+})

-    default:
-      gcc_unreachable ();
-    }
-}
-  [(set_attr "type"   "alu,alu,alu,alu")
-   (set_attr "length" "  2,  4,  4,  4")])
+(define_insn "*iorsi3"
+  [(set (match_operand:SI 0 "register_operand"          "=l, r,    r,    r")
+	(ior:SI (match_operand:SI 1 "register_operand"  "%0, r,    r,    r")
+		(match_operand:SI 2 "nds32_ior_operand" " l, r, Iu15, Ie15")))]
+  ""
+  "@
+   or33\t%0, %2
+   or\t%0, %1, %2
+   ori\t%0, %1, %2
+   bset\t%0, %1, %B2"
+  [(set_attr "type"    "alu,alu,alu,alu")
+   (set_attr "length"  "  2,  4,  4,  4")
+   (set_attr "feature" "v3m, v1, v1,pe1")])

 (define_insn "*or_slli"
-  [(set (match_operand:SI 0 "register_operand"                     "=   r")
-	(ior:SI (ashift:SI (match_operand:SI 1 "register_operand"  "    r")
-			   (match_operand:SI 2 "immediate_operand" " Iu05"))
-		(match_operand:SI 3 "register_operand"             "    r")))]
-  "TARGET_ISA_V3"
+  [(set (match_operand:SI 0 "register_operand"                       "=   r")
+	(ior:SI (ashift:SI (match_operand:SI 1 "register_operand"    "    r")
+			   (match_operand:SI 2 "nds32_imm5u_operand" " Iu05"))
+		(match_operand:SI 3 "register_operand"               "    r")))]
+  "TARGET_ISA_V3 && (TARGET_PIPELINE_PANTHER || optimize_size)"
   "or_slli\t%0, %3, %1, %2"
-  [(set_attr "type" "alu")
-   (set_attr "length" "4")])
+  [(set_attr "type" "alu_shift")
+   (set_attr "length"       "4")])

 (define_insn "*or_srli"
-  [(set (match_operand:SI 0 "register_operand"                       "=   r")
-	(ior:SI (lshiftrt:SI (match_operand:SI 1 "register_operand"  "    r")
-			     (match_operand:SI 2 "immediate_operand" " Iu05"))
-		(match_operand:SI 3 "register_operand"               "    r")))]
-  "TARGET_ISA_V3"
+  [(set (match_operand:SI 0 "register_operand"                         "=   r")
+	(ior:SI (lshiftrt:SI (match_operand:SI 1 "register_operand"    "    r")
+			     (match_operand:SI 2 "nds32_imm5u_operand" " Iu05"))
+		(match_operand:SI 3 "register_operand"                 "    r")))]
+  "TARGET_ISA_V3 && (TARGET_PIPELINE_PANTHER || optimize_size)"
   "or_srli\t%0, %3, %1, %2"
-  [(set_attr "type" "alu")
-   (set_attr "length" "4")])
+  [(set_attr "type" "alu_shift")
+   (set_attr "length"       "4")])


 ;; ----------------------------------------------------------------------------
@@ -644,71 +832,64 @@

 ;; For V3/V3M ISA, we have 'xor33' instruction.
 ;; So we can identify 'xor Rt3,Rt3,Ra3' case and set its length to be 2.
-(define_insn "xorsi3"
-  [(set (match_operand:SI 0 "register_operand"         "=w, r,    r,    r")
-	(xor:SI (match_operand:SI 1 "register_operand" "%0, r,    r,    r")
-		(match_operand:SI 2 "general_operand"  " w, r, Iu15, It15")))]
+
+(define_expand "xorsi3"
+  [(set (match_operand:SI 0 "register_operand"         "")
+	(xor:SI (match_operand:SI 1 "register_operand" "")
+		(match_operand:SI 2 "general_operand"  "")))]
   ""
 {
-  int one_position;
-
-  switch (which_alternative)
-    {
-    case 0:
-      return "xor33\t%0, %2";
-    case 1:
-      return "xor\t%0, %1, %2";
-    case 2:
-      return "xori\t%0, %1, %2";
-    case 3:
-      /* If we reach this alternative,
-         it must pass the nds32_can_use_btgl_p() test,
-         so that we can guarantee there is only one 1-bit
-         within the immediate value.  */
-      /* Use exact_log2() to search the 1-bit position.  */
-      one_position = exact_log2 (INTVAL (operands[2]));
-      operands[2] = GEN_INT (one_position);
-      return "btgl\t%0, %1, %2";
+  if (!nds32_xor_operand (operands[2], SImode))
+    operands[2] = force_reg (SImode, operands[2]);
+})

-    default:
-      gcc_unreachable ();
-    }
-}
-  [(set_attr "type"   "alu,alu,alu,alu")
-   (set_attr "length" "  2,  4,  4,  4")])
+(define_insn "*xorsi3"
+  [(set (match_operand:SI 0 "register_operand"          "=l, r,    r,    r")
+	(xor:SI (match_operand:SI 1 "register_operand"  "%0, r,    r,    r")
+		(match_operand:SI 2 "nds32_xor_operand" " l, r, Iu15, It15")))]
+  ""
+  "@
+   xor33\t%0, %2
+   xor\t%0, %1, %2
+   xori\t%0, %1, %2
+   btgl\t%0, %1, %B2"
+  [(set_attr "type"    "alu,alu,alu,alu")
+   (set_attr "length"  "  2,  4,  4,  4")
+   (set_attr "feature" "v3m, v1, v1,pe1")])

 (define_insn "*xor_slli"
   [(set (match_operand:SI 0 "register_operand"                     "=   r")
 	(xor:SI (ashift:SI (match_operand:SI 1 "register_operand"  "    r")
-			   (match_operand:SI 2 "immediate_operand" " Iu05"))
+			   (match_operand:SI 2 "nds32_imm5u_operand" " Iu05"))
 		(match_operand:SI 3 "register_operand"             "    r")))]
-  "TARGET_ISA_V3"
+  "TARGET_ISA_V3 && (TARGET_PIPELINE_PANTHER || optimize_size)"
   "xor_slli\t%0, %3, %1, %2"
-  [(set_attr "type" "alu")
-   (set_attr "length" "4")])
+  [(set_attr "type" "alu_shift")
+   (set_attr "length"       "4")])

 (define_insn "*xor_srli"
-  [(set (match_operand:SI 0 "register_operand"                       "=   r")
-	(xor:SI (lshiftrt:SI (match_operand:SI 1 "register_operand"  "    r")
-			     (match_operand:SI 2 "immediate_operand" " Iu05"))
-		(match_operand:SI 3 "register_operand"               "    r")))]
-  "TARGET_ISA_V3"
+  [(set (match_operand:SI 0 "register_operand"                         "=   r")
+	(xor:SI (lshiftrt:SI (match_operand:SI 1 "register_operand"    "    r")
+			     (match_operand:SI 2 "nds32_imm5u_operand" " Iu05"))
+		(match_operand:SI 3 "register_operand"                 "    r")))]
+  "TARGET_ISA_V3 && (TARGET_PIPELINE_PANTHER || optimize_size)"
   "xor_srli\t%0, %3, %1, %2"
-  [(set_attr "type" "alu")
-   (set_attr "length" "4")])
+  [(set_attr "type" "alu_shift")
+   (set_attr "length"       "4")])

 ;; Rotate Right Instructions.

-(define_insn "rotrsi3"
-  [(set (match_operand:SI 0 "register_operand"                 "=   r, r")
-	  (rotatert:SI (match_operand:SI 1 "register_operand"  "    r, r")
-		       (match_operand:SI 2 "nonmemory_operand" " Iu05, r")))]
+(define_insn "*rotrsi3"
+  [(set (match_operand:SI 0 "register_operand"                    "=   r, r")
+	  (rotatert:SI (match_operand:SI 1 "register_operand"     "    r, r")
+		       (match_operand:SI 2 "nds32_rimm5u_operand" " Iu05, r")))]
   ""
   "@
-  rotri\t%0, %1, %2
-  rotr\t%0, %1, %2"
-  [(set_attr "type"   "alu,alu")
-   (set_attr "length" "  4,  4")])
+   rotri\t%0, %1, %2
+   rotr\t%0, %1, %2"
+  [(set_attr "type"    "  alu,  alu")
+   (set_attr "subtype" "shift,shift")
+   (set_attr "length"  "    4,    4")])


 ;; ----------------------------------------------------------------------------
@@ -720,14 +901,95 @@
 ;; And for V2 ISA, there is NO 'neg33' instruction.
 ;; The only option is to use 'subri A,B,0' (its semantic is 'A = 0 - B').
 (define_insn "negsi2"
-  [(set (match_operand:SI 0 "register_operand"         "=w, r")
-	(neg:SI (match_operand:SI 1 "register_operand" " w, r")))]
+  [(set (match_operand:SI 0 "register_operand"         "=l, r")
+	(neg:SI (match_operand:SI 1 "register_operand" " l, r")))]
   ""
   "@
    neg33\t%0, %1
    subri\t%0, %1, 0"
-  [(set_attr "type"   "alu,alu")
-   (set_attr "length" "  2,  4")])
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")
+   (set_attr "feature" "v3m, v1")])
+
+(define_expand "negsf2"
+  [(set (match_operand:SF 0 "register_operand" "")
+	(neg:SF (match_operand:SF 1 "register_operand" "")))]
+  ""
+{
+  if (!TARGET_FPU_SINGLE && !TARGET_EXT_PERF)
+    {
+      rtx new_dst = simplify_gen_subreg (SImode, operands[0], SFmode, 0);
+      rtx new_src = simplify_gen_subreg (SImode, operands[1], SFmode, 0);
+
+      emit_insn (gen_xorsi3 (new_dst,
+			     new_src,
+			     gen_int_mode (0x80000000, SImode)));
+
+      DONE;
+    }
+})
+
+(define_expand "negdf2"
+  [(set (match_operand:DF 0 "register_operand" "")
+	(neg:DF (match_operand:DF 1 "register_operand" "")))]
+  ""
+{
+})
+
+(define_insn_and_split "soft_negdf2"
+  [(set (match_operand:DF 0 "register_operand" "")
+	(neg:DF (match_operand:DF 1 "register_operand" "")))]
+  "!TARGET_FPU_DOUBLE"
+  "#"
+  "!TARGET_FPU_DOUBLE"
+  [(const_int 1)]
+{
+    rtx src = operands[1];
+    rtx dst = operands[0];
+    rtx ori_dst = operands[0];
+
+    bool need_extra_move_for_dst_p;
+    /* FPU register can't change mode to SI directly, so we need create a
+       tmp register to handle it, and FPU register can't do `xor` or btgl.  */
+    if (HARD_REGISTER_P (src)
+	&& TEST_HARD_REG_BIT (reg_class_contents[FP_REGS], REGNO (src)))
+      {
+	rtx tmp = gen_reg_rtx (DFmode);
+	emit_move_insn (tmp, src);
+	src = tmp;
+      }
+
+    if (HARD_REGISTER_P (dst)
+	&& TEST_HARD_REG_BIT (reg_class_contents[FP_REGS], REGNO (dst)))
+      {
+	need_extra_move_for_dst_p = true;
+	rtx tmp = gen_reg_rtx (DFmode);
+	dst = tmp;
+      }
+
+    rtx dst_high_part = simplify_gen_subreg (
+			  SImode, dst,
+			  DFmode, subreg_highpart_offset (SImode, DFmode));
+    rtx dst_low_part = simplify_gen_subreg (
+			  SImode, dst,
+			  DFmode, subreg_lowpart_offset (SImode, DFmode));
+    rtx src_high_part = simplify_gen_subreg (
+			  SImode, src,
+			  DFmode, subreg_highpart_offset (SImode, DFmode));
+    rtx src_low_part = simplify_gen_subreg (
+			  SImode, src,
+			  DFmode, subreg_lowpart_offset (SImode, DFmode));
+
+    emit_insn (gen_xorsi3 (dst_high_part,
+			   src_high_part,
+			   gen_int_mode (0x80000000, SImode)));
+    emit_move_insn (dst_low_part, src_low_part);
+
+    if (need_extra_move_for_dst_p)
+      emit_move_insn (ori_dst, dst);
+
+    DONE;
+})


 ;; ----------------------------------------------------------------------------
@@ -737,55 +999,72 @@
 ;; For V3/V3M ISA, we have 'not33' instruction.
 ;; So we can identify 'not Rt3,Ra3' case and set its length to be 2.
 (define_insn "one_cmplsi2"
-  [(set (match_operand:SI 0 "register_operand"         "=w, r")
-	(not:SI (match_operand:SI 1 "register_operand" " w, r")))]
+  [(set (match_operand:SI 0 "register_operand"         "=l, r")
+	(not:SI (match_operand:SI 1 "register_operand" " l, r")))]
   ""
   "@
    not33\t%0, %1
    nor\t%0, %1, %1"
-  [(set_attr "type"   "alu,alu")
-   (set_attr "length" "  2,  4")])
+  [(set_attr "type"    "alu,alu")
+   (set_attr "length"  "  2,  4")
+   (set_attr "feature" "v3m, v1")])


 ;; ----------------------------------------------------------------------------

 ;; Shift instructions.

-(define_insn "ashlsi3"
-  [(set (match_operand:SI 0 "register_operand"             "=   l,    r, r")
-	(ashift:SI (match_operand:SI 1 "register_operand"  "    l,    r, r")
-		   (match_operand:SI 2 "nonmemory_operand" " Iu03, Iu05, r")))]
+(define_expand "<shift>si3"
+  [(set (match_operand:SI 0 "register_operand"                      "")
+	(shift_rotate:SI (match_operand:SI 1 "register_operand"     "")
+			 (match_operand:SI 2 "nds32_rimm5u_operand" "")))]
   ""
-  "@
-  slli333\t%0, %1, %2
-  slli\t%0, %1, %2
-  sll\t%0, %1, %2"
-  [(set_attr "type"   "alu,alu,alu")
-   (set_attr "length" "  2,  4,  4")])
+{
+  if (operands[2] == const0_rtx)
+    {
+      emit_move_insn (operands[0], operands[1]);
+      DONE;
+    }
+})

-(define_insn "ashrsi3"
-  [(set (match_operand:SI 0 "register_operand"               "=   d,    r, r")
-	(ashiftrt:SI (match_operand:SI 1 "register_operand"  "    0,    r, r")
-		     (match_operand:SI 2 "nonmemory_operand" " Iu05, Iu05, r")))]
+(define_insn "*ashlsi3"
+  [(set (match_operand:SI 0 "register_operand"                "=   l,    r, r")
+	(ashift:SI (match_operand:SI 1 "register_operand"     "    l,    r, r")
+		   (match_operand:SI 2 "nds32_rimm5u_operand" " Iu03, Iu05, r")))]
   ""
   "@
-  srai45\t%0, %2
-  srai\t%0, %1, %2
-  sra\t%0, %1, %2"
-  [(set_attr "type"   "alu,alu,alu")
-   (set_attr "length" "  2,  4,  4")])
-
-(define_insn "lshrsi3"
-  [(set (match_operand:SI 0 "register_operand"               "=   d,    r, r")
-	(lshiftrt:SI (match_operand:SI 1 "register_operand"  "    0,    r, r")
-		     (match_operand:SI 2 "nonmemory_operand" " Iu05, Iu05, r")))]
+   slli333\t%0, %1, %2
+   slli\t%0, %1, %2
+   sll\t%0, %1, %2"
+  [(set_attr "type"    "  alu,  alu,  alu")
+   (set_attr "subtype" "shift,shift,shift")
+   (set_attr "length"  "    2,    4,    4")])
+
+(define_insn "*ashrsi3"
+  [(set (match_operand:SI 0 "register_operand"                  "=   d,    r, r")
+	(ashiftrt:SI (match_operand:SI 1 "register_operand"     "    0,    r, r")
+		     (match_operand:SI 2 "nds32_rimm5u_operand" " Iu05, Iu05, r")))]
+  ""
+  "@
+   srai45\t%0, %2
+   srai\t%0, %1, %2
+   sra\t%0, %1, %2"
+  [(set_attr "type"    "  alu,  alu,  alu")
+   (set_attr "subtype" "shift,shift,shift")
+   (set_attr "length"  "    2,    4,    4")])
+
+(define_insn "*lshrsi3"
+  [(set (match_operand:SI 0 "register_operand"                  "=   d,    r, r")
+	(lshiftrt:SI (match_operand:SI 1 "register_operand"     "    0,    r, r")
+		     (match_operand:SI 2 "nds32_rimm5u_operand" " Iu05, Iu05, r")))]
   ""
   "@
-  srli45\t%0, %2
-  srli\t%0, %1, %2
-  srl\t%0, %1, %2"
-  [(set_attr "type"   "alu,alu,alu")
-   (set_attr "length" "  2,  4,  4")])
+   srli45\t%0, %2
+   srli\t%0, %1, %2
+   srl\t%0, %1, %2"
+  [(set_attr "type"    "  alu,  alu,  alu")
+   (set_attr "subtype" "shift,shift,shift")
+   (set_attr "length"  "    2,    4,    4")])


 ;; ----------------------------------------------------------------------------
@@ -794,148 +1073,65 @@
 ;; Conditional Move patterns
 ;; ----------------------------------------------------------------------------

-(define_expand "movsicc"
-  [(set (match_operand:SI 0 "register_operand" "")
-	(if_then_else:SI (match_operand 1 "comparison_operator" "")
-			 (match_operand:SI 2 "register_operand" "")
-			 (match_operand:SI 3 "register_operand" "")))]
-  "TARGET_CMOV"
+(define_expand "mov<mode>cc"
+  [(set (match_operand:QIHISI 0 "register_operand" "")
+	(if_then_else:QIHISI (match_operand 1 "nds32_movecc_comparison_operator" "")
+			 (match_operand:QIHISI 2 "register_operand" "")
+			 (match_operand:QIHISI 3 "register_operand" "")))]
+  "TARGET_CMOV && !optimize_size"
 {
-  if ((GET_CODE (operands[1]) == EQ || GET_CODE (operands[1]) == NE)
-      && GET_MODE (XEXP (operands[1], 0)) == SImode
-      && XEXP (operands[1], 1) == const0_rtx)
-    {
-      /* If the operands[1] rtx is already (eq X 0) or (ne X 0),
-         we have gcc generate original template rtx.  */
-      goto create_template;
-    }
-  else
+  enum nds32_expand_result_type result = nds32_expand_movcc (operands);
+  switch (result)
     {
-      /* Since there is only 'slt'(Set when Less Than) instruction for
-         comparison in Andes ISA, the major strategy we use here is to
-         convert conditional move into 'LT + EQ' or 'LT + NE' rtx combination.
-         We design constraints properly so that the reload phase will assist
-         to make one source operand to use same register as result operand.
-         Then we can use cmovz/cmovn to catch the other source operand
-         which has different register.  */
-      enum rtx_code code = GET_CODE (operands[1]);
-      enum rtx_code new_code = code;
-      rtx cmp_op0 = XEXP (operands[1], 0);
-      rtx cmp_op1 = XEXP (operands[1], 1);
-      rtx tmp;
-      int reverse = 0;
-
-      /* Main Goal: Use 'LT + EQ' or 'LT + NE' to target "then" part
-         Strategy : Reverse condition and swap comparison operands
-
-         For example:
-
-             a <= b ? P : Q   (LE or LEU)
-         --> a >  b ? Q : P   (reverse condition)
-         --> b <  a ? Q : P   (swap comparison operands to achieve 'LT/LTU')
-
-             a >= b ? P : Q   (GE or GEU)
-         --> a <  b ? Q : P   (reverse condition to achieve 'LT/LTU')
-
-             a <  b ? P : Q   (LT or LTU)
-         --> (NO NEED TO CHANGE, it is already 'LT/LTU')
-
-             a >  b ? P : Q   (GT or GTU)
-         --> b <  a ? P : Q   (swap comparison operands to achieve 'LT/LTU') */
-      switch (code)
-	{
-	case NE:
-	  /*   (a != b ? P : Q)
-	     can be expressed as
-	       (a == b ? Q : P)
-	     so, fall through to reverse condition */
-	case GE: case GEU: case LE: case LEU:
-	  new_code = reverse_condition (code);
-	  reverse = 1;
-	  break;
-	case EQ: case GT: case GTU: case LT: case LTU:
-	  /* no need to reverse condition */
-	  break;
-	default:
-	  FAIL;
-	}
-
-      /* For '>' comparison operator, we swap operands
-         so that we can have 'LT/LTU' operator.  */
-      if (new_code == GT || new_code == GTU)
-	{
-	  tmp     = cmp_op0;
-	  cmp_op0 = cmp_op1;
-	  cmp_op1 = tmp;
-
-	  new_code = swap_condition (new_code);
-	}
-
-      /* Use a temporary register to store slt/slts result.  */
-      tmp = gen_reg_rtx (SImode);
-
-      /* Split EQ and NE because we don't have direct comparison of EQ and NE.
-         If we don't split it, the conditional move transformation will fail
-         when producing (SET A (EQ B C)) or (SET A (NE B C)).  */
-      if (new_code == EQ)
-	{
-	  emit_insn (gen_xorsi3 (tmp, cmp_op0, cmp_op1));
-	  emit_insn (gen_slt_compare (tmp, tmp, GEN_INT (1)));
-	}
-      else if (new_code == NE)
-	{
-	  emit_insn (gen_xorsi3 (tmp, cmp_op0, cmp_op1));
-	  emit_insn (gen_slt_compare (tmp, GEN_INT (0), tmp));
-        }
-      else
-	/* This emit_insn will create corresponding 'slt/slts' insturction.  */
-	emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_ee (new_code, SImode,
-						     cmp_op0, cmp_op1)));
-
-      /* Change comparison semantic into (eq X 0) or (ne X 0) behavior
-         so that cmovz or cmovn will be matched later.
-
-         For reverse condition cases, we want to create a semantic that:
-           (eq X 0) --> pick up "else" part
-         For normal cases, we want to create a semantic that:
-           (ne X 0) --> pick up "then" part
-
-         Later we will have cmovz/cmovn instruction pattern to
-         match corresponding behavior and output instruction.  */
-      operands[1] = gen_rtx_fmt_ee (reverse ? EQ : NE,
-				    VOIDmode, tmp, const0_rtx);
+    case EXPAND_DONE:
+      DONE;
+      break;
+    case EXPAND_FAIL:
+      FAIL;
+      break;
+    case EXPAND_CREATE_TEMPLATE:
+      break;
+    default:
+      gcc_unreachable ();
     }
-
-create_template:
-  do {} while(0); /* dummy line */
 })

-(define_insn "cmovz"
-  [(set (match_operand:SI 0 "register_operand"                      "=r, r")
-        (if_then_else:SI (eq (match_operand:SI 1 "register_operand" " r, r")
+(define_insn "cmovz<mode>"
+  [(set (match_operand:QIHISI 0 "register_operand"                      "=r, r")
+	(if_then_else:QIHISI (eq (match_operand:SI 1 "register_operand" " r, r")
 			     (const_int 0))
-			 (match_operand:SI 2 "register_operand"     " r, 0")
-			 (match_operand:SI 3 "register_operand"     " 0, r")))]
+			 (match_operand:QIHISI 2 "register_operand"     " r, 0")
+			 (match_operand:QIHISI 3 "register_operand"     " 0, r")))]
   "TARGET_CMOV"
   "@
    cmovz\t%0, %2, %1
    cmovn\t%0, %3, %1"
-  [(set_attr "type" "move")
+  [(set_attr "type"  "alu")
    (set_attr "length"  "4")])

-(define_insn "cmovn"
-  [(set (match_operand:SI 0 "register_operand"                      "=r, r")
-	(if_then_else:SI (ne (match_operand:SI 1 "register_operand" " r, r")
+(define_insn "cmovn<mode>"
+  [(set (match_operand:QIHISI 0 "register_operand"                      "=r, r")
+	(if_then_else:QIHISI (ne (match_operand:SI 1 "register_operand" " r, r")
 			     (const_int 0))
-			 (match_operand:SI 2 "register_operand"     " r, 0")
-			 (match_operand:SI 3 "register_operand"     " 0, r")))]
+			 (match_operand:QIHISI 2 "register_operand"     " r, 0")
+			 (match_operand:QIHISI 3 "register_operand"     " 0, r")))]
   "TARGET_CMOV"
   "@
    cmovn\t%0, %2, %1
    cmovz\t%0, %3, %1"
-  [(set_attr "type" "move")
+  [(set_attr "type"  "alu")
    (set_attr "length"  "4")])

+;; A hotfix to help RTL combiner to merge a cmovn insn and a zero_extend insn.
+;; It should be removed once after we change the expansion form of the cmovn.
+(define_insn "*cmovn_simplified_<mode>"
+  [(set (match_operand:QIHISI 0 "register_operand" "=r")
+	(if_then_else:QIHISI (match_operand:SI 1 "register_operand" "r")
+			 (match_operand:QIHISI 2 "register_operand" "r")
+			 (match_operand:QIHISI 3 "register_operand" "0")))]
+  ""
+  "cmovn\t%0, %2, %1"
+  [(set_attr "type" "alu")])

 ;; ----------------------------------------------------------------------------
 ;; Conditional Branch patterns
@@ -950,573 +1146,188 @@ create_template:
 		      (pc)))]
   ""
 {
-  rtx tmp_reg;
-  enum rtx_code code;
-
-  code = GET_CODE (operands[0]);
-
-  /* If operands[2] is (const_int 0),
-     we can use beqz,bnez,bgtz,bgez,bltz,or blez instructions.
-     So we have gcc generate original template rtx.  */
-  if (GET_CODE (operands[2]) == CONST_INT)
-    if (INTVAL (operands[2]) == 0)
-      if ((code != GTU)
-	  && (code != GEU)
-	  && (code != LTU)
-	  && (code != LEU))
-	goto create_template;
-
-  /* For other comparison, NDS32 ISA only has slt (Set-on-Less-Than)
-     behavior for the comparison, we might need to generate other
-     rtx patterns to achieve same semantic.  */
-  switch (code)
+  enum nds32_expand_result_type result = nds32_expand_cbranch (operands);
+  switch (result)
     {
-    case GT:
-    case GTU:
-      if (GET_CODE (operands[2]) == CONST_INT)
-	{
-	  /* GT  reg_A, const_int  =>  !(LT  reg_A, const_int + 1) */
-	  tmp_reg = gen_rtx_REG (SImode, TA_REGNUM);
-
-	  /* We want to plus 1 into the integer value
-	     of operands[2] to create 'slt' instruction.
-	     This caculation is performed on the host machine,
-	     which may be 64-bit integer.
-	     So the meaning of caculation result may be
-	     different from the 32-bit nds32 target.
-
-	     For example:
-	       0x7fffffff + 0x1 -> 0x80000000,
-	       this value is POSITIVE on 64-bit machine,
-	       but the expected value on 32-bit nds32 target
-	       should be NEGATIVE value.
-
-	     Hence, instead of using GEN_INT(), we use gen_int_mode() to
-	     explicitly create SImode constant rtx.  */
-	  operands[2] = gen_int_mode (INTVAL (operands[2]) + 1, SImode);
-
-	  if (code == GT)
-	    {
-	      /* GT, use slts instruction */
-	      emit_insn (gen_slts_compare (tmp_reg, operands[1], operands[2]));
-	    }
-	  else
-	    {
-	      /* GTU, use slt instruction */
-	      emit_insn (gen_slt_compare  (tmp_reg, operands[1], operands[2]));
-	    }
-
-	  PUT_CODE (operands[0], EQ);
-	  operands[1] = tmp_reg;
-	  operands[2] = const0_rtx;
-	  emit_insn (gen_cbranchsi4 (operands[0], operands[1],
-				     operands[2], operands[3]));
-
-	  DONE;
-	}
-      else
-	{
-	  /* GT  reg_A, reg_B  =>  LT  reg_B, reg_A */
-	  tmp_reg = gen_rtx_REG (SImode, TA_REGNUM);
-
-	  if (code == GT)
-	    {
-	      /* GT, use slts instruction */
-	      emit_insn (gen_slts_compare (tmp_reg, operands[2], operands[1]));
-	    }
-	  else
-	    {
-	      /* GTU, use slt instruction */
-	      emit_insn (gen_slt_compare  (tmp_reg, operands[2], operands[1]));
-	    }
-
-	  PUT_CODE (operands[0], NE);
-	  operands[1] = tmp_reg;
-	  operands[2] = const0_rtx;
-	  emit_insn (gen_cbranchsi4 (operands[0], operands[1],
-				     operands[2], operands[3]));
-
-	  DONE;
-	}
-
-    case GE:
-    case GEU:
-      /* GE  reg_A, reg_B      =>  !(LT  reg_A, reg_B) */
-      /* GE  reg_A, const_int  =>  !(LT  reg_A, const_int) */
-      tmp_reg = gen_rtx_REG (SImode, TA_REGNUM);
-
-      if (code == GE)
-	{
-	  /* GE, use slts instruction */
-	  emit_insn (gen_slts_compare (tmp_reg, operands[1], operands[2]));
-	}
-      else
-	{
-	  /* GEU, use slt instruction */
-	  emit_insn (gen_slt_compare  (tmp_reg, operands[1], operands[2]));
-	}
-
-      PUT_CODE (operands[0], EQ);
-      operands[1] = tmp_reg;
-      operands[2] = const0_rtx;
-      emit_insn (gen_cbranchsi4 (operands[0], operands[1],
-				 operands[2], operands[3]));
-
+    case EXPAND_DONE:
       DONE;
-
-    case LT:
-    case LTU:
-      /* LT  reg_A, reg_B      =>  LT  reg_A, reg_B */
-      /* LT  reg_A, const_int  =>  LT  reg_A, const_int */
-      tmp_reg = gen_rtx_REG (SImode, TA_REGNUM);
-
-      if (code == LT)
-	{
-	  /* LT, use slts instruction */
-	  emit_insn (gen_slts_compare (tmp_reg, operands[1], operands[2]));
-	}
-      else
-	{
-	  /* LTU, use slt instruction */
-	  emit_insn (gen_slt_compare  (tmp_reg, operands[1], operands[2]));
-	}
-
-      PUT_CODE (operands[0], NE);
-      operands[1] = tmp_reg;
-      operands[2] = const0_rtx;
-      emit_insn (gen_cbranchsi4 (operands[0], operands[1],
-				 operands[2], operands[3]));
-
-      DONE;
-
-    case LE:
-    case LEU:
-      if (GET_CODE (operands[2]) == CONST_INT)
-	{
-	  /* LE  reg_A, const_int  =>  LT  reg_A, const_int + 1 */
-	  tmp_reg = gen_rtx_REG (SImode, TA_REGNUM);
-
-	  /* Note that (le:SI X INT_MAX) is not the same as (lt:SI X INT_MIN).
-	     We better have an assert here in case GCC does not properly
-	     optimize it away.  The INT_MAX here is 0x7fffffff for target.  */
-	  gcc_assert (code != LE || INTVAL (operands[2]) != 0x7fffffff);
-	  operands[2] = gen_int_mode (INTVAL (operands[2]) + 1, SImode);
-
-	  if (code == LE)
-	    {
-	      /* LE, use slts instruction */
-	      emit_insn (gen_slts_compare (tmp_reg, operands[1], operands[2]));
-	    }
-	  else
-	    {
-	      /* LEU, use slt instruction */
-	      emit_insn (gen_slt_compare  (tmp_reg, operands[1], operands[2]));
-	    }
-
-	  PUT_CODE (operands[0], NE);
-	  operands[1] = tmp_reg;
-	  operands[2] = const0_rtx;
-	  emit_insn (gen_cbranchsi4 (operands[0], operands[1],
-				     operands[2], operands[3]));
-
-	  DONE;
-	}
-      else
-	{
-	  /* LE  reg_A, reg_B  =>  !(LT  reg_B, reg_A) */
-	  tmp_reg = gen_rtx_REG (SImode, TA_REGNUM);
-
-	  if (code == LE)
-	    {
-	      /* LE, use slts instruction */
-	      emit_insn (gen_slts_compare (tmp_reg, operands[2], operands[1]));
-	    }
-	  else
-	    {
-	      /* LEU, use slt instruction */
-	      emit_insn (gen_slt_compare  (tmp_reg, operands[2], operands[1]));
-	    }
-
-	  PUT_CODE (operands[0], EQ);
-	  operands[1] = tmp_reg;
-	  operands[2] = const0_rtx;
-	  emit_insn (gen_cbranchsi4 (operands[0], operands[1],
-				     operands[2], operands[3]));
-
-	  DONE;
-	}
-
-    case EQ:
-    case NE:
-      /* NDS32 ISA has various form for eq/ne behavior no matter
-         what kind of the operand is.
-         So just generate original template rtx.  */
-      goto create_template;
-
-    default:
+      break;
+    case EXPAND_FAIL:
       FAIL;
+      break;
+    case EXPAND_CREATE_TEMPLATE:
+      break;
+    default:
+      gcc_unreachable ();
     }
-
-create_template:
-  do {} while(0); /* dummy line */
 })


-(define_insn "*cbranchsi4_equality_zero"
+(define_insn "cbranchsi4_equality_zero"
   [(set (pc)
 	(if_then_else (match_operator 0 "nds32_equality_comparison_operator"
-			[(match_operand:SI 1 "register_operand"  "t, l, r")
+			[(match_operand:SI 1 "register_operand"  "t,l, r")
 			 (const_int 0)])
 		      (label_ref (match_operand 2 "" ""))
 		      (pc)))]
   ""
 {
-  enum rtx_code code;
-
-  code = GET_CODE (operands[0]);
-
-  /* This zero-comparison conditional branch has two forms:
-       32-bit instruction =>          beqz/bnez           imm16s << 1
-       16-bit instruction => beqzs8/bnezs8/beqz38/bnez38  imm8s << 1
-
-     For 32-bit case,
-     we assume it is always reachable. (but check range -65500 ~ 65500)
-
-     For 16-bit case,
-     it must satisfy { 255 >= (label - pc) >= -256 } condition.
-     However, since the $pc for nds32 is at the beginning of the instruction,
-     we should leave some length space for current insn.
-     So we use range -250 ~ 250.  */
-
-  switch (get_attr_length (insn))
-    {
-    case 2:
-      if (which_alternative == 0)
-	{
-	  /* constraint: t */
-	  return (code == EQ) ? "beqzs8\t%2" : "bnezs8\t%2";
-	}
-      else if (which_alternative == 1)
-	{
-	  /* constraint: l */
-	  return (code == EQ) ? "beqz38\t%1, %2" : "bnez38\t%1, %2";
-	}
-      else
-	{
-	  /* constraint: r */
-	  /* For which_alternative==2, it should not be here.  */
-	  gcc_unreachable ();
-	}
-    case 4:
-      /* including constraints: t, l, and r */
-      return (code == EQ) ? "beqz\t%1, %2" : "bnez\t%1, %2";
-    case 6:
-      if (which_alternative == 0)
-	{
-	  /* constraint: t */
-	  if (code == EQ)
-	    {
-	      /*    beqzs8  .L0
-	          =>
-	            bnezs8  .LCB0
-	            j  .L0
-	          .LCB0:
-	       */
-	      return "bnezs8\t.LCB%=\;j\t%2\n.LCB%=:";
-	    }
-	  else
-	    {
-	      /*    bnezs8  .L0
-	          =>
-	            beqzs8  .LCB0
-	            j  .L0
-	          .LCB0:
-	       */
-	      return "beqzs8\t.LCB%=\;j\t%2\n.LCB%=:";
-	    }
-	}
-      else if (which_alternative == 1)
-	{
-	  /* constraint: l */
-	  if (code == EQ)
-	    {
-	      /*    beqz38  $r0, .L0
-	          =>
-	            bnez38  $r0, .LCB0
-	            j  .L0
-	          .LCB0:
-	       */
-	      return "bnez38\t%1, .LCB%=\;j\t%2\n.LCB%=:";
-	    }
-	  else
-	    {
-	      /*    bnez38  $r0, .L0
-	          =>
-	            beqz38  $r0, .LCB0
-	            j  .L0
-	          .LCB0:
-	       */
-	      return "beqz38\t%1, .LCB%=\;j\t%2\n.LCB%=:";
-	    }
-	}
-      else
-	{
-	  /* constraint: r */
-	  /* For which_alternative==2, it should not be here.  */
-	  gcc_unreachable ();
-	}
-    case 8:
-      /* constraint: t, l, r.  */
-      if (code == EQ)
-	{
-	  /*    beqz  $r8, .L0
-	      =>
-	        bnez  $r8, .LCB0
-	        j  .L0
-	      .LCB0:
-	   */
-	  return "bnez\t%1, .LCB%=\;j\t%2\n.LCB%=:";
-	}
-      else
-	{
-	  /*    bnez  $r8, .L0
-	      =>
-	        beqz  $r8, .LCB0
-	        j  .L0
-	      .LCB0:
-	   */
-	  return "beqz\t%1, .LCB%=\;j\t%2\n.LCB%=:";
-	}
-    default:
-      gcc_unreachable ();
-    }
+  return nds32_output_cbranchsi4_equality_zero (insn, operands);
 }
   [(set_attr "type" "branch")
-   (set_attr "enabled" "1")
+   (set_attr_alternative "enabled"
+     [
+       ;; Alternative 0
+       (if_then_else (match_test "TARGET_16_BIT")
+		     (const_string "yes")
+		     (const_string "no"))
+       ;; Alternative 1
+       (if_then_else (match_test "TARGET_16_BIT")
+		     (const_string "yes")
+		     (const_string "no"))
+       ;; Alternative 2
+       (const_string "yes")
+     ])
    (set_attr_alternative "length"
      [
        ;; Alternative 0
-       (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -250))
-			  (le (minus (match_dup 2) (pc)) (const_int  250)))
-		     (if_then_else (match_test "TARGET_16_BIT")
-				   (const_int 2)
-				   (const_int 4))
-		     (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -65500))
-					(le (minus (match_dup 2) (pc)) (const_int  65500)))
-				   (const_int 4)
+       (if_then_else (match_test "!CROSSING_JUMP_P (insn)")
+		     (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -250))
+					(le (minus (match_dup 2) (pc)) (const_int  250)))
 				   (if_then_else (match_test "TARGET_16_BIT")
-						 (const_int 6)
-						 (const_int 8))))
+						 (const_int 2)
+						 (const_int 4))
+				   (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -65500))
+						      (le (minus (match_dup 2) (pc)) (const_int  65500)))
+						 (const_int 4)
+						 (if_then_else (match_test "TARGET_16_BIT")
+							       (const_int 8)
+							       (const_int 10))))
+		     (const_int 10))
        ;; Alternative 1
-       (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -250))
-			  (le (minus (match_dup 2) (pc)) (const_int  250)))
-		     (if_then_else (match_test "TARGET_16_BIT")
-				   (const_int 2)
-				   (const_int 4))
+       (if_then_else (match_test "!CROSSING_JUMP_P (insn)")
+		     (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -250))
+					(le (minus (match_dup 2) (pc)) (const_int  250)))
+				   (if_then_else (match_test "TARGET_16_BIT")
+						 (const_int 2)
+						 (const_int 4))
+				   (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -65500))
+						      (le (minus (match_dup 2) (pc)) (const_int  65500)))
+						 (const_int 4)
+						 (if_then_else (match_test "TARGET_16_BIT")
+							       (const_int 8)
+							       (const_int 10))))
+		     (const_int 10))
+       ;; Alternative 2
+       (if_then_else (match_test "!CROSSING_JUMP_P (insn)")
 		     (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -65500))
 					(le (minus (match_dup 2) (pc)) (const_int  65500)))
 				   (const_int 4)
-				   (if_then_else (match_test "TARGET_16_BIT")
-						 (const_int 6)
-						 (const_int 8))))
-       ;; Alternative 2
-       (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -65500))
-			  (le (minus (match_dup 2) (pc)) (const_int  65500)))
-		     (const_int 4)
-		     (const_int 8))
+				   (const_int 10))
+		     (const_int 10))
      ])])


 ;; This pattern is dedicated to V2 ISA,
 ;; because V2 DOES NOT HAVE beqc/bnec instruction.
-(define_insn "*cbranchsi4_equality_reg"
+(define_insn "cbranchsi4_equality_reg"
   [(set (pc)
 	(if_then_else (match_operator 0 "nds32_equality_comparison_operator"
-			[(match_operand:SI 1 "register_operand"           "r")
-			 (match_operand:SI 2 "nds32_reg_constant_operand" "r")])
+			[(match_operand:SI 1 "register_operand" "v, r")
+			 (match_operand:SI 2 "register_operand" "l, r")])
 		      (label_ref (match_operand 3 "" ""))
 		      (pc)))]
   "TARGET_ISA_V2"
 {
-  enum rtx_code code;
-
-  code = GET_CODE (operands[0]);
-
-  /* This register-comparison conditional branch has one form:
-       32-bit instruction =>          beq/bne           imm14s << 1
-
-     For 32-bit case,
-     we assume it is always reachable. (but check range -16350 ~ 16350).  */
-
-  switch (code)
-    {
-    case EQ:
-      /* r, r */
-      switch (get_attr_length (insn))
-	{
-	case 4:
-	  return "beq\t%1, %2, %3";
-	case 8:
-	  /*    beq  $r0, $r1, .L0
-	      =>
-	        bne  $r0, $r1, .LCB0
-	        j  .L0
-	      .LCB0:
-	   */
-	  return "bne\t%1, %2, .LCB%=\;j\t%3\n.LCB%=:";
-	default:
-	  gcc_unreachable ();
-	}
-
-    case NE:
-      /* r, r */
-      switch (get_attr_length (insn))
-	{
-	case 4:
-	  return "bne\t%1, %2, %3";
-	case 8:
-	  /*    bne  $r0, $r1, .L0
-	      =>
-	        beq  $r0, $r1, .LCB0
-	        j  .L0
-	      .LCB0:
-	   */
-	  return "beq\t%1, %2, .LCB%=\;j\t%3\n.LCB%=:";
-	default:
-	  gcc_unreachable ();
-	}
-
-    default:
-      gcc_unreachable ();
-    }
+  return nds32_output_cbranchsi4_equality_reg (insn, operands);
 }
   [(set_attr "type"   "branch")
-   (set (attr "length")
-	(if_then_else (and (ge (minus (match_dup 3) (pc)) (const_int -16350))
-			   (le (minus (match_dup 3) (pc)) (const_int  16350)))
-		      (const_int 4)
-		      (const_int 8)))])
+   (set_attr_alternative "enabled"
+     [
+       ;; Alternative 0
+       (if_then_else (match_test "TARGET_16_BIT")
+		     (const_string "yes")
+		     (const_string "no"))
+       ;; Alternative 1
+       (const_string "yes")
+     ])
+   (set_attr_alternative "length"
+     [
+       ;; Alternative 0
+       (if_then_else (match_test "!CROSSING_JUMP_P (insn)")
+		     (if_then_else (and (ge (minus (match_dup 3) (pc)) (const_int -250))
+					(le (minus (match_dup 3) (pc)) (const_int  250)))
+				   (const_int 2)
+				   (if_then_else (and (ge (minus (match_dup 3) (pc))
+							  (const_int -16350))
+						      (le (minus (match_dup 3) (pc))
+							  (const_int  16350)))
+						 (const_int 4)
+						 (const_int 8)))
+		     (const_int 8))
+       ;; Alternative 1
+       (if_then_else (match_test "!CROSSING_JUMP_P (insn)")
+		     (if_then_else (and (ge (minus (match_dup 3) (pc)) (const_int -16350))
+					(le (minus (match_dup 3) (pc)) (const_int  16350)))
+				   (const_int 4)
+				   (const_int 10))
+		     (const_int 10))
+     ])])


 ;; This pattern is dedicated to V3/V3M,
 ;; because V3/V3M DO HAVE beqc/bnec instruction.
-(define_insn "*cbranchsi4_equality_reg_or_const_int"
+(define_insn "cbranchsi4_equality_reg_or_const_int"
   [(set (pc)
 	(if_then_else (match_operator 0 "nds32_equality_comparison_operator"
-			[(match_operand:SI 1 "register_operand"           "r,    r")
-			 (match_operand:SI 2 "nds32_reg_constant_operand" "r, Is11")])
+			[(match_operand:SI 1 "register_operand"      "v, r,    r")
+			 (match_operand:SI 2 "nds32_rimm11s_operand" "l, r, Is11")])
 		      (label_ref (match_operand 3 "" ""))
 		      (pc)))]
   "TARGET_ISA_V3 || TARGET_ISA_V3M"
 {
-  enum rtx_code code;
-
-  code = GET_CODE (operands[0]);
-
-  /* This register-comparison conditional branch has one form:
-       32-bit instruction =>          beq/bne           imm14s << 1
-       32-bit instruction =>         beqc/bnec          imm8s << 1
-
-     For 32-bit case, we assume it is always reachable.
-     (but check range -16350 ~ 16350 and -250 ~ 250).  */
-
-  switch (code)
-    {
-    case EQ:
-      if (which_alternative == 0)
-	{
-	  /* r, r */
-	  switch (get_attr_length (insn))
-	    {
-	    case 4:
-	      return "beq\t%1, %2, %3";
-	    case 8:
-	      /*    beq  $r0, $r1, .L0
-	          =>
-	            bne  $r0, $r1, .LCB0
-	            j  .L0
-	          .LCB0:
-	       */
-	      return "bne\t%1, %2, .LCB%=\;j\t%3\n.LCB%=:";
-	    default:
-	      gcc_unreachable ();
-	    }
-	}
-      else
-	{
-	  /* r, Is11 */
-	  switch (get_attr_length (insn))
-	    {
-	    case 4:
-	      return "beqc\t%1, %2, %3";
-	    case 8:
-	      /*    beqc  $r0, constant, .L0
-	          =>
-	            bnec  $r0, constant, .LCB0
-	            j  .L0
-	          .LCB0:
-	       */
-	      return "bnec\t%1, %2, .LCB%=\;j\t%3\n.LCB%=:";
-	    default:
-	      gcc_unreachable ();
-	    }
-	}
-    case NE:
-      if (which_alternative == 0)
-	{
-	  /* r, r */
-	  switch (get_attr_length (insn))
-	    {
-	    case 4:
-	      return "bne\t%1, %2, %3";
-	    case 8:
-	      /*    bne  $r0, $r1, .L0
-	          =>
-	            beq  $r0, $r1, .LCB0
-	            j  .L0
-	          .LCB0:
-	       */
-	      return "beq\t%1, %2, .LCB%=\;j\t%3\n.LCB%=:";
-	    default:
-	      gcc_unreachable ();
-	    }
-	}
-      else
-	{
-	  /* r, Is11 */
-	  switch (get_attr_length (insn))
-	    {
-	    case 4:
-	      return "bnec\t%1, %2, %3";
-	    case 8:
-	      /*    bnec  $r0, constant, .L0
-	          =>
-	            beqc  $r0, constant, .LCB0
-	            j  .L0
-	          .LCB0:
-	       */
-	      return "beqc\t%1, %2, .LCB%=\;j\t%3\n.LCB%=:";
-	    default:
-	      gcc_unreachable ();
-	    }
-	}
-    default:
-      gcc_unreachable ();
-    }
+  return nds32_output_cbranchsi4_equality_reg_or_const_int (insn, operands);
 }
   [(set_attr "type"   "branch")
+   (set_attr_alternative "enabled"
+     [
+       ;; Alternative 0
+       (if_then_else (match_test "TARGET_16_BIT")
+		     (const_string "yes")
+		     (const_string "no"))
+       ;; Alternative 1
+       (const_string "yes")
+       ;; Alternative 2
+       (const_string "yes")
+     ])
    (set_attr_alternative "length"
      [
        ;; Alternative 0
-       (if_then_else (and (ge (minus (match_dup 3) (pc)) (const_int -16350))
-			  (le (minus (match_dup 3) (pc)) (const_int  16350)))
-		     (const_int 4)
-		     (const_int 8))
+       (if_then_else (match_test "!CROSSING_JUMP_P (insn)")
+		     (if_then_else (and (ge (minus (match_dup 3) (pc)) (const_int -250))
+					(le (minus (match_dup 3) (pc)) (const_int  250)))
+				   (const_int 2)
+				   (if_then_else (and (ge (minus (match_dup 3) (pc))
+							  (const_int -16350))
+						      (le (minus (match_dup 3) (pc))
+							  (const_int  16350)))
+						 (const_int 4)
+						 (const_int 8)))
+		    (const_int 8))
        ;; Alternative 1
-       (if_then_else (and (ge (minus (match_dup 3) (pc)) (const_int -250))
-			  (le (minus (match_dup 3) (pc)) (const_int  250)))
-		     (const_int 4)
-		     (const_int 8))
+       (if_then_else (match_test "!CROSSING_JUMP_P (insn)")
+		     (if_then_else (and (ge (minus (match_dup 3) (pc)) (const_int -16350))
+					(le (minus (match_dup 3) (pc)) (const_int  16350)))
+				   (const_int 4)
+				   (const_int 10))
+		    (const_int 10))
+       ;; Alternative 2
+       (if_then_else (match_test "!CROSSING_JUMP_P (insn)")
+		     (if_then_else (and (ge (minus (match_dup 3) (pc)) (const_int -250))
+					(le (minus (match_dup 3) (pc)) (const_int  250)))
+				   (const_int 4)
+				   (const_int 10))
+		    (const_int 10))
      ])])


@@ -1529,80 +1340,16 @@ create_template:
 		      (pc)))]
   ""
 {
-  enum rtx_code code;
-
-  code = GET_CODE (operands[0]);
-
-  /* This zero-greater-less-comparison conditional branch has one form:
-       32-bit instruction =>      bgtz/bgez/bltz/blez     imm16s << 1
-
-     For 32-bit case, we assume it is always reachable.
-     (but check range -65500 ~ 65500).  */
-
-  if (get_attr_length (insn) == 8)
-    {
-      /* The branch target is too far to simply use one
-         bgtz/bgez/bltz/blez instruction.
-         We need to reverse condition and use 'j' to jump to the target.  */
-      switch (code)
-	{
-	case GT:
-	  /*   bgtz  $r8, .L0
-	     =>
-	       blez  $r8, .LCB0
-	       j  .L0
-	     .LCB0:
-	   */
-	  return "blez\t%1, .LCB%=\;j\t%2\n.LCB%=:";
-	case GE:
-	  /*   bgez  $r8, .L0
-	     =>
-	       bltz  $r8, .LCB0
-	       j  .L0
-	     .LCB0:
-	   */
-	  return "bltz\t%1, .LCB%=\;j\t%2\n.LCB%=:";
-	case LT:
-	  /*   bltz  $r8, .L0
-	     =>
-	       bgez  $r8, .LCB0
-	       j  .L0
-	     .LCB0:
-	   */
-	  return "bgez\t%1, .LCB%=\;j\t%2\n.LCB%=:";
-	case LE:
-	  /*   blez  $r8, .L0
-	     =>
-	       bgtz  $r8, .LCB0
-	       j  .L0
-	     .LCB0:
-	   */
-	  return "bgtz\t%1, .LCB%=\;j\t%2\n.LCB%=:";
-	default:
-	  gcc_unreachable ();
-	}
-    }
-
-  switch (code)
-    {
-    case GT:
-      return "bgtz\t%1, %2";
-    case GE:
-      return "bgez\t%1, %2";
-    case LT:
-      return "bltz\t%1, %2";
-    case LE:
-      return "blez\t%1, %2";
-    default:
-      gcc_unreachable ();
-    }
+  return nds32_output_cbranchsi4_greater_less_zero (insn, operands);
 }
   [(set_attr "type"   "branch")
    (set (attr "length")
-        (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -65500))
-			   (le (minus (match_dup 2) (pc)) (const_int  65500)))
-		      (const_int 4)
-		      (const_int 8)))])
+	(if_then_else (match_test "!CROSSING_JUMP_P (insn)")
+		      (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -65500))
+					 (le (minus (match_dup 2) (pc)) (const_int  65500)))
+				    (const_int 4)
+				    (const_int 10))
+		      (const_int 10)))])


 (define_expand "cstoresi4"
@@ -1612,237 +1359,85 @@ create_template:
 	   (match_operand:SI 3 "nonmemory_operand" "")]))]
   ""
 {
-  rtx tmp_reg;
-  enum rtx_code code;
-
-  code = GET_CODE (operands[1]);
-
-  switch (code)
+  enum nds32_expand_result_type result = nds32_expand_cstore (operands);
+  switch (result)
     {
-    case EQ:
-      if (GET_CODE (operands[3]) == CONST_INT)
-	{
-	  /* reg_R = (reg_A == const_int_B)
-	     --> addi reg_C, reg_A, -const_int_B
-	         slti reg_R, reg_C, const_int_1 */
-	  tmp_reg = gen_reg_rtx (SImode);
-	  operands[3] = gen_int_mode (-INTVAL (operands[3]), SImode);
-	  /* If the integer value is not in the range of imm15s,
-	     we need to force register first because our addsi3 pattern
-	     only accept nds32_rimm15s_operand predicate.  */
-	  if (!satisfies_constraint_Is15 (operands[3]))
-	    operands[3] = force_reg (SImode, operands[3]);
-	  emit_insn (gen_addsi3 (tmp_reg, operands[2], operands[3]));
-	  emit_insn (gen_slt_compare (operands[0], tmp_reg, const1_rtx));
-
-	  DONE;
-	}
-      else
-	{
-	  /* reg_R = (reg_A == reg_B)
-	     --> xor  reg_C, reg_A, reg_B
-	         slti reg_R, reg_C, const_int_1 */
-	  tmp_reg = gen_reg_rtx (SImode);
-	  emit_insn (gen_xorsi3 (tmp_reg, operands[2], operands[3]));
-	  emit_insn (gen_slt_compare (operands[0], tmp_reg, const1_rtx));
-
-	  DONE;
-	}
-
-    case NE:
-      if (GET_CODE (operands[3]) == CONST_INT)
-	{
-	  /* reg_R = (reg_A != const_int_B)
-	     --> addi reg_C, reg_A, -const_int_B
-	         slti reg_R, const_int_0, reg_C */
-	  tmp_reg = gen_reg_rtx (SImode);
-	  operands[3] = gen_int_mode (-INTVAL (operands[3]), SImode);
-	  /* If the integer value is not in the range of imm15s,
-	     we need to force register first because our addsi3 pattern
-	     only accept nds32_rimm15s_operand predicate.  */
-	  if (!satisfies_constraint_Is15 (operands[3]))
-	    operands[3] = force_reg (SImode, operands[3]);
-	  emit_insn (gen_addsi3 (tmp_reg, operands[2], operands[3]));
-	  emit_insn (gen_slt_compare (operands[0], const0_rtx, tmp_reg));
-
-	  DONE;
-	}
-      else
-	{
-	  /* reg_R = (reg_A != reg_B)
-	     --> xor  reg_C, reg_A, reg_B
-	         slti reg_R, const_int_0, reg_C */
-	  tmp_reg = gen_reg_rtx (SImode);
-	  emit_insn (gen_xorsi3 (tmp_reg, operands[2], operands[3]));
-	  emit_insn (gen_slt_compare (operands[0], const0_rtx, tmp_reg));
-
-	  DONE;
-	}
-
-    case GT:
-    case GTU:
-      /* reg_R = (reg_A > reg_B)       --> slt reg_R, reg_B, reg_A */
-      /* reg_R = (reg_A > const_int_B) --> slt reg_R, const_int_B, reg_A */
-      if (code == GT)
-	{
-	  /* GT, use slts instruction */
-	  emit_insn (gen_slts_compare (operands[0], operands[3], operands[2]));
-	}
-      else
-	{
-	  /* GTU, use slt instruction */
-	  emit_insn (gen_slt_compare  (operands[0], operands[3], operands[2]));
-	}
-
+    case EXPAND_DONE:
       DONE;
-
-    case GE:
-    case GEU:
-      if (GET_CODE (operands[3]) == CONST_INT)
-	{
-	  /* reg_R = (reg_A >= const_int_B)
-	     --> movi reg_C, const_int_B - 1
-	         slt  reg_R, reg_C, reg_A */
-	  tmp_reg = gen_reg_rtx (SImode);
-
-	  emit_insn (gen_movsi (tmp_reg,
-				gen_int_mode (INTVAL (operands[3]) - 1,
-					      SImode)));
-	  if (code == GE)
-	    {
-	      /* GE, use slts instruction */
-	      emit_insn (gen_slts_compare (operands[0], tmp_reg, operands[2]));
-	    }
-	  else
-	    {
-	      /* GEU, use slt instruction */
-	      emit_insn (gen_slt_compare  (operands[0], tmp_reg, operands[2]));
-	    }
-
-	  DONE;
-	}
-      else
-	{
-	  /* reg_R = (reg_A >= reg_B)
-	     --> slt  reg_R, reg_A, reg_B
-	         xori reg_R, reg_R, const_int_1 */
-	  if (code == GE)
-	    {
-	      /* GE, use slts instruction */
-	      emit_insn (gen_slts_compare (operands[0],
-					   operands[2], operands[3]));
-	    }
-	  else
-	    {
-	      /* GEU, use slt instruction */
-	      emit_insn (gen_slt_compare  (operands[0],
-					   operands[2], operands[3]));
-	    }
-
-	  /* perform 'not' behavior */
-	  emit_insn (gen_xorsi3 (operands[0], operands[0], const1_rtx));
-
-	  DONE;
-	}
-
-    case LT:
-    case LTU:
-      /* reg_R = (reg_A < reg_B)       --> slt reg_R, reg_A, reg_B */
-      /* reg_R = (reg_A < const_int_B) --> slt reg_R, reg_A, const_int_B */
-      if (code == LT)
-	{
-	  /* LT, use slts instruction */
-	  emit_insn (gen_slts_compare (operands[0], operands[2], operands[3]));
-	}
-      else
-	{
-	  /* LTU, use slt instruction */
-	  emit_insn (gen_slt_compare  (operands[0], operands[2], operands[3]));
-	}
-
-      DONE;
-
-    case LE:
-    case LEU:
-      if (GET_CODE (operands[3]) == CONST_INT)
-	{
-	  /* reg_R = (reg_A <= const_int_B)
-	     --> movi reg_C, const_int_B + 1
-	         slt  reg_R, reg_A, reg_C */
-	  tmp_reg = gen_reg_rtx (SImode);
-
-	  emit_insn (gen_movsi (tmp_reg,
-				gen_int_mode (INTVAL (operands[3]) + 1,
-						      SImode)));
-	  if (code == LE)
-	    {
-	      /* LE, use slts instruction */
-	      emit_insn (gen_slts_compare (operands[0], operands[2], tmp_reg));
-	    }
-	  else
-	    {
-	      /* LEU, use slt instruction */
-	      emit_insn (gen_slt_compare  (operands[0], operands[2], tmp_reg));
-	    }
-
-	  DONE;
-	}
-      else
-	{
-	  /* reg_R = (reg_A <= reg_B) --> slt  reg_R, reg_B, reg_A
-	                                  xori reg_R, reg_R, const_int_1 */
-	  if (code == LE)
-	    {
-	      /* LE, use slts instruction */
-	      emit_insn (gen_slts_compare (operands[0],
-					   operands[3], operands[2]));
-	    }
-	  else
-	    {
-	      /* LEU, use slt instruction */
-	      emit_insn (gen_slt_compare  (operands[0],
-					   operands[3], operands[2]));
-	    }
-
-	  /* perform 'not' behavior */
-	  emit_insn (gen_xorsi3 (operands[0], operands[0], const1_rtx));
-
-	  DONE;
-	}
-
-
+      break;
+    case EXPAND_FAIL:
+      FAIL;
+      break;
+    case EXPAND_CREATE_TEMPLATE:
+      break;
     default:
       gcc_unreachable ();
     }
 })


-(define_insn "slts_compare"
-  [(set (match_operand:SI 0 "register_operand"         "=t,    t, r,    r")
-	(lt:SI (match_operand:SI 1 "nonmemory_operand" " d,    d, r,    r")
-	       (match_operand:SI 2 "nonmemory_operand" " r, Iu05, r, Is15")))]
+(define_expand "slts_compare"
+  [(set (match_operand:SI 0 "register_operand"       "")
+	(lt:SI (match_operand:SI 1 "general_operand" "")
+	       (match_operand:SI 2 "general_operand" "")))]
+  ""
+{
+  if (!REG_P (operands[1]))
+    operands[1] = force_reg (SImode, operands[1]);
+
+  if (!REG_P (operands[2]) && !satisfies_constraint_Is15 (operands[2]))
+    operands[2] = force_reg (SImode, operands[2]);
+})
+
+(define_insn "slts_compare_impl"
+  [(set (match_operand:SI 0 "register_operand"             "=t,   t, r,    r")
+	(lt:SI (match_operand:SI 1 "register_operand"      " d,   d, r,    r")
+	       (match_operand:SI 2 "nds32_rimm15s_operand" " r,Iu05, r, Is15")))]
   ""
   "@
    slts45\t%1, %2
    sltsi45\t%1, %2
    slts\t%0, %1, %2
    sltsi\t%0, %1, %2"
-  [(set_attr "type"   "compare,compare,compare,compare")
-   (set_attr "length" "      2,      2,      4,      4")])
+  [(set_attr "type"   "alu,    alu,    alu,    alu")
+   (set_attr "length" "  2,      2,      4,      4")])
+
+(define_insn "slt_eq0"
+  [(set (match_operand:SI 0 "register_operand"        "=t, r")
+	(eq:SI (match_operand:SI 1 "register_operand" " d, r")
+	       (const_int 0)))]
+  ""
+  "@
+   slti45\t%1, 1
+   slti\t%0, %1, 1"
+  [(set_attr "type"   "alu, alu")
+   (set_attr "length" "  2,   4")])
+
+(define_expand "slt_compare"
+  [(set (match_operand:SI 0 "register_operand"        "")
+	(ltu:SI (match_operand:SI 1 "general_operand" "")
+		(match_operand:SI 2 "general_operand" "")))]
+  ""
+{
+  if (!REG_P (operands[1]))
+    operands[1] = force_reg (SImode, operands[1]);

-(define_insn "slt_compare"
-  [(set (match_operand:SI 0 "register_operand"          "=t,    t, r,    r")
-	(ltu:SI (match_operand:SI 1 "nonmemory_operand" " d,    d, r,    r")
-		(match_operand:SI 2 "nonmemory_operand" " r, Iu05, r, Is15")))]
+  if (!REG_P (operands[2]) && !satisfies_constraint_Is15 (operands[2]))
+    operands[2] = force_reg (SImode, operands[2]);
+})
+
+(define_insn "slt_compare_impl"
+  [(set (match_operand:SI 0 "register_operand"              "=t,   t, r,    r")
+	(ltu:SI (match_operand:SI 1 "register_operand"      " d,   d, r,    r")
+		(match_operand:SI 2 "nds32_rimm15s_operand" " r,Iu05, r, Is15")))]
   ""
   "@
    slt45\t%1, %2
    slti45\t%1, %2
    slt\t%0, %1, %2
    slti\t%0, %1, %2"
-  [(set_attr "type"   "compare,compare,compare,compare")
-   (set_attr "length" "      2,      2,      4,      4")])
-
+  [(set_attr "type"   "alu,    alu,    alu,    alu")
+   (set_attr "length" "  2,      2,      4,      4")])

 ;; ----------------------------------------------------------------------------

@@ -1874,12 +1469,14 @@ create_template:
     }
 }
   [(set_attr "type" "branch")
-   (set_attr "enabled" "1")
+   (set_attr "enabled" "yes")
    (set (attr "length")
-	(if_then_else (and (ge (minus (match_dup 0) (pc)) (const_int -250))
-			   (le (minus (match_dup 0) (pc)) (const_int  250)))
-		      (if_then_else (match_test "TARGET_16_BIT")
-				    (const_int 2)
+	(if_then_else (match_test "!CROSSING_JUMP_P (insn)")
+		      (if_then_else (and (ge (minus (match_dup 0) (pc)) (const_int -250))
+					 (le (minus (match_dup 0) (pc)) (const_int  250)))
+				    (if_then_else (match_test "TARGET_16_BIT")
+						  (const_int 2)
+						  (const_int 4))
 				    (const_int 4))
 		      (const_int 4)))])

@@ -1887,14 +1484,27 @@ create_template:
   [(set (pc) (match_operand:SI 0 "register_operand" "r, r"))]
   ""
   "@
-  jr5\t%0
-  jr\t%0"
+   jr5\t%0
+   jr\t%0"
   [(set_attr "type"   "branch,branch")
    (set_attr "length" "     2,     4")])

+(define_insn "*cond_indirect_jump"
+  [(cond_exec (ne (match_operand:SI 0 "register_operand"       "r")
+		  (const_int 0))
+	      (set (pc) (match_operand:SI 1 "register_operand" "0")))]
+  ""
+  "jrnez\t%0"
+  [(set_attr "type"   "branch")
+   (set_attr "length"      "4")])
+
+;; ----------------------------------------------------------------------------
+
+;; Normal call patterns.
+
 ;; Subroutine call instruction returning no value.
 ;;   operands[0]: It should be a mem RTX whose address is
-;;                the address of the function.
+;;                the the address of the function.
 ;;   operands[1]: It is the number of bytes of arguments pushed as a const_int.
 ;;   operands[2]: It is the number of registers used as operands.

@@ -1904,39 +1514,114 @@ create_template:
 	      (clobber (reg:SI LP_REGNUM))
 	      (clobber (reg:SI TA_REGNUM))])]
   ""
-  ""
+  {
+    rtx insn;
+    rtx sym = XEXP (operands[0], 0);
+
+    if (TARGET_ICT_MODEL_LARGE
+	&& nds32_indirect_call_referenced_p (sym))
+      {
+	rtx reg = gen_reg_rtx (Pmode);
+	emit_move_insn (reg, sym);
+	operands[0] = gen_const_mem (Pmode, reg);
+      }
+
+    if (flag_pic)
+      {
+	insn = emit_call_insn (gen_call_internal
+			       (XEXP (operands[0], 0), GEN_INT (0)));
+	use_reg (&CALL_INSN_FUNCTION_USAGE (insn), pic_offset_table_rtx);
+	DONE;
+      }
+  }
 )

-(define_insn "*call_register"
-  [(parallel [(call (mem (match_operand:SI 0 "register_operand" "r, r"))
-		    (match_operand 1))
-	      (clobber (reg:SI LP_REGNUM))
-	      (clobber (reg:SI TA_REGNUM))])]
-  ""
-  "@
-  jral5\t%0
-  jral\t%0"
-  [(set_attr "type"   "branch,branch")
-   (set_attr "length" "     2,     4")])
-
-(define_insn "*call_immediate"
-  [(parallel [(call (mem (match_operand:SI 0 "immediate_operand" "i"))
+(define_insn "call_internal"
+  [(parallel [(call (mem (match_operand:SI 0 "nds32_call_address_operand" "r, i"))
 		    (match_operand 1))
 	      (clobber (reg:SI LP_REGNUM))
 	      (clobber (reg:SI TA_REGNUM))])]
   ""
 {
-  if (TARGET_CMODEL_LARGE)
-    return "bal\t%0";
-  else
-    return "jal\t%0";
+  rtx_insn *next_insn = next_active_insn (insn);
+  bool align_p = (!(next_insn && get_attr_length (next_insn) == 2))
+		 && NDS32_ALIGN_P ();
+  switch (which_alternative)
+    {
+    case 0:
+      if (TARGET_16_BIT)
+	{
+	  if (align_p)
+	    return "jral5\t%0\;.align 2";
+	  else
+	    return "jral5\t%0";
+	}
+      else
+	{
+	  if (align_p)
+	    return "jral\t%0\;.align 2";
+	  else
+	    return "jral\t%0";
+	}
+    case 1:
+      return nds32_output_call (insn, operands, operands[0],
+				"bal\t%0", "jal\t%0", align_p);
+    default:
+      gcc_unreachable ();
+    }
 }
-  [(set_attr "type"   "branch")
-   (set (attr "length")
-	(if_then_else (match_test "TARGET_CMODEL_LARGE")
-		      (const_int 12)
-		      (const_int 4)))])
+  [(set_attr "enabled" "yes")
+   (set_attr "type" "branch")
+   (set_attr_alternative "length"
+     [
+       ;; Alternative 0
+       (if_then_else (match_test "TARGET_16_BIT")
+		     (const_int 2)
+		     (const_int 4))
+       ;; Alternative 1
+       (if_then_else (match_test "flag_pic")
+		     (const_int 16)
+		     (if_then_else (match_test "nds32_long_call_p (operands[0])")
+				   (const_int 12)
+				   (const_int 4)))
+     ])]
+)

+(define_insn "*cond_call_register"
+  [(cond_exec (ne (match_operand:SI 0 "register_operand"                   "r")
+		  (const_int 0))
+	      (parallel [(call (mem (match_operand:SI 1 "register_operand" "0"))
+			       (match_operand 2))
+			 (clobber (reg:SI LP_REGNUM))
+			 (clobber (reg:SI TA_REGNUM))]))]
+  "TARGET_ISA_V3"
+  "jralnez\t%0"
+  [(set_attr "type"   "branch")
+   (set_attr "length"      "4")])
+
+(define_insn "*cond_call_immediate"
+  [(cond_exec (match_operator 0 "nds32_conditional_call_comparison_operator"
+		[(match_operand:SI 1 "register_operand"                     "r")
+		 (const_int 0)])
+	      (parallel [(call (mem (match_operand:SI 2 "nds32_symbolic_operand" "i"))
+			       (match_operand 3))
+			 (clobber (reg:SI LP_REGNUM))
+			 (clobber (reg:SI TA_REGNUM))]))]
+  "!flag_pic && !TARGET_CMODEL_LARGE
+   && nds32_indirect_call_referenced_p (operands[2])"
+{
+  switch (GET_CODE (operands[0]))
+    {
+    case LT:
+      return "bltzal\t%1, %2";
+    case GE:
+      return "bgezal\t%1, %2";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type"    "branch")
+   (set_attr "length"       "4")])

 ;; Subroutine call instruction returning a value.
 ;;   operands[0]: It is the hard regiser in which the value is returned.
@@ -1951,49 +1636,152 @@ create_template:
 	      (clobber (reg:SI LP_REGNUM))
 	      (clobber (reg:SI TA_REGNUM))])]
   ""
-  ""
+  {
+    rtx insn;
+    rtx sym = XEXP (operands[1], 0);
+
+    if (TARGET_ICT_MODEL_LARGE
+	&& nds32_indirect_call_referenced_p (sym))
+      {
+	rtx reg = gen_reg_rtx (Pmode);
+	emit_move_insn (reg, sym);
+	operands[1] = gen_const_mem (Pmode, reg);
+      }
+
+    if (flag_pic)
+      {
+	insn =
+	  emit_call_insn (gen_call_value_internal
+			  (operands[0], XEXP (operands[1], 0), GEN_INT (0)));
+	use_reg (&CALL_INSN_FUNCTION_USAGE (insn), pic_offset_table_rtx);
+	DONE;
+      }
+  }
 )

-(define_insn "*call_value_register"
+(define_insn "call_value_internal"
   [(parallel [(set (match_operand 0)
-		   (call (mem (match_operand:SI 1 "register_operand" "r, r"))
+		   (call (mem (match_operand:SI 1 "nds32_call_address_operand" "r, i"))
 		         (match_operand 2)))
 	      (clobber (reg:SI LP_REGNUM))
 	      (clobber (reg:SI TA_REGNUM))])]
   ""
-  "@
-  jral5\t%1
-  jral\t%1"
-  [(set_attr "type"   "branch,branch")
-   (set_attr "length" "     2,     4")])
-
-(define_insn "*call_value_immediate"
-  [(parallel [(set (match_operand 0)
-		   (call (mem (match_operand:SI 1 "immediate_operand" "i"))
-			 (match_operand 2)))
-	      (clobber (reg:SI LP_REGNUM))
-	      (clobber (reg:SI TA_REGNUM))])]
-  ""
 {
-  if (TARGET_CMODEL_LARGE)
-    return "bal\t%1";
-  else
-    return "jal\t%1";
+  rtx_insn *next_insn = next_active_insn (insn);
+  bool align_p = (!(next_insn && get_attr_length (next_insn) == 2))
+		 && NDS32_ALIGN_P ();
+  switch (which_alternative)
+    {
+    case 0:
+      if (TARGET_16_BIT)
+	{
+	  if (align_p)
+	    return "jral5\t%1\;.align 2";
+	  else
+	    return "jral5\t%1";
+	}
+      else
+	{
+	  if (align_p)
+	    return "jral\t%1\;.align 2";
+	  else
+	    return "jral\t%1";
+	}
+    case 1:
+      return nds32_output_call (insn, operands, operands[1],
+				"bal\t%1", "jal\t%1", align_p);
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "enabled" "yes")
+   (set_attr "type" "branch")
+   (set_attr_alternative "length"
+     [
+       ;; Alternative 0
+       (if_then_else (match_test "TARGET_16_BIT")
+		     (const_int 2)
+		     (const_int 4))
+       ;; Alternative 1
+       (if_then_else (match_test "flag_pic")
+		     (const_int 16)
+		     (if_then_else (match_test "nds32_long_call_p (operands[1])")
+				   (const_int 12)
+				   (const_int 4)))
+     ])]
+)
+
+(define_insn "*cond_call_value_register"
+  [(cond_exec (ne (match_operand:SI 0 "register_operand"                        "r")
+		  (const_int 0))
+	      (parallel [(set (match_operand 1)
+			      (call (mem (match_operand:SI 2 "register_operand" "0"))
+				    (match_operand 3)))
+			 (clobber (reg:SI LP_REGNUM))
+			 (clobber (reg:SI TA_REGNUM))]))]
+  "TARGET_ISA_V3"
+  "jralnez\t%0"
+  [(set_attr "type"    "branch")
+   (set_attr "length"       "4")])
+
+(define_insn "*cond_call_value_immediate"
+  [(cond_exec (match_operator 0 "nds32_conditional_call_comparison_operator"
+		[(match_operand:SI 1 "register_operand"                          "r")
+		 (const_int 0)])
+	      (parallel [(set (match_operand 2)
+			      (call (mem (match_operand:SI 3 "nds32_symbolic_operand" "i"))
+				    (match_operand 4)))
+			 (clobber (reg:SI LP_REGNUM))
+			 (clobber (reg:SI TA_REGNUM))]))]
+  "!flag_pic && !TARGET_CMODEL_LARGE
+   && nds32_indirect_call_referenced_p (operands[3])"
+{
+  switch (GET_CODE (operands[0]))
+    {
+    case LT:
+      return "bltzal\t%1, %3";
+    case GE:
+      return "bgezal\t%1, %3";
+    default:
+      gcc_unreachable ();
+    }
 }
   [(set_attr "type"   "branch")
-   (set (attr "length")
-	(if_then_else (match_test "TARGET_CMODEL_LARGE")
-		      (const_int 12)
-		      (const_int 4)))])
+   (set_attr "length"      "4")])
+
+;; Call subroutine returning any type.
+
+(define_expand "untyped_call"
+  [(parallel [(call (match_operand 0 "" "")
+		    (const_int 0))
+	      (match_operand 1 "" "")
+	      (match_operand 2 "" "")])]
+  ""
+{
+  int i;
+
+  emit_call_insn (gen_call (operands[0], const0_rtx));
+
+  for (i = 0; i < XVECLEN (operands[2], 0); i++)
+    {
+      rtx set = XVECEXP (operands[2], 0, i);
+      emit_move_insn (SET_DEST (set), SET_SRC (set));
+    }

+  /* The optimizer does not know that the call sets the function value
+     registers we stored in the result block.  We avoid problems by
+     claiming that all hard registers are used and clobbered at this
+     point.  */
+  emit_insn (gen_blockage ());
+  DONE;
+})

 ;; ----------------------------------------------------------------------------

 ;; The sibcall patterns.

 ;; sibcall
-;; sibcall_register
-;; sibcall_immediate
+;; sibcall_internal

 (define_expand "sibcall"
   [(parallel [(call (match_operand 0 "memory_operand" "")
@@ -2001,41 +1789,60 @@ create_template:
 	      (clobber (reg:SI TA_REGNUM))
 	      (return)])]
   ""
-  ""
-)
-
-(define_insn "*sibcall_register"
-  [(parallel [(call (mem (match_operand:SI 0 "register_operand" "r, r"))
-		    (match_operand 1))
-	      (clobber (reg:SI TA_REGNUM))
-	      (return)])]
-  ""
-  "@
-   jr5\t%0
-   jr\t%0"
-  [(set_attr "type"   "branch,branch")
-   (set_attr "length" "     2,     4")])
+{
+    rtx sym = XEXP (operands[0], 0);
+
+    if (TARGET_ICT_MODEL_LARGE
+	&& nds32_indirect_call_referenced_p (sym))
+      {
+	rtx reg = gen_reg_rtx (Pmode);
+	emit_move_insn (reg, sym);
+	operands[0] = gen_const_mem (Pmode, reg);
+      }
+})

-(define_insn "*sibcall_immediate"
-  [(parallel [(call (mem (match_operand:SI 0 "immediate_operand" "i"))
+(define_insn "sibcall_internal"
+  [(parallel [(call (mem (match_operand:SI 0 "nds32_call_address_operand" "r, i"))
 		    (match_operand 1))
 	      (clobber (reg:SI TA_REGNUM))
 	      (return)])]
   ""
 {
-  if (TARGET_CMODEL_LARGE)
-    return "b\t%0";
-  else
-    return "j\t%0";
+  switch (which_alternative)
+    {
+    case 0:
+      if (TARGET_16_BIT)
+	return "jr5\t%0";
+      else
+	return "jr\t%0";
+    case 1:
+      if (nds32_long_call_p (operands[0]))
+	return "b\t%0";
+      else
+	return "j\t%0";
+    default:
+      gcc_unreachable ();
+    }
 }
-  [(set_attr "type"   "branch")
-   (set (attr "length")
-	(if_then_else (match_test "TARGET_CMODEL_LARGE")
-		      (const_int 12)
-		      (const_int 4)))])
+  [(set_attr "enabled" "yes")
+   (set_attr "type" "branch")
+   (set_attr_alternative "length"
+     [
+       ;; Alternative 0
+       (if_then_else (match_test "TARGET_16_BIT")
+		     (const_int 2)
+		     (const_int 4))
+       ;; Alternative 1
+       (if_then_else (match_test "flag_pic")
+		     (const_int 16)
+		     (if_then_else (match_test "nds32_long_call_p (operands[0])")
+				   (const_int 12)
+				   (const_int 4)))
+     ])]
+)

 ;; sibcall_value
-;; sibcall_value_register
+;; sibcall_value_internal
 ;; sibcall_value_immediate

 (define_expand "sibcall_value"
@@ -2045,73 +1852,106 @@ create_template:
 	      (clobber (reg:SI TA_REGNUM))
 	      (return)])]
   ""
-  ""
-)
-
-(define_insn "*sibcall_value_register"
-  [(parallel [(set (match_operand 0)
-		   (call (mem (match_operand:SI 1 "register_operand" "r, r"))
-			 (match_operand 2)))
-	      (clobber (reg:SI TA_REGNUM))
-	      (return)])]
-  ""
-  "@
-   jr5\t%1
-   jr\t%1"
-  [(set_attr "type"   "branch,branch")
-   (set_attr "length" "     2,     4")])
+{
+    rtx sym = XEXP (operands[1], 0);
+
+    if (TARGET_ICT_MODEL_LARGE
+	&& nds32_indirect_call_referenced_p (sym))
+      {
+	rtx reg = gen_reg_rtx (Pmode);
+	emit_move_insn (reg, sym);
+	operands[1] = gen_const_mem (Pmode, reg);
+      }
+})

-(define_insn "*sibcall_value_immediate"
+(define_insn "sibcall_value_internal"
   [(parallel [(set (match_operand 0)
-		   (call (mem (match_operand:SI 1 "immediate_operand" "i"))
+		   (call (mem (match_operand:SI 1 "nds32_call_address_operand" "r, i"))
 			 (match_operand 2)))
 	      (clobber (reg:SI TA_REGNUM))
 	      (return)])]
   ""
 {
-  if (TARGET_CMODEL_LARGE)
-    return "b\t%1";
-  else
-    return "j\t%1";
+  switch (which_alternative)
+    {
+    case 0:
+      if (TARGET_16_BIT)
+	return "jr5\t%1";
+      else
+	return "jr\t%1";
+    case 1:
+      if (nds32_long_call_p (operands[1]))
+	return "b\t%1";
+      else
+	return "j\t%1";
+    default:
+      gcc_unreachable ();
+    }
 }
-  [(set_attr "type"   "branch")
-   (set (attr "length")
-	(if_then_else (match_test "TARGET_CMODEL_LARGE")
-		      (const_int 12)
-		      (const_int 4)))])
-
+  [(set_attr "enabled" "yes")
+   (set_attr "type" "branch")
+   (set_attr_alternative "length"
+     [
+       ;; Alternative 0
+       (if_then_else (match_test "TARGET_16_BIT")
+		     (const_int 2)
+		     (const_int 4))
+       ;; Alternative 1
+       (if_then_else (match_test "flag_pic")
+		     (const_int 16)
+		     (if_then_else (match_test "nds32_long_call_p (operands[1])")
+				   (const_int 12)
+				   (const_int 4)))
+     ])]
+)

 ;; ----------------------------------------------------------------------------

-;; prologue and epilogue.
+;; The prologue and epilogue.

 (define_expand "prologue" [(const_int 0)]
   ""
 {
   /* Note that only under V3/V3M ISA, we could use v3push prologue.
-     In addition, we do not want to use v3push for isr function
-     and variadic function.  */
-  if (TARGET_V3PUSH
-      && !nds32_isr_function_p (current_function_decl)
-      && (cfun->machine->va_args_size == 0))
+     In addition, we need to check if v3push is indeed available.  */
+  if (NDS32_V3PUSH_AVAILABLE_P)
     nds32_expand_prologue_v3push ();
   else
     nds32_expand_prologue ();
+
+  /* If cfun->machine->fp_as_gp_p is true, we can generate special
+     directive to guide linker doing fp-as-gp optimization.
+     However, for a naked function, which means
+     it should not have prologue/epilogue,
+     using fp-as-gp still requires saving $fp by push/pop behavior and
+     there is no benefit to use fp-as-gp on such small function.
+     So we need to make sure this function is NOT naked as well.  */
+  if (cfun->machine->fp_as_gp_p && !cfun->machine->naked_p)
+    emit_insn (gen_omit_fp_begin (gen_rtx_REG (SImode, FP_REGNUM)));
+
   DONE;
 })

 (define_expand "epilogue" [(const_int 0)]
   ""
 {
+  /* If cfun->machine->fp_as_gp_p is true, we can generate special
+     directive to guide linker doing fp-as-gp optimization.
+     However, for a naked function, which means
+     it should not have prologue/epilogue,
+     using fp-as-gp still requires saving $fp by push/pop behavior and
+     there is no benefit to use fp-as-gp on such small function.
+     So we need to make sure this function is NOT naked as well.  */
+  if (cfun->machine->fp_as_gp_p && !cfun->machine->naked_p)
+    emit_insn (gen_omit_fp_end (gen_rtx_REG (SImode, FP_REGNUM)));
+
   /* Note that only under V3/V3M ISA, we could use v3pop epilogue.
-     In addition, we do not want to use v3pop for isr function
-     and variadic function.  */
-  if (TARGET_V3PUSH
-      && !nds32_isr_function_p (current_function_decl)
-      && (cfun->machine->va_args_size == 0))
+     In addition, we need to check if v3push is indeed available.  */
+  if (NDS32_V3PUSH_AVAILABLE_P)
     nds32_expand_epilogue_v3pop (false);
   else
     nds32_expand_epilogue (false);
+
   DONE;
 })

@@ -2121,15 +1961,11 @@ create_template:
   /* Pass true to indicate that this is sibcall epilogue and
      exit from a function without the final branch back to the
      calling function.  */
-  if (TARGET_V3PUSH && !nds32_isr_function_p (current_function_decl))
-    nds32_expand_epilogue_v3pop (true);
-  else
-    nds32_expand_epilogue (true);
+  nds32_expand_epilogue (true);

   DONE;
 })

-
 ;; nop instruction.

 (define_insn "nop"
@@ -2142,7 +1978,7 @@ create_template:
     return "nop";
 }
   [(set_attr "type" "misc")
-   (set_attr "enabled" "1")
+   (set_attr "enabled" "yes")
    (set (attr "length")
 	(if_then_else (match_test "TARGET_16_BIT")
 		      (const_int 2)
@@ -2166,12 +2002,11 @@ create_template:
 {
   return nds32_output_stack_push (operands[0]);
 }
-  [(set_attr "type" "misc")
-   (set_attr "enabled" "1")
+  [(set_attr "type" "store_multiple")
+   (set_attr "combo" "12")
+   (set_attr "enabled" "yes")
    (set (attr "length")
-	(if_then_else (match_test "TARGET_V3PUSH
-				   && !nds32_isr_function_p (cfun->decl)
-				   && (cfun->machine->va_args_size == 0)")
+	(if_then_else (match_test "NDS32_V3PUSH_AVAILABLE_P")
 		      (const_int 2)
 		      (const_int 4)))])

@@ -2188,12 +2023,11 @@ create_template:
 {
   return nds32_output_stack_pop (operands[0]);
 }
-  [(set_attr "type" "misc")
-   (set_attr "enabled" "1")
+  [(set_attr "type" "load_multiple")
+   (set_attr "combo" "12")
+   (set_attr "enabled" "yes")
    (set (attr "length")
-	(if_then_else (match_test "TARGET_V3PUSH
-				   && !nds32_isr_function_p (cfun->decl)
-				   && (cfun->machine->va_args_size == 0)")
+	(if_then_else (match_test "NDS32_V3PUSH_AVAILABLE_P")
 		      (const_int 2)
 		      (const_int 4)))])

@@ -2205,34 +2039,64 @@ create_template:
 ;; Use this pattern to expand a return instruction
 ;; with simple_return rtx if no epilogue is required.
 (define_expand "return"
-  [(simple_return)]
+  [(parallel [(return)
+              (clobber (reg:SI FP_REGNUM))])]
   "nds32_can_use_return_insn ()"
-  ""
-)
+{
+  /* Emit as the simple return.  */
+  if (!cfun->machine->fp_as_gp_p
+      && cfun->machine->naked_p
+      && (cfun->machine->va_args_size == 0))
+    {
+      emit_jump_insn (gen_return_internal ());
+      DONE;
+    }
+})

 ;; This pattern is expanded only by the shrink-wrapping optimization
 ;; on paths where the function prologue has not been executed.
+;; However, such optimization may reorder the prologue/epilogue blocks
+;; together with basic blocks within function body.
+;; So we must disable this pattern if we have already decided
+;; to perform fp_as_gp optimization, which requires prologue to be
+;; first block and epilogue to be last block.
 (define_expand "simple_return"
   [(simple_return)]
-  ""
+  "!cfun->machine->fp_as_gp_p"
   ""
 )

+(define_insn "*nds32_return"
+  [(parallel [(return)
+   (clobber (reg:SI FP_REGNUM))])]
+  ""
+{
+  return nds32_output_return ();
+}
+  [(set_attr "type" "branch")
+   (set_attr "enabled" "yes")
+   (set_attr "length" "4")])
+
 (define_insn "return_internal"
   [(simple_return)]
   ""
 {
+  if (nds32_isr_function_critical_p (current_function_decl))
+    return "iret";
+
   if (TARGET_16_BIT)
     return "ret5";
   else
     return "ret";
 }
   [(set_attr "type" "branch")
-   (set_attr "enabled" "1")
+   (set_attr "enabled" "yes")
    (set (attr "length")
-	(if_then_else (match_test "TARGET_16_BIT")
-		      (const_int 2)
-		      (const_int 4)))])
+	(if_then_else (match_test "nds32_isr_function_critical_p (current_function_decl)")
+		      (const_int 4)
+		      (if_then_else (match_test "TARGET_16_BIT")
+				    (const_int 2)
+				    (const_int 4))))])


 ;; ----------------------------------------------------------------------------
@@ -2267,6 +2131,7 @@ create_template:
 {
   rtx add_tmp;
   rtx reg, test;
+  rtx tmp_reg;

   /* Step A: "k <-- (plus (operands[0]) (-operands[1]))".  */
   if (operands[1] != const0_rtx)
@@ -2275,8 +2140,8 @@ create_template:
       add_tmp = gen_int_mode (-INTVAL (operands[1]), SImode);

       /* If the integer value is not in the range of imm15s,
-         we need to force register first because our addsi3 pattern
-         only accept nds32_rimm15s_operand predicate.  */
+	 we need to force register first because our addsi3 pattern
+	 only accept nds32_rimm15s_operand predicate.  */
       add_tmp = force_reg (SImode, add_tmp);

       emit_insn (gen_addsi3 (reg, operands[0], add_tmp));
@@ -2288,11 +2153,14 @@ create_template:
   emit_jump_insn (gen_cbranchsi4 (test, operands[0], operands[2],
 				  operands[4]));

-  operands[5] = gen_reg_rtx (SImode);
-  /* Step C, D, E, and F, using another temporary register operands[5].  */
+  tmp_reg = gen_reg_rtx (SImode);
+  /* Step C, D, E, and F, using another temporary register tmp_reg.  */
+  if (flag_pic)
+    emit_use (pic_offset_table_rtx);
+
   emit_jump_insn (gen_casesi_internal (operands[0],
 				       operands[3],
-				       operands[5]));
+				       tmp_reg));
   DONE;
 })

@@ -2328,17 +2196,34 @@ create_template:
   else
     return nds32_output_casesi (operands);
 }
-  [(set_attr "length" "20")
-   (set_attr "type" "alu")])
+  [(set_attr "type" "branch")
+   (set (attr "length")
+	(if_then_else (match_test "flag_pic")
+		      (const_int 28)
+		      (const_int 20)))])

 ;; ----------------------------------------------------------------------------

 ;; Performance Extension

+; If -fwrapv option is issued, GCC expects there will be
+; signed overflow situation.  So the ABS(INT_MIN) is still INT_MIN
+; (e.g. ABS(0x80000000)=0x80000000).
+; However, the hardware ABS instruction of nds32 target
+; always performs saturation: abs 0x80000000 -> 0x7fffffff.
+; So that we can only enable abssi2 pattern if flag_wrapv is NOT presented.
+(define_insn "abssi2"
+  [(set (match_operand:SI 0 "register_operand"         "=r")
+	(abs:SI (match_operand:SI 1 "register_operand" " r")))]
+  "TARGET_EXT_PERF && TARGET_HW_ABS && !flag_wrapv"
+  "abs\t%0, %1"
+  [(set_attr "type" "alu")
+   (set_attr "length" "4")])
+
 (define_insn "clzsi2"
   [(set (match_operand:SI 0 "register_operand"         "=r")
 	(clz:SI (match_operand:SI 1 "register_operand" " r")))]
-  "TARGET_PERF_EXT"
+  "TARGET_EXT_PERF"
   "clz\t%0, %1"
   [(set_attr "type" "alu")
    (set_attr "length" "4")])
@@ -2347,34 +2232,212 @@ create_template:
   [(set (match_operand:SI 0 "register_operand"          "=r")
 	(smax:SI (match_operand:SI 1 "register_operand" " r")
 		 (match_operand:SI 2 "register_operand" " r")))]
-  "TARGET_PERF_EXT"
+  "TARGET_EXT_PERF"
   "max\t%0, %1, %2"
   [(set_attr "type" "alu")
    (set_attr "length" "4")])

+(define_expand "uminqi3"
+  [(set (match_operand:QI 0 "register_operand" "")
+	(umin:QI (match_operand:QI 1 "register_operand" "")
+		 (match_operand:QI 2 "register_operand" "")))]
+  "TARGET_EXT_PERF"
+{
+  rtx tmpop[3];
+  tmpop[0] = gen_reg_rtx (SImode);
+  tmpop[1] = gen_reg_rtx (SImode);
+  tmpop[2] = gen_reg_rtx (SImode);
+
+  emit_insn (gen_zero_extendqisi2 (tmpop[1], operands[1]));
+  emit_insn (gen_zero_extendqisi2 (tmpop[2], operands[2]));
+  emit_insn (gen_sminsi3 (tmpop[0], tmpop[1], tmpop[2]));
+  convert_move (operands[0], tmpop[0], false);
+  DONE;
+})
+
+(define_expand "sminqi3"
+  [(set (match_operand:QI 0 "register_operand" "")
+	(smin:QI (match_operand:QI 1 "register_operand" "")
+		 (match_operand:QI 2 "register_operand" "")))]
+  "TARGET_EXT_PERF"
+{
+  rtx tmpop[3];
+  tmpop[0] = gen_reg_rtx (SImode);
+  tmpop[1] = gen_reg_rtx (SImode);
+  tmpop[2] = gen_reg_rtx (SImode);
+
+  emit_insn (gen_extendqisi2 (tmpop[1], operands[1]));
+  emit_insn (gen_extendqisi2 (tmpop[2], operands[2]));
+  emit_insn (gen_sminsi3 (tmpop[0], tmpop[1], tmpop[2]));
+  convert_move (operands[0], tmpop[0], false);
+  DONE;
+})
+
+(define_expand "uminhi3"
+  [(set (match_operand:HI 0 "register_operand" "")
+	(umin:HI (match_operand:HI 1 "register_operand" "")
+		 (match_operand:HI 2 "register_operand" "")))]
+  "TARGET_EXT_PERF"
+{
+  rtx tmpop[3];
+  tmpop[0] = gen_reg_rtx (SImode);
+  tmpop[1] = gen_reg_rtx (SImode);
+  tmpop[2] = gen_reg_rtx (SImode);
+
+  emit_insn (gen_zero_extendhisi2 (tmpop[1], operands[1]));
+  emit_insn (gen_zero_extendhisi2 (tmpop[2], operands[2]));
+  emit_insn (gen_sminsi3 (tmpop[0], tmpop[1], tmpop[2]));
+  convert_move (operands[0], tmpop[0], false);
+  DONE;
+})
+
+(define_expand "sminhi3"
+  [(set (match_operand:HI 0 "register_operand" "")
+	(smin:HI (match_operand:HI 1 "register_operand" "")
+		 (match_operand:HI 2 "register_operand" "")))]
+  "TARGET_EXT_PERF"
+{
+  rtx tmpop[3];
+  tmpop[0] = gen_reg_rtx (SImode);
+  tmpop[1] = gen_reg_rtx (SImode);
+  tmpop[2] = gen_reg_rtx (SImode);
+
+  emit_insn (gen_extendhisi2 (tmpop[1], operands[1]));
+  emit_insn (gen_extendhisi2 (tmpop[2], operands[2]));
+  emit_insn (gen_sminsi3 (tmpop[0], tmpop[1], tmpop[2]));
+  convert_move (operands[0], tmpop[0], false);
+  DONE;
+})
+
 (define_insn "sminsi3"
   [(set (match_operand:SI 0 "register_operand"          "=r")
 	(smin:SI (match_operand:SI 1 "register_operand" " r")
 		 (match_operand:SI 2 "register_operand" " r")))]
-  "TARGET_PERF_EXT"
+  "TARGET_EXT_PERF"
   "min\t%0, %1, %2"
   [(set_attr "type" "alu")
    (set_attr "length" "4")])

-(define_insn "*btst"
-  [(set (match_operand:SI 0 "register_operand"                   "=   r")
-	(zero_extract:SI (match_operand:SI 1 "register_operand"  "    r")
+(define_insn "btst"
+  [(set (match_operand:SI 0 "register_operand"                     "=   r")
+	(zero_extract:SI (match_operand:SI 1 "register_operand"    "    r")
 			 (const_int 1)
-			 (match_operand:SI 2 "immediate_operand" " Iu05")))]
-  "TARGET_PERF_EXT"
+			 (match_operand:SI 2 "nds32_imm5u_operand" " Iu05")))]
+  "TARGET_EXT_PERF"
   "btst\t%0, %1, %2"
   [(set_attr "type" "alu")
    (set_attr "length" "4")])

+(define_insn "ave"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(truncate:SI
+	  (ashiftrt:DI
+	    (plus:DI
+	      (plus:DI
+		(sign_extend:DI (match_operand:SI 1 "register_operand" "r"))
+		(sign_extend:DI (match_operand:SI 2 "register_operand" "r")))
+	      (const_int 1))
+	  (const_int 1))))]
+  "TARGET_EXT_PERF"
+  "ave\t%0, %1, %2"
+  [(set_attr "type" "alu")
+   (set_attr "length" "4")])
+
 ;; ----------------------------------------------------------------------------

 ;; Pseudo NOPs

+;; Structural hazards NOP
+(define_insn "nop_res_dep"
+  [(unspec [(match_operand 0 "const_int_operand" "i")] UNSPEC_VOLATILE_RES_DEP)]
+  ""
+  "! structural dependency (%0 cycles)"
+  [(set_attr "length" "0")]
+)
+
+;; Data hazards NOP
+(define_insn "nop_data_dep"
+  [(unspec [(match_operand 0 "const_int_operand" "i")] UNSPEC_VOLATILE_DATA_DEP)]
+  ""
+  "! data dependency (%0 cycles)"
+  [(set_attr "length" "0")]
+)
+
+(define_insn "relax_group"
+  [(unspec_volatile [(match_operand:SI 0 "immediate_operand" "i")] UNSPEC_VOLATILE_RELAX_GROUP)]
+  ""
+  ".relax_hint %0"
+  [(set_attr "length" "0")]
+)
+
+(define_insn "innermost_loop_begin"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_INNERMOST_LOOP_BEGIN)]
+  ""
+  ".innermost_loop_begin"
+  [(set_attr "length" "0")]
+)
+
+(define_insn "innermost_loop_end"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_INNERMOST_LOOP_END)]
+  ""
+  ".innermost_loop_end"
+  [(set_attr "length" "0")]
+)
+
+(define_insn "no_ifc_begin"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_NO_IFC_BEGIN)]
+  ""
+  ".no_ifc_begin"
+  [(set_attr "length" "0")]
+)
+
+(define_insn "no_ifc_end"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_NO_IFC_END)]
+  ""
+  ".no_ifc_end"
+  [(set_attr "length" "0")]
+)
+
+(define_insn "no_ex9_begin"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_NO_EX9_BEGIN)]
+  ""
+  ".no_ex9_begin"
+  [(set_attr "length" "0")]
+)
+
+(define_insn "no_ex9_end"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_NO_EX9_END)]
+  ""
+  ".no_ex9_end"
+  [(set_attr "length" "0")]
+)
+
+(define_insn "hwloop_last_insn"
+  [(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_HWLOOP_LAST_INSN)]
+  ""
+  ""
+  [(set_attr "length" "0")]
+)
+
+;; Output .omit_fp_begin for fp-as-gp optimization.
+;; Also we have to set $fp register.
+(define_insn "omit_fp_begin"
+  [(set (match_operand:SI 0 "register_operand" "=x")
+	(unspec_volatile:SI [(const_int 0)] UNSPEC_VOLATILE_OMIT_FP_BEGIN))]
+  ""
+  "! -----\;.omit_fp_begin\;la\t$fp,_FP_BASE_\;! -----"
+  [(set_attr "length" "8")]
+)
+
+;; Output .omit_fp_end for fp-as-gp optimization.
+;; Claim that we have to use $fp register.
+(define_insn "omit_fp_end"
+  [(unspec_volatile:SI [(match_operand:SI 0 "register_operand" "x")] UNSPEC_VOLATILE_OMIT_FP_END)]
+  ""
+  "! -----\;.omit_fp_end\;! -----"
+  [(set_attr "length" "0")]
+)
+
 (define_insn "pop25return"
   [(return)
    (unspec_volatile:SI [(reg:SI LP_REGNUM)] UNSPEC_VOLATILE_POP25_RETURN)]
@@ -2383,4 +2446,262 @@ create_template:
   [(set_attr "length" "0")]
 )

+;; Add pc
+(define_insn "add_pc"
+  [(set (match_operand:SI 0 "register_operand"          "=r")
+	(plus:SI (match_operand:SI 1 "register_operand"  "0")
+		 (pc)))]
+  "TARGET_LINUX_ABI || flag_pic"
+  "add5.pc\t%0"
+  [(set_attr "type"    "alu")
+   (set_attr "length"    "4")]
+)
+
+(define_expand "bswapsi2"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(bswap:SI (match_operand:SI 1 "register_operand" "r")))]
+  ""
+{
+  emit_insn (gen_unspec_wsbh (operands[0], operands[1]));
+  emit_insn (gen_rotrsi3 (operands[0], operands[0], GEN_INT (16)));
+  DONE;
+})
+
+(define_insn "bswaphi2"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+	(bswap:HI (match_operand:HI 1 "register_operand" "r")))]
+  ""
+  "wsbh\t%0, %1"
+  [(set_attr "type"    "alu")
+   (set_attr "length"    "4")]
+)
+
+;;  Hardware loop
+
+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+  [(parallel [(set (pc) (if_then_else
+			  (ne (match_operand:SI 0 "" "")
+			      (const_int 1))
+			  (label_ref (match_operand 1 "" ""))
+			  (pc)))
+	      (set (match_dup 0)
+		   (plus:SI (match_dup 0)
+			    (const_int -1)))
+	      (unspec [(const_int 0)] UNSPEC_LOOP_END)
+	      (clobber (match_dup 2))])] ; match_scratch
+  "NDS32_HW_LOOP_P ()"
+{
+  /* The loop optimizer doesn't check the predicates... */
+  if (GET_MODE (operands[0]) != SImode)
+    FAIL;
+  operands[2] = gen_rtx_SCRATCH (SImode);
+})
+
+(define_insn "loop_end"
+  [(set (pc)
+	(if_then_else (ne (match_operand:SI 3 "nonimmediate_operand" "0, 0, *r, 0")
+			  (const_int 1))
+		      (label_ref (match_operand 1 "" ""))
+		      (pc)))
+   (set (match_operand:SI 0 "nonimmediate_operand" "=r, m, m, *f")
+	(plus:SI (match_dup 3)
+		 (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LOOP_END)
+   (clobber (match_scratch:SI 2 "=X, &r, &r, &r"))]
+  "NDS32_HW_LOOP_P ()"
+  "#"
+  [(set_attr "length" "12, 12, 12, 12")])
+
+(define_split
+  [(set (pc)
+	(if_then_else (ne (match_operand:SI 3 "nonimmediate_operand" "")
+			  (const_int 1))
+		      (label_ref (match_operand 1 "" ""))
+		      (pc)))
+   (set (match_operand:SI 0 "fpu_reg_or_memory_operand" "")
+	(plus:SI (match_dup 3)
+		 (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LOOP_END)
+   (clobber (match_scratch:SI 2 ""))]
+  "NDS32_HW_LOOP_P ()"
+  [(set (match_dup 2) (plus:SI (match_dup 3) (const_int -1)))
+   (set (match_dup 0) (match_dup 2))
+   (set (pc)
+	(if_then_else (ne (match_dup 2) (const_int 0))
+		      (label_ref (match_dup 1))
+		      (pc)))]
+{
+  if (fpu_reg_or_memory_operand (operands[3], SImode))
+    {
+      emit_move_insn (operands[2], operands[3]);
+      operands[3] = operands[2];
+    }
+})
+
+(define_insn "mtlbi_hint"
+  [(set (reg:SI LB_REGNUM)
+	(match_operand:SI 0 "nds32_label_operand" "i"))
+   (unspec [(match_operand 1 "const_int_operand" "i")] UNSPEC_LOOP_END)]
+  "NDS32_HW_LOOP_P ()"
+  "mtlbi\t%0"
+  [(set_attr "length"	"4")])
+
+(define_insn "mtlbi"
+  [(set (reg:SI LB_REGNUM)
+	(match_operand:SI 0 "nds32_label_operand" "i"))]
+  "NDS32_HW_LOOP_P ()"
+  "mtlbi\t%0"
+  [(set_attr "length"	"4")])
+
+(define_insn "mtlei"
+  [(set (reg:SI LE_REGNUM)
+	(match_operand:SI 0 "nds32_label_operand" "i"))]
+  "NDS32_HW_LOOP_P ()"
+  "mtlei\t%0"
+  [(set_attr "length"	"4")])
+
+(define_insn "init_lc"
+  [(set (reg:SI LC_REGNUM)
+	(match_operand:SI 0 "register_operand" "r"))
+   (unspec [(match_operand 1 "const_int_operand" "i")] UNSPEC_LOOP_END)]
+  "NDS32_HW_LOOP_P ()"
+  "mtusr\t%0, LC"
+  [(set_attr "length"	"4")])
+
+; After replace hwloop, use this is pattern to get right CFG
+(define_insn "hwloop_cfg"
+  [(set (pc)
+	(if_then_else (ne (reg:SI LC_REGNUM)
+			  (const_int 1))
+		      (match_operand:SI 1 "nds32_label_operand" "i")
+		      (pc)))
+   (set (reg:SI LC_REGNUM)
+	(plus:SI (reg:SI LC_REGNUM)
+		 (const_int -1)))
+   (use (reg:SI LB_REGNUM))
+   (use (reg:SI LE_REGNUM))
+   (use (reg:SI LC_REGNUM))
+   (unspec [(match_operand 0 "const_int_operand" "i")] UNSPEC_LOOP_END)]
+  "TARGET_HWLOOP"
+  ""
+  [(set_attr "length" "0")])
+;; ----------------------------------------------------------------------------
+
+;; Patterns for exception handling
+
+(define_expand "eh_return"
+  [(use (match_operand 0 "general_operand"))]
+  ""
+{
+  emit_insn (gen_nds32_eh_return (operands[0]));
+  DONE;
+})
+
+(define_insn_and_split "nds32_eh_return"
+  [(unspec_volatile [(match_operand:SI 0 "register_operand" "r")] UNSPEC_VOLATILE_EH_RETURN)]
+  ""
+  "#"
+  "reload_completed"
+  [(const_int 0)]
+{
+  rtx place;
+  rtx addr;
+
+  /* The operands[0] is the handler address.  We need to assign it
+     to return address rtx so that we can jump to exception handler
+     when returning from current function.  */
+
+  if (cfun->machine->lp_size == 0)
+    {
+      /* If $lp is not saved in the stack frame, we can take $lp directly.  */
+      place = gen_rtx_REG (SImode, LP_REGNUM);
+    }
+  else
+    {
+      /* Otherwise, we need to locate the stack slot of return address.
+	 The return address is generally saved in [$fp-4] location.
+	 However, DSE (dead store elimination) does not detect an alias
+	 between [$fp-x] and [$sp+y].  This can result in a store to save
+	 $lp introduced by builtin_eh_return() being incorrectly deleted
+	 if it is based on $fp.  The solution we take here is to compute
+	 the offset relative to stack pointer and then use $sp to access
+	 location so that the alias can be detected.
+	 FIXME: What if the immediate value "offset" is too large to be
+	        fit in a single addi instruction?  */
+      HOST_WIDE_INT offset;
+
+      offset = (cfun->machine->fp_size
+		+ cfun->machine->gp_size
+		+ cfun->machine->lp_size
+		+ cfun->machine->callee_saved_gpr_regs_size
+		+ cfun->machine->callee_saved_area_gpr_padding_bytes
+		+ cfun->machine->callee_saved_fpr_regs_size
+		+ cfun->machine->eh_return_data_regs_size
+		+ cfun->machine->local_size
+		+ cfun->machine->out_args_size);
+
+      addr = plus_constant (Pmode, stack_pointer_rtx, offset - 4);
+      place = gen_frame_mem (SImode, addr);
+    }
+
+  emit_move_insn (place, operands[0]);
+  DONE;
+})
+
+;; ----------------------------------------------------------------------------
+
+;; Patterns for TLS.
+;; The following two tls patterns don't be expanded directly because the
+;; intermediate value may be spilled into the stack.  As a result, it is
+;; hard to analyze the define-use chain in the relax_opt pass.
+
+
+;; There is a unspec operand to record RELAX_GROUP number because each
+;; emitted instruction need a relax_hint above it.
+(define_insn "tls_desc"
+  [(set (reg:SI 0)
+	(call (unspec_volatile:SI [(match_operand:SI 0 "nds32_symbolic_operand" "i")] UNSPEC_TLS_DESC)
+	      (const_int 1)))
+   (use (unspec [(match_operand:SI 1 "immediate_operand" "i")] UNSPEC_VOLATILE_RELAX_GROUP))
+   (use (reg:SI GP_REGNUM))
+   (clobber (reg:SI LP_REGNUM))
+   (clobber (reg:SI TA_REGNUM))]
+  ""
+  {
+    return nds32_output_tls_desc (operands);
+  }
+  [(set_attr "length" "20")
+   (set_attr "type" "branch")]
+)
+
+;; There is a unspec operand to record RELAX_GROUP number because each
+;; emitted instruction need a relax_hint above it.
+(define_insn "tls_ie"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "nds32_symbolic_operand" "i")] UNSPEC_TLS_IE))
+   (use (unspec [(match_operand:SI 2 "immediate_operand" "i")] UNSPEC_VOLATILE_RELAX_GROUP))
+   (use (reg:SI GP_REGNUM))]
+  ""
+  {
+    return nds32_output_tls_ie (operands);
+  }
+  [(set (attr "length") (if_then_else (match_test "flag_pic")
+				      (const_int 12)
+				      (const_int 8)))
+   (set_attr "type" "misc")]
+)
+
+;; The pattern is for some relaxation groups that have to keep addsi3 in 32-bit mode.
+(define_insn "addsi3_32bit"
+  [(set (match_operand:SI 0 "register_operand"             "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "%r")
+		    (match_operand:SI 2 "register_operand" " r")] UNSPEC_ADD32))]
+  ""
+  "add\t%0, %1, %2";
+  [(set_attr "type"    "alu")
+   (set_attr "length"  "4")
+   (set_attr "feature" "v1")])
+
 ;; ----------------------------------------------------------------------------
diff --git a/gcc/config/nds32/nds32.opt b/gcc/config/nds32/nds32.opt
index 938136f..a70ced9 100644
--- a/gcc/config/nds32/nds32.opt
+++ b/gcc/config/nds32/nds32.opt
@@ -21,14 +21,67 @@
 HeaderInclude
 config/nds32/nds32-opts.h

-mbig-endian
-Target Report RejectNegative Negative(mlittle-endian) Mask(BIG_ENDIAN)
+; ---------------------------------------------------------------
+; The following options are designed for aliasing and compatibility options.
+
+EB
+Target RejectNegative Alias(mbig-endian)
 Generate code in big-endian mode.

-mlittle-endian
-Target Report RejectNegative Negative(mbig-endian) InverseMask(BIG_ENDIAN)
+EL
+Target RejectNegative Alias(mlittle-endian)
 Generate code in little-endian mode.

+mfp-as-gp
+Target RejectNegative Alias(mforce-fp-as-gp)
+Force performing fp-as-gp optimization.
+
+mno-fp-as-gp
+Target RejectNegative Alias(mforbid-fp-as-gp)
+Forbid performing fp-as-gp optimization.
+
+m16bit
+Target Undocumented Alias(m16-bit)
+Generate 16-bit instructions.
+
+mcrt-arg=yes
+Target Undocumented Alias(mcrt-arg)
+Obsolete option.  Users SHOULD NOT use this option in the command line.
+
+mreduce-regs
+Target Undocumented Alias(mreduced-regs)
+Use reduced-set registers for register allocation.
+
+mcache-line-size=
+Target RejectNegative Joined UInteger Undocumented Alias(mcache-block-size=)
+Alias of -mcache-block-size=
+
+; ---------------------------------------------------------------
+
+mabi=
+Target RejectNegative Joined Enum(abi_type) Var(nds32_abi) Init(TARGET_DEFAULT_ABI)
+Specify which ABI type to generate code for: 2, 2fp+.
+
+Enum
+Name(abi_type) Type(enum abi_type)
+Known ABIs (for use with the -mabi= option):
+
+EnumValue
+Enum(abi_type) String(2) Value(NDS32_ABI_V2)
+
+EnumValue
+Enum(abi_type) String(2fp+) Value(NDS32_ABI_V2_FP_PLUS)
+
+mfloat-abi=soft
+Target RejectNegative Alias(mabi=, 2)
+Specify use soft floating point ABI which mean alias to -mabi=2.
+
+mfloat-abi=hard
+Target RejectNegative Alias(mabi=, 2fp+)
+Specify use soft floating point ABI which mean alias to -mabi=2fp+.
+
+; ---------------------------------------------------------------
+
 mreduced-regs
 Target Report RejectNegative Negative(mfull-regs) Mask(REDUCED_REGS)
 Use reduced-set registers for register allocation.
@@ -37,14 +90,148 @@ mfull-regs
 Target Report RejectNegative Negative(mreduced-regs) InverseMask(REDUCED_REGS)
 Use full-set registers for register allocation.

+; ---------------------------------------------------------------
+
+Os1
+Target
+Optimize for size level 1. This option will disable IFC and EX9 to prevent performance drop.
+
+Os2
+Target
+Optimize for size level 2. This option will disable IFC and EX9 for innermost loop to prevent performance drop.
+
+Os3
+Target
+Optimize for size level 3 which mean don't care performance.
+
+malways-align
+Target Mask(ALWAYS_ALIGN)
+Always align function entry, jump target and return address.
+
+malign-functions
+Target Mask(ALIGN_FUNCTION)
+Align function entry to 4 byte.
+
+mbig-endian
+Target Undocumented RejectNegative Negative(mlittle-endian) Mask(BIG_ENDIAN)
+Generate code in big-endian mode.
+
+mlittle-endian
+Target Undocumented RejectNegative Negative(mbig-endian) InverseMask(BIG_ENDIAN)
+Generate code in little-endian mode.
+
+mforce-fp-as-gp
+Target Undocumented Mask(FORCE_FP_AS_GP)
+Prevent $fp being allocated during register allocation so that compiler is able to force performing fp-as-gp optimization.
+
+mforbid-fp-as-gp
+Target Undocumented Mask(FORBID_FP_AS_GP)
+Forbid using $fp to access static and global variables.  This option strictly forbids fp-as-gp optimization regardless of '-mforce-fp-as-gp'.
+
+minline-strcpy
+Target Undocumented Mask(INLINE_STRCPY)
+Inlining strcpy function.
+
+mload-store-opt
+Target Mask(LOAD_STORE_OPT)
+Enable load store optimization.
+
+mregrename
+Target Mask(REGRENAME_OPT)
+Enable target dependent register rename optimization.
+
+mgcse
+Target Mask(GCSE_OPT)
+Enable target dependent global CSE optimization.
+
+mconst-remater
+Target Var(flag_nds32_const_remater_opt)
+Enable target dependent constant remeterialization optimization.
+
+msoft-fp-arith-comm
+Target Mask(SOFT_FP_ARITH_COMM)
+Enable operand commutative for soft floating point arithmetic optimization.
+
+msign-conversion
+Target Var(flag_nds32_sign_conversion)
+Enable the sign conversion in Gimple level.
+
+mscalbn-transform
+Target Var(flag_nds32_scalbn_transform)
+Enable the scalbn transform in Gimple level.
+
+mlmwsmw-opt
+Target Var(flag_nds32_lmwsmw_opt)
+Enable the load/store multiple optimization.
+
+mict-model=
+Target Undocumented RejectNegative Joined Enum(nds32_ict_model_type) Var(nds32_ict_model) Init(ICT_MODEL_SMALL)
+Specify the address generation strategy for ICT call's code model.
+
+Enum
+Name(nds32_ict_model_type) Type(enum nds32_ict_model_type)
+Known cmodel types (for use with the -mict-model= option):
+
+EnumValue
+Enum(nds32_ict_model_type) String(small) Value(ICT_MODEL_SMALL)
+
+EnumValue
+Enum(nds32_ict_model_type) String(large) Value(ICT_MODEL_LARGE)
+
+mlmwsmw-cost=
+Target RejectNegative Joined Enum(lmwsmw_cost_type) Var(flag_lmwsmw_cost) Init(LMWSMW_OPT_AUTO)
+Specify the load/store insn generate to lmw/smw.
+
+Enum
+Name(lmwsmw_cost_type) Type(enum lmwsmw_cost_type)
+Known lmwsmw cost type (for use with the -mlmwsmw-cost= option):
+
+EnumValue
+Enum(lmwsmw_cost_type) String(size) Value(LMWSMW_OPT_SIZE)
+
+EnumValue
+Enum(lmwsmw_cost_type) String(speed) Value(LMWSMW_OPT_SPEED)
+
+EnumValue
+Enum(lmwsmw_cost_type) String(all) Value(LMWSMW_OPT_ALL)
+
+EnumValue
+Enum(lmwsmw_cost_type) String(auto) Value(LMWSMW_OPT_AUTO)
+
+mabi-compatible
+Target Var(flag_nds32_abi_compatible)
+Enable the ABI compatible detection.
+
+mcprop-acc
+Target Var(flag_nds32_cprop_acc)
+Enable the copy propagation for accumulate style instructions.
+
+; ---------------------------------------------------------------
+
 mcmov
 Target Report Mask(CMOV)
 Generate conditional move instructions.

-mperf-ext
-Target Report Mask(PERF_EXT)
+mhw-abs
+Target Report Mask(HW_ABS)
+Generate hardware abs instructions.
+
+mext-perf
+Target Report Mask(EXT_PERF)
 Generate performance extension instructions.

+mext-perf2
+Target Report Mask(EXT_PERF2)
+Generate performance extension version 2 instructions.
+
+mext-string
+Target Report Mask(EXT_STRING)
+Generate string extension instructions.
+
+mext-dsp
+Target Report Mask(EXT_DSP)
+Generate DSP extension instructions.
+
 mv3push
 Target Report Mask(V3PUSH)
 Generate v3 push25/pop25 instructions.
@@ -53,10 +240,22 @@ m16-bit
 Target Report Mask(16_BIT)
 Generate 16-bit instructions.

+mrelax-hint
+Target Report Mask(RELAX_HINT)
+Insert relax hint for linker to do relaxation.
+
+mvh
+Target Report Mask(VH) Condition(!TARGET_LINUX_ABI)
+Enable Virtual Hosting support.
+
 misr-vector-size=
-Target RejectNegative Joined UInteger Var(nds32_isr_vector_size) Init(NDS32_DEFAULT_ISR_VECTOR_SIZE)
+Target RejectNegative Joined UInteger Var(nds32_isr_vector_size) Init(NDS32_DEFAULT_ISR_VECTOR_SIZE) Condition(!TARGET_LINUX_ABI)
 Specify the size of each interrupt vector, which must be 4 or 16.

+misr-secure=
+Target RejectNegative Joined UInteger Var(nds32_isr_secure_level) Init(0)
+Specify the security level of c-isr for the whole file.
+
 mcache-block-size=
 Target RejectNegative Joined UInteger Var(nds32_cache_block_size) Init(NDS32_DEFAULT_CACHE_BLOCK_SIZE)
 Specify the size of each cache block, which must be a power of 2 between 4 and 512.
@@ -73,32 +272,418 @@ EnumValue
 Enum(nds32_arch_type) String(v2) Value(ARCH_V2)

 EnumValue
+Enum(nds32_arch_type) String(v2j) Value(ARCH_V2J)
+
+EnumValue
 Enum(nds32_arch_type) String(v3) Value(ARCH_V3)

 EnumValue
+Enum(nds32_arch_type) String(v3j) Value(ARCH_V3J)
+
+EnumValue
 Enum(nds32_arch_type) String(v3m) Value(ARCH_V3M)

-mcmodel=
-Target RejectNegative Joined Enum(nds32_cmodel_type) Var(nds32_cmodel_option) Init(CMODEL_MEDIUM)
-Specify the address generation strategy for code model.
+EnumValue
+Enum(nds32_arch_type) String(v3m+) Value(ARCH_V3M_PLUS)
+
+EnumValue
+Enum(nds32_arch_type) String(v3f) Value(ARCH_V3F)
+
+EnumValue
+Enum(nds32_arch_type) String(v3s) Value(ARCH_V3S)
+
+mcpu=
+Target RejectNegative Joined Enum(nds32_cpu_type) Var(nds32_cpu_option) Init(CPU_N9)
+Specify the cpu for pipeline model.

 Enum
-Name(nds32_cmodel_type) Type(enum nds32_cmodel_type)
-Known cmodel types (for use with the -mcmodel= option):
+Name(nds32_cpu_type) Type(enum nds32_cpu_type)
+Known cpu types (for use with the -mcpu= option):
+
+EnumValue
+Enum(nds32_cpu_type) String(n6) Value(CPU_N6)
+
+EnumValue
+Enum(nds32_cpu_type) String(n650) Value(CPU_N6)
+
+EnumValue
+Enum(nds32_cpu_type) String(n7) Value(CPU_N7)
+
+EnumValue
+Enum(nds32_cpu_type) String(n705) Value(CPU_N7)
+
+EnumValue
+Enum(nds32_cpu_type) String(n8) Value(CPU_N8)
+
+EnumValue
+Enum(nds32_cpu_type) String(n801) Value(CPU_N8)
+
+EnumValue
+Enum(nds32_cpu_type) String(sn8) Value(CPU_N8)
+
+EnumValue
+Enum(nds32_cpu_type) String(sn801) Value(CPU_N8)
+
+EnumValue
+Enum(nds32_cpu_type) String(s8) Value(CPU_N8)
+
+EnumValue
+Enum(nds32_cpu_type) String(s801) Value(CPU_N8)
+
+EnumValue
+Enum(nds32_cpu_type) String(e8) Value(CPU_E8)
+
+EnumValue
+Enum(nds32_cpu_type) String(e801) Value(CPU_E8)
+
+EnumValue
+Enum(nds32_cpu_type) String(n820) Value(CPU_E8)
+
+EnumValue
+Enum(nds32_cpu_type) String(s830) Value(CPU_E8)
+
+EnumValue
+Enum(nds32_cpu_type) String(e830) Value(CPU_E8)
+
+EnumValue
+Enum(nds32_cpu_type) String(n9) Value(CPU_N9)
+
+EnumValue
+Enum(nds32_cpu_type) String(n903) Value(CPU_N9)
+
+EnumValue
+Enum(nds32_cpu_type) String(n903a) Value(CPU_N9)
+
+EnumValue
+Enum(nds32_cpu_type) String(n968) Value(CPU_N9)
+
+EnumValue
+Enum(nds32_cpu_type) String(n968a) Value(CPU_N9)
+
+EnumValue
+Enum(nds32_cpu_type) String(n10) Value(CPU_N10)
+
+EnumValue
+Enum(nds32_cpu_type) String(n1033) Value(CPU_N10)
+
+EnumValue
+Enum(nds32_cpu_type) String(n1033a) Value(CPU_N10)
+
+EnumValue
+Enum(nds32_cpu_type) String(n1033-fpu) Value(CPU_N10)
+
+EnumValue
+Enum(nds32_cpu_type) String(n1033-spu) Value(CPU_N10)

 EnumValue
-Enum(nds32_cmodel_type) String(small) Value(CMODEL_SMALL)
+Enum(nds32_cpu_type) String(n1068) Value(CPU_N10)

 EnumValue
-Enum(nds32_cmodel_type) String(medium) Value(CMODEL_MEDIUM)
+Enum(nds32_cpu_type) String(n1068a) Value(CPU_N10)

 EnumValue
-Enum(nds32_cmodel_type) String(large) Value(CMODEL_LARGE)
+Enum(nds32_cpu_type) String(n1068-fpu) Value(CPU_N10)
+
+EnumValue
+Enum(nds32_cpu_type) String(n1068a-fpu) Value(CPU_N10)
+
+EnumValue
+Enum(nds32_cpu_type) String(n1068-spu) Value(CPU_N10)
+
+EnumValue
+Enum(nds32_cpu_type) String(n1068a-spu) Value(CPU_N10)
+
+EnumValue
+Enum(nds32_cpu_type) String(d10) Value(CPU_N10)
+
+EnumValue
+Enum(nds32_cpu_type) String(d1088) Value(CPU_N10)
+
+EnumValue
+Enum(nds32_cpu_type) String(d1088-fpu) Value(CPU_N10)
+
+EnumValue
+Enum(nds32_cpu_type) String(d1088-spu) Value(CPU_N10)
+
+EnumValue
+Enum(nds32_cpu_type) Undocumented String(graywolf) Value(CPU_GRAYWOLF)
+
+EnumValue
+Enum(nds32_cpu_type) String(n15) Value(CPU_GRAYWOLF)
+
+EnumValue
+Enum(nds32_cpu_type) String(d15) Value(CPU_GRAYWOLF)
+
+EnumValue
+Enum(nds32_cpu_type) String(n15s) Value(CPU_GRAYWOLF)
+
+EnumValue
+Enum(nds32_cpu_type) String(d15s) Value(CPU_GRAYWOLF)
+
+EnumValue
+Enum(nds32_cpu_type) String(n15f) Value(CPU_GRAYWOLF)
+
+EnumValue
+Enum(nds32_cpu_type) String(d15f) Value(CPU_GRAYWOLF)
+
+EnumValue
+Enum(nds32_cpu_type) String(n12) Value(CPU_N12)
+
+EnumValue
+Enum(nds32_cpu_type) String(n1213) Value(CPU_N12)
+
+EnumValue
+Enum(nds32_cpu_type) String(n1233) Value(CPU_N12)
+
+EnumValue
+Enum(nds32_cpu_type) String(n1233-fpu) Value(CPU_N12)
+
+EnumValue
+Enum(nds32_cpu_type) String(n1233-spu) Value(CPU_N12)
+
+EnumValue
+Enum(nds32_cpu_type) String(n13) Value(CPU_N13)
+
+EnumValue
+Enum(nds32_cpu_type) String(n1337) Value(CPU_N13)
+
+EnumValue
+Enum(nds32_cpu_type) String(n1337-fpu) Value(CPU_N13)
+
+EnumValue
+Enum(nds32_cpu_type) String(n1337-spu) Value(CPU_N13)
+
+EnumValue
+Enum(nds32_cpu_type) Undocumented String(panther) Value(CPU_PANTHER)
+
+EnumValue
+Enum(nds32_cpu_type) Undocumented String(simple) Value(CPU_SIMPLE)
+
+mcpu=n15
+Target RejectNegative Undocumented Alias(mcpu=, graywolf)
+Alias for multi-lib work.
+
+mcpu=n15f
+Target RejectNegative Undocumented Alias(mcpu=, graywolf)
+Alias for multi-lib work.
+
+mcpu=n15s
+Target RejectNegative Undocumented Alias(mcpu=, graywolf)
+Alias for multi-lib work.
+
+mcpu=d15
+Target RejectNegative Undocumented Alias(mcpu=, graywolf)
+Alias for multi-lib work.
+
+mcpu=d15s
+Target RejectNegative Undocumented Alias(mcpu=, graywolf)
+Alias for multi-lib work.
+
+mcpu=d15f
+Target RejectNegative Undocumented Alias(mcpu=, graywolf)
+Alias for multi-lib work.
+
+mgraywolf
+Target RejectNegative Undocumented Alias(mcpu=, graywolf)
+This alias is only for gcc parallel test.
+
+mv3m+
+Target RejectNegative Undocumented Alias(march=, v3m+)
+This alias is only for gcc parallel test.
+
+mmemory-model=
+Target RejectNegative Joined Enum(nds32_memory_model_type) Var(nds32_memory_model_option) Init(MEMORY_MODEL_FAST)
+Specify the memory model, fast or slow memory.
+
+Enum
+Name(nds32_memory_model_type) Type(enum nds32_memory_model_type)
+
+EnumValue
+Enum(nds32_memory_model_type) String(slow) Value(MEMORY_MODEL_SLOW)
+
+EnumValue
+Enum(nds32_memory_model_type) String(fast) Value(MEMORY_MODEL_FAST)
+
+mconfig-fpu=
+Target RejectNegative Joined Enum(float_reg_number) Var(nds32_fp_regnum) Init(TARGET_CONFIG_FPU_DEFAULT)
+Specify a fpu configuration value from 0 to 7; 0-3 is as FPU spec says, and 4-7 is corresponding to 0-3.
+
+Enum
+Name(float_reg_number) Type(enum float_reg_number)
+Known floating-point number of registers (for use with the -mconfig-fpu= option):
+
+EnumValue
+Enum(float_reg_number) String(0) Value(NDS32_CONFIG_FPU_0)
+
+EnumValue
+Enum(float_reg_number) String(1) Value(NDS32_CONFIG_FPU_1)
+
+EnumValue
+Enum(float_reg_number) String(2) Value(NDS32_CONFIG_FPU_2)
+
+EnumValue
+Enum(float_reg_number) String(3) Value(NDS32_CONFIG_FPU_3)
+
+EnumValue
+Enum(float_reg_number) String(4) Value(NDS32_CONFIG_FPU_4)
+
+EnumValue
+Enum(float_reg_number) String(5) Value(NDS32_CONFIG_FPU_5)
+
+EnumValue
+Enum(float_reg_number) String(6) Value(NDS32_CONFIG_FPU_6)
+
+EnumValue
+Enum(float_reg_number) String(7) Value(NDS32_CONFIG_FPU_7)
+
+mconfig-mul=
+Target RejectNegative Joined Enum(nds32_mul_type) Var(nds32_mul_config) Init(MUL_TYPE_FAST_1)
+Specify configuration of instruction mul: fast1, fast2 or slow. The default is fast1.
+
+Enum
+Name(nds32_mul_type) Type(enum nds32_mul_type)
+
+EnumValue
+Enum(nds32_mul_type) String(fast) Value(MUL_TYPE_FAST_1)
+
+EnumValue
+Enum(nds32_mul_type) String(fast1) Value(MUL_TYPE_FAST_1)
+
+EnumValue
+Enum(nds32_mul_type) String(fast2) Value(MUL_TYPE_FAST_2)
+
+EnumValue
+Enum(nds32_mul_type) String(slow) Value(MUL_TYPE_SLOW)
+
+mconfig-register-ports=
+Target RejectNegative Joined Enum(nds32_register_ports) Var(nds32_register_ports_config) Init(REG_PORT_3R2W)
+Specify how many read/write ports for n9/n10 cores.  The value should be 3r2w or 2r1w.
+
+Enum
+Name(nds32_register_ports) Type(enum nds32_register_ports)
+
+EnumValue
+Enum(nds32_register_ports) String(3r2w) Value(REG_PORT_3R2W)
+
+EnumValue
+Enum(nds32_register_ports) String(2r1w) Value(REG_PORT_2R1W)
+
+mreorg-out-of-order
+Target Report Var(flag_reorg_out_of_order) Init(0)
+Allow out-of-order reorganization for multiple issue micro-architectures.
+
+mifc
+Target Report Mask(IFC)
+Use special directives to guide linker doing ifc optimization.
+
+mex9
+Target Report Mask(EX9)
+Use special directives to guide linker doing ex9 optimization.
+
+mprint-stall-cycles
+Target Report Mask(PRINT_STALLS)
+Print stall cycles due to structural or data dependencies. It should be used with the option '-S'.
+Note that stall cycles are determined by the compiler's pipeline model and it may not be precise.

 mctor-dtor
 Target Report
 Enable constructor/destructor feature.

+mcrt-arg
+Target Report
+Enable argc/argv passed by simulator.
+
 mrelax
 Target Report
 Guide linker to relax instructions.
+
+minnermost-loop
+Target Report Mask(INNERMOST_LOOP)
+Insert the innermost loop directive.
+
+mext-fpu-fma
+Target Report Mask(EXT_FPU_FMA)
+Generate floating-point multiply-accumulation instructions.
+
+mext-fpu-sp
+Target Report Mask(FPU_SINGLE)
+Generate single-precision floating-point instructions.
+
+mext-fpu-dp
+Target Report Mask(FPU_DOUBLE)
+Generate double-precision floating-point instructions.
+
+mext-zol
+Target Report Mask(HWLOOP)
+Insert the hardware loop directive.
+
+mforce-no-ext-zol
+Target Undocumented Report Mask(FORCE_NO_HWLOOP)
+Force disable hardware loop, even use -mext-zol.
+
+mforce-no-ext-dsp
+Target Undocumented Report Mask(FORCE_NO_EXT_DSP)
+Force disable hardware loop, even use -mext-dsp.
+
+mforce-memcpy-zol
+Target Report Var(flag_force_memcpy_zol) Init(0)
+Force enable hardware loop in memcpy function.
+
+msched-prolog-epilog
+Target Var(flag_sched_prolog_epilog) Init(1)
+Permit scheduling of a function's prologue and epilogue sequence.
+
+mret-in-naked-func
+Target Var(flag_ret_in_naked_func) Init(1)
+Generate return instruction in naked function.
+
+malways-save-lp
+Target Var(flag_always_save_lp) Init(0)
+Always save $lp in the stack.
+
+munaligned-access
+Target Report Var(flag_unaligned_access) Init(0)
+Enable unaligned word and halfword accesses to packed data.
+
+; ---------------------------------------------------------------
+; The following options are designed for compatibility issue.
+; Hopefully these obsolete options will be removed one day.
+
+mg
+Target Undocumented Warn(%qs is deprecated and has no effect)
+Obsolete option.  Users SHOULD NOT use this option in the command line.
+
+mdx-regs
+Target Undocumented Warn(%qs is deprecated and has no effect)
+Obsolete option.  Users SHOULD NOT use this option in the command line.
+
+mexpand-isr
+Target Undocumented Warn(%qs is deprecated and has no effect)
+Obsolete option.  Users SHOULD NOT use this option in the command line.
+
+mcrt-cpp=yes
+Target Undocumented Warn(%qs is deprecated and has no effect, use -mctor-dtor instead)
+Obsolete option.  Users SHOULD NOT use this option in the command line.
+
+mcrt-exit=yes
+Target Undocumented Warn(%qs is deprecated and has no effect, use -mctor-dtor instead)
+Obsolete option.  Users SHOULD NOT use this option in the command line.
+
+mlib=
+Target RejectNegative Joined Undocumented Warn(%qs is deprecated and has no effect)
+Obsolete option.  Users SHOULD NOT use this option in the command line.
+
+; ---------------------------------------------------------------
+; The following options are designed for compatibility issue.
+; Hopefully these obsolete options will be removed one day.
+
+mace
+Target RejectNegative
+Compile with Andes ACE.
+
+mace-s2s=
+Target Joined RejectNegative
+Argument for pass to Andes's ACE source-to-source translator.
+
+
+; ---------------------------------------------------------------
diff --git a/gcc/config/nds32/nds32_init.inc b/gcc/config/nds32/nds32_init.inc
new file mode 100644
index 0000000..1084ad0
--- /dev/null
+++ b/gcc/config/nds32/nds32_init.inc
@@ -0,0 +1,43 @@
+/*
+ * nds32_init.inc
+ *
+ * NDS32 architecture startup assembler header file
+ *
+ */
+
+.macro nds32_init
+
+	! Initialize GP for data access
+	la      $gp, _SDA_BASE_
+
+#if defined(__NDS32_EXT_EX9__)
+	! Check HW for EX9
+	mfsr    $r0, $MSC_CFG
+	li      $r1, (1 << 24)
+	and     $r2, $r0, $r1
+	beqz    $r2, 1f
+
+	! Initialize the table base of EX9 instruction
+	la      $r0, _ITB_BASE_
+	mtusr   $r0, $ITB
+1:
+#endif
+
+#if defined(__NDS32_EXT_FPU_DP__) || defined(__NDS32_EXT_FPU_SP__)
+	! Enable FPU
+	mfsr    $r0, $FUCOP_CTL
+	ori     $r0, $r0, #0x1
+	mtsr    $r0, $FUCOP_CTL
+	dsb
+
+	! Enable denormalized flush-to-Zero mode
+	fmfcsr  $r0
+	ori     $r0,$r0,#0x1000
+	fmtcsr  $r0
+	dsb
+#endif
+
+	! Initialize default stack pointer
+	la      $sp, _stack
+
+.endm
diff --git a/gcc/config/nds32/nds32_intrinsic.h b/gcc/config/nds32/nds32_intrinsic.h
index 3e868dc..fef727b 100644
--- a/gcc/config/nds32/nds32_intrinsic.h
+++ b/gcc/config/nds32/nds32_intrinsic.h
@@ -26,12 +26,1383 @@
 #ifndef _NDS32_INTRINSIC_H
 #define _NDS32_INTRINSIC_H

+typedef signed char int8x4_t __attribute ((vector_size(4)));
+typedef short int16x2_t __attribute ((vector_size(4)));
+typedef int int32x2_t __attribute__((vector_size(8)));
+typedef unsigned char uint8x4_t __attribute__ ((vector_size (4)));
+typedef unsigned short uint16x2_t __attribute__ ((vector_size (4)));
+typedef unsigned int uint32x2_t __attribute__((vector_size(8)));
+
+/* General instrinsic register names.  */
 enum nds32_intrinsic_registers
 {
-  __NDS32_REG_PSW__ = 1024,
+  __NDS32_REG_CPU_VER__ = 1024,
+  __NDS32_REG_ICM_CFG__,
+  __NDS32_REG_DCM_CFG__,
+  __NDS32_REG_MMU_CFG__,
+  __NDS32_REG_MSC_CFG__,
+  __NDS32_REG_MSC_CFG2__,
+  __NDS32_REG_CORE_ID__,
+  __NDS32_REG_FUCOP_EXIST__,
+
+  __NDS32_REG_PSW__,
   __NDS32_REG_IPSW__,
+  __NDS32_REG_P_IPSW__,
+  __NDS32_REG_IVB__,
+  __NDS32_REG_EVA__,
+  __NDS32_REG_P_EVA__,
   __NDS32_REG_ITYPE__,
-  __NDS32_REG_IPC__
+  __NDS32_REG_P_ITYPE__,
+
+  __NDS32_REG_MERR__,
+  __NDS32_REG_IPC__,
+  __NDS32_REG_P_IPC__,
+  __NDS32_REG_OIPC__,
+  __NDS32_REG_P_P0__,
+  __NDS32_REG_P_P1__,
+
+  __NDS32_REG_INT_MASK__,
+  __NDS32_REG_INT_MASK2__,
+  __NDS32_REG_INT_MASK3__,
+  __NDS32_REG_INT_PEND__,
+  __NDS32_REG_INT_PEND2__,
+  __NDS32_REG_INT_PEND3__,
+  __NDS32_REG_SP_USR__,
+  __NDS32_REG_SP_PRIV__,
+  __NDS32_REG_INT_PRI__,
+  __NDS32_REG_INT_PRI2__,
+  __NDS32_REG_INT_PRI3__,
+  __NDS32_REG_INT_PRI4__,
+  __NDS32_REG_INT_CTRL__,
+  __NDS32_REG_INT_TRIGGER__,
+  __NDS32_REG_INT_TRIGGER2__,
+  __NDS32_REG_INT_GPR_PUSH_DIS__,
+
+  __NDS32_REG_MMU_CTL__,
+  __NDS32_REG_L1_PPTB__,
+  __NDS32_REG_TLB_VPN__,
+  __NDS32_REG_TLB_DATA__,
+  __NDS32_REG_TLB_MISC__,
+  __NDS32_REG_VLPT_IDX__,
+  __NDS32_REG_ILMB__,
+  __NDS32_REG_DLMB__,
+
+  __NDS32_REG_CACHE_CTL__,
+  __NDS32_REG_HSMP_SADDR__,
+  __NDS32_REG_HSMP_EADDR__,
+  __NDS32_REG_SDZ_CTL__,
+  __NDS32_REG_N12MISC_CTL__,
+  __NDS32_REG_MISC_CTL__,
+  __NDS32_REG_ECC_MISC__,
+
+  __NDS32_REG_BPC0__,
+  __NDS32_REG_BPC1__,
+  __NDS32_REG_BPC2__,
+  __NDS32_REG_BPC3__,
+  __NDS32_REG_BPC4__,
+  __NDS32_REG_BPC5__,
+  __NDS32_REG_BPC6__,
+  __NDS32_REG_BPC7__,
+
+  __NDS32_REG_BPA0__,
+  __NDS32_REG_BPA1__,
+  __NDS32_REG_BPA2__,
+  __NDS32_REG_BPA3__,
+  __NDS32_REG_BPA4__,
+  __NDS32_REG_BPA5__,
+  __NDS32_REG_BPA6__,
+  __NDS32_REG_BPA7__,
+
+  __NDS32_REG_BPAM0__,
+  __NDS32_REG_BPAM1__,
+  __NDS32_REG_BPAM2__,
+  __NDS32_REG_BPAM3__,
+  __NDS32_REG_BPAM4__,
+  __NDS32_REG_BPAM5__,
+  __NDS32_REG_BPAM6__,
+  __NDS32_REG_BPAM7__,
+
+  __NDS32_REG_BPV0__,
+  __NDS32_REG_BPV1__,
+  __NDS32_REG_BPV2__,
+  __NDS32_REG_BPV3__,
+  __NDS32_REG_BPV4__,
+  __NDS32_REG_BPV5__,
+  __NDS32_REG_BPV6__,
+  __NDS32_REG_BPV7__,
+
+  __NDS32_REG_BPCID0__,
+  __NDS32_REG_BPCID1__,
+  __NDS32_REG_BPCID2__,
+  __NDS32_REG_BPCID3__,
+  __NDS32_REG_BPCID4__,
+  __NDS32_REG_BPCID5__,
+  __NDS32_REG_BPCID6__,
+  __NDS32_REG_BPCID7__,
+
+  __NDS32_REG_EDM_CFG__,
+  __NDS32_REG_EDMSW__,
+  __NDS32_REG_EDM_CTL__,
+  __NDS32_REG_EDM_DTR__,
+  __NDS32_REG_BPMTC__,
+  __NDS32_REG_DIMBR__,
+
+  __NDS32_REG_TECR0__,
+  __NDS32_REG_TECR1__,
+  __NDS32_REG_PFMC0__,
+  __NDS32_REG_PFMC1__,
+  __NDS32_REG_PFMC2__,
+  __NDS32_REG_PFM_CTL__,
+  __NDS32_REG_PFT_CTL__,
+  __NDS32_REG_HSP_CTL__,
+  __NDS32_REG_SP_BOUND__,
+  __NDS32_REG_SP_BOUND_PRIV__,
+  __NDS32_REG_SP_BASE__,
+  __NDS32_REG_SP_BASE_PRIV__,
+  __NDS32_REG_FUCOP_CTL__,
+  __NDS32_REG_PRUSR_ACC_CTL__,
+
+  __NDS32_REG_DMA_CFG__,
+  __NDS32_REG_DMA_GCSW__,
+  __NDS32_REG_DMA_CHNSEL__,
+  __NDS32_REG_DMA_ACT__,
+  __NDS32_REG_DMA_SETUP__,
+  __NDS32_REG_DMA_ISADDR__,
+  __NDS32_REG_DMA_ESADDR__,
+  __NDS32_REG_DMA_TCNT__,
+  __NDS32_REG_DMA_STATUS__,
+  __NDS32_REG_DMA_2DSET__,
+  __NDS32_REG_DMA_2DSCTL__,
+  __NDS32_REG_DMA_RCNT__,
+  __NDS32_REG_DMA_HSTATUS__,
+
+  __NDS32_REG_PC__,
+  __NDS32_REG_SP_USR1__,
+  __NDS32_REG_SP_USR2__,
+  __NDS32_REG_SP_USR3__,
+  __NDS32_REG_SP_PRIV1__,
+  __NDS32_REG_SP_PRIV2__,
+  __NDS32_REG_SP_PRIV3__,
+  __NDS32_REG_BG_REGION__,
+  __NDS32_REG_SFCR__,
+  __NDS32_REG_SIGN__,
+  __NDS32_REG_ISIGN__,
+  __NDS32_REG_P_ISIGN__,
+  __NDS32_REG_IFC_LP__,
+  __NDS32_REG_ITB__
 };

+/* The cctl subtype for intrinsic.  */
+enum nds32_cctl_valck
+{
+  __NDS32_CCTL_L1D_VA_FILLCK__,
+  __NDS32_CCTL_L1D_VA_ULCK__,
+  __NDS32_CCTL_L1I_VA_FILLCK__,
+  __NDS32_CCTL_L1I_VA_ULCK__
+};
+
+enum nds32_cctl_idxwbinv
+{
+  __NDS32_CCTL_L1D_IX_WBINVAL__,
+  __NDS32_CCTL_L1D_IX_INVAL__,
+  __NDS32_CCTL_L1D_IX_WB__,
+  __NDS32_CCTL_L1I_IX_INVAL__
+};
+
+enum nds32_cctl_vawbinv
+{
+  __NDS32_CCTL_L1D_VA_INVAL__,
+  __NDS32_CCTL_L1D_VA_WB__,
+  __NDS32_CCTL_L1D_VA_WBINVAL__,
+  __NDS32_CCTL_L1I_VA_INVAL__
+};
+
+enum nds32_cctl_idxread
+{
+  __NDS32_CCTL_L1D_IX_RTAG__,
+  __NDS32_CCTL_L1D_IX_RWD__,
+  __NDS32_CCTL_L1I_IX_RTAG__,
+  __NDS32_CCTL_L1I_IX_RWD__
+};
+
+enum nds32_cctl_idxwrite
+{
+  __NDS32_CCTL_L1D_IX_WTAG__,
+  __NDS32_CCTL_L1D_IX_WWD__,
+  __NDS32_CCTL_L1I_IX_WTAG__,
+  __NDS32_CCTL_L1I_IX_WWD__
+};
+
+enum nds32_dpref
+{
+  __NDS32_DPREF_SRD__,
+  __NDS32_DPREF_MRD__,
+  __NDS32_DPREF_SWR__,
+  __NDS32_DPREF_MWR__,
+  __NDS32_DPREF_PTE__,
+  __NDS32_DPREF_CLWR__
+};
+
+/* ------------------------------------------------------------------------ */
+
+/* Define interrupt number for intrinsic function.  */
+#define NDS32_INT_H0 0
+#define NDS32_INT_H1 1
+#define NDS32_INT_H2 2
+#define NDS32_INT_H3 3
+#define NDS32_INT_H4 4
+#define NDS32_INT_H5 5
+#define NDS32_INT_H6 6
+#define NDS32_INT_H7 7
+#define NDS32_INT_H8 8
+#define NDS32_INT_H9 9
+#define NDS32_INT_H10 10
+#define NDS32_INT_H11 11
+#define NDS32_INT_H12 12
+#define NDS32_INT_H13 13
+#define NDS32_INT_H14 14
+#define NDS32_INT_H15 15
+#define NDS32_INT_H16 16
+#define NDS32_INT_H17 17
+#define NDS32_INT_H18 18
+#define NDS32_INT_H19 19
+#define NDS32_INT_H20 20
+#define NDS32_INT_H21 21
+#define NDS32_INT_H22 22
+#define NDS32_INT_H23 23
+#define NDS32_INT_H24 24
+#define NDS32_INT_H25 25
+#define NDS32_INT_H26 26
+#define NDS32_INT_H27 27
+#define NDS32_INT_H28 28
+#define NDS32_INT_H29 29
+#define NDS32_INT_H30 30
+#define NDS32_INT_H31 31
+#define NDS32_INT_H32 32
+#define NDS32_INT_H33 33
+#define NDS32_INT_H34 34
+#define NDS32_INT_H35 35
+#define NDS32_INT_H36 36
+#define NDS32_INT_H37 37
+#define NDS32_INT_H38 38
+#define NDS32_INT_H39 39
+#define NDS32_INT_H40 40
+#define NDS32_INT_H41 41
+#define NDS32_INT_H42 42
+#define NDS32_INT_H43 43
+#define NDS32_INT_H44 44
+#define NDS32_INT_H45 45
+#define NDS32_INT_H46 46
+#define NDS32_INT_H47 47
+#define NDS32_INT_H48 48
+#define NDS32_INT_H49 49
+#define NDS32_INT_H50 50
+#define NDS32_INT_H51 51
+#define NDS32_INT_H52 52
+#define NDS32_INT_H53 53
+#define NDS32_INT_H54 54
+#define NDS32_INT_H55 55
+#define NDS32_INT_H56 56
+#define NDS32_INT_H57 57
+#define NDS32_INT_H58 58
+#define NDS32_INT_H59 59
+#define NDS32_INT_H60 60
+#define NDS32_INT_H61 61
+#define NDS32_INT_H62 62
+#define NDS32_INT_H63 63
+#define NDS32_INT_SWI 64
+#define NDS32_INT_ALZ 65
+#define NDS32_INT_IDIVZE 66
+#define NDS32_INT_DSSIM 67
+
+/* ------------------------------------------------------------------------ */
+
+/* Define intrinsic register name macro for compatibility.  */
+#define NDS32_SR_CPU_VER               __NDS32_REG_CPU_VER__
+#define NDS32_SR_ICM_CFG               __NDS32_REG_ICM_CFG__
+#define NDS32_SR_DCM_CFG               __NDS32_REG_DCM_CFG__
+#define NDS32_SR_MMU_CFG               __NDS32_REG_MMU_CFG__
+#define NDS32_SR_MSC_CFG               __NDS32_REG_MSC_CFG__
+#define NDS32_SR_MSC_CFG2              __NDS32_REG_MSC_CFG2__
+#define NDS32_SR_CORE_ID               __NDS32_REG_CORE_ID__
+#define NDS32_SR_FUCOP_EXIST           __NDS32_REG_FUCOP_EXIST__
+#define NDS32_SR_PSW                   __NDS32_REG_PSW__
+#define NDS32_SR_IPSW                  __NDS32_REG_IPSW__
+#define NDS32_SR_P_IPSW                __NDS32_REG_P_IPSW__
+#define NDS32_SR_IVB                   __NDS32_REG_IVB__
+#define NDS32_SR_EVA                   __NDS32_REG_EVA__
+#define NDS32_SR_P_EVA                 __NDS32_REG_P_EVA__
+#define NDS32_SR_ITYPE                 __NDS32_REG_ITYPE__
+#define NDS32_SR_P_ITYPE               __NDS32_REG_P_ITYPE__
+#define NDS32_SR_MERR                  __NDS32_REG_MERR__
+#define NDS32_SR_IPC                   __NDS32_REG_IPC__
+#define NDS32_SR_P_IPC                 __NDS32_REG_P_IPC__
+#define NDS32_SR_OIPC                  __NDS32_REG_OIPC__
+#define NDS32_SR_P_P0                  __NDS32_REG_P_P0__
+#define NDS32_SR_P_P1                  __NDS32_REG_P_P1__
+#define NDS32_SR_INT_MASK              __NDS32_REG_INT_MASK__
+#define NDS32_SR_INT_MASK2             __NDS32_REG_INT_MASK2__
+#define NDS32_SR_INT_MASK3             __NDS32_REG_INT_MASK3__
+#define NDS32_SR_INT_PEND              __NDS32_REG_INT_PEND__
+#define NDS32_SR_INT_PEND2             __NDS32_REG_INT_PEND2__
+#define NDS32_SR_INT_PEND3             __NDS32_REG_INT_PEND3__
+#define NDS32_SR_SP_USR                __NDS32_REG_SP_USR__
+#define NDS32_SR_SP_PRIV               __NDS32_REG_SP_PRIV__
+#define NDS32_SR_INT_PRI               __NDS32_REG_INT_PRI__
+#define NDS32_SR_INT_PRI2              __NDS32_REG_INT_PRI2__
+#define NDS32_SR_INT_PRI3              __NDS32_REG_INT_PRI3__
+#define NDS32_SR_INT_PRI4              __NDS32_REG_INT_PRI4__
+#define NDS32_SR_INT_CTRL              __NDS32_REG_INT_CTRL__
+#define NDS32_SR_INT_TRIGGER           __NDS32_REG_INT_TRIGGER__
+#define NDS32_SR_INT_TRIGGER2          __NDS32_REG_INT_TRIGGER2__
+#define NDS32_SR_INT_GPR_PUSH_DIS      __NDS32_REG_INT_GPR_PUSH_DIS__
+#define NDS32_SR_MMU_CTL               __NDS32_REG_MMU_CTL__
+#define NDS32_SR_L1_PPTB               __NDS32_REG_L1_PPTB__
+#define NDS32_SR_TLB_VPN               __NDS32_REG_TLB_VPN__
+#define NDS32_SR_TLB_DATA              __NDS32_REG_TLB_DATA__
+#define NDS32_SR_TLB_MISC              __NDS32_REG_TLB_MISC__
+#define NDS32_SR_VLPT_IDX              __NDS32_REG_VLPT_IDX__
+#define NDS32_SR_ILMB                  __NDS32_REG_ILMB__
+#define NDS32_SR_DLMB                  __NDS32_REG_DLMB__
+#define NDS32_SR_CACHE_CTL             __NDS32_REG_CACHE_CTL__
+#define NDS32_SR_HSMP_SADDR            __NDS32_REG_HSMP_SADDR__
+#define NDS32_SR_HSMP_EADDR            __NDS32_REG_HSMP_EADDR__
+#define NDS32_SR_SDZ_CTL               __NDS32_REG_SDZ_CTL__
+#define NDS32_SR_N12MISC_CTL           __NDS32_REG_N12MISC_CTL__
+#define NDS32_SR_MISC_CTL              __NDS32_REG_MISC_CTL__
+#define NDS32_SR_ECC_MISC              __NDS32_REG_ECC_MISC__
+#define NDS32_SR_BPC0                  __NDS32_REG_BPC0__
+#define NDS32_SR_BPC1                  __NDS32_REG_BPC1__
+#define NDS32_SR_BPC2                  __NDS32_REG_BPC2__
+#define NDS32_SR_BPC3                  __NDS32_REG_BPC3__
+#define NDS32_SR_BPC4                  __NDS32_REG_BPC4__
+#define NDS32_SR_BPC5                  __NDS32_REG_BPC5__
+#define NDS32_SR_BPC6                  __NDS32_REG_BPC6__
+#define NDS32_SR_BPC7                  __NDS32_REG_BPC7__
+#define NDS32_SR_BPA0                  __NDS32_REG_BPA0__
+#define NDS32_SR_BPA1                  __NDS32_REG_BPA1__
+#define NDS32_SR_BPA2                  __NDS32_REG_BPA2__
+#define NDS32_SR_BPA3                  __NDS32_REG_BPA3__
+#define NDS32_SR_BPA4                  __NDS32_REG_BPA4__
+#define NDS32_SR_BPA5                  __NDS32_REG_BPA5__
+#define NDS32_SR_BPA6                  __NDS32_REG_BPA6__
+#define NDS32_SR_BPA7                  __NDS32_REG_BPA7__
+#define NDS32_SR_BPAM0                 __NDS32_REG_BPAM0__
+#define NDS32_SR_BPAM1                 __NDS32_REG_BPAM1__
+#define NDS32_SR_BPAM2                 __NDS32_REG_BPAM2__
+#define NDS32_SR_BPAM3                 __NDS32_REG_BPAM3__
+#define NDS32_SR_BPAM4                 __NDS32_REG_BPAM4__
+#define NDS32_SR_BPAM5                 __NDS32_REG_BPAM5__
+#define NDS32_SR_BPAM6                 __NDS32_REG_BPAM6__
+#define NDS32_SR_BPAM7                 __NDS32_REG_BPAM7__
+#define NDS32_SR_BPV0                  __NDS32_REG_BPV0__
+#define NDS32_SR_BPV1                  __NDS32_REG_BPV1__
+#define NDS32_SR_BPV2                  __NDS32_REG_BPV2__
+#define NDS32_SR_BPV3                  __NDS32_REG_BPV3__
+#define NDS32_SR_BPV4                  __NDS32_REG_BPV4__
+#define NDS32_SR_BPV5                  __NDS32_REG_BPV5__
+#define NDS32_SR_BPV6                  __NDS32_REG_BPV6__
+#define NDS32_SR_BPV7                  __NDS32_REG_BPV7__
+#define NDS32_SR_BPCID0                __NDS32_REG_BPCID0__
+#define NDS32_SR_BPCID1                __NDS32_REG_BPCID1__
+#define NDS32_SR_BPCID2                __NDS32_REG_BPCID2__
+#define NDS32_SR_BPCID3                __NDS32_REG_BPCID3__
+#define NDS32_SR_BPCID4                __NDS32_REG_BPCID4__
+#define NDS32_SR_BPCID5                __NDS32_REG_BPCID5__
+#define NDS32_SR_BPCID6                __NDS32_REG_BPCID6__
+#define NDS32_SR_BPCID7                __NDS32_REG_BPCID7__
+#define NDS32_SR_EDM_CFG               __NDS32_REG_EDM_CFG__
+#define NDS32_SR_EDMSW                 __NDS32_REG_EDMSW__
+#define NDS32_SR_EDM_CTL               __NDS32_REG_EDM_CTL__
+#define NDS32_SR_EDM_DTR               __NDS32_REG_EDM_DTR__
+#define NDS32_SR_BPMTC                 __NDS32_REG_BPMTC__
+#define NDS32_SR_DIMBR                 __NDS32_REG_DIMBR__
+#define NDS32_SR_TECR0                 __NDS32_REG_TECR0__
+#define NDS32_SR_TECR1                 __NDS32_REG_TECR1__
+#define NDS32_SR_PFMC0                 __NDS32_REG_PFMC0__
+#define NDS32_SR_PFMC1                 __NDS32_REG_PFMC1__
+#define NDS32_SR_PFMC2                 __NDS32_REG_PFMC2__
+#define NDS32_SR_PFM_CTL               __NDS32_REG_PFM_CTL__
+#define NDS32_SR_HSP_CTL               __NDS32_REG_HSP_CTL__
+#define NDS32_SR_SP_BOUND              __NDS32_REG_SP_BOUND__
+#define NDS32_SR_SP_BOUND_PRIV         __NDS32_REG_SP_BOUND_PRIV__
+#define NDS32_SR_SP_BASE               __NDS32_REG_SP_BASE__
+#define NDS32_SR_SP_BASE_PRIV          __NDS32_REG_SP_BASE_PRIV__
+#define NDS32_SR_FUCOP_CTL             __NDS32_REG_FUCOP_CTL__
+#define NDS32_SR_PRUSR_ACC_CTL         __NDS32_REG_PRUSR_ACC_CTL__
+#define NDS32_SR_DMA_CFG               __NDS32_REG_DMA_CFG__
+#define NDS32_SR_DMA_GCSW              __NDS32_REG_DMA_GCSW__
+#define NDS32_SR_DMA_CHNSEL            __NDS32_REG_DMA_CHNSEL__
+#define NDS32_SR_DMA_ACT               __NDS32_REG_DMA_ACT__
+#define NDS32_SR_DMA_SETUP             __NDS32_REG_DMA_SETUP__
+#define NDS32_SR_DMA_ISADDR            __NDS32_REG_DMA_ISADDR__
+#define NDS32_SR_DMA_ESADDR            __NDS32_REG_DMA_ESADDR__
+#define NDS32_SR_DMA_TCNT              __NDS32_REG_DMA_TCNT__
+#define NDS32_SR_DMA_STATUS            __NDS32_REG_DMA_STATUS__
+#define NDS32_SR_DMA_2DSET             __NDS32_REG_DMA_2DSET__
+#define NDS32_SR_DMA_2DSCTL            __NDS32_REG_DMA_2DSCTL__
+#define NDS32_SR_DMA_RCNT              __NDS32_REG_DMA_RCNT__
+#define NDS32_SR_DMA_HSTATUS           __NDS32_REG_DMA_HSTATUS__
+#define NDS32_SR_SP_USR1               __NDS32_REG_SP_USR1__
+#define NDS32_SR_SP_USR2               __NDS32_REG_SP_USR2__
+#define NDS32_SR_SP_USR3               __NDS32_REG_SP_USR3__
+#define NDS32_SR_SP_PRIV1              __NDS32_REG_SP_PRIV1__
+#define NDS32_SR_SP_PRIV2              __NDS32_REG_SP_PRIV2__
+#define NDS32_SR_SP_PRIV3              __NDS32_REG_SP_PRIV3__
+#define NDS32_SR_BG_REGION             __NDS32_REG_BG_REGION__
+#define NDS32_SR_SFCR                  __NDS32_REG_SFCR__
+#define NDS32_SR_SIGN                  __NDS32_REG_SIGN__
+#define NDS32_SR_ISIGN                 __NDS32_REG_ISIGN__
+#define NDS32_SR_P_ISIGN               __NDS32_REG_P_ISIGN__
+
+#define NDS32_USR_PC                    __NDS32_REG_PC__
+#define NDS32_USR_DMA_CFG               __NDS32_REG_DMA_CFG__
+#define NDS32_USR_DMA_GCSW              __NDS32_REG_DMA_GCSW__
+#define NDS32_USR_DMA_CHNSEL            __NDS32_REG_DMA_CHNSEL__
+#define NDS32_USR_DMA_ACT               __NDS32_REG_DMA_ACT__
+#define NDS32_USR_DMA_SETUP             __NDS32_REG_DMA_SETUP__
+#define NDS32_USR_DMA_ISADDR            __NDS32_REG_DMA_ISADDR__
+#define NDS32_USR_DMA_ESADDR            __NDS32_REG_DMA_ESADDR__
+#define NDS32_USR_DMA_TCNT              __NDS32_REG_DMA_TCNT__
+#define NDS32_USR_DMA_STATUS            __NDS32_REG_DMA_STATUS__
+#define NDS32_USR_DMA_2DSET             __NDS32_REG_DMA_2DSET__
+#define NDS32_USR_DMA_2DSCTL            __NDS32_REG_DMA_2DSCTL__
+#define NDS32_USR_PFMC0                 __NDS32_REG_PFMC0__
+#define NDS32_USR_PFMC1                 __NDS32_REG_PFMC1__
+#define NDS32_USR_PFMC2                 __NDS32_REG_PFMC2__
+#define NDS32_USR_PFM_CTL               __NDS32_REG_PFM_CTL__
+#define NDS32_USR_IFC_LP                __NDS32_REG_IFC_LP__
+#define NDS32_USR_ITB                   __NDS32_REG_ITB__
+
+#define NDS32_CCTL_L1D_VA_FILLCK        __NDS32_CCTL_L1D_VA_FILLCK__
+#define NDS32_CCTL_L1D_VA_ULCK          __NDS32_CCTL_L1D_VA_ULCK__
+#define NDS32_CCTL_L1I_VA_FILLCK        __NDS32_CCTL_L1I_VA_FILLCK__
+#define NDS32_CCTL_L1I_VA_ULCK          __NDS32_CCTL_L1I_VA_ULCK__
+
+#define NDS32_CCTL_L1D_IX_WBINVAL       __NDS32_CCTL_L1D_IX_WBINVAL__
+#define NDS32_CCTL_L1D_IX_INVAL         __NDS32_CCTL_L1D_IX_INVAL__
+#define NDS32_CCTL_L1D_IX_WB            __NDS32_CCTL_L1D_IX_WB__
+#define NDS32_CCTL_L1I_IX_INVAL         __NDS32_CCTL_L1I_IX_INVAL__
+
+#define NDS32_CCTL_L1D_VA_INVAL         __NDS32_CCTL_L1D_VA_INVAL__
+#define NDS32_CCTL_L1D_VA_WB            __NDS32_CCTL_L1D_VA_WB__
+#define NDS32_CCTL_L1D_VA_WBINVAL       __NDS32_CCTL_L1D_VA_WBINVAL__
+#define NDS32_CCTL_L1I_VA_INVAL         __NDS32_CCTL_L1I_VA_INVAL__
+
+#define NDS32_CCTL_L1D_IX_RTAG          __NDS32_CCTL_L1D_IX_RTAG__
+#define NDS32_CCTL_L1D_IX_RWD           __NDS32_CCTL_L1D_IX_RWD__
+#define NDS32_CCTL_L1I_IX_RTAG          __NDS32_CCTL_L1I_IX_RTAG__
+#define NDS32_CCTL_L1I_IX_RWD           __NDS32_CCTL_L1I_IX_RWD__
+
+#define NDS32_CCTL_L1D_IX_WTAG          __NDS32_CCTL_L1D_IX_WTAG__
+#define NDS32_CCTL_L1D_IX_WWD           __NDS32_CCTL_L1D_IX_WWD__
+#define NDS32_CCTL_L1I_IX_WTAG          __NDS32_CCTL_L1I_IX_WTAG__
+#define NDS32_CCTL_L1I_IX_WWD           __NDS32_CCTL_L1I_IX_WWD__
+
+#define NDS32_DPREF_SRD                 __NDS32_DPREF_SRD__
+#define NDS32_DPREF_MRD                 __NDS32_DPREF_MRD__
+#define NDS32_DPREF_SWR                 __NDS32_DPREF_SWR__
+#define NDS32_DPREF_MWR                 __NDS32_DPREF_MWR__
+#define NDS32_DPREF_PTE                 __NDS32_DPREF_PTE__
+#define NDS32_DPREF_CLWR                __NDS32_DPREF_CLWR__
+
+/* ------------------------------------------------------------------------ */
+
+/* Define user friendly macro.  */
+#define SIGNATURE_BEGIN	__nds32__signature_begin ()
+#define SIGNATURE_END	__nds32__signature_end ()
+
+/* Map __nds32__xxx() to __builtin_xxx() functions for compatibility.  */
+#define __nds32__llw(a) \
+  (__builtin_nds32_llw ((a)))
+#define __nds32__lwup(a) \
+  (__builtin_nds32_lwup ((a)))
+#define __nds32__lbup(a) \
+  (__builtin_nds32_lbup ((a)))
+#define __nds32__scw(a, b) \
+  (__builtin_nds32_scw ((a), (b)))
+#define __nds32__swup(a, b) \
+  (__builtin_nds32_swup ((a), (b)))
+#define __nds32__sbup(a, b) \
+  (__builtin_nds32_sbup ((a), (b)))
+
+#define __nds32__mfsr(srname) \
+  (__builtin_nds32_mfsr ((srname)))
+#define __nds32__mfusr(usrname) \
+  (__builtin_nds32_mfusr ((usrname)))
+#define __nds32__mtsr(val, srname) \
+  (__builtin_nds32_mtsr ((val), (srname)))
+#define __nds32__mtsr_isb(val, srname) \
+  (__builtin_nds32_mtsr_isb ((val), (srname)))
+#define __nds32__mtsr_dsb(val, srname) \
+  (__builtin_nds32_mtsr_dsb ((val), (srname)))
+#define __nds32__mtusr(val, usrname) \
+  (__builtin_nds32_mtusr ((val), (usrname)))
+
+#define __nds32__break(swid) \
+  (__builtin_nds32_break(swid))
+#define __nds32__cctlva_lck(subtype, va) \
+  (__builtin_nds32_cctl_va_lck ((subtype), (va)))
+#define __nds32__cctlidx_wbinval(subtype, idx) \
+  (__builtin_nds32_cctl_idx_wbinval ((subtype), (idx)))
+#define __nds32__cctlva_wbinval_alvl(subtype, va) \
+  (__builtin_nds32_cctl_va_wbinval_la ((subtype), (va)))
+#define __nds32__cctlva_wbinval_one_lvl(subtype, va) \
+  (__builtin_nds32_cctl_va_wbinval_l1 ((subtype), (va)))
+#define __nds32__cctlidx_read(subtype, idx) \
+  (__builtin_nds32_cctl_idx_read ((subtype), (idx)))
+#define __nds32__cctlidx_write(subtype, b, idxw) \
+  (__builtin_nds32_cctl_idx_write ((subtype), (b), (idxw)))
+#define __nds32__cctl_l1d_invalall()  \
+  (__builtin_nds32_cctl_l1d_invalall())
+#define __nds32__cctl_l1d_wball_alvl() \
+  (__builtin_nds32_cctl_l1d_wball_alvl())
+#define __nds32__cctl_l1d_wball_one_lvl() \
+  (__builtin_nds32_cctl_l1d_wball_one_lvl())
+
+#define __nds32__dsb() \
+  (__builtin_nds32_dsb())
+#define __nds32__isb() \
+  (__builtin_nds32_isb())
+#define __nds32__msync_store() \
+  (__builtin_nds32_msync_store())
+#define __nds32__msync_all() \
+  (__builtin_nds32_msync_all())
+#define __nds32__nop() \
+  (__builtin_nds32_nop())
+
+#define __nds32__standby_wait_done() \
+  (__builtin_nds32_standby_wait_done())
+#define __nds32__standby_no_wake_grant() \
+  (__builtin_nds32_standby_no_wake_grant())
+#define __nds32__standby_wake_grant() \
+  (__builtin_nds32_standby_wake_grant())
+#define __nds32__schedule_barrier() \
+  (__builtin_nds32_schedule_barrier())
+#define __nds32__setend_big() \
+  (__builtin_nds32_setend_big())
+#define __nds32__setend_little() \
+  (__builtin_nds32_setend_little())
+#define __nds32__setgie_en() \
+  (__builtin_nds32_setgie_en())
+#define __nds32__setgie_dis() \
+  (__builtin_nds32_setgie_dis())
+
+#define __nds32__jr_itoff(a) \
+  (__builtin_nds32_jr_itoff ((a)))
+#define __nds32__jr_toff(a) \
+  (__builtin_nds32_jr_toff ((a)))
+#define __nds32__jral_iton(a) \
+  (__builtin_nds32_jral_iton ((a)))
+#define __nds32__jral_ton(a) \
+  (__builtin_nds32_jral_ton ((a)))
+#define __nds32__ret_itoff(a) \
+  (__builtin_nds32_ret_itoff ((a)))
+#define __nds32__ret_toff(a) \
+  (__builtin_nds32_ret_toff ((a)))
+#define __nds32__svs(a, b) \
+  (__builtin_nds32_svs ((a), (b)))
+#define __nds32__sva(a, b) \
+  (__builtin_nds32_sva ((a), (b)))
+#define __nds32__dpref_qw(a, b, subtype) \
+  (__builtin_nds32_dpref_qw ((a), (b), (subtype)))
+#define __nds32__dpref_hw(a, b, subtype) \
+  (__builtin_nds32_dpref_hw ((a), (b), (subtype)))
+#define __nds32__dpref_w(a, b, subtype) \
+  (__builtin_nds32_dpref_w ((a), (b), (subtype)))
+#define __nds32__dpref_dw(a, b, subtype) \
+  (__builtin_nds32_dpref_dw ((a), (b), (subtype)))
+
+#define __nds32__teqz(a, swid) \
+  (__builtin_nds32_teqz ((a), (swid)))
+#define __nds32__tnez(a, swid) \
+  ( __builtin_nds32_tnez ((a), (swid)))
+#define __nds32__trap(swid) \
+  (__builtin_nds32_trap ((swid)))
+#define __nds32__isync(a) \
+  (__builtin_nds32_isync ((a)))
+#define __nds32__rotr(val, ror) \
+  (__builtin_nds32_rotr ((val), (ror)))
+#define __nds32__wsbh(a) \
+  (__builtin_nds32_wsbh ((a)))
+#define __nds32__syscall(a) \
+  (__builtin_nds32_syscall ((a)))
+#define __nds32__return_address() \
+  (__builtin_nds32_return_address())
+#define __nds32__get_current_sp() \
+  (__builtin_nds32_get_current_sp())
+#define __nds32__set_current_sp(a) \
+  (__builtin_nds32_set_current_sp ((a)))
+#define __nds32__abs(a) \
+  (__builtin_nds32_pe_abs ((a)))
+#define __nds32__ave(a, b) \
+  (__builtin_nds32_pe_ave ((a), (b)))
+#define __nds32__bclr(a, pos) \
+  (__builtin_nds32_pe_bclr ((a), (pos)))
+#define __nds32__bset(a, pos) \
+  (__builtin_nds32_pe_bset ((a), (pos)))
+#define __nds32__btgl(a, pos) \
+  (__builtin_nds32_pe_btgl ((a), (pos)))
+#define __nds32__btst(a, pos) \
+  (__builtin_nds32_pe_btst ((a), (pos)))
+
+#define __nds32__clip(a, imm) \
+  (__builtin_nds32_pe_clip ((a), (imm)))
+#define __nds32__clips(a, imm) \
+  (__builtin_nds32_pe_clips ((a), (imm)))
+#define __nds32__clz(a) \
+  (__builtin_nds32_pe_clz ((a)))
+#define __nds32__clo(a) \
+  (__builtin_nds32_pe_clo ((a)))
+#define __nds32__bse(r, a, b) \
+  (__builtin_nds32_pe2_bse ((r), (a), (b)))
+#define __nds32__bsp(r, a, b) \
+  (__builtin_nds32_pe2_bsp ((r), (a), (b)))
+#define __nds32__pbsad(a, b) \
+  (__builtin_nds32_pe2_pbsad ((a), (b)))
+#define __nds32__pbsada(acc, a, b) \
+  (__builtin_nds32_pe2_pbsada ((acc), (a), (b)))
+
+#define __nds32__ffb(a, b) \
+  (__builtin_nds32_se_ffb ((a), (b)))
+#define __nds32__ffmism(a, b) \
+  (__builtin_nds32_se_ffmism ((a), (b)))
+#define __nds32__flmism(a, b) \
+  (__builtin_nds32_se_flmism ((a), (b)))
+#define __nds32__fcpynsd(a, b) \
+  (__builtin_nds32_fcpynsd ((a), (b)))
+#define __nds32__fcpynss(a, b) \
+  (__builtin_nds32_fcpynss ((a), (b)))
+#define __nds32__fcpysd(a, b) \
+  (__builtin_nds32_fcpysd ((a), (b)))
+#define __nds32__fcpyss(a, b) \
+  (__builtin_nds32_fcpyss ((a), (b)))
+#define __nds32__fmfcsr() \
+  (__builtin_nds32_fmfcsr())
+#define __nds32__fmtcsr(fpcsr) \
+  (__builtin_nds32_fmtcsr ((fpcsr)))
+#define __nds32__fmfcfg() \
+  (__builtin_nds32_fmfcfg())
+
+#define __nds32__tlbop_trd(a) \
+  (__builtin_nds32_tlbop_trd ((a)))
+#define __nds32__tlbop_twr(a) \
+  (__builtin_nds32_tlbop_twr ((a)))
+#define __nds32__tlbop_rwr(a) \
+  (__builtin_nds32_tlbop_rwr ((a)))
+#define __nds32__tlbop_rwlk(a) \
+  (__builtin_nds32_tlbop_rwlk ((a)))
+#define __nds32__tlbop_unlk(a) \
+  (__builtin_nds32_tlbop_unlk ((a)))
+#define __nds32__tlbop_pb(a) \
+  (__builtin_nds32_tlbop_pb ((a)))
+#define __nds32__tlbop_inv(a) \
+  (__builtin_nds32_tlbop_inv ((a)))
+#define __nds32__tlbop_flua() \
+(__builtin_nds32_tlbop_flua())
+
+#define __nds32__kaddw(a, b) \
+  (__builtin_nds32_kaddw ((a), (b)))
+#define __nds32__kaddh(a, b) \
+  (__builtin_nds32_kaddh ((a), (b)))
+#define __nds32__ksubw(a, b) \
+  (__builtin_nds32_ksubw ((a), (b)))
+#define __nds32__ksubh(a, b) \
+  (__builtin_nds32_ksubh ((a), (b)))
+#define __nds32__kdmbb(a, b) \
+  (__builtin_nds32_kdmbb ((a), (b)))
+#define __nds32__v_kdmbb(a, b) \
+  (__builtin_nds32_v_kdmbb ((a), (b)))
+#define __nds32__kdmbt(a, b) \
+  (__builtin_nds32_kdmbt ((a), (b)))
+#define __nds32__v_kdmbt(a, b) \
+  (__builtin_nds32_v_kdmbt ((a), (b)))
+#define __nds32__kdmtb(a, b) \
+  (__builtin_nds32_kdmtb ((a), (b)))
+#define __nds32__v_kdmtb(a, b) \
+  (__builtin_nds32_v_kdmtb ((a), (b)))
+#define __nds32__kdmtt(a, b) \
+  (__builtin_nds32_kdmtt ((a), (b)))
+#define __nds32__v_kdmtt(a, b) \
+  (__builtin_nds32_v_kdmtt ((a), (b)))
+#define __nds32__khmbb(a, b) \
+  (__builtin_nds32_khmbb ((a), (b)))
+#define __nds32__v_khmbb(a, b) \
+  (__builtin_nds32_v_khmbb ((a), (b)))
+#define __nds32__khmbt(a, b) \
+  (__builtin_nds32_khmbt ((a), (b)))
+#define __nds32__v_khmbt(a, b) \
+  (__builtin_nds32_v_khmbt ((a), (b)))
+#define __nds32__khmtb(a, b) \
+  (__builtin_nds32_khmtb ((a), (b)))
+#define __nds32__v_khmtb(a, b) \
+  (__builtin_nds32_v_khmtb ((a), (b)))
+#define __nds32__khmtt(a, b) \
+  (__builtin_nds32_khmtt ((a), (b)))
+#define __nds32__v_khmtt(a, b) \
+  (__builtin_nds32_v_khmtt ((a), (b)))
+#define __nds32__kslraw(a, b) \
+  (__builtin_nds32_kslraw ((a), (b)))
+#define __nds32__kslraw_u(a, b) \
+  (__builtin_nds32_kslraw_u ((a), (b)))
+
+#define __nds32__rdov() \
+  (__builtin_nds32_rdov())
+#define __nds32__clrov() \
+  (__builtin_nds32_clrov())
+#define __nds32__gie_dis() \
+  (__builtin_nds32_gie_dis())
+#define __nds32__gie_en() \
+  (__builtin_nds32_gie_en())
+#define __nds32__enable_int(a) \
+  (__builtin_nds32_enable_int ((a)))
+#define __nds32__disable_int(a) \
+  (__builtin_nds32_disable_int ((a)))
+#define __nds32__set_pending_swint() \
+  (__builtin_nds32_set_pending_swint())
+#define __nds32__clr_pending_swint() \
+  (__builtin_nds32_clr_pending_swint())
+#define __nds32__clr_pending_hwint(a) \
+  (__builtin_nds32_clr_pending_hwint(a))
+#define __nds32__get_all_pending_int() \
+  (__builtin_nds32_get_all_pending_int())
+#define __nds32__get_pending_int(a) \
+  (__builtin_nds32_get_pending_int ((a)))
+#define __nds32__set_int_priority(a, b) \
+  (__builtin_nds32_set_int_priority ((a), (b)))
+#define __nds32__get_int_priority(a) \
+  (__builtin_nds32_get_int_priority ((a)))
+#define __nds32__set_trig_type_level(a) \
+  (__builtin_nds32_set_trig_level(a))
+#define __nds32__set_trig_type_edge(a) \
+  (__builtin_nds32_set_trig_edge(a))
+#define __nds32__get_trig_type(a) \
+  (__builtin_nds32_get_trig_type ((a)))
+
+#define __nds32__get_unaligned_hw(a) \
+  (__builtin_nds32_unaligned_load_hw ((a)))
+#define __nds32__get_unaligned_w(a) \
+  (__builtin_nds32_unaligned_load_w ((a)))
+#define __nds32__get_unaligned_dw(a) \
+  (__builtin_nds32_unaligned_load_dw ((a)))
+#define __nds32__put_unaligned_hw(a, data) \
+  (__builtin_nds32_unaligned_store_hw ((a), (data)))
+#define __nds32__put_unaligned_w(a, data) \
+  (__builtin_nds32_unaligned_store_w ((a), (data)))
+#define __nds32__put_unaligned_dw(a, data) \
+  (__builtin_nds32_unaligned_store_dw ((a), (data)))
+
+#define __nds32__signature_begin() \
+  (__builtin_nds32_signature_begin ())
+#define __nds32__signature_end() \
+  (__builtin_nds32_signature_end ())
+
+#define __nds32__add16(a, b) \
+  (__builtin_nds32_add16 ((a), (b)))
+#define __nds32__v_uadd16(a, b) \
+  (__builtin_nds32_v_uadd16 ((a), (b)))
+#define __nds32__v_sadd16(a, b) \
+  (__builtin_nds32_v_sadd16 ((a), (b)))
+#define __nds32__radd16(a, b) \
+  (__builtin_nds32_radd16 ((a), (b)))
+#define __nds32__v_radd16(a, b) \
+  (__builtin_nds32_v_radd16 ((a), (b)))
+#define __nds32__uradd16(a, b) \
+  (__builtin_nds32_uradd16 ((a), (b)))
+#define __nds32__v_uradd16(a, b) \
+  (__builtin_nds32_v_uradd16 ((a), (b)))
+#define __nds32__kadd16(a, b) \
+  (__builtin_nds32_kadd16 ((a), (b)))
+#define __nds32__v_kadd16(a, b) \
+  (__builtin_nds32_v_kadd16 ((a), (b)))
+#define __nds32__ukadd16(a, b) \
+  (__builtin_nds32_ukadd16 ((a), (b)))
+#define __nds32__v_ukadd16(a, b) \
+  (__builtin_nds32_v_ukadd16 ((a), (b)))
+#define __nds32__sub16(a, b) \
+  (__builtin_nds32_sub16 ((a), (b)))
+#define __nds32__v_usub16(a, b) \
+  (__builtin_nds32_v_usub16 ((a), (b)))
+#define __nds32__v_ssub16(a, b) \
+  (__builtin_nds32_v_ssub16 ((a), (b)))
+#define __nds32__rsub16(a, b) \
+  (__builtin_nds32_rsub16 ((a), (b)))
+#define __nds32__v_rsub16(a, b) \
+  (__builtin_nds32_v_rsub16 ((a), (b)))
+#define __nds32__ursub16(a, b) \
+  (__builtin_nds32_ursub16 ((a), (b)))
+#define __nds32__v_ursub16(a, b) \
+  (__builtin_nds32_v_ursub16 ((a), (b)))
+#define __nds32__ksub16(a, b) \
+  (__builtin_nds32_ksub16 ((a), (b)))
+#define __nds32__v_ksub16(a, b) \
+  (__builtin_nds32_v_ksub16 ((a), (b)))
+#define __nds32__uksub16(a, b) \
+  (__builtin_nds32_uksub16 ((a), (b)))
+#define __nds32__v_uksub16(a, b) \
+  (__builtin_nds32_v_uksub16 ((a), (b)))
+#define __nds32__cras16(a, b) \
+  (__builtin_nds32_cras16 ((a), (b)))
+#define __nds32__v_ucras16(a, b) \
+  (__builtin_nds32_v_ucras16 ((a), (b)))
+#define __nds32__v_scras16(a, b) \
+  (__builtin_nds32_v_scras16 ((a), (b)))
+#define __nds32__rcras16(a, b) \
+  (__builtin_nds32_rcras16 ((a), (b)))
+#define __nds32__v_rcras16(a, b) \
+  (__builtin_nds32_v_rcras16 ((a), (b)))
+#define __nds32__urcras16(a, b) \
+  (__builtin_nds32_urcras16 ((a), (b)))
+#define __nds32__v_urcras16(a, b) \
+  (__builtin_nds32_v_urcras16 ((a), (b)))
+#define __nds32__kcras16(a, b) \
+  (__builtin_nds32_kcras16 ((a), (b)))
+#define __nds32__v_kcras16(a, b) \
+  (__builtin_nds32_v_kcras16 ((a), (b)))
+#define __nds32__ukcras16(a, b) \
+  (__builtin_nds32_ukcras16 ((a), (b)))
+#define __nds32__v_ukcras16(a, b) \
+  (__builtin_nds32_v_ukcras16 ((a), (b)))
+#define __nds32__crsa16(a, b) \
+  (__builtin_nds32_crsa16 ((a), (b)))
+#define __nds32__v_ucrsa16(a, b) \
+  (__builtin_nds32_v_ucrsa16 ((a), (b)))
+#define __nds32__v_scrsa16(a, b) \
+  (__builtin_nds32_v_scrsa16 ((a), (b)))
+#define __nds32__rcrsa16(a, b) \
+  (__builtin_nds32_rcrsa16 ((a), (b)))
+#define __nds32__v_rcrsa16(a, b) \
+  (__builtin_nds32_v_rcrsa16 ((a), (b)))
+#define __nds32__urcrsa16(a, b) \
+  (__builtin_nds32_urcrsa16 ((a), (b)))
+#define __nds32__v_urcrsa16(a, b) \
+  (__builtin_nds32_v_urcrsa16 ((a), (b)))
+#define __nds32__kcrsa16(a, b) \
+  (__builtin_nds32_kcrsa16 ((a), (b)))
+#define __nds32__v_kcrsa16(a, b) \
+  (__builtin_nds32_v_kcrsa16 ((a), (b)))
+#define __nds32__ukcrsa16(a, b) \
+  (__builtin_nds32_ukcrsa16 ((a), (b)))
+#define __nds32__v_ukcrsa16(a, b) \
+  (__builtin_nds32_v_ukcrsa16 ((a), (b)))
+
+#define __nds32__add8(a, b) \
+  (__builtin_nds32_add8 ((a), (b)))
+#define __nds32__v_uadd8(a, b) \
+  (__builtin_nds32_v_uadd8 ((a), (b)))
+#define __nds32__v_sadd8(a, b) \
+  (__builtin_nds32_v_sadd8 ((a), (b)))
+#define __nds32__radd8(a, b) \
+  (__builtin_nds32_radd8 ((a), (b)))
+#define __nds32__v_radd8(a, b) \
+  (__builtin_nds32_v_radd8 ((a), (b)))
+#define __nds32__uradd8(a, b) \
+  (__builtin_nds32_uradd8 ((a), (b)))
+#define __nds32__v_uradd8(a, b) \
+  (__builtin_nds32_v_uradd8 ((a), (b)))
+#define __nds32__kadd8(a, b) \
+  (__builtin_nds32_kadd8 ((a), (b)))
+#define __nds32__v_kadd8(a, b) \
+  (__builtin_nds32_v_kadd8 ((a), (b)))
+#define __nds32__ukadd8(a, b) \
+  (__builtin_nds32_ukadd8 ((a), (b)))
+#define __nds32__v_ukadd8(a, b) \
+  (__builtin_nds32_v_ukadd8 ((a), (b)))
+#define __nds32__sub8(a, b) \
+  (__builtin_nds32_sub8 ((a), (b)))
+#define __nds32__v_usub8(a, b) \
+  (__builtin_nds32_v_usub8 ((a), (b)))
+#define __nds32__v_ssub8(a, b) \
+  (__builtin_nds32_v_ssub8 ((a), (b)))
+#define __nds32__rsub8(a, b) \
+  (__builtin_nds32_rsub8 ((a), (b)))
+#define __nds32__v_rsub8(a, b) \
+  (__builtin_nds32_v_rsub8 ((a), (b)))
+#define __nds32__ursub8(a, b) \
+  (__builtin_nds32_ursub8 ((a), (b)))
+#define __nds32__v_ursub8(a, b) \
+  (__builtin_nds32_v_ursub8 ((a), (b)))
+#define __nds32__ksub8(a, b) \
+  (__builtin_nds32_ksub8 ((a), (b)))
+#define __nds32__v_ksub8(a, b) \
+  (__builtin_nds32_v_ksub8 ((a), (b)))
+#define __nds32__uksub8(a, b) \
+  (__builtin_nds32_uksub8 ((a), (b)))
+#define __nds32__v_uksub8(a, b) \
+  (__builtin_nds32_v_uksub8 ((a), (b)))
+
+#define __nds32__sra16(a, b) \
+  (__builtin_nds32_sra16 ((a), (b)))
+#define __nds32__v_sra16(a, b) \
+  (__builtin_nds32_v_sra16 ((a), (b)))
+#define __nds32__sra16_u(a, b) \
+  (__builtin_nds32_sra16_u ((a), (b)))
+#define __nds32__v_sra16_u(a, b) \
+  (__builtin_nds32_v_sra16_u ((a), (b)))
+#define __nds32__srl16(a, b) \
+  (__builtin_nds32_srl16 ((a), (b)))
+#define __nds32__v_srl16(a, b) \
+  (__builtin_nds32_v_srl16 ((a), (b)))
+#define __nds32__srl16_u(a, b) \
+  (__builtin_nds32_srl16_u ((a), (b)))
+#define __nds32__v_srl16_u(a, b) \
+  (__builtin_nds32_v_srl16_u ((a), (b)))
+#define __nds32__sll16(a, b) \
+  (__builtin_nds32_sll16 ((a), (b)))
+#define __nds32__v_sll16(a, b) \
+  (__builtin_nds32_v_sll16 ((a), (b)))
+#define __nds32__ksll16(a, b) \
+  (__builtin_nds32_ksll16 ((a), (b)))
+#define __nds32__v_ksll16(a, b) \
+  (__builtin_nds32_v_ksll16 ((a), (b)))
+#define __nds32__kslra16(a, b) \
+  (__builtin_nds32_kslra16 ((a), (b)))
+#define __nds32__v_kslra16(a, b) \
+  (__builtin_nds32_v_kslra16 ((a), (b)))
+#define __nds32__kslra16_u(a, b) \
+  (__builtin_nds32_kslra16_u ((a), (b)))
+#define __nds32__v_kslra16_u(a, b) \
+  (__builtin_nds32_v_kslra16_u ((a), (b)))
+
+#define __nds32__cmpeq16(a, b) \
+  (__builtin_nds32_cmpeq16 ((a), (b)))
+#define __nds32__v_scmpeq16(a, b) \
+  (__builtin_nds32_v_scmpeq16 ((a), (b)))
+#define __nds32__v_ucmpeq16(a, b) \
+  (__builtin_nds32_v_ucmpeq16 ((a), (b)))
+#define __nds32__scmplt16(a, b) \
+  (__builtin_nds32_scmplt16 ((a), (b)))
+#define __nds32__v_scmplt16(a, b) \
+  (__builtin_nds32_v_scmplt16 ((a), (b)))
+#define __nds32__scmple16(a, b) \
+  (__builtin_nds32_scmple16 ((a), (b)))
+#define __nds32__v_scmple16(a, b) \
+  (__builtin_nds32_v_scmple16 ((a), (b)))
+#define __nds32__ucmplt16(a, b) \
+  (__builtin_nds32_ucmplt16 ((a), (b)))
+#define __nds32__v_ucmplt16(a, b) \
+  (__builtin_nds32_v_ucmplt16 ((a), (b)))
+#define __nds32__ucmple16(a, b) \
+  (__builtin_nds32_ucmple16 ((a), (b)))
+#define __nds32__v_ucmple16(a, b) \
+  (__builtin_nds32_v_ucmple16 ((a), (b)))
+
+#define __nds32__cmpeq8(a, b) \
+  (__builtin_nds32_cmpeq8 ((a), (b)))
+#define __nds32__v_scmpeq8(a, b) \
+  (__builtin_nds32_v_scmpeq8 ((a), (b)))
+#define __nds32__v_ucmpeq8(a, b) \
+  (__builtin_nds32_v_ucmpeq8 ((a), (b)))
+#define __nds32__scmplt8(a, b) \
+  (__builtin_nds32_scmplt8 ((a), (b)))
+#define __nds32__v_scmplt8(a, b) \
+  (__builtin_nds32_v_scmplt8 ((a), (b)))
+#define __nds32__scmple8(a, b) \
+  (__builtin_nds32_scmple8 ((a), (b)))
+#define __nds32__v_scmple8(a, b) \
+  (__builtin_nds32_v_scmple8 ((a), (b)))
+#define __nds32__ucmplt8(a, b) \
+  (__builtin_nds32_ucmplt8 ((a), (b)))
+#define __nds32__v_ucmplt8(a, b) \
+  (__builtin_nds32_v_ucmplt8 ((a), (b)))
+#define __nds32__ucmple8(a, b) \
+  (__builtin_nds32_ucmple8 ((a), (b)))
+#define __nds32__v_ucmple8(a, b) \
+  (__builtin_nds32_v_ucmple8 ((a), (b)))
+
+#define __nds32__smin16(a, b) \
+  (__builtin_nds32_smin16 ((a), (b)))
+#define __nds32__v_smin16(a, b) \
+  (__builtin_nds32_v_smin16 ((a), (b)))
+#define __nds32__umin16(a, b) \
+  (__builtin_nds32_umin16 ((a), (b)))
+#define __nds32__v_umin16(a, b) \
+  (__builtin_nds32_v_umin16 ((a), (b)))
+#define __nds32__smax16(a, b) \
+  (__builtin_nds32_smax16 ((a), (b)))
+#define __nds32__v_smax16(a, b) \
+  (__builtin_nds32_v_smax16 ((a), (b)))
+#define __nds32__umax16(a, b) \
+  (__builtin_nds32_umax16 ((a), (b)))
+#define __nds32__v_umax16(a, b) \
+  (__builtin_nds32_v_umax16 ((a), (b)))
+#define __nds32__sclip16(a, b) \
+  (__builtin_nds32_sclip16 ((a), (b)))
+#define __nds32__v_sclip16(a, b) \
+  (__builtin_nds32_v_sclip16 ((a), (b)))
+#define __nds32__uclip16(a, b) \
+  (__builtin_nds32_uclip16 ((a), (b)))
+#define __nds32__v_uclip16(a, b) \
+  (__builtin_nds32_v_uclip16 ((a), (b)))
+#define __nds32__khm16(a, b) \
+  (__builtin_nds32_khm16 ((a), (b)))
+#define __nds32__v_khm16(a, b) \
+  (__builtin_nds32_v_khm16 ((a), (b)))
+#define __nds32__khmx16(a, b) \
+  (__builtin_nds32_khmx16 ((a), (b)))
+#define __nds32__v_khmx16(a, b) \
+  (__builtin_nds32_v_khmx16 ((a), (b)))
+#define __nds32__kabs16(a) \
+  (__builtin_nds32_kabs16 ((a)))
+#define __nds32__v_kabs16(a) \
+  (__builtin_nds32_v_kabs16 ((a)))
+
+#define __nds32__smin8(a, b) \
+  (__builtin_nds32_smin8 ((a), (b)))
+#define __nds32__v_smin8(a, b) \
+  (__builtin_nds32_v_smin8 ((a), (b)))
+#define __nds32__umin8(a, b) \
+  (__builtin_nds32_umin8 ((a), (b)))
+#define __nds32__v_umin8(a, b) \
+  (__builtin_nds32_v_umin8 ((a), (b)))
+#define __nds32__smax8(a, b) \
+  (__builtin_nds32_smax8 ((a), (b)))
+#define __nds32__v_smax8(a, b) \
+  (__builtin_nds32_v_smax8 ((a), (b)))
+#define __nds32__umax8(a, b) \
+  (__builtin_nds32_umax8 ((a), (b)))
+#define __nds32__v_umax8(a, b) \
+  (__builtin_nds32_v_umax8 ((a), (b)))
+#define __nds32__kabs8(a) \
+  (__builtin_nds32_kabs8 ((a)))
+#define __nds32__v_kabs8(a) \
+  (__builtin_nds32_v_kabs8 ((a)))
+
+#define __nds32__sunpkd810(a) \
+  (__builtin_nds32_sunpkd810 ((a)))
+#define __nds32__v_sunpkd810(a) \
+  (__builtin_nds32_v_sunpkd810 ((a)))
+#define __nds32__sunpkd820(a) \
+  (__builtin_nds32_sunpkd820 ((a)))
+#define __nds32__v_sunpkd820(a) \
+  (__builtin_nds32_v_sunpkd820 ((a)))
+#define __nds32__sunpkd830(a) \
+  (__builtin_nds32_sunpkd830 ((a)))
+#define __nds32__v_sunpkd830(a) \
+  (__builtin_nds32_v_sunpkd830 ((a)))
+#define __nds32__sunpkd831(a) \
+  (__builtin_nds32_sunpkd831 ((a)))
+#define __nds32__v_sunpkd831(a) \
+  (__builtin_nds32_v_sunpkd831 ((a)))
+#define __nds32__zunpkd810(a) \
+  (__builtin_nds32_zunpkd810 ((a)))
+#define __nds32__v_zunpkd810(a) \
+  (__builtin_nds32_v_zunpkd810 ((a)))
+#define __nds32__zunpkd820(a) \
+  (__builtin_nds32_zunpkd820 ((a)))
+#define __nds32__v_zunpkd820(a) \
+  (__builtin_nds32_v_zunpkd820 ((a)))
+#define __nds32__zunpkd830(a) \
+  (__builtin_nds32_zunpkd830 ((a)))
+#define __nds32__v_zunpkd830(a) \
+  (__builtin_nds32_v_zunpkd830 ((a)))
+#define __nds32__zunpkd831(a) \
+  (__builtin_nds32_zunpkd831 ((a)))
+#define __nds32__v_zunpkd831(a) \
+  (__builtin_nds32_v_zunpkd831 ((a)))
+
+#define __nds32__raddw(a, b) \
+  (__builtin_nds32_raddw ((a), (b)))
+#define __nds32__uraddw(a, b) \
+  (__builtin_nds32_uraddw ((a), (b)))
+#define __nds32__rsubw(a, b) \
+  (__builtin_nds32_rsubw ((a), (b)))
+#define __nds32__ursubw(a, b) \
+  (__builtin_nds32_ursubw ((a), (b)))
+
+#define __nds32__sra_u(a, b) \
+  (__builtin_nds32_sra_u ((a), (b)))
+#define __nds32__ksll(a, b) \
+  (__builtin_nds32_ksll ((a), (b)))
+#define __nds32__pkbb16(a, b) \
+  (__builtin_nds32_pkbb16 ((a), (b)))
+#define __nds32__v_pkbb16(a, b) \
+  (__builtin_nds32_v_pkbb16 ((a), (b)))
+#define __nds32__pkbt16(a, b) \
+  (__builtin_nds32_pkbt16 ((a), (b)))
+#define __nds32__v_pkbt16(a, b) \
+  (__builtin_nds32_v_pkbt16 ((a), (b)))
+#define __nds32__pktb16(a, b) \
+  (__builtin_nds32_pktb16 ((a), (b)))
+#define __nds32__v_pktb16(a, b) \
+  (__builtin_nds32_v_pktb16 ((a), (b)))
+#define __nds32__pktt16(a, b) \
+  (__builtin_nds32_pktt16 ((a), (b)))
+#define __nds32__v_pktt16(a, b) \
+  (__builtin_nds32_v_pktt16 ((a), (b)))
+
+#define __nds32__smmul(a, b) \
+  (__builtin_nds32_smmul ((a), (b)))
+#define __nds32__smmul_u(a, b) \
+  (__builtin_nds32_smmul_u ((a), (b)))
+#define __nds32__kmmac(r, a, b) \
+  (__builtin_nds32_kmmac ((r), (a), (b)))
+#define __nds32__kmmac_u(r, a, b) \
+  (__builtin_nds32_kmmac_u ((r), (a), (b)))
+#define __nds32__kmmsb(r, a, b) \
+  (__builtin_nds32_kmmsb ((r), (a), (b)))
+#define __nds32__kmmsb_u(r, a, b) \
+  (__builtin_nds32_kmmsb_u ((r), (a), (b)))
+#define __nds32__kwmmul(a, b) \
+  (__builtin_nds32_kwmmul ((a), (b)))
+#define __nds32__kwmmul_u(a, b) \
+  (__builtin_nds32_kwmmul_u ((a), (b)))
+
+#define __nds32__smmwb(a, b) \
+  (__builtin_nds32_smmwb ((a), (b)))
+#define __nds32__v_smmwb(a, b) \
+  (__builtin_nds32_v_smmwb ((a), (b)))
+#define __nds32__smmwb_u(a, b) \
+  (__builtin_nds32_smmwb_u ((a), (b)))
+#define __nds32__v_smmwb_u(a, b) \
+  (__builtin_nds32_v_smmwb_u ((a), (b)))
+#define __nds32__smmwt(a, b) \
+  (__builtin_nds32_smmwt ((a), (b)))
+#define __nds32__v_smmwt(a, b) \
+  (__builtin_nds32_v_smmwt ((a), (b)))
+#define __nds32__smmwt_u(a, b) \
+  (__builtin_nds32_smmwt_u ((a), (b)))
+#define __nds32__v_smmwt_u(a, b) \
+  (__builtin_nds32_v_smmwt_u ((a), (b)))
+#define __nds32__kmmawb(r, a, b) \
+  (__builtin_nds32_kmmawb ((r), (a), (b)))
+#define __nds32__v_kmmawb(r, a, b) \
+  (__builtin_nds32_v_kmmawb ((r), (a), (b)))
+#define __nds32__kmmawb_u(r, a, b) \
+  (__builtin_nds32_kmmawb_u ((r), (a), (b)))
+#define __nds32__v_kmmawb_u(r, a, b) \
+  (__builtin_nds32_v_kmmawb_u ((r), (a), (b)))
+#define __nds32__kmmawt(r, a, b) \
+  (__builtin_nds32_kmmawt ((r), (a), (b)))
+#define __nds32__v_kmmawt(r, a, b) \
+  (__builtin_nds32_v_kmmawt ((r), (a), (b)))
+#define __nds32__kmmawt_u(r, a, b) \
+  (__builtin_nds32_kmmawt_u ((r), (a), (b)))
+#define __nds32__v_kmmawt_u(r, a, b) \
+  (__builtin_nds32_v_kmmawt_u ((r), (a), (b)))
+
+#define __nds32__smbb(a, b) \
+  (__builtin_nds32_smbb ((a), (b)))
+#define __nds32__v_smbb(a, b) \
+  (__builtin_nds32_v_smbb ((a), (b)))
+#define __nds32__smbt(a, b) \
+  (__builtin_nds32_smbt ((a), (b)))
+#define __nds32__v_smbt(a, b) \
+  (__builtin_nds32_v_smbt ((a), (b)))
+#define __nds32__smtt(a, b) \
+  (__builtin_nds32_smtt ((a), (b)))
+#define __nds32__v_smtt(a, b) \
+  (__builtin_nds32_v_smtt ((a), (b)))
+#define __nds32__kmda(a, b) \
+  (__builtin_nds32_kmda ((a), (b)))
+#define __nds32__v_kmda(a, b) \
+  (__builtin_nds32_v_kmda ((a), (b)))
+#define __nds32__kmxda(a, b) \
+  (__builtin_nds32_kmxda ((a), (b)))
+#define __nds32__v_kmxda(a, b) \
+  (__builtin_nds32_v_kmxda ((a), (b)))
+#define __nds32__smds(a, b) \
+  (__builtin_nds32_smds ((a), (b)))
+#define __nds32__v_smds(a, b) \
+  (__builtin_nds32_v_smds ((a), (b)))
+#define __nds32__smdrs(a, b) \
+  (__builtin_nds32_smdrs ((a), (b)))
+#define __nds32__v_smdrs(a, b) \
+  (__builtin_nds32_v_smdrs ((a), (b)))
+#define __nds32__smxds(a, b) \
+  (__builtin_nds32_smxds ((a), (b)))
+#define __nds32__v_smxds(a, b) \
+  (__builtin_nds32_v_smxds ((a), (b)))
+#define __nds32__kmabb(r, a, b) \
+  (__builtin_nds32_kmabb ((r), (a), (b)))
+#define __nds32__v_kmabb(r, a, b) \
+  (__builtin_nds32_v_kmabb ((r), (a), (b)))
+#define __nds32__kmabt(r, a, b) \
+  (__builtin_nds32_kmabt ((r), (a), (b)))
+#define __nds32__v_kmabt(r, a, b) \
+  (__builtin_nds32_v_kmabt ((r), (a), (b)))
+#define __nds32__kmatt(r, a, b) \
+  (__builtin_nds32_kmatt ((r), (a), (b)))
+#define __nds32__v_kmatt(r, a, b) \
+  (__builtin_nds32_v_kmatt ((r), (a), (b)))
+#define __nds32__kmada(r, a, b) \
+  (__builtin_nds32_kmada ((r), (a), (b)))
+#define __nds32__v_kmada(r, a, b) \
+  (__builtin_nds32_v_kmada ((r), (a), (b)))
+#define __nds32__kmaxda(r, a, b) \
+  (__builtin_nds32_kmaxda ((r), (a), (b)))
+#define __nds32__v_kmaxda(r, a, b) \
+  (__builtin_nds32_v_kmaxda ((r), (a), (b)))
+#define __nds32__kmads(r, a, b) \
+  (__builtin_nds32_kmads ((r), (a), (b)))
+#define __nds32__v_kmads(r, a, b) \
+  (__builtin_nds32_v_kmads ((r), (a), (b)))
+#define __nds32__kmadrs(r, a, b) \
+  (__builtin_nds32_kmadrs ((r), (a), (b)))
+#define __nds32__v_kmadrs(r, a, b) \
+  (__builtin_nds32_v_kmadrs ((r), (a), (b)))
+#define __nds32__kmaxds(r, a, b) \
+  (__builtin_nds32_kmaxds ((r), (a), (b)))
+#define __nds32__v_kmaxds(r, a, b) \
+  (__builtin_nds32_v_kmaxds ((r), (a), (b)))
+#define __nds32__kmsda(r, a, b) \
+  (__builtin_nds32_kmsda ((r), (a), (b)))
+#define __nds32__v_kmsda(r, a, b) \
+  (__builtin_nds32_v_kmsda ((r), (a), (b)))
+#define __nds32__kmsxda(r, a, b) \
+  (__builtin_nds32_kmsxda ((r), (a), (b)))
+#define __nds32__v_kmsxda(r, a, b) \
+  (__builtin_nds32_v_kmsxda ((r), (a), (b)))
+
+#define __nds32__smal(a, b) \
+  (__builtin_nds32_smal ((a), (b)))
+#define __nds32__v_smal(a, b) \
+  (__builtin_nds32_v_smal ((a), (b)))
+
+#define __nds32__bitrev(a, b) \
+  (__builtin_nds32_bitrev ((a), (b)))
+#define __nds32__wext(a, b) \
+  (__builtin_nds32_wext ((a), (b)))
+#define __nds32__bpick(r, a, b) \
+  (__builtin_nds32_bpick ((r), (a), (b)))
+#define __nds32__insb(r, a, b) \
+  (__builtin_nds32_insb ((r), (a), (b)))
+
+#define __nds32__sadd64(a, b) \
+  (__builtin_nds32_sadd64 ((a), (b)))
+#define __nds32__uadd64(a, b) \
+  (__builtin_nds32_uadd64 ((a), (b)))
+#define __nds32__radd64(a, b) \
+  (__builtin_nds32_radd64 ((a), (b)))
+#define __nds32__uradd64(a, b) \
+  (__builtin_nds32_uradd64 ((a), (b)))
+#define __nds32__kadd64(a, b) \
+  (__builtin_nds32_kadd64 ((a), (b)))
+#define __nds32__ukadd64(a, b) \
+  (__builtin_nds32_ukadd64 ((a), (b)))
+#define __nds32__ssub64(a, b) \
+  (__builtin_nds32_ssub64 ((a), (b)))
+#define __nds32__usub64(a, b) \
+  (__builtin_nds32_usub64 ((a), (b)))
+#define __nds32__rsub64(a, b) \
+  (__builtin_nds32_rsub64 ((a), (b)))
+#define __nds32__ursub64(a, b) \
+  (__builtin_nds32_ursub64 ((a), (b)))
+#define __nds32__ksub64(a, b) \
+  (__builtin_nds32_ksub64 ((a), (b)))
+#define __nds32__uksub64(a, b) \
+  (__builtin_nds32_uksub64 ((a), (b)))
+
+#define __nds32__smar64(r, a, b) \
+  (__builtin_nds32_smar64 ((r), (a), (b)))
+#define __nds32__smsr64(r, a, b) \
+  (__builtin_nds32_smsr64 ((r), (a), (b)))
+#define __nds32__umar64(r, a, b) \
+  (__builtin_nds32_umar64 ((r), (a), (b)))
+#define __nds32__umsr64(r, a, b) \
+  (__builtin_nds32_umsr64 ((r), (a), (b)))
+#define __nds32__kmar64(r, a, b) \
+  (__builtin_nds32_kmar64 ((r), (a), (b)))
+#define __nds32__kmsr64(r, a, b) \
+  (__builtin_nds32_kmsr64 ((r), (a), (b)))
+#define __nds32__ukmar64(r, a, b) \
+  (__builtin_nds32_ukmar64 ((r), (a), (b)))
+#define __nds32__ukmsr64(r, a, b) \
+  (__builtin_nds32_ukmsr64 ((r), (a), (b)))
+
+#define __nds32__smalbb(r, a, b) \
+  (__builtin_nds32_smalbb ((r), (a), (b)))
+#define __nds32__v_smalbb(r, a, b) \
+  (__builtin_nds32_v_smalbb ((r), (a), (b)))
+#define __nds32__smalbt(r, a, b) \
+  (__builtin_nds32_smalbt ((r), (a), (b)))
+#define __nds32__v_smalbt(r, a, b) \
+  (__builtin_nds32_v_smalbt ((r), (a), (b)))
+#define __nds32__smaltt(r, a, b) \
+  (__builtin_nds32_smaltt ((r), (a), (b)))
+#define __nds32__v_smaltt(r, a, b) \
+  (__builtin_nds32_v_smaltt ((r), (a), (b)))
+#define __nds32__smalda(r, a, b) \
+  (__builtin_nds32_smalda ((r), (a), (b)))
+#define __nds32__v_smalda(r, a, b) \
+  (__builtin_nds32_v_smalda ((r), (a), (b)))
+#define __nds32__smalxda(r, a, b) \
+  (__builtin_nds32_smalxda ((r), (a), (b)))
+#define __nds32__v_smalxda(r, a, b) \
+  (__builtin_nds32_v_smalxda ((r), (a), (b)))
+#define __nds32__smalds(r, a, b) \
+  (__builtin_nds32_smalds ((r), (a), (b)))
+#define __nds32__v_smalds(r, a, b) \
+  (__builtin_nds32_v_smalds ((r), (a), (b)))
+#define __nds32__smaldrs(r, a, b) \
+  (__builtin_nds32_smaldrs ((r), (a), (b)))
+#define __nds32__v_smaldrs(r, a, b) \
+  (__builtin_nds32_v_smaldrs ((r), (a), (b)))
+#define __nds32__smalxds(r, a, b) \
+  (__builtin_nds32_smalxds ((r), (a), (b)))
+#define __nds32__v_smalxds(r, a, b) \
+  (__builtin_nds32_v_smalxds ((r), (a), (b)))
+#define __nds32__smslda(r, a, b) \
+  (__builtin_nds32_smslda ((r), (a), (b)))
+#define __nds32__v_smslda(r, a, b) \
+  (__builtin_nds32_v_smslda ((r), (a), (b)))
+#define __nds32__smslxda(r, a, b) \
+  (__builtin_nds32_smslxda ((r), (a), (b)))
+#define __nds32__v_smslxda(r, a, b) \
+  (__builtin_nds32_v_smslxda ((r), (a), (b)))
+
+#define __nds32__smul16(a, b) \
+  (__builtin_nds32_smul16 ((a), (b)))
+#define __nds32__v_smul16(a, b) \
+  (__builtin_nds32_v_smul16 ((a), (b)))
+#define __nds32__smulx16(a, b) \
+  (__builtin_nds32_smulx16 ((a), (b)))
+#define __nds32__v_smulx16(a, b) \
+  (__builtin_nds32_v_smulx16 ((a), (b)))
+#define __nds32__umul16(a, b) \
+  (__builtin_nds32_umul16 ((a), (b)))
+#define __nds32__v_umul16(a, b) \
+  (__builtin_nds32_v_umul16 ((a), (b)))
+#define __nds32__umulx16(a, b) \
+  (__builtin_nds32_umulx16 ((a), (b)))
+#define __nds32__v_umulx16(a, b) \
+  (__builtin_nds32_v_umulx16 ((a), (b)))
+
+#define __nds32__uclip32(a, imm) \
+  (__builtin_nds32_uclip32 ((a), (imm)))
+#define __nds32__sclip32(a, imm) \
+  (__builtin_nds32_sclip32 ((a), (imm)))
+#define __nds32__kabs(a) \
+  (__builtin_nds32_kabs ((a)))
+
+#define __nds32__no_ext_zol() \
+  (__builtin_nds32_no_ext_zol())
+
+#define __nds32__unaligned_feature() \
+  (__builtin_nds32_unaligned_feature())
+#define __nds32__enable_unaligned() \
+  (__builtin_nds32_enable_unaligned())
+#define __nds32__disable_unaligned() \
+  (__builtin_nds32_disable_unaligned())
+
+#define __nds32__get_unaligned_u16x2(a) \
+  (__builtin_nds32_get_unaligned_u16x2 ((a)))
+#define __nds32__get_unaligned_s16x2(a) \
+  (__builtin_nds32_get_unaligned_s16x2 ((a)))
+#define __nds32__get_unaligned_u8x4(a) \
+  (__builtin_nds32_get_unaligned_u8x4 ((a)))
+#define __nds32__get_unaligned_s8x4(a) \
+  (__builtin_nds32_get_unaligned_s8x4 ((a)))
+
+#define __nds32__put_unaligned_u16x2(a, data) \
+  (__builtin_nds32_put_unaligned_u16x2 ((a), (data)))
+#define __nds32__put_unaligned_s16x2(a, data) \
+  (__builtin_nds32_put_unaligned_s16x2 ((a), (data)))
+#define __nds32__put_unaligned_u8x4(a, data) \
+  (__builtin_nds32_put_unaligned_u8x4 ((a), (data)))
+#define __nds32__put_unaligned_s8x4(a, data) \
+  (__builtin_nds32_put_unaligned_s8x4 ((a), (data)))
+
+#define NDS32ATTR_SIGNATURE              __attribute__((signature))
+
 #endif /* nds32_intrinsic.h */
diff --git a/gcc/config/nds32/nds32_isr.h b/gcc/config/nds32/nds32_isr.h
new file mode 100644
index 0000000..6fabd3e
--- /dev/null
+++ b/gcc/config/nds32/nds32_isr.h
@@ -0,0 +1,526 @@
+/* Intrinsic definitions of Andes NDS32 cpu for GNU compiler
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _NDS32_ISR_H
+#define _NDS32_ISR_H
+
+/* Attribute of a interrupt or exception handler:
+
+   NDS32_READY_NESTED: This handler is interruptible if user re-enable GIE bit.
+   NDS32_NESTED      : This handler is interruptible.  This is not suitable
+                       exception handler.
+   NDS32_NOT_NESTED  : This handler is NOT interruptible.  Users have to do
+                       some work if nested is wanted
+   NDS32_CRITICAL    : This handler is critical ISR, which means it is small
+                       and efficient.  */
+#define NDS32_READY_NESTED   0
+#define NDS32_NESTED         1
+#define NDS32_NOT_NESTED     2
+#define NDS32_CRITICAL       3
+
+/* Attribute of a interrupt or exception handler:
+
+   NDS32_SAVE_ALL_REGS    : Save all registers in a table.
+   NDS32_SAVE_PARTIAL_REGS: Save partial registers.  */
+#define NDS32_SAVE_CALLER_REGS   0
+#define NDS32_SAVE_ALL_REGS      1
+
+/* There are two version of Register table for interrupt and exception handler,
+   one for 16-register CPU the other for 32-register CPU.  These structures are
+   used for context switching or system call handling.  The address of this
+   data can be get from the input argument of the handler functions.
+
+   For system call handling, r0 to r5 are used to pass arguments.  If more
+   arguments are used they are put into the stack and its starting address is
+   in sp.  Return value of system call can be put into r0 and r1 upon exit from
+   system call handler.  System call ID is in a system register and it can be
+   fetched via intrinsic function.  For more information please read ABI and
+   other related documents.
+
+   For context switching, at least 2 values need to saved in kernel.  One is
+   IPC and the other is the stack address of current task.  Use intrinsic
+   function to get IPC and  the input argument of the handler functions + 8 to
+   get stack address of current task.  To do context switching, you replace
+   new_sp with the stack address of new task and replace IPC system register
+   with IPC of new task, then, just return from handler.  The context switching
+   will happen.  */
+
+/* Register table for exception handler; 32-register version.  */
+typedef struct
+{
+  int r0;
+  int r1;
+  int r2;
+  int r3;
+  int r4;
+  int r5;
+  int r6;
+  int r7;
+  int r8;
+  int r9;
+  int r10;
+  int r11;
+  int r12;
+  int r13;
+  int r14;
+  int r15;
+  int r16;
+  int r17;
+  int r18;
+  int r19;
+  int r20;
+  int r21;
+  int r22;
+  int r23;
+  int r24;
+  int r25;
+  int r26;
+  int r27;
+  int fp;
+  int gp;
+  int lp;
+  int sp;
+} NDS32_GPR32;
+
+/* Register table for exception handler; 16-register version.  */
+typedef struct
+{
+  int r0;
+  int r1;
+  int r2;
+  int r3;
+  int r4;
+  int r5;
+  int r6;
+  int r7;
+  int r8;
+  int r9;
+  int r10;
+  int r15;
+  int fp;
+  int gp;
+  int lp;
+  int sp;
+} NDS32_GPR16;
+
+
+/* Use NDS32_REG32_TAB or NDS32_REG16_TAB in your program to
+   access register table.  */
+typedef struct
+{
+  union
+    {
+      int          reg_a[32] ;
+      NDS32_GPR32  reg_s ;
+    } u ;
+} NDS32_REG32_TAB;
+
+typedef struct
+{
+  union
+    {
+      int          reg_a[16] ;
+      NDS32_GPR16  reg_s ;
+    } u ;
+} NDS32_REG16_TAB;
+
+typedef struct
+{
+  int    d0lo;
+  int    d0hi;
+  int    d1lo;
+  int    d1hi;
+} NDS32_DX_TAB;
+
+typedef struct
+{
+#ifdef __NDS32_EB__
+  float    fsr0;
+  float    fsr1;
+  float    fsr2;
+  float    fsr3;
+  float    fsr4;
+  float    fsr5;
+  float    fsr6;
+  float    fsr7;
+#else
+  float    fsr1;
+  float    fsr0;
+  float    fsr3;
+  float    fsr2;
+  float    fsr5;
+  float    fsr4;
+  float    fsr7;
+  float    fsr6;
+#endif
+} NDS32_FSR8;
+
+typedef struct
+{
+  double   dsr0;
+  double   dsr1;
+  double   dsr2;
+  double   dsr3;
+} NDS32_DSR4;
+
+typedef struct
+{
+#ifdef __NDS32_EB__
+  float    fsr0;
+  float    fsr1;
+  float    fsr2;
+  float    fsr3;
+  float    fsr4;
+  float    fsr5;
+  float    fsr6;
+  float    fsr7;
+  float    fsr8;
+  float    fsr9;
+  float    fsr10;
+  float    fsr11;
+  float    fsr12;
+  float    fsr13;
+  float    fsr14;
+  float    fsr15;
+#else
+  float    fsr1;
+  float    fsr0;
+  float    fsr3;
+  float    fsr2;
+  float    fsr5;
+  float    fsr4;
+  float    fsr7;
+  float    fsr6;
+  float    fsr9;
+  float    fsr8;
+  float    fsr11;
+  float    fsr10;
+  float    fsr13;
+  float    fsr12;
+  float    fsr15;
+  float    fsr14;
+#endif
+} NDS32_FSR16;
+
+typedef struct
+{
+  double   dsr0;
+  double   dsr1;
+  double   dsr2;
+  double   dsr3;
+  double   dsr4;
+  double   dsr5;
+  double   dsr6;
+  double   dsr7;
+} NDS32_DSR8;
+
+typedef struct
+{
+#ifdef __NDS32_EB__
+  float    fsr0;
+  float    fsr1;
+  float    fsr2;
+  float    fsr3;
+  float    fsr4;
+  float    fsr5;
+  float    fsr6;
+  float    fsr7;
+  float    fsr8;
+  float    fsr9;
+  float    fsr10;
+  float    fsr11;
+  float    fsr12;
+  float    fsr13;
+  float    fsr14;
+  float    fsr15;
+  float    fsr16;
+  float    fsr17;
+  float    fsr18;
+  float    fsr19;
+  float    fsr20;
+  float    fsr21;
+  float    fsr22;
+  float    fsr23;
+  float    fsr24;
+  float    fsr25;
+  float    fsr26;
+  float    fsr27;
+  float    fsr28;
+  float    fsr29;
+  float    fsr30;
+  float    fsr31;
+#else
+  float    fsr1;
+  float    fsr0;
+  float    fsr3;
+  float    fsr2;
+  float    fsr5;
+  float    fsr4;
+  float    fsr7;
+  float    fsr6;
+  float    fsr9;
+  float    fsr8;
+  float    fsr11;
+  float    fsr10;
+  float    fsr13;
+  float    fsr12;
+  float    fsr15;
+  float    fsr14;
+  float    fsr17;
+  float    fsr16;
+  float    fsr19;
+  float    fsr18;
+  float    fsr21;
+  float    fsr20;
+  float    fsr23;
+  float    fsr22;
+  float    fsr25;
+  float    fsr24;
+  float    fsr27;
+  float    fsr26;
+  float    fsr29;
+  float    fsr28;
+  float    fsr31;
+  float    fsr30;
+#endif
+} NDS32_FSR32;
+
+typedef struct
+{
+  double   dsr0;
+  double   dsr1;
+  double   dsr2;
+  double   dsr3;
+  double   dsr4;
+  double   dsr5;
+  double   dsr6;
+  double   dsr7;
+  double   dsr8;
+  double   dsr9;
+  double   dsr10;
+  double   dsr11;
+  double   dsr12;
+  double   dsr13;
+  double   dsr14;
+  double   dsr15;
+} NDS32_DSR16;
+
+typedef struct
+{
+  double   dsr0;
+  double   dsr1;
+  double   dsr2;
+  double   dsr3;
+  double   dsr4;
+  double   dsr5;
+  double   dsr6;
+  double   dsr7;
+  double   dsr8;
+  double   dsr9;
+  double   dsr10;
+  double   dsr11;
+  double   dsr12;
+  double   dsr13;
+  double   dsr14;
+  double   dsr15;
+  double   dsr16;
+  double   dsr17;
+  double   dsr18;
+  double   dsr19;
+  double   dsr20;
+  double   dsr21;
+  double   dsr22;
+  double   dsr23;
+  double   dsr24;
+  double   dsr25;
+  double   dsr26;
+  double   dsr27;
+  double   dsr28;
+  double   dsr29;
+  double   dsr30;
+  double   dsr31;
+} NDS32_DSR32;
+
+typedef struct
+{
+  union
+    {
+      NDS32_FSR8   fsr_s ;
+      NDS32_DSR4   dsr_s ;
+    } u ;
+} NDS32_FPU8_TAB;
+
+typedef struct
+{
+  union
+    {
+      NDS32_FSR16  fsr_s ;
+      NDS32_DSR8   dsr_s ;
+    } u ;
+} NDS32_FPU16_TAB;
+
+typedef struct
+{
+  union
+    {
+      NDS32_FSR32  fsr_s ;
+      NDS32_DSR16  dsr_s ;
+    } u ;
+} NDS32_FPU32_TAB;
+
+typedef struct
+{
+  union
+    {
+      NDS32_FSR32  fsr_s ;
+      NDS32_DSR32  dsr_s ;
+    } u ;
+} NDS32_FPU64_TAB;
+
+typedef struct
+{
+  int    ipc;
+  int    ipsw;
+#if defined(NDS32_EXT_FPU_CONFIG_0)
+  NDS32_FPU8_TAB fpr;
+#elif defined(NDS32_EXT_FPU_CONFIG_1)
+  NDS32_FPU16_TAB fpr;
+#elif defined(NDS32_EXT_FPU_CONFIG_2)
+  NDS32_FPU32_TAB fpr;
+#elif defined(NDS32_EXT_FPU_CONFIG_3)
+  NDS32_FPU64_TAB fpr;
+#endif
+#if __NDS32_DX_REGS__
+  NDS32_DX_TAB dxr;
+#endif
+#if __NDS32_EXT_IFC__
+  int    ifc_lp;
+  int    filler;
+#endif
+#if __NDS32_REDUCED_REGS__ || __NDS32_REDUCE_REGS
+  NDS32_REG16_TAB gpr;
+#else
+  NDS32_REG32_TAB gpr;
+#endif
+} NDS32_CONTEXT;
+
+/* Predefined Vector Definition.
+
+   For IVIC Mode: 9 to 14 are for hardware interrupt
+                  and 15 is for software interrupt.
+   For EVIC Mode: 9 to 72 are for hardware interrupt
+                  and software interrupt can be routed to any one of them.
+
+   You may want to define your hardware interrupts in the following way
+   for easy maintainance.
+
+     IVIC mode:
+       #define MY_HW_IVIC_TIMER NDS32_VECTOR_INTERRUPT_HW0 + 1
+       #define MY_HW_IVIC_USB   NDS32_VECTOR_INTERRUPT_HW0 + 3
+     EVIC mode:
+     #define MY_HW_EVIC_DMA   NDS32_VECTOR_INTERRUPT_HW0 + 2
+     #define MY_HW_EVIC_SWI   NDS32_VECTOR_INTERRUPT_HW0 + 10 */
+#define NDS32_VECTOR_RESET               0
+#define NDS32_VECTOR_TLB_FILL            1
+#define NDS32_VECTOR_PTE_NOT_PRESENT     2
+#define NDS32_VECTOR_TLB_MISC            3
+#define NDS32_VECTOR_TLB_VLPT_MISS       4
+#define NDS32_VECTOR_MACHINE_ERROR       5
+#define NDS32_VECTOR_DEBUG_RELATED       6
+#define NDS32_VECTOR_GENERAL_EXCEPTION   7
+#define NDS32_VECTOR_SYSCALL             8
+#define NDS32_VECTOR_INTERRUPT_HW0       9
+#define NDS32_VECTOR_INTERRUPT_HW1       10
+#define NDS32_VECTOR_INTERRUPT_HW2       11
+#define NDS32_VECTOR_INTERRUPT_HW3       12
+#define NDS32_VECTOR_INTERRUPT_HW4       13
+#define NDS32_VECTOR_INTERRUPT_HW5       14
+#define NDS32_VECTOR_INTERRUPT_HW6       15
+#define NDS32_VECTOR_SWI                 15  /* THIS IS FOR IVIC MODE ONLY */
+#define NDS32_VECTOR_INTERRUPT_HW7       16
+#define NDS32_VECTOR_INTERRUPT_HW8       17
+#define NDS32_VECTOR_INTERRUPT_HW9       18
+#define NDS32_VECTOR_INTERRUPT_HW10      19
+#define NDS32_VECTOR_INTERRUPT_HW11      20
+#define NDS32_VECTOR_INTERRUPT_HW12      21
+#define NDS32_VECTOR_INTERRUPT_HW13      22
+#define NDS32_VECTOR_INTERRUPT_HW14      23
+#define NDS32_VECTOR_INTERRUPT_HW15      24
+#define NDS32_VECTOR_INTERRUPT_HW16      25
+#define NDS32_VECTOR_INTERRUPT_HW17      26
+#define NDS32_VECTOR_INTERRUPT_HW18      27
+#define NDS32_VECTOR_INTERRUPT_HW19      28
+#define NDS32_VECTOR_INTERRUPT_HW20      29
+#define NDS32_VECTOR_INTERRUPT_HW21      30
+#define NDS32_VECTOR_INTERRUPT_HW22      31
+#define NDS32_VECTOR_INTERRUPT_HW23      32
+#define NDS32_VECTOR_INTERRUPT_HW24      33
+#define NDS32_VECTOR_INTERRUPT_HW25      34
+#define NDS32_VECTOR_INTERRUPT_HW26      35
+#define NDS32_VECTOR_INTERRUPT_HW27      36
+#define NDS32_VECTOR_INTERRUPT_HW28      37
+#define NDS32_VECTOR_INTERRUPT_HW29      38
+#define NDS32_VECTOR_INTERRUPT_HW30      39
+#define NDS32_VECTOR_INTERRUPT_HW31      40
+#define NDS32_VECTOR_INTERRUPT_HW32      41
+#define NDS32_VECTOR_INTERRUPT_HW33      42
+#define NDS32_VECTOR_INTERRUPT_HW34      43
+#define NDS32_VECTOR_INTERRUPT_HW35      44
+#define NDS32_VECTOR_INTERRUPT_HW36      45
+#define NDS32_VECTOR_INTERRUPT_HW37      46
+#define NDS32_VECTOR_INTERRUPT_HW38      47
+#define NDS32_VECTOR_INTERRUPT_HW39      48
+#define NDS32_VECTOR_INTERRUPT_HW40      49
+#define NDS32_VECTOR_INTERRUPT_HW41      50
+#define NDS32_VECTOR_INTERRUPT_HW42      51
+#define NDS32_VECTOR_INTERRUPT_HW43      52
+#define NDS32_VECTOR_INTERRUPT_HW44      53
+#define NDS32_VECTOR_INTERRUPT_HW45      54
+#define NDS32_VECTOR_INTERRUPT_HW46      55
+#define NDS32_VECTOR_INTERRUPT_HW47      56
+#define NDS32_VECTOR_INTERRUPT_HW48      57
+#define NDS32_VECTOR_INTERRUPT_HW49      58
+#define NDS32_VECTOR_INTERRUPT_HW50      59
+#define NDS32_VECTOR_INTERRUPT_HW51      60
+#define NDS32_VECTOR_INTERRUPT_HW52      61
+#define NDS32_VECTOR_INTERRUPT_HW53      62
+#define NDS32_VECTOR_INTERRUPT_HW54      63
+#define NDS32_VECTOR_INTERRUPT_HW55      64
+#define NDS32_VECTOR_INTERRUPT_HW56      65
+#define NDS32_VECTOR_INTERRUPT_HW57      66
+#define NDS32_VECTOR_INTERRUPT_HW58      67
+#define NDS32_VECTOR_INTERRUPT_HW59      68
+#define NDS32_VECTOR_INTERRUPT_HW60      69
+#define NDS32_VECTOR_INTERRUPT_HW61      70
+#define NDS32_VECTOR_INTERRUPT_HW62      71
+#define NDS32_VECTOR_INTERRUPT_HW63      72
+
+#define NDS32ATTR_RESET(option)          __attribute__((reset(option)))
+#define NDS32ATTR_EXCEPT(type)           __attribute__((exception(type)))
+#define NDS32ATTR_EXCEPTION(type)        __attribute__((exception(type)))
+#define NDS32ATTR_INTERRUPT(type)        __attribute__((interrupt(type)))
+#define NDS32ATTR_ISR(type)              __attribute__((interrupt(type)))
+
+#endif /* nds32_isr.h */
diff --git a/gcc/config/nds32/pipelines.md b/gcc/config/nds32/pipelines.md
index f7e2fa8..6cd854d 100644
--- a/gcc/config/nds32/pipelines.md
+++ b/gcc/config/nds32/pipelines.md
@@ -18,12 +18,65 @@
 ;; along with GCC; see the file COPYING3.  If not see
 ;; <http://www.gnu.org/licenses/>.

-(define_automaton "nds32_machine")
+;; ------------------------------------------------------------------------
+;; Include N7 pipeline settings.
+;; ------------------------------------------------------------------------
+(include "nds32-n7.md")
+
+
+;; ------------------------------------------------------------------------
+;; Include N8 pipeline settings.
+;; ------------------------------------------------------------------------
+(include "nds32-n8.md")
+
+
+;; ------------------------------------------------------------------------
+;; Include E8 pipeline settings.
+;; ------------------------------------------------------------------------
+(include "nds32-e8.md")
+
+
+;; ------------------------------------------------------------------------
+;; Include N9/N10 pipeline settings.
+;; ------------------------------------------------------------------------
+(include "nds32-n9-3r2w.md")
+(include "nds32-n9-2r1w.md")
+
+
+;; ------------------------------------------------------------------------
+;; Include N10 pipeline settings.
+;; ------------------------------------------------------------------------
+(include "nds32-n10.md")
+
+
+;; ------------------------------------------------------------------------
+;; Include Graywolf pipeline settings.
+;; ------------------------------------------------------------------------
+(include "nds32-graywolf.md")
+
+
+;; ------------------------------------------------------------------------
+;; Include N12/N13 pipeline settings.
+;; ------------------------------------------------------------------------
+(include "nds32-n13.md")
+
+
+;; ------------------------------------------------------------------------
+;; Include Panther pipeline settings.
+;; ------------------------------------------------------------------------
+(include "nds32-panther.md")
+
+
+;; ------------------------------------------------------------------------
+;; Define simple pipeline settings.
+;; ------------------------------------------------------------------------
+
+(define_automaton "nds32_simple_machine")

-(define_cpu_unit "general_unit" "nds32_machine")
+(define_cpu_unit "simple_unit" "nds32_simple_machine")

 (define_insn_reservation "simple_insn" 1
-			 (eq_attr "type" "unknown,load,store,move,alu,compare,branch,call,misc")
-			 "general_unit")
+  (eq_attr "pipeline_model" "simple")
+  "simple_unit")

 ;; ------------------------------------------------------------------------
diff --git a/gcc/config/nds32/predicates.md b/gcc/config/nds32/predicates.md
index 05a039d..71a3615 100644
--- a/gcc/config/nds32/predicates.md
+++ b/gcc/config/nds32/predicates.md
@@ -24,25 +24,93 @@
 (define_predicate "nds32_greater_less_comparison_operator"
   (match_code "gt,ge,lt,le"))

+(define_predicate "nds32_float_comparison_operator"
+  (match_code "eq,ne,le,lt,ge,gt,ordered,unordered,ungt,unge,unlt,unle"))
+
+(define_predicate "nds32_movecc_comparison_operator"
+  (match_code "eq,ne,le,leu,ge,geu"))
+
 (define_special_predicate "nds32_logical_binary_operator"
   (match_code "and,ior,xor"))

+(define_special_predicate "nds32_conditional_call_comparison_operator"
+  (match_code "lt,ge"))
+
+(define_special_predicate "nds32_have_33_inst_operator"
+  (match_code "mult,and,ior,xor"))
+
 (define_predicate "nds32_symbolic_operand"
-  (match_code "const,symbol_ref,label_ref"))
+  (and (match_code "const,symbol_ref,label_ref")
+       (match_test "!(TARGET_ICT_MODEL_LARGE
+		      && nds32_indirect_call_referenced_p (op))")))
+
+(define_predicate "nds32_nonunspec_symbolic_operand"
+  (and (match_code "const,symbol_ref,label_ref")
+       (match_test "!flag_pic && nds32_const_unspec_p (op)
+		    && !(TARGET_ICT_MODEL_LARGE
+			 && nds32_indirect_call_referenced_p (op))")))
+
+(define_predicate "nds32_label_operand"
+  (match_code "label_ref"))

 (define_predicate "nds32_reg_constant_operand"
-  (ior (match_operand 0 "register_operand")
-       (match_operand 0 "const_int_operand")))
+  (match_code "reg,const_int"))

 (define_predicate "nds32_rimm15s_operand"
   (ior (match_operand 0 "register_operand")
        (and (match_operand 0 "const_int_operand")
 	    (match_test "satisfies_constraint_Is15 (op)"))))

+(define_predicate "nds32_rimm11s_operand"
+  (ior (match_operand 0 "register_operand")
+       (and (match_operand 0 "const_int_operand")
+	    (match_test "satisfies_constraint_Is11 (op)"))))
+
+(define_predicate "nds32_imm_0_1_operand"
+  (and (match_operand 0 "const_int_operand")
+       (ior (match_test "satisfies_constraint_Iv00 (op)")
+	    (match_test "satisfies_constraint_Iv01 (op)"))))
+
+(define_predicate "nds32_imm_1_2_operand"
+  (and (match_operand 0 "const_int_operand")
+       (ior (match_test "satisfies_constraint_Iv01 (op)")
+	    (match_test "satisfies_constraint_Iv02 (op)"))))
+
+(define_predicate "nds32_imm_1_2_4_8_operand"
+  (and (match_operand 0 "const_int_operand")
+       (ior (ior (match_test "satisfies_constraint_Iv01 (op)")
+		 (match_test "satisfies_constraint_Iv02 (op)"))
+	    (ior (match_test "satisfies_constraint_Iv04 (op)")
+		 (match_test "satisfies_constraint_Iv08 (op)")))))
+
+(define_predicate "nds32_imm2u_operand"
+  (and (match_operand 0 "const_int_operand")
+       (match_test "satisfies_constraint_Iu02 (op)")))
+
+(define_predicate "nds32_imm4u_operand"
+  (and (match_operand 0 "const_int_operand")
+       (match_test "satisfies_constraint_Iu04 (op)")))
+
 (define_predicate "nds32_imm5u_operand"
   (and (match_operand 0 "const_int_operand")
        (match_test "satisfies_constraint_Iu05 (op)")))

+(define_predicate "nds32_imm6u_operand"
+  (and (match_operand 0 "const_int_operand")
+       (match_test "satisfies_constraint_Iu06 (op)")))
+
+(define_predicate "nds32_rimm4u_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "nds32_imm4u_operand")))
+
+(define_predicate "nds32_rimm5u_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "nds32_imm5u_operand")))
+
+(define_predicate "nds32_rimm6u_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "nds32_imm6u_operand")))
+
 (define_predicate "nds32_move_operand"
   (and (match_operand 0 "general_operand")
        (not (match_code "high,const,symbol_ref,label_ref")))
@@ -57,12 +125,121 @@
   return true;
 })

+(define_predicate "nds32_vmove_operand"
+  (and (match_operand 0 "general_operand")
+       (not (match_code "high,const,symbol_ref,label_ref")))
+{
+  /* If the constant op does NOT satisfy Is20 nor Ihig,
+     we can not perform move behavior by a single instruction.  */
+  if (GET_CODE (op) == CONST_VECTOR
+      && !satisfies_constraint_CVs2 (op)
+      && !satisfies_constraint_CVhi (op))
+    return false;
+
+  return true;
+})
+
+(define_predicate "nds32_and_operand"
+  (match_code "reg,const_int")
+{
+  return (REG_P (op) && GET_MODE (op) == mode)
+	 || satisfies_constraint_Izeb (op)
+	 || satisfies_constraint_Izeh (op)
+	 || satisfies_constraint_Ixls (op)
+	 || satisfies_constraint_Ix11 (op)
+	 || satisfies_constraint_Ibms (op)
+	 || satisfies_constraint_Ifex (op)
+	 || satisfies_constraint_Iu15 (op)
+	 || satisfies_constraint_Ii15 (op)
+	 || satisfies_constraint_Ic15 (op);
+})
+
+(define_predicate "nds32_ior_operand"
+  (match_code "reg,const_int")
+{
+  return (REG_P (op) && GET_MODE (op) == mode)
+	 || satisfies_constraint_Iu15 (op)
+	 || satisfies_constraint_Ie15 (op);
+})
+
+(define_predicate "nds32_xor_operand"
+  (match_code "reg,const_int")
+{
+  return (REG_P (op) && GET_MODE (op) == mode)
+	 || GET_CODE (op) == SUBREG
+	 || satisfies_constraint_Iu15 (op)
+	 || satisfies_constraint_It15 (op);
+})
+
+(define_predicate "nds32_general_register_operand"
+  (match_code "reg,subreg")
+{
+  if (GET_CODE (op) == SUBREG)
+    op = SUBREG_REG (op);
+
+  return (REG_P (op)
+	  && (REGNO (op) >= FIRST_PSEUDO_REGISTER
+	      || REGNO (op) <= NDS32_LAST_GPR_REGNUM));
+})
+
+(define_predicate "nds32_fpu_register_operand"
+  (match_code "reg,subreg")
+{
+  if (GET_CODE (op) == SUBREG)
+    op = SUBREG_REG (op);
+
+  return (REG_P (op)
+	  && NDS32_IS_FPR_REGNUM (REGNO (op)));
+})
+
+(define_predicate "fpu_reg_or_memory_operand"
+  (ior (match_operand 0 "nds32_fpu_register_operand")
+       (match_operand 0 "memory_operand")))
+
+(define_predicate "nds32_call_address_operand"
+  (ior (match_operand 0 "nds32_symbolic_operand")
+       (match_operand 0 "nds32_general_register_operand")))
+
+(define_predicate "nds32_insv_operand"
+  (match_code "const_int")
+{
+  return INTVAL (op) == 0
+	 || INTVAL (op) == 8
+	 || INTVAL (op) == 16
+	 || INTVAL (op) == 24;
+})
+
+(define_predicate "nds32_lmw_smw_base_operand"
+  (and (match_code "mem")
+       (match_test "nds32_valid_smw_lwm_base_p (op)")))
+
+(define_predicate "float_even_register_operand"
+  (and (match_code "reg")
+       (and (match_test "REGNO (op) >= NDS32_FIRST_FPR_REGNUM")
+	    (match_test "REGNO (op) <= NDS32_LAST_FPR_REGNUM")
+	    (match_test "(REGNO (op) & 1) == 0"))))
+
+(define_predicate "float_odd_register_operand"
+  (and (match_code "reg")
+       (and (match_test "REGNO (op) >= NDS32_FIRST_FPR_REGNUM")
+	    (match_test "REGNO (op) <= NDS32_LAST_FPR_REGNUM")
+	    (match_test "(REGNO (op) & 1) != 0"))))
+
 (define_special_predicate "nds32_load_multiple_operation"
   (match_code "parallel")
 {
   /* To verify 'load' operation, pass 'true' for the second argument.
      See the implementation in nds32.c for details.  */
-  return nds32_valid_multiple_load_store (op, true);
+  return nds32_valid_multiple_load_store_p (op, true, false);
+})
+
+(define_special_predicate "nds32_load_multiple_and_update_address_operation"
+  (match_code "parallel")
+{
+  /* To verify 'load' operation, pass 'true' for the second argument.
+     to verify 'update address' operation, pass 'true' for the third argument
+     See the implementation in nds32.c for details.  */
+  return nds32_valid_multiple_load_store_p (op, true, true);
 })

 (define_special_predicate "nds32_store_multiple_operation"
@@ -70,7 +247,16 @@
 {
   /* To verify 'store' operation, pass 'false' for the second argument.
      See the implementation in nds32.c for details.  */
-  return nds32_valid_multiple_load_store (op, false);
+  return nds32_valid_multiple_load_store_p (op, false, false);
+})
+
+(define_special_predicate "nds32_store_multiple_and_update_address_operation"
+  (match_code "parallel")
+{
+  /* To verify 'store' operation, pass 'false' for the second argument,
+     to verify 'update address' operation, pass 'true' for the third argument
+     See the implementation in nds32.c for details.  */
+  return nds32_valid_multiple_load_store_p (op, false, true);
 })

 (define_special_predicate "nds32_stack_push_operation"
diff --git a/gcc/config/nds32/t-elf b/gcc/config/nds32/t-elf
new file mode 100644
index 0000000..a63a310
--- /dev/null
+++ b/gcc/config/nds32/t-elf
@@ -0,0 +1,42 @@
+# The multilib settings of Andes NDS32 cpu for GNU compiler
+# Copyright (C) 2012-2016 Free Software Foundation, Inc.
+# Contributed by Andes Technology Corporation.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published
+# by the Free Software Foundation; either version 3, or (at your
+# option) any later version.
+#
+# GCC is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+# License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# We also define a macro MULTILIB_DEFAULTS in nds32.h that tells the
+# driver program which options are defaults for this target and thus
+# do not need to be handled specially.
+MULTILIB_OPTIONS += mcmodel=small/mcmodel=medium/mcmodel=large mvh
+
+ifneq ($(filter graywolf,$(TM_MULTILIB_CONFIG)),)
+MULTILIB_OPTIONS += mcpu=graywolf
+endif
+
+ifneq ($(filter dsp,$(TM_MULTILIB_CONFIG)),)
+MULTILIB_OPTIONS += mext-dsp
+endif
+
+ifneq ($(filter zol,$(TM_MULTILIB_CONFIG)),)
+MULTILIB_OPTIONS += mext-zol
+endif
+
+ifneq ($(filter v3m+,$(TM_MULTILIB_CONFIG)),)
+MULTILIB_OPTIONS += march=v3m+
+endif
+
+# ------------------------------------------------------------------------
diff --git a/gcc/config/nds32/t-mlibs b/gcc/config/nds32/t-linux
similarity index 94%
rename from gcc/config/nds32/t-mlibs
rename to gcc/config/nds32/t-linux
index 5cb13f7..a4d8ab3 100644
--- a/gcc/config/nds32/t-mlibs
+++ b/gcc/config/nds32/t-linux
@@ -21,6 +21,6 @@
 # We also define a macro MULTILIB_DEFAULTS in nds32.h that tells the
 # driver program which options are defaults for this target and thus
 # do not need to be handled specially.
-MULTILIB_OPTIONS = mcmodel=small/mcmodel=medium/mcmodel=large
+MULTILIB_OPTIONS +=

 # ------------------------------------------------------------------------
diff --git a/gcc/config/nds32/t-nds32 b/gcc/config/nds32/t-nds32
index cf3aea6..e34b844 100644
--- a/gcc/config/nds32/t-nds32
+++ b/gcc/config/nds32/t-nds32
@@ -1,51 +1,294 @@
-# General rules that all nds32/ targets must have.
+# Dependency rules rule of Andes NDS32 cpu for GNU compiler
 # Copyright (C) 2012-2016 Free Software Foundation, Inc.
 # Contributed by Andes Technology Corporation.
 #
 # This file is part of GCC.
 #
-# GCC is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3, or (at your option)
-# any later version.
+# GCC is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published
+# by the Free Software Foundation; either version 3, or (at your
+# option) any later version.
 #
-# GCC is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
+# GCC is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+# License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.

-nds32-cost.o: $(srcdir)/config/nds32/nds32-cost.c
-	$(COMPILE) $<
-	$(POSTCOMPILE)

-nds32-intrinsic.o: $(srcdir)/config/nds32/nds32-intrinsic.c
-	$(COMPILE) $<
-	$(POSTCOMPILE)
+nds32-md-auxiliary.o: $(srcdir)/config/nds32/nds32-md-auxiliary.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-md-auxiliary.c

-nds32-isr.o: $(srcdir)/config/nds32/nds32-isr.c
-	$(COMPILE) $<
-	$(POSTCOMPILE)
+nds32-memory-manipulation.o: $(srcdir)/config/nds32/nds32-memory-manipulation.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-memory-manipulation.c

-nds32-md-auxiliary.o: $(srcdir)/config/nds32/nds32-md-auxiliary.c
-	$(COMPILE) $<
-	$(POSTCOMPILE)
+nds32-predicates.o: $(srcdir)/config/nds32/nds32-predicates.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-predicates.c

-nds32-pipelines-auxiliary.o: $(srcdir)/config/nds32/nds32-pipelines-auxiliary.c
-	$(COMPILE) $<
-	$(POSTCOMPILE)
+nds32-intrinsic.o: $(srcdir)/config/nds32/nds32-intrinsic.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-intrinsic.c

-nds32-predicates.o: $(srcdir)/config/nds32/nds32-predicates.c
-	$(COMPILE) $<
-	$(POSTCOMPILE)
+nds32-pipelines-auxiliary.o: \
+  $(srcdir)/config/nds32/nds32-pipelines-auxiliary.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-pipelines-auxiliary.c

-nds32-memory-manipulation.o: $(srcdir)/config/nds32/nds32-memory-manipulation.c
-	$(COMPILE) $<
-	$(POSTCOMPILE)
+nds32-isr.o: \
+  $(srcdir)/config/nds32/nds32-isr.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-isr.c

-nds32-fp-as-gp.o: $(srcdir)/config/nds32/nds32-fp-as-gp.c
-	$(COMPILE) $<
-	$(POSTCOMPILE)
+nds32-cost.o: \
+  $(srcdir)/config/nds32/nds32-cost.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-cost.c
+
+nds32-fp-as-gp.o: \
+  $(srcdir)/config/nds32/nds32-fp-as-gp.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-fp-as-gp.c
+
+nds32-load-store-opt.o: \
+  $(srcdir)/config/nds32/nds32-load-store-opt.c \
+  $(srcdir)/config/nds32/nds32-load-store-opt.h \
+  $(srcdir)/config/nds32/nds32-reg-utils.h \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-load-store-opt.c
+
+nds32-soft-fp-comm.o: \
+  $(srcdir)/config/nds32/nds32-soft-fp-comm.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-soft-fp-comm.c
+
+nds32-regrename.o: \
+  $(srcdir)/config/nds32/nds32-regrename.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-regrename.c
+
+nds32-gcse.o: \
+  $(srcdir)/config/nds32/nds32-gcse.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-gcse.c
+
+nds32-relax-opt.o: \
+  $(srcdir)/config/nds32/nds32-relax-opt.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-relax-opt.c
+
+nds32-cprop-acc.o: \
+  $(srcdir)/config/nds32/nds32-cprop-acc.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-cprop-acc.c
+
+nds32-sign-conversion.o: \
+  $(srcdir)/config/nds32/nds32-sign-conversion.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(GIMPLE_H) $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-sign-conversion.c
+
+nds32-scalbn-transform.o: \
+  $(srcdir)/config/nds32/nds32-scalbn-transform.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(GIMPLE_H) $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-scalbn-transform.c
+
+nds32-abi-compatible.o: \
+  $(srcdir)/config/nds32/nds32-abi-compatible.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(GIMPLE_H) $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-abi-compatible.c
+
+nds32-lmwsmw.o: \
+  $(srcdir)/config/nds32/nds32-lmwsmw.c \
+  $(srcdir)/config/nds32/nds32-load-store-opt.h \
+  $(srcdir)/config/nds32/nds32-reg-utils.h \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-lmwsmw.c
+
+nds32-reg-utils.o: \
+  $(srcdir)/config/nds32/nds32-reg-utils.c \
+  $(srcdir)/config/nds32/nds32-reg-utils.h \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-reg-utils.c
+
+nds32-const-remater.o: \
+  $(srcdir)/config/nds32/nds32-const-remater.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-const-remater.c
+
+nds32-utils.o: \
+  $(srcdir)/config/nds32/nds32-utils.c \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+  $(RTL_H) $(TREE_H) $(HASH_TABLE_H) $(OBSTACK_H) $(REGS_H) hard-reg-set.h \
+  insn-config.h conditions.h output.h dumpfile.h \
+  $(INSN_ATTR_H) $(FLAGS_H) reload.h $(FUNCTION_H) \
+  $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
+  $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
+  $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/nds32/nds32-utils.c
diff --git a/gcc/configure b/gcc/configure
index 954673c..ca21885 100755
--- a/gcc/configure
+++ b/gcc/configure
@@ -27327,7 +27327,7 @@ esac
 # version to the per-target configury.
 case "$cpu_type" in
   aarch64 | alpha | arm | avr | bfin | cris | i386 | m32c | m68k | microblaze \
-  | mips | nios2 | pa | rs6000 | score | sparc | spu | tilegx | tilepro \
+  | mips | nds32 | nios2 | pa | rs6000 | score | sparc | spu | tilegx | tilepro \
   | visium | xstormy16 | xtensa)
     insn="nop"
     ;;
diff --git a/gcc/configure.ac b/gcc/configure.ac
index 4c65d44..d7a5efc 100644
--- a/gcc/configure.ac
+++ b/gcc/configure.ac
@@ -4667,7 +4667,7 @@ esac
 # version to the per-target configury.
 case "$cpu_type" in
   aarch64 | alpha | arm | avr | bfin | cris | i386 | m32c | m68k | microblaze \
-  | mips | nios2 | pa | rs6000 | score | sparc | spu | tilegx | tilepro \
+  | mips | nds32 | nios2 | pa | rs6000 | score | sparc | spu | tilegx | tilepro \
   | visium | xstormy16 | xtensa)
     insn="nop"
     ;;
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index ee2715d..37fa3b5 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -13587,38 +13587,33 @@ builtin is exact.

 These built-in functions are available for the NDS32 target:

-@deftypefn {Built-in Function} void __builtin_nds32_isync (int *@var{addr})
+@table @code
+@item void __builtin_nds32_isync (int *@var{addr})
 Insert an ISYNC instruction into the instruction stream where
 @var{addr} is an instruction address for serialization.
-@end deftypefn

-@deftypefn {Built-in Function} void __builtin_nds32_isb (void)
+@item void __builtin_nds32_isb (void)
 Insert an ISB instruction into the instruction stream.
-@end deftypefn

-@deftypefn {Built-in Function} int __builtin_nds32_mfsr (int @var{sr})
+@item int __builtin_nds32_mfsr (int @var{sr})
 Return the content of a system register which is mapped by @var{sr}.
-@end deftypefn

-@deftypefn {Built-in Function} int __builtin_nds32_mfusr (int @var{usr})
+@item int __builtin_nds32_mfusr (int @var{usr})
 Return the content of a user space register which is mapped by @var{usr}.
-@end deftypefn

-@deftypefn {Built-in Function} void __builtin_nds32_mtsr (int @var{value}, int @var{sr})
+@item void __builtin_nds32_mtsr (int @var{value}, int @var{sr})
 Move the @var{value} to a system register which is mapped by @var{sr}.
-@end deftypefn

-@deftypefn {Built-in Function} void __builtin_nds32_mtusr (int @var{value}, int @var{usr})
+@item void __builtin_nds32_mtusr (int @var{value}, int @var{usr})
 Move the @var{value} to a user space register which is mapped by @var{usr}.
-@end deftypefn

-@deftypefn {Built-in Function} void __builtin_nds32_setgie_en (void)
+@item void __builtin_nds32_setgie_en (void)
 Enable global interrupt.
-@end deftypefn

-@deftypefn {Built-in Function} void __builtin_nds32_setgie_dis (void)
+@item void __builtin_nds32_setgie_dis (void)
 Disable global interrupt.
-@end deftypefn
+
+@end table

 @node picoChip Built-in Functions
 @subsection picoChip Built-in Functions
diff --git a/gcc/doc/install.texi b/gcc/doc/install.texi
index b60b53a..fc23722 100644
--- a/gcc/doc/install.texi
+++ b/gcc/doc/install.texi
@@ -2109,7 +2109,7 @@ supported since version 4.7.2 and is the default in 4.8.0 and newer.

 @item --with-nds32-lib=@var{library}
 Specifies that @var{library} setting is used for building @file{libgcc.a}.
-Currently, the valid @var{library} is @samp{newlib} or @samp{mculib}.
+Currently, the valid @var{library} are 'newlib' or 'mculib'.
 This option is only supported for the NDS32 target.

 @item --with-build-time-tools=@var{dir}
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 2ed9285..75e0042 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -904,13 +904,19 @@ Objective-C and Objective-C++ Dialects}.
 -mreduced-regs -mfull-regs @gol
 -mcmov -mno-cmov @gol
 -mperf-ext -mno-perf-ext @gol
+-mperf2-ext -mno-perf2-ext @gol
+-mstring-ext -mno-string-ext @gol
 -mv3push -mno-v3push @gol
 -m16bit -mno-16bit @gol
+-mgp-direct -mno-gp-direct @gol
 -misr-vector-size=@var{num} @gol
 -mcache-block-size=@var{num} @gol
 -march=@var{arch} @gol
--mcmodel=@var{code-model} @gol
--mctor-dtor -mrelax}
+-mcpu=@var{cpu} @gol
+-mmemory-model=@var{cpu} @gol
+-mconfig-register-ports=@var{ports} @gol
+-mforce-fp-as-gp -mforbid-fp-as-gp @gol
+-mex9 -mctor-dtor -mrelax}

 @emph{Nios II Options}
 @gccoptlist{-G @var{num} -mgpopt=@var{option} -mgpopt -mno-gpopt @gol
@@ -5006,7 +5012,7 @@ example, warn if an unsigned variable is compared against zero with
 @opindex Wbad-function-cast
 @opindex Wno-bad-function-cast
 Warn when a function call is cast to a non-matching type.
-For example, warn if a call to a function returning an integer type
+For example, warn if a call to a function returning an integer type
 is cast to a pointer type.

 @item -Wc90-c99-compat @r{(C and Objective-C only)}
@@ -19089,6 +19095,22 @@ Generate performance extension instructions.
 @opindex mno-perf-ext
 Do not generate performance extension instructions.

+@item -mperf2-ext
+@opindex mperf2-ext
+Generate performance extension version 2 instructions.
+
+@item -mno-perf2-ext
+@opindex mno-perf2-ext
+Do not generate performance extension version 2 instructions.
+
+@item -mstring-ext
+@opindex mstring-ext
+Generate string extension instructions.
+
+@item -mno-string-ext
+@opindex mno-string-ext
+Do not generate string extension instructions.
+
 @item -mv3push
 @opindex mv3push
 Generate v3 push25/pop25 instructions.
@@ -19105,6 +19127,14 @@ Generate 16-bit instructions.
 @opindex mno-16-bit
 Do not generate 16-bit instructions.

+@item -mgp-direct
+@opindex mgp-direct
+Generate GP base instructions directly.
+
+@item -mno-gp-direct
+@opindex mno-gp-direct
+Do no generate GP base instructions directly.
+
 @item -misr-vector-size=@var{num}
 @opindex misr-vector-size
 Specify the size of each interrupt vector, which must be 4 or 16.
@@ -19118,20 +19148,33 @@ which must be a power of 2 between 4 and 512.
 @opindex march
 Specify the name of the target architecture.

-@item -mcmodel=@var{code-model}
-@opindex mcmodel
-Set the code model to one of
-@table @asis
-@item @samp{small}
-All the data and read-only data segments must be within 512KB addressing space.
-The text segment must be within 16MB addressing space.
-@item @samp{medium}
-The data segment must be within 512KB while the read-only data segment can be
-within 4GB addressing space.  The text segment should be still within 16MB
-addressing space.
-@item @samp{large}
-All the text and data segments can be within 4GB addressing space.
-@end table
+@item -mcpu=@var{cpu}
+@opindex mcpu
+Specify the cpu for pipeline model.
+
+@item -mmemory-model=@var{cpu}
+@opindex mmemory-model
+Specify fast or slow memory model.
+
+@item -mconfig-register-ports=@var{ports}
+@opindex mconfig-register-ports
+Specify how many read/write ports for n9/n10 cores.
+The value should be 3r2w or 2r1w.
+
+@item -mforce-fp-as-gp
+@opindex mforce-fp-as-gp
+Prevent $fp being allocated during register allocation so that compiler
+is able to force performing fp-as-gp optimization.
+
+@item -mforbid-fp-as-gp
+@opindex mforbid-fp-as-gp
+Forbid using $fp to access static and global variables.
+This option strictly forbids fp-as-gp optimization
+regardless of @option{-mforce-fp-as-gp}.
+
+@item -mex9
+@opindex mex9
+Use special directives to guide linker doing ex9 optimization.

 @item -mctor-dtor
 @opindex mctor-dtor
@@ -19159,55 +19202,15 @@ Put global and static objects less than or equal to @var{num} bytes
 into the small data or BSS sections instead of the normal data or BSS
 sections.  The default value of @var{num} is 8.

-@item -mgpopt=@var{option}
 @item -mgpopt
 @itemx -mno-gpopt
 @opindex mgpopt
 @opindex mno-gpopt
-Generate (do not generate) GP-relative accesses.  The following
-@var{option} names are recognized:
-
-@table @samp
-
-@item none
-Do not generate GP-relative accesses.
-
-@item local
-Generate GP-relative accesses for small data objects that are not
-external, weak, or uninitialized common symbols.
-Also use GP-relative addressing for objects that
-have been explicitly placed in a small data section via a @code{section}
-attribute.
-
-@item global
-As for @samp{local}, but also generate GP-relative accesses for
-small data objects that are external, weak, or common.  If you use this option,
-you must ensure that all parts of your program (including libraries) are
-compiled with the same @option{-G} setting.
-
-@item data
-Generate GP-relative accesses for all data objects in the program.  If you
-use this option, the entire data and BSS segments
-of your program must fit in 64K of memory and you must use an appropriate
-linker script to allocate them within the addressable range of the
-global pointer.
-
-@item all
-Generate GP-relative addresses for function pointers as well as data
-pointers.  If you use this option, the entire text, data, and BSS segments
-of your program must fit in 64K of memory and you must use an appropriate
-linker script to allocate them within the addressable range of the
-global pointer.
-
-@end table
-
-@option{-mgpopt} is equivalent to @option{-mgpopt=local}, and
-@option{-mno-gpopt} is equivalent to @option{-mgpopt=none}.
-
-The default is @option{-mgpopt} except when @option{-fpic} or
-@option{-fPIC} is specified to generate position-independent code.
-Note that the Nios II ABI does not permit GP-relative accesses from
-shared libraries.
+Generate (do not generate) GP-relative accesses for objects in the
+small data or BSS sections.  The default is @option{-mgpopt} except
+when @option{-fpic} or @option{-fPIC} is specified to generate
+position-independent code.  Note that the Nios II ABI does not permit
+GP-relative accesses from shared libraries.

 You may need to specify @option{-mno-gpopt} explicitly when building
 programs that include large amounts of small data, including large
diff --git a/gcc/gcc.c b/gcc/gcc.c
index 0f042b0..5c43f33 100644
--- a/gcc/gcc.c
+++ b/gcc/gcc.c
@@ -1288,7 +1288,7 @@ static const struct compiler default_compilers[] =
   {".zip", "#Java", 0, 0, 0}, {".jar", "#Java", 0, 0, 0},
   {".go", "#Go", 0, 1, 0},
   /* Next come the entries for C.  */
-  {".c", "@c", 0, 0, 1},
+  {".c", "@nds32_c", 0, 0, 1},
   {"@c",
    /* cc1 has an integrated ISO C preprocessor.  We should invoke the
       external preprocessor if -save-temps is given.  */
@@ -1303,6 +1303,38 @@ static const struct compiler default_compilers[] =
       %{!save-temps*:%{!traditional-cpp:%{!no-integrated-cpp:\
 	  cc1 %(cpp_unique_options) %(cc1_options)}}}\
       %{!fsyntax-only:%(invoke_as)}}}}", 0, 0, 1},
+  {"@nds32_c",
+   /* cc1 has an integrated ISO C preprocessor.  We should invoke the
+      external preprocessor if -save-temps is given.  */
+     "%{E|M|MM:%(trad_capable_cpp) %(cpp_options) %(cpp_debug_options)}\
+      %{mace:\
+	  %{!E:%{!M:%{!MM:\
+	      %{traditional:\
+%eGNU C no longer supports -traditional without -E}\
+	  %{save-temps*|traditional-cpp|no-integrated-cpp:%(trad_capable_cpp) \
+	      %(cpp_options) -o %{save-temps*:%b.i} %{!save-temps*:%g.i} \n\
+		cs2 %{mace-s2s*} %{save-temps*:%b.i} %{!save-temps*:%g.i} \
+		    -o %{save-temps*:%b.ace.i} %{!save-temps*:%g.ace.i} --\n\
+		cc1 -fpreprocessed %{save-temps*:%b.ace.i} %{!save-temps*:%g.ace.i} \
+	      %(cc1_options)}\
+	  %{!save-temps*:%{!traditional-cpp:%{!no-integrated-cpp:\
+	      %(trad_capable_cpp) %(cpp_options) -o %u.i\n}}}\
+	  %{!save-temps*:%{!traditional-cpp:%{!no-integrated-cpp:\
+	      cs2 %{mace-s2s*} %U.i -o %u.ace.i --\n}}}\
+	  %{!save-temps*:%{!traditional-cpp:%{!no-integrated-cpp:\
+	      cc1 -fpreprocessed %U.ace.i %(cc1_options)}}}\
+	  %{!fsyntax-only:%(invoke_as)}}}}}\
+      %{!mace:\
+	  %{!E:%{!M:%{!MM:\
+	      %{traditional:\
+%eGNU C no longer supports -traditional without -E}\
+	  %{save-temps*|traditional-cpp|no-integrated-cpp:%(trad_capable_cpp) \
+	      %(cpp_options) -o %{save-temps*:%b.i} %{!save-temps*:%g.i} \n\
+		cc1 -fpreprocessed %{save-temps*:%b.i} %{!save-temps*:%g.i} \
+	      %(cc1_options)}\
+	  %{!save-temps*:%{!traditional-cpp:%{!no-integrated-cpp:\
+	      cc1 %(cpp_unique_options) %(cc1_options)}}}\
+	  %{!fsyntax-only:%(invoke_as)}}}}}", 0, 0, 1},
   {"-",
    "%{!E:%e-E or -x required when input is from standard input}\
     %(trad_capable_cpp) %(cpp_options) %(cpp_debug_options)", 0, 0, 0},
diff --git a/gcc/loop-unroll.c b/gcc/loop-unroll.c
index 4d26e2f..60f934c 100644
--- a/gcc/loop-unroll.c
+++ b/gcc/loop-unroll.c
@@ -1132,7 +1132,9 @@ decide_unroll_stupid (struct loop *loop, int flags)
      of mispredicts.
      TODO: this heuristic needs tunning; call inside the loop body
      is also relatively good reason to not unroll.  */
-  if (num_loop_branches (loop) > 1)
+  unsigned branch_count = PARAM_VALUE (PARAM_MAX_LOOP_UNROLL_BRANCH);
+
+  if (num_loop_branches (loop) > branch_count)
     {
       if (dump_file)
 	fprintf (dump_file, ";; Not unrolling, contains branches\n");
diff --git a/gcc/opt-read.awk b/gcc/opt-read.awk
index b304ccb..2e6e8df 100644
--- a/gcc/opt-read.awk
+++ b/gcc/opt-read.awk
@@ -99,6 +99,7 @@ BEGIN {
 			val_flags = "0"
 			val_flags = val_flags \
 			  test_flag("Canonical", props, "| CL_ENUM_CANONICAL") \
+			  test_flag("Undocumented", props, "| CL_UNDOCUMENTED") \
 			  test_flag("DriverOnly", props, "| CL_ENUM_DRIVER_ONLY")
 			enum_data[enum_name] = enum_data[enum_name] \
 			  "  { " quote string quote ", " value ", " val_flags \
diff --git a/gcc/opts.c b/gcc/opts.c
index 0f9431a..da75332 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -1271,6 +1271,10 @@ print_filtered_help (unsigned int include_flags,
 	{
 	  unsigned int len = strlen (cl_enums[i].values[j].arg);

+	  /* Skip the undocument enum value */
+	  if (cl_enums[i].values[j].flags & CL_UNDOCUMENTED)
+	     continue;
+
 	  if (pos > 4 && pos + 1 + len <= columns)
 	    {
 	      printf (" %s", cl_enums[i].values[j].arg);
diff --git a/gcc/params.def b/gcc/params.def
index dbff305..44847b3 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -297,6 +297,11 @@ DEFPARAM(PARAM_MAX_UNROLL_TIMES,
 	"max-unroll-times",
 	"The maximum number of unrollings of a single loop.",
 	8, 0, 0)
+/* Maximum number of loop unroll loop branch count.  */
+DEFPARAM (PARAM_MAX_LOOP_UNROLL_BRANCH,
+         "max-unroll-loop-branch",
+         "Maximum number of loop branch count",
+         1, 1, 20)
 /* The maximum number of insns of a peeled loop.  */
 DEFPARAM(PARAM_MAX_PEELED_INSNS,
 	"max-peeled-insns",
diff --git a/gcc/testsuite/g++.dg/init/array15.C b/gcc/testsuite/g++.dg/init/array15.C
index 17160d0..280fe69 100644
--- a/gcc/testsuite/g++.dg/init/array15.C
+++ b/gcc/testsuite/g++.dg/init/array15.C
@@ -1,4 +1,6 @@
 // { dg-do run }
+// { dg-require-effective-target nds32_full_addr_space { target nds32*-*-elf* } }
+// { dg-options "-mcmodel=large" { target nds32*-*-elf* } }

 // Copyright (C) 2004 Free Software Foundation, Inc.
 // Contributed by Nathan Sidwell 8 Dec 2004 <nathan@codesourcery.com>
diff --git a/gcc/testsuite/g++.dg/init/array16.C b/gcc/testsuite/g++.dg/init/array16.C
index 188d1a8..83c0d47 100644
--- a/gcc/testsuite/g++.dg/init/array16.C
+++ b/gcc/testsuite/g++.dg/init/array16.C
@@ -2,6 +2,7 @@
 // have "compile" for some targets and "run" for others.
 // { dg-do run { target { ! mmix-*-* } } }
 // { dg-options "-mstructure-size-boundary=8" { target arm*-*-* } }
+// { dg-skip-if "" { nds32_gp_direct } }

 // Copyright (C) 2004 Free Software Foundation, Inc.
 // Contributed by Nathan Sidwell 8 Dec 2004 <nathan@codesourcery.com>
diff --git a/gcc/testsuite/g++.dg/torture/type-generic-1.C b/gcc/testsuite/g++.dg/torture/type-generic-1.C
index 4d82592..5ae789c 100644
--- a/gcc/testsuite/g++.dg/torture/type-generic-1.C
+++ b/gcc/testsuite/g++.dg/torture/type-generic-1.C
@@ -4,6 +4,7 @@
 /* { dg-do run } */
 /* { dg-add-options ieee } */
 /* { dg-skip-if "No Inf/NaN support" { spu-*-* } } */
+/* { dg-skip-if "No Denormmalized support" { nds32_ext_fpu } } */

 #include "../../gcc.dg/tg-tests.h"

diff --git a/gcc/testsuite/gcc.c-torture/compile/limits-fndefn.c b/gcc/testsuite/gcc.c-torture/compile/limits-fndefn.c
index 228c5d9..d2d3e51 100644
--- a/gcc/testsuite/gcc.c-torture/compile/limits-fndefn.c
+++ b/gcc/testsuite/gcc.c-torture/compile/limits-fndefn.c
@@ -1,4 +1,5 @@
 /* { dg-skip-if "too complex for avr" { avr-*-* } { "*" } { "" } } */
+/* { dg-skip-if "lto may cause internal compiler error on cygwin with gcc-4.9" { nds32*-*-* } { "*" } { "" } } */
 /* { dg-skip-if "ptxas times out" { nvptx-*-* } { "*" } { "" } } */
 /* { dg-timeout-factor 4.0 } */
 #define LIM1(x) x##0, x##1, x##2, x##3, x##4, x##5, x##6, x##7, x##8, x##9,
diff --git a/gcc/testsuite/gcc.c-torture/execute/20010122-1.c b/gcc/testsuite/gcc.c-torture/execute/20010122-1.c
index 4eeb8c7..6cd02bc 100644
--- a/gcc/testsuite/gcc.c-torture/execute/20010122-1.c
+++ b/gcc/testsuite/gcc.c-torture/execute/20010122-1.c
@@ -1,4 +1,5 @@
 /* { dg-skip-if "requires frame pointers" { *-*-* } "-fomit-frame-pointer" "" } */
+/* { dg-additional-options "-malways-save-lp" { target nds32*-*-* } } */
 /* { dg-require-effective-target return_address } */

 extern void exit (int);
diff --git a/gcc/testsuite/gcc.c-torture/execute/920501-8.x b/gcc/testsuite/gcc.c-torture/execute/920501-8.x
new file mode 100644
index 0000000..96f05bc
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/920501-8.x
@@ -0,0 +1,11 @@
+# Please see Andes Bugzilla #11005 for the details.
+if { [istarget "nds32*-*-*"] } {
+	# The nds32 mculib toolchains require
+	# "-u_printf_float" and "-u_scanf_float" options
+	# to fully support printf and scanf functionality.
+	# These options are supposed to be harmless to newlib toolchain.
+	set additional_flags "-u_printf_float -u_scanf_float"
+}
+
+return 0
+
diff --git a/gcc/testsuite/gcc.c-torture/execute/930513-1.x b/gcc/testsuite/gcc.c-torture/execute/930513-1.x
new file mode 100644
index 0000000..96f05bc
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/930513-1.x
@@ -0,0 +1,11 @@
+# Please see Andes Bugzilla #11005 for the details.
+if { [istarget "nds32*-*-*"] } {
+	# The nds32 mculib toolchains require
+	# "-u_printf_float" and "-u_scanf_float" options
+	# to fully support printf and scanf functionality.
+	# These options are supposed to be harmless to newlib toolchain.
+	set additional_flags "-u_printf_float -u_scanf_float"
+}
+
+return 0
+
diff --git a/gcc/testsuite/gcc.c-torture/execute/ieee/ieee.exp b/gcc/testsuite/gcc.c-torture/execute/ieee/ieee.exp
index 009984e..19cfcca 100644
--- a/gcc/testsuite/gcc.c-torture/execute/ieee/ieee.exp
+++ b/gcc/testsuite/gcc.c-torture/execute/ieee/ieee.exp
@@ -30,6 +30,10 @@ load_lib c-torture.exp
 # Disable tests on machines with no hardware support for IEEE arithmetic.
 if { [istarget "vax-*-*"] || [ istarget "powerpc-*-*spe"] || [istarget "pdp11-*-*"] } { return }

+# Since we cannot use dg-skip-if or dg-require-effective-target for individual
+# test case under ieee category, we disable all ieee tests on nds32 fpu toolchains.
+if { [istarget "nds32*-*-*"] && [check_effective_target_nds32_ext_fpu] } { return }
+
 if $tracelevel then {
     strace $tracelevel
 }
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr60822.c b/gcc/testsuite/gcc.c-torture/execute/pr60822.c
index dcd2447..a305df3 100644
--- a/gcc/testsuite/gcc.c-torture/execute/pr60822.c
+++ b/gcc/testsuite/gcc.c-torture/execute/pr60822.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target int32plus } */
+/* { dg-additional-options "-mcmodel=large" { target nds32*-*-elf* } } */
 struct X {
     char fill0[800000];
     int a;
diff --git a/gcc/testsuite/gcc.c-torture/execute/struct-ret-1.x b/gcc/testsuite/gcc.c-torture/execute/struct-ret-1.x
new file mode 100644
index 0000000..96f05bc
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/struct-ret-1.x
@@ -0,0 +1,11 @@
+# Please see Andes Bugzilla #11005 for the details.
+if { [istarget "nds32*-*-*"] } {
+	# The nds32 mculib toolchains require
+	# "-u_printf_float" and "-u_scanf_float" options
+	# to fully support printf and scanf functionality.
+	# These options are supposed to be harmless to newlib toolchain.
+	set additional_flags "-u_printf_float -u_scanf_float"
+}
+
+return 0
+
diff --git a/gcc/testsuite/gcc.dg/constructor-1.c b/gcc/testsuite/gcc.dg/constructor-1.c
index 73e9fc3..827987e 100644
--- a/gcc/testsuite/gcc.dg/constructor-1.c
+++ b/gcc/testsuite/gcc.dg/constructor-1.c
@@ -1,6 +1,7 @@
 /* { dg-do run } */
 /* { dg-options "-O2" } */
 /* { dg-skip-if "" { ! global_constructor } { "*" } { "" } } */
+/* { dg-options "-O2 -mctor-dtor" { target { nds32*-*-* } } } */

 /* The ipa-split pass pulls the body of the if(!x) block
    into a separate function to make foo a better inlining
diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-0.c b/gcc/testsuite/gcc.dg/graphite/interchange-0.c
index d56be46..b83535c 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-0.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-0.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target size32plus } */
+/* { dg-additional-options "-mcmodel=large" { target nds32*-*-elf* } } */

 #define DEBUG 0

diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-1.c b/gcc/testsuite/gcc.dg/graphite/interchange-1.c
index b65d486..2d77f0e 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-1.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-1.c
@@ -1,4 +1,6 @@
 /* { dg-require-effective-target size32plus } */
+/* { dg-require-effective-target nds32_full_addr_space { target nds32*-*-elf* } } */
+/* { dg-additional-options "-mcmodel=large" { target nds32*-*-elf* } } */

 /* Formerly known as ltrans-1.c */

diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-10.c b/gcc/testsuite/gcc.dg/graphite/interchange-10.c
index a955644..2021de2 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-10.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-10.c
@@ -1,4 +1,6 @@
 /* { dg-require-effective-target size32plus } */
+/* { dg-require-effective-target nds32_full_addr_space { target nds32*-*-elf* } } */
+/* { dg-additional-options "-mcmodel=large" { target nds32*-*-elf* } } */

 #define DEBUG 0
 #if DEBUG
diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-11.c b/gcc/testsuite/gcc.dg/graphite/interchange-11.c
index 6102822..5abb316 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-11.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-11.c
@@ -1,4 +1,6 @@
 /* { dg-require-effective-target size32plus } */
+/* { dg-require-effective-target nds32_full_addr_space { target nds32*-*-elf* } } */
+/* { dg-additional-options "-mcmodel=large" { target nds32*-*-elf* } } */

 #define DEBUG 0
 #if DEBUG
diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-15.c b/gcc/testsuite/gcc.dg/graphite/interchange-15.c
index 7410f29..1f71f06 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-15.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-15.c
@@ -1,4 +1,6 @@
 /* { dg-require-effective-target size32plus } */
+/* { dg-require-effective-target nds32_full_addr_space { target nds32*-*-elf* } } */
+/* { dg-additional-options "-mcmodel=large" { target nds32*-*-elf* } } */

 #define DEBUG 0
 #if DEBUG
diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-2.c b/gcc/testsuite/gcc.dg/graphite/interchange-2.c
index 936ee00..0041649 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-2.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-2.c
@@ -1,4 +1,6 @@
 /* { dg-require-effective-target size32plus } */
+/* { dg-require-effective-target nds32_full_addr_space { target nds32*-*-elf* } } */
+/* { dg-additional-options "-mcmodel=large" { target nds32*-*-elf* } } */

 /* Formerly known as ltrans-2.c */

diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-3.c b/gcc/testsuite/gcc.dg/graphite/interchange-3.c
index 4aec824..6635529 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-3.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-3.c
@@ -1,4 +1,6 @@
 /* { dg-require-effective-target size32plus } */
+/* { dg-require-effective-target nds32_full_addr_space { target nds32*-*-elf* } } */
+/* { dg-additional-options "-mcmodel=large" { target nds32*-*-elf* } } */

 /* Formerly known as ltrans-3.c */

diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-4.c b/gcc/testsuite/gcc.dg/graphite/interchange-4.c
index 463ecb5..359f0ac 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-4.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-4.c
@@ -1,4 +1,6 @@
 /* { dg-require-effective-target size32plus } */
+/* { dg-require-effective-target nds32_full_addr_space { target nds32*-*-elf* } } */
+/* { dg-additional-options "-mcmodel=large" { target nds32*-*-elf* } } */

 /* Formerly known as ltrans-4.c */

diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-5.c b/gcc/testsuite/gcc.dg/graphite/interchange-5.c
index e5aaa64..892257e 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-5.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-5.c
@@ -1,4 +1,5 @@
 /* { dg-require-effective-target size32plus } */
+/* { dg-additional-options "-mcmodel=large" { target nds32*-*-elf* } } */

 /* Formerly known as ltrans-5.c */

diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-mvt.c b/gcc/testsuite/gcc.dg/graphite/interchange-mvt.c
index c6543ec..51c6ee5 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-mvt.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-mvt.c
@@ -1,4 +1,6 @@
 /* { dg-require-effective-target size32plus } */
+/* { dg-require-effective-target nds32_full_addr_space { target nds32*-*-elf* } } */
+/* { dg-additional-options "-mcmodel=large" { target nds32*-*-elf* } } */

 #define DEBUG 0
 #if DEBUG
diff --git a/gcc/testsuite/gcc.dg/graphite/pr46185.c b/gcc/testsuite/gcc.dg/graphite/pr46185.c
index 36d46a4..738c9a8 100644
--- a/gcc/testsuite/gcc.dg/graphite/pr46185.c
+++ b/gcc/testsuite/gcc.dg/graphite/pr46185.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
+/* { dg-require-effective-target nds32_full_addr_space { target nds32*-*-elf* } } */
 /* { dg-options "-O2 -floop-interchange -ffast-math -fno-ipa-cp" } */
+/* { dg-additional-options "-mcmodel=large" { target nds32*-*-elf* } } */

 #define DEBUG 0
 #if DEBUG
diff --git a/gcc/testsuite/gcc.dg/graphite/uns-interchange-15.c b/gcc/testsuite/gcc.dg/graphite/uns-interchange-15.c
index fe2669f..dd77aa3 100644
--- a/gcc/testsuite/gcc.dg/graphite/uns-interchange-15.c
+++ b/gcc/testsuite/gcc.dg/graphite/uns-interchange-15.c
@@ -1,4 +1,6 @@
 /* { dg-require-effective-target size32plus } */
+/* { dg-require-effective-target nds32_full_addr_space { target nds32*-*-elf* } } */
+/* { dg-additional-options "-mcmodel=large" { target nds32*-*-elf* } } */

 #define DEBUG 0
 #if DEBUG
diff --git a/gcc/testsuite/gcc.dg/graphite/uns-interchange-mvt.c b/gcc/testsuite/gcc.dg/graphite/uns-interchange-mvt.c
index 211c9ab..c7defb4 100644
--- a/gcc/testsuite/gcc.dg/graphite/uns-interchange-mvt.c
+++ b/gcc/testsuite/gcc.dg/graphite/uns-interchange-mvt.c
@@ -1,4 +1,6 @@
 /* { dg-require-effective-target size32plus } */
+/* { dg-require-effective-target nds32_full_addr_space { target nds32*-*-elf* } } */
+/* { dg-additional-options "-mcmodel=large" { target nds32*-*-elf* } } */

 #define DEBUG 0
 #if DEBUG
diff --git a/gcc/testsuite/gcc.dg/initpri1.c b/gcc/testsuite/gcc.dg/initpri1.c
index 794ea2b..10b3a24 100644
--- a/gcc/testsuite/gcc.dg/initpri1.c
+++ b/gcc/testsuite/gcc.dg/initpri1.c
@@ -1,4 +1,5 @@
 /* { dg-do run { target init_priority } } */
+/* { dg-options "-mctor-dtor" { target { nds32*-*-* } } } */

 extern void abort ();

diff --git a/gcc/testsuite/gcc.dg/initpri2.c b/gcc/testsuite/gcc.dg/initpri2.c
index fa9fda0..1418411 100644
--- a/gcc/testsuite/gcc.dg/initpri2.c
+++ b/gcc/testsuite/gcc.dg/initpri2.c
@@ -1,4 +1,5 @@
 /* { dg-do compile { target init_priority } } */
+/* { dg-options "-mctor-dtor" { target { nds32*-*-* } } } */

 /* Priorities must be in the range [0, 65535].  */
 void c1()
diff --git a/gcc/testsuite/gcc.dg/initpri3.c b/gcc/testsuite/gcc.dg/initpri3.c
index 1633da0..e1b8cf6 100644
--- a/gcc/testsuite/gcc.dg/initpri3.c
+++ b/gcc/testsuite/gcc.dg/initpri3.c
@@ -1,6 +1,7 @@
 /* { dg-do run { target init_priority } } */
 /* { dg-require-effective-target lto } */
 /* { dg-options "-flto -O3" } */
+/* { dg-options "-flto -O3 -mctor-dtor" { target { nds32*-*-* } } } */

 extern void abort ();

diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-sra-1.c b/gcc/testsuite/gcc.dg/ipa/ipa-sra-1.c
index 4db904b..2290d8b 100644
--- a/gcc/testsuite/gcc.dg/ipa/ipa-sra-1.c
+++ b/gcc/testsuite/gcc.dg/ipa/ipa-sra-1.c
@@ -1,5 +1,6 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -fipa-sra -fdump-tree-eipa_sra-details"  } */
+/* { dg-additional-options "-u_printf_float -u_scanf_float" { target nds32*-*-* } } */

 struct bovid
 {
diff --git a/gcc/testsuite/gcc.dg/lower-subreg-1.c b/gcc/testsuite/gcc.dg/lower-subreg-1.c
index 47057fe..25439b1 100644
--- a/gcc/testsuite/gcc.dg/lower-subreg-1.c
+++ b/gcc/testsuite/gcc.dg/lower-subreg-1.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! { mips64 || { aarch64*-*-* arm*-*-* ia64-*-* sparc*-*-* spu-*-* tilegx-*-* } } } } } */
+/* { dg-do compile { target { ! { mips64 || { aarch64*-*-* arm*-*-* ia64-*-* sparc*-*-* spu-*-* tilegx-*-* nds32*-*-* } } } } } */
 /* { dg-options "-O -fdump-rtl-subreg1" } */
 /* { dg-additional-options "-mno-stv" { target ia32 } } */
 /* { dg-skip-if "" { { i?86-*-* x86_64-*-* } && x32 } { "*" } { "" } } */
diff --git a/gcc/testsuite/gcc.dg/pr28796-2.c b/gcc/testsuite/gcc.dg/pr28796-2.c
index f56a5d4..fff71bc 100644
--- a/gcc/testsuite/gcc.dg/pr28796-2.c
+++ b/gcc/testsuite/gcc.dg/pr28796-2.c
@@ -2,6 +2,7 @@
 /* { dg-options "-O2 -funsafe-math-optimizations -fno-finite-math-only -DUNSAFE" } */
 /* { dg-add-options ieee } */
 /* { dg-skip-if "No Inf/NaN support" { spu-*-* } } */
+/* { dg-skip-if "No Denormmalized support" { nds32_ext_fpu } } */

 #include "tg-tests.h"

diff --git a/gcc/testsuite/gcc.dg/sibcall-10.c b/gcc/testsuite/gcc.dg/sibcall-10.c
index d98b43a..bb0e24c 100644
--- a/gcc/testsuite/gcc.dg/sibcall-10.c
+++ b/gcc/testsuite/gcc.dg/sibcall-10.c
@@ -5,7 +5,7 @@
    Copyright (C) 2002 Free Software Foundation Inc.
    Contributed by Hans-Peter Nilsson  <hp@bitrange.com>  */

-/* { dg-do run { xfail { { cris-*-* crisv32-*-* h8300-*-* hppa*64*-*-* m32r-*-* mcore-*-* mn10300-*-* msp430*-*-* nds32*-*-* xstormy16-*-* v850*-*-* vax-*-* xtensa*-*-* } || { arm*-*-* && { ! arm32 } } } } } */
+/* { dg-do run { xfail { { cris-*-* crisv32-*-* h8300-*-* hppa*64*-*-* m32r-*-* mcore-*-* mn10300-*-* msp430*-*-* xstormy16-*-* v850*-*-* vax-*-* xtensa*-*-* } || { arm*-*-* && { ! arm32 } } } } } */
 /* -mlongcall disables sibcall patterns.  */
 /* { dg-skip-if "" { powerpc*-*-* } { "-mlongcall" } { "" } } */
 /* { dg-options "-O2 -foptimize-sibling-calls" } */
diff --git a/gcc/testsuite/gcc.dg/sibcall-3.c b/gcc/testsuite/gcc.dg/sibcall-3.c
index eafe8dd..f188a18 100644
--- a/gcc/testsuite/gcc.dg/sibcall-3.c
+++ b/gcc/testsuite/gcc.dg/sibcall-3.c
@@ -5,7 +5,7 @@
    Copyright (C) 2002 Free Software Foundation Inc.
    Contributed by Hans-Peter Nilsson  <hp@bitrange.com>  */

-/* { dg-do run { xfail { { cris-*-* crisv32-*-* h8300-*-* hppa*64*-*-* m32r-*-* mcore-*-* mn10300-*-* msp430*-*-* nds32*-*-* xstormy16-*-* v850*-*-* vax-*-* xtensa*-*-* } || { arm*-*-* && { ! arm32 } } } } } */
+/* { dg-do run { xfail { { cris-*-* crisv32-*-* h8300-*-* hppa*64*-*-* m32r-*-* mcore-*-* mn10300-*-* msp430*-*-* xstormy16-*-* v850*-*-* vax-*-* xtensa*-*-* } || { arm*-*-* && { ! arm32 } } } } } */
 /* -mlongcall disables sibcall patterns.  */
 /* { dg-skip-if "" { powerpc*-*-* } { "-mlongcall" } { "" } } */
 /* { dg-options "-O2 -foptimize-sibling-calls" } */
diff --git a/gcc/testsuite/gcc.dg/sibcall-4.c b/gcc/testsuite/gcc.dg/sibcall-4.c
index 1e039c6..a8c844a 100644
--- a/gcc/testsuite/gcc.dg/sibcall-4.c
+++ b/gcc/testsuite/gcc.dg/sibcall-4.c
@@ -5,7 +5,7 @@
    Copyright (C) 2002 Free Software Foundation Inc.
    Contributed by Hans-Peter Nilsson  <hp@bitrange.com>  */

-/* { dg-do run { xfail { { cris-*-* crisv32-*-* h8300-*-* hppa*64*-*-* m32r-*-* mcore-*-* mn10300-*-* msp430*-*-* nds32*-*-* xstormy16-*-* v850*-*-* vax-*-* xtensa*-*-* } || { arm*-*-* && { ! arm32 } } } } } */
+/* { dg-do run { xfail { { cris-*-* crisv32-*-* h8300-*-* hppa*64*-*-* m32r-*-* mcore-*-* mn10300-*-* msp430*-*-* xstormy16-*-* v850*-*-* vax-*-* xtensa*-*-* } || { arm*-*-* && { ! arm32 } } } } } */
 /* -mlongcall disables sibcall patterns.  */
 /* { dg-skip-if "" { powerpc*-*-* } { "-mlongcall" } { "" } } */
 /* { dg-options "-O2 -foptimize-sibling-calls" } */
diff --git a/gcc/testsuite/gcc.dg/sibcall-9.c b/gcc/testsuite/gcc.dg/sibcall-9.c
index 34e7053..71c3251 100644
--- a/gcc/testsuite/gcc.dg/sibcall-9.c
+++ b/gcc/testsuite/gcc.dg/sibcall-9.c
@@ -5,7 +5,7 @@
    Copyright (C) 2002 Free Software Foundation Inc.
    Contributed by Hans-Peter Nilsson  <hp@bitrange.com>  */

-/* { dg-do run { xfail { { cris-*-* crisv32-*-* h8300-*-* hppa*64*-*-* m32r-*-* mcore-*-* mn10300-*-* msp430*-*-* nds32*-*-* nvptx-*-* xstormy16-*-* v850*-*-* vax-*-* xtensa*-*-* } || { arm*-*-* && { ! arm32 } } } } } */
+/* { dg-do run { xfail { { cris-*-* crisv32-*-* h8300-*-* hppa*64*-*-* m32r-*-* mcore-*-* mn10300-*-* msp430*-*-* nvptx-*-* xstormy16-*-* v850*-*-* vax-*-* xtensa*-*-* } || { arm*-*-* && { ! arm32 } } } } } */
 /* -mlongcall disables sibcall patterns.  */
 /* { dg-skip-if "" { powerpc*-*-* } { "-mlongcall" } { "" } } */
 /* { dg-options "-O2 -foptimize-sibling-calls" } */
diff --git a/gcc/testsuite/gcc.dg/stack-usage-1.c b/gcc/testsuite/gcc.dg/stack-usage-1.c
index 7864c6a..c768ca2 100644
--- a/gcc/testsuite/gcc.dg/stack-usage-1.c
+++ b/gcc/testsuite/gcc.dg/stack-usage-1.c
@@ -2,6 +2,7 @@
 /* { dg-options "-fstack-usage" } */
 /* nvptx doesn't have a reg allocator, and hence no stack usage data.  */
 /* { dg-skip-if "" { nvptx-*-* } { "*" } { "" } } */
+/* { dg-options "-fstack-usage -fno-omit-frame-pointer" { target { nds32*-*-* } } } */

 /* This is aimed at testing basic support for -fstack-usage in the back-ends.
    See the SPARC back-end for example (grep flag_stack_usage_info in sparc.c).
diff --git a/gcc/testsuite/gcc.dg/torture/type-generic-1.c b/gcc/testsuite/gcc.dg/torture/type-generic-1.c
index 3897818..6815e8b 100644
--- a/gcc/testsuite/gcc.dg/torture/type-generic-1.c
+++ b/gcc/testsuite/gcc.dg/torture/type-generic-1.c
@@ -3,6 +3,7 @@

 /* { dg-do run } */
 /* { dg-skip-if "No Inf/NaN support" { spu-*-* } } */
+/* { dg-skip-if "No Denormmalized support" { nds32_ext_fpu } } */
 /* { dg-options "-DUNSAFE" { target tic6x*-*-* visium-*-* } } */
 /* { dg-add-options ieee } */

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-cse-2.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-cse-2.c
index 1a4bfe6..78c948a 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-cse-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-cse-2.c
@@ -25,4 +25,4 @@ foo ()
    but the loop reads only one element at a time, and DOM cannot resolve these.
    The same happens on powerpc depending on the SIMD support available.  */

-/* { dg-final { scan-tree-dump "return 28;" "optimized" { xfail { { alpha*-*-* hppa*64*-*-* powerpc64*-*-* } || { sparc*-*-* && lp64 } } } } } */
+/* { dg-final { scan-tree-dump "return 28;" "optimized" { xfail { { alpha*-*-* hppa*64*-*-* powerpc64*-*-* nds32*-*-*} || { sparc*-*-* && lp64 } } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vrp88.c b/gcc/testsuite/gcc.dg/tree-ssa/vrp88.c
index f70b311..8a1081c 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/vrp88.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/vrp88.c
@@ -33,6 +33,6 @@ bitmap_single_bit_set_p (const_bitmap a)
 }

 /* Verify that VRP simplified an "if" statement.  */
-/* { dg-final { scan-tree-dump "Folded into: if.*" "vrp1"} } */
+/* { dg-final { scan-tree-dump "Folded into: if.*" "vrp1" { xfail *-*-* } } } */


diff --git a/gcc/testsuite/gcc.target/nds32/basic-main.c b/gcc/testsuite/gcc.target/nds32/basic-main.c
index 6fdbc35..7341fb5 100644
--- a/gcc/testsuite/gcc.target/nds32/basic-main.c
+++ b/gcc/testsuite/gcc.target/nds32/basic-main.c
@@ -1,9 +1,10 @@
 /* This is a basic main function test program.  */

-/* { dg-do run }  */
-/* { dg-options "-O0" }  */
+/* { dg-do run } */
+/* { dg-options "-O0" } */

-int main(void)
+int
+main (void)
 {
   return 0;
 }
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-PE1-abs.c b/gcc/testsuite/gcc.target/nds32/builtin-PE1-abs.c
new file mode 100644
index 0000000..8cadcfd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-PE1-abs.c
@@ -0,0 +1,20 @@
+/* This is a test program for abs instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_perf } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  int a = -4;
+  int abs = __nds32__abs (a);
+
+  if (abs != 4)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-PE1-ave.c b/gcc/testsuite/gcc.target/nds32/builtin-PE1-ave.c
new file mode 100644
index 0000000..d2c87db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-PE1-ave.c
@@ -0,0 +1,21 @@
+/* This is a test program for ave instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_perf } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  int a = 4;
+  int b = 2;
+  int ave = __nds32__ave (a, b);
+
+  if (ave != 3)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-PE1-bclr.c b/gcc/testsuite/gcc.target/nds32/builtin-PE1-bclr.c
new file mode 100644
index 0000000..0e6c1e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-PE1-bclr.c
@@ -0,0 +1,20 @@
+/* This is a test program for bclr instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_perf } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  int a = 1;
+  int c = __nds32__bclr (a, 0);
+
+  if (c != 0)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-PE1-bset.c b/gcc/testsuite/gcc.target/nds32/builtin-PE1-bset.c
new file mode 100644
index 0000000..1bd8513
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-PE1-bset.c
@@ -0,0 +1,20 @@
+/* This is a test program for bset instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_perf } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  int c = 0;
+  c = __nds32__bset (c, 0);
+
+  if (c != 1)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-PE1-btgl.c b/gcc/testsuite/gcc.target/nds32/builtin-PE1-btgl.c
new file mode 100644
index 0000000..a1dbc00
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-PE1-btgl.c
@@ -0,0 +1,20 @@
+/* This is a test program for btgl instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_perf } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  int a = 1;
+  int c = __nds32__btgl (1, 0);
+
+  if (c != 0)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-PE1-btst.c b/gcc/testsuite/gcc.target/nds32/builtin-PE1-btst.c
new file mode 100644
index 0000000..c001f94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-PE1-btst.c
@@ -0,0 +1,20 @@
+/* This is a test program for btst instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_perf } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  int c = 1;
+  c = __nds32__btst (c, 0);
+
+  if (c != 1)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-PE1-clip.c b/gcc/testsuite/gcc.target/nds32/builtin-PE1-clip.c
new file mode 100644
index 0000000..d63b298
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-PE1-clip.c
@@ -0,0 +1,20 @@
+/* This is a test program for clip instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_perf } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  int c = 33;
+  c = __nds32__clip (c, 5);
+
+  if (c != 31)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-PE1-clips.c b/gcc/testsuite/gcc.target/nds32/builtin-PE1-clips.c
new file mode 100644
index 0000000..3e3f663
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-PE1-clips.c
@@ -0,0 +1,20 @@
+/* This is a test program for clips instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_perf } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  int a = -33;
+  int c = __nds32__clips (a, 5);
+
+  if (c != -32)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-PE1-clo.c b/gcc/testsuite/gcc.target/nds32/builtin-PE1-clo.c
new file mode 100644
index 0000000..d672a33
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-PE1-clo.c
@@ -0,0 +1,20 @@
+/* This is a test program for clo instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_perf } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  int c = 0xFFFF0000;
+  c =  __nds32__clo (c);
+
+  if (c != 16)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-PE1-clz.c b/gcc/testsuite/gcc.target/nds32/builtin-PE1-clz.c
new file mode 100644
index 0000000..17e6318
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-PE1-clz.c
@@ -0,0 +1,20 @@
+/* This is a test program for clz instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_perf } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  int c = 0x0000FFFF;
+  c =  __nds32__clz (c);
+
+  if (c != 16)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-PE2-bse.c b/gcc/testsuite/gcc.target/nds32/builtin-PE2-bse.c
new file mode 100644
index 0000000..c769fea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-PE2-bse.c
@@ -0,0 +1,28 @@
+/* This is a test program for bse instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_perf2 } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned int a = 0xF0F0F0F0;
+  unsigned int b = 0x00000300;
+  unsigned int r = 0;
+
+  unsigned int verify_b = 0x00000300;
+  unsigned int verify_r = 0;
+
+  __nds32__bse (&r, a, &b);
+  a = 0xF0F0F0F0;
+  asm volatile ("bse %0, %2, %1": "+&r" (verify_r), "+&r" (verify_b) : "r" (a));
+
+  if ((verify_b == b) && (verify_r == r))
+    exit (0);
+  else
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-PE2-bsp.c b/gcc/testsuite/gcc.target/nds32/builtin-PE2-bsp.c
new file mode 100644
index 0000000..d798719
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-PE2-bsp.c
@@ -0,0 +1,26 @@
+/* This is a test program for bsp instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_perf2 } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned int a = 0x0000000F;
+  unsigned int b = 0x00000300;
+  unsigned int r = 0;
+  unsigned int verify_b = 0x00000300;
+  unsigned int verify_r = 0;
+
+  __nds32__bsp (&r, a, &b);
+  asm volatile ("bsp %0, %2, %1": "+&r" (verify_r), "+&r" (verify_b) : "r" (a));
+
+  if ((verify_b == b) && (verify_r == r))
+    exit (0);
+  else
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-PE2-pbsad.c b/gcc/testsuite/gcc.target/nds32/builtin-PE2-pbsad.c
new file mode 100644
index 0000000..bc4fe42
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-PE2-pbsad.c
@@ -0,0 +1,21 @@
+/* This is a test program for pbsad instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_perf2 } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned int a = 0x09070605;
+  unsigned int b = 0x04020301;
+  unsigned int r = __nds32__pbsad (a, b);
+
+  if (r != 17)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-PE2-pbsada.c b/gcc/testsuite/gcc.target/nds32/builtin-PE2-pbsada.c
new file mode 100644
index 0000000..6ed1b08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-PE2-pbsada.c
@@ -0,0 +1,23 @@
+/* This is a test program for pbsada instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_perf2 } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned int a = 0x09070605;
+  unsigned int b = 0x04020301;
+  unsigned int r = 1;
+
+  r = __nds32__pbsada(r, a, b);
+
+  if (r != 18)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-add16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-add16.c
new file mode 100644
index 0000000..0eec324
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-add16.c
@@ -0,0 +1,49 @@
+/* This is a test program for add16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int add16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__add16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_uadd16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_uadd16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_sadd16 (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_sadd16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = add16 (0x0001f000, 0x00011000);
+  uint16x2_t v_ua = v_uadd16 ((uint16x2_t) {0xf000, 0xf000},
+			      (uint16x2_t) {0x1000, 0x2000});
+  int16x2_t v_sa = v_sadd16 ((int16x2_t) {0xf777, 0xf111},
+			     (int16x2_t) {0x1000, 0x2000});
+
+  if (a != 0x00020000)
+    abort ();
+  else if (v_ua[0] != 0x0000
+	   || v_ua[1] != 0x1000)
+    abort ();
+  else if (v_sa[0] != 0x0777
+	   || v_sa[1] != 0x1111)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-add64.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-add64.c
new file mode 100644
index 0000000..b761b7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-add64.c
@@ -0,0 +1,36 @@
+/* This is a test program for add64 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long sadd64 (long long ra, long long rb)
+{
+  return __nds32__sadd64 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+unsigned long long uadd64 (unsigned long long ra, unsigned long long rb)
+{
+  return __nds32__uadd64 (ra, rb);
+}
+
+int
+main ()
+{
+  long long sa = sadd64 (0x1122334400000000ll, 0x55667788ll);
+  unsigned long long ua = uadd64 (0xffff00000000ull, 0x55667788ull);
+
+  if (sa != 0x1122334455667788ll)
+    abort ();
+  else if (ua != 0xffff55667788ull)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-add8.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-add8.c
new file mode 100644
index 0000000..77e686c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-add8.c
@@ -0,0 +1,53 @@
+/* This is a test program for add8 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int add8 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__add8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint8x4_t v_uadd8 (uint8x4_t ra, uint8x4_t rb)
+{
+  return __nds32__v_uadd8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int8x4_t v_sadd8 (int8x4_t ra, int8x4_t rb)
+{
+  return __nds32__v_sadd8 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = add8 (0x11223344, 0x55667788);
+  uint8x4_t v_ua = v_uadd8 ((uint8x4_t) {0xff, 0xee, 0xdd, 0xcc},
+			    (uint8x4_t) {0x1, 0xee, 0xdd, 0xcc});
+  int8x4_t v_sa = v_sadd8 ((int8x4_t) {0x80, 0x7f, 0xbb, 0xaa},
+			   (int8x4_t) {0x80, 0x7f, 0xbb, 0xaa});
+
+  if (a != 0x6688aacc)
+    abort ();
+  else if (v_ua[0] != 0
+	   || v_ua[1] != 0xdc
+	   || v_ua[2] != 0xba
+	   || v_ua[3] != 0x98)
+    abort ();
+  else if (v_sa[0] != 0
+	   || v_sa[1] != (char) 0xfe
+	   || v_sa[2] != 0x76
+	   || v_sa[3] != 0x54)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-bitrev.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-bitrev.c
new file mode 100644
index 0000000..2c8c297
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-bitrev.c
@@ -0,0 +1,27 @@
+/* This is a test program for bitrev instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int bitrev (unsigned int ra, unsigned int rb)
+{
+  return __nds32__bitrev (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = bitrev (0xd, 1);
+
+  if (a != 0x2)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-bpick.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-bpick.c
new file mode 100644
index 0000000..78893cb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-bpick.c
@@ -0,0 +1,27 @@
+/* This is a test program for bpick instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int bpick (unsigned int ra, unsigned int rb, unsigned int rc)
+{
+  return __nds32__bpick (ra, rb, rc);
+}
+
+int
+main ()
+{
+  unsigned int a = bpick (0x11223344, 0x11332244, 0);
+
+  if (a != 0x11332244)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-cmpeq16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-cmpeq16.c
new file mode 100644
index 0000000..c37abf4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-cmpeq16.c
@@ -0,0 +1,49 @@
+/* This is a test program for cmpeq16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int cmpeq16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__cmpeq16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_scmpeq16 (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_scmpeq16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_ucmpeq16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_ucmpeq16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = cmpeq16 (0xffff0000, 0xffff0001);
+  uint16x2_t v_sa = v_scmpeq16 ((int16x2_t) {0x7fff, 0x8000},
+				(int16x2_t) {0x8000, 0x8000});
+  uint16x2_t v_ua = v_ucmpeq16 ((uint16x2_t) {0x7fff, 0x8000},
+				(uint16x2_t) {0x8000, 0x8000});
+
+  if (a != 0xffff0000)
+    abort ();
+  else if (v_sa[0] != 0
+	   || v_sa[1] != 0xffff)
+    abort ();
+  else if (v_ua[0] != 0
+	   || v_ua[1] != 0xffff)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-cmpeq8.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-cmpeq8.c
new file mode 100644
index 0000000..a692dac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-cmpeq8.c
@@ -0,0 +1,53 @@
+/* This is a test program for cmpeq8 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int cmpeq8 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__cmpeq8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint8x4_t v_scmpeq8 (int8x4_t ra, int8x4_t rb)
+{
+  return __nds32__v_scmpeq8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint8x4_t v_ucmpeq8 (uint8x4_t ra, uint8x4_t rb)
+{
+  return __nds32__v_ucmpeq8 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = cmpeq8 (0xffff0000, 0xffff0101);
+  uint8x4_t v_sa = v_scmpeq8 ((int8x4_t) { 0x7f, 0x7f, 0x01, 0x01},
+			      (int8x4_t) { 0x7f, 0x7f, 0x00, 0x00});
+  uint8x4_t v_ua = v_ucmpeq8 ((uint8x4_t) { 0x7f, 0x7f, 0x01, 0x01},
+			      (uint8x4_t) { 0x7f, 0x7f, 0x00, 0x00});
+
+  if (a != 0xffff0000)
+    abort ();
+  else if (v_sa[0] != 0xff
+           || v_sa[1] != 0xff
+           || v_sa[2] != 0
+	   || v_sa[3] != 0)
+    abort ();
+  else if (v_ua[0] != 0xff
+           || v_ua[1] != 0xff
+           || v_ua[2] != 0
+	   || v_ua[3] != 0)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-cras16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-cras16.c
new file mode 100644
index 0000000..7d6da46
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-cras16.c
@@ -0,0 +1,58 @@
+/* This is a test program for cras16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int cras16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__cras16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_ucras16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_ucras16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_scras16 (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_scras16 (ra, rb);
+}
+
+int
+main ()
+{
+
+#ifdef __NDS32_EL__
+  uint16x2_t v_ua_p = {1, 0};
+  int16x2_t v_sa_p = {0x1000, 0x111};
+#else
+  uint16x2_t v_ua_p = {0x2469, 0xe000};
+  int16x2_t v_sa_p = {0x3000, 0xe111};
+#endif
+
+  unsigned int a = cras16 (0x0001f000, 0x0001f000);
+  uint16x2_t v_ua = v_ucras16 ((uint16x2_t) {0x1235, 0xf000},
+			       (uint16x2_t) {0x1000, 0x1234});
+  int16x2_t v_sa = v_scras16 ((int16x2_t) {0x2000, 0xf111},
+			      (int16x2_t) {0x1000, 0x1000});
+
+  if (a != 0xf001efff)
+    abort ();
+  else if (v_ua[0] != v_ua_p[0]
+	   || v_ua[1] != v_ua_p[1])
+    abort ();
+  else if (v_sa[0] != v_sa_p[0]
+	   || v_sa[1] != v_sa_p[1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-crsa16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-crsa16.c
new file mode 100644
index 0000000..de99c3a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-crsa16.c
@@ -0,0 +1,57 @@
+/* This is a test program for crsa16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int crsa16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__crsa16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_ucrsa16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_ucrsa16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_scrsa16 (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_scrsa16 (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  uint16x2_t v_ua_p = {0x2469, 0xe000};
+  int16x2_t v_sa_p = {0x3000, 0x110};
+#else
+  uint16x2_t v_ua_p = {1, 0};
+  int16x2_t v_sa_p = {0x1000, 0x112};
+#endif
+
+  unsigned int a = crsa16 (0x0001f000, 0x0001f000);
+  uint16x2_t v_ua = v_ucrsa16 ((uint16x2_t) {0x1235, 0xf000},
+			       (uint16x2_t) {0x1000, 0x1234});
+  int16x2_t v_sa = v_scrsa16 ((int16x2_t) {0x2000, 0x0111},
+			      (int16x2_t) {0x0001, 0x1000});
+
+  if (a != 0x1001f001)
+    abort ();
+  else if (v_ua[0] != v_ua_p[0]
+	   || v_ua[1] != v_ua_p[1])
+    abort ();
+  else if (v_sa[0] != v_sa_p[0]
+	   || v_sa[1] != v_sa_p[1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-insb.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-insb.c
new file mode 100644
index 0000000..ebd0348
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-insb.c
@@ -0,0 +1,27 @@
+/* This is a test program for insb instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int insb (unsigned int ra, unsigned int rb)
+{
+  return __nds32__insb (ra, rb, 1);
+}
+
+int
+main ()
+{
+  unsigned int a = insb (0x11220044, 0x33);
+
+  if (a != 0x11223344)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-pkbb16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-pkbb16.c
new file mode 100644
index 0000000..23d92e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-pkbb16.c
@@ -0,0 +1,44 @@
+/* This is a test program for pkbb16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int pkbb16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__pkbb16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_pkbb16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_pkbb16 (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  uint16x2_t va_p = {0xcccc, 0xaaaa};
+#else
+  uint16x2_t va_p = {0xbbbb, 0xdddd};
+#endif
+
+  unsigned int a = pkbb16 (0x11223344, 0x55667788);
+  uint16x2_t va = v_pkbb16 ((uint16x2_t) {0xaaaa, 0xbbbb},
+			    (uint16x2_t) {0xcccc, 0xdddd});
+
+  if (a != 0x33447788)
+    abort ();
+  else if (va[0] != va_p[0]
+	   || va[1] != va_p[1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-pkbt16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-pkbt16.c
new file mode 100644
index 0000000..6c34420
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-pkbt16.c
@@ -0,0 +1,44 @@
+/* This is a test program for pkbt16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int pkbt16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__pkbt16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_pkbt16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_pkbt16 (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  uint16x2_t va_p = {0xdddd, 0xaaaa};
+#else
+  uint16x2_t va_p = {0xbbbb, 0xcccc};
+#endif
+
+  unsigned int a = pkbt16 (0x11223344, 0x55667788);
+  uint16x2_t va = v_pkbt16 ((uint16x2_t) {0xaaaa, 0xbbbb},
+			    (uint16x2_t) {0xcccc, 0xdddd});
+
+  if (a != 0x33445566)
+    abort ();
+  else if (va[0] != va_p[0]
+	   || va[1] != va_p[1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-pktb16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-pktb16.c
new file mode 100644
index 0000000..0aab5df
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-pktb16.c
@@ -0,0 +1,44 @@
+/* This is a test program for pktb16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int pktb16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__pktb16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_pktb16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_pktb16 (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  uint16x2_t va_p = {0xcccc, 0xbbbb};
+#else
+  uint16x2_t va_p = {0xaaaa, 0xdddd};
+#endif
+
+  unsigned int a = pktb16 (0x11223344, 0x55667788);
+  uint16x2_t va = v_pktb16 ((uint16x2_t) {0xaaaa, 0xbbbb},
+			    (uint16x2_t) {0xcccc, 0xdddd});
+
+  if (a != 0x11227788)
+    abort ();
+  else if (va[0] != va_p[0]
+	   || va[1] != va_p[1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-pktt16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-pktt16.c
new file mode 100644
index 0000000..745cde5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-pktt16.c
@@ -0,0 +1,44 @@
+/* This is a test program for pktt16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int pktt16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__pktt16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_pktt16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_pktt16 (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  uint16x2_t va_p = {0xdddd, 0xbbbb};
+#else
+  uint16x2_t va_p = {0xaaaa, 0xcccc};
+#endif
+
+  unsigned int a = pktt16 (0x11223344, 0x55667788);
+  uint16x2_t va = v_pktt16 ((uint16x2_t) {0xaaaa, 0xbbbb},
+			    (uint16x2_t) {0xcccc, 0xdddd});
+
+  if (a != 0x11225566)
+    abort ();
+  else if (va[0] != va_p[0]
+	   || va[1] != va_p[1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-radd16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-radd16.c
new file mode 100644
index 0000000..5271b41
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-radd16.c
@@ -0,0 +1,38 @@
+/* This is a test program for radd16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int radd16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__radd16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_radd16 (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_radd16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = radd16 (0x7fff7fff, 0x7fff7fff);
+  int16x2_t va = v_radd16 ((int16x2_t) {0x8000, 0x4000},
+			   (int16x2_t) {0x8000, 0x8000});
+
+  if (a != 0x7fff7fff)
+    abort ();
+  else if (va[0] != (short) 0x8000
+	   || va[1] != (short) 0xe000)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-radd64.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-radd64.c
new file mode 100644
index 0000000..3e82ff5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-radd64.c
@@ -0,0 +1,27 @@
+/* This is a test program for radd64 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long radd64 (long long ra, long long rb)
+{
+  return __nds32__radd64 (ra, rb);
+}
+
+int
+main ()
+{
+  long long a = radd64 (0xf000000000000000ll, 0xf000000000000000ll);
+
+  if (a != 0xf000000000000000ll)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-radd8.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-radd8.c
new file mode 100644
index 0000000..10735a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-radd8.c
@@ -0,0 +1,40 @@
+/* This is a test program for radd8 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int radd8 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__radd8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int8x4_t v_radd8 (int8x4_t ra, int8x4_t rb)
+{
+  return __nds32__v_radd8 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = radd8 (0x11223344, 0x55667788);
+  int8x4_t va = v_radd8 ((int8x4_t) {0x7f, 0x80, 0x80, 0xaa},
+			 (int8x4_t) {0x7f, 0x80, 0x40, 0xaa});
+
+  if (a != 0x334455e6)
+    abort ();
+  else if (va[0] != 0x7f
+	   || va[1] != (char) 0x80
+	   || va[2] != (char) 0xe0
+	   || va[3] != (char) 0xaa)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-raddw.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-raddw.c
new file mode 100644
index 0000000..190a477
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-raddw.c
@@ -0,0 +1,27 @@
+/* This is a test program for raddw instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int raddw (int ra, int rb)
+{
+  return __nds32__raddw (ra, rb);
+}
+
+int
+main ()
+{
+  int a = raddw (0x80000000, 0x80000000);
+
+  if (a != 0x80000000)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-rcras16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-rcras16.c
new file mode 100644
index 0000000..2a2288a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-rcras16.c
@@ -0,0 +1,44 @@
+/* This is a test program for rcras16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int rcras16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__rcras16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_rcras16 (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_rcras16 (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int16x2_t va_p = {0x7fff, 0x8000};
+#else
+  int16x2_t va_p = {0xffff, 0};
+#endif
+
+  unsigned int a = rcras16 (0x0fff0000, 0x00000fff);
+  int16x2_t va = v_rcras16 ((int16x2_t) {0x7fff, 0x8000},
+			    (int16x2_t) {0x8000, 0x8000});
+
+  if (a != 0x0fff0000)
+    abort ();
+  else if (va[0] != va_p[0]
+	   || va[1] != va_p[1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-rcrsa16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-rcrsa16.c
new file mode 100644
index 0000000..ebcc0f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-rcrsa16.c
@@ -0,0 +1,44 @@
+/* This is a test program for rcrsa16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int rcrsa16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__rcrsa16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_rcrsa16 (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_rcrsa16 (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int16x2_t va_p = {0x8000, 0x8000};
+#else
+  int16x2_t va_p = {0, 0xffff};
+#endif
+
+  unsigned int a = rcrsa16 (0x7fff7fff, 0x7fff8000);
+  int16x2_t va = v_rcrsa16 ((int16x2_t) {0x8000, 0x8000},
+			    (int16x2_t) {0x7fff, 0x8000});
+
+  if (a != 0x7fff7fff)
+    abort ();
+  else if (va[0] != va_p [0]
+	   || va[1] != va_p [1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-rsub16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-rsub16.c
new file mode 100644
index 0000000..f9fcc86
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-rsub16.c
@@ -0,0 +1,38 @@
+/* This is a test program for rsub16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int rsub16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__rsub16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_rsub16 (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_rsub16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = rsub16 (0x7fff7fff, 0x80008000);
+  int16x2_t va = v_rsub16 ((int16x2_t) {0x8000, 0x8000},
+			   (int16x2_t) {0x7fff, 0x4000});
+
+  if (a != 0x7fff7fff)
+    abort ();
+  else if (va[0] != (short) 0x8000
+	   || va[1] != (short) 0xa000)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-rsub64.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-rsub64.c
new file mode 100644
index 0000000..227eba7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-rsub64.c
@@ -0,0 +1,27 @@
+/* This is a test program for rsub64 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long rsub64 (long long ra, long long rb)
+{
+  return __nds32__rsub64 (ra, rb);
+}
+
+int
+main ()
+{
+  long long a = rsub64 (0xe, 0xf);
+
+  if (a != 0xffffffffffffffff)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-rsub8.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-rsub8.c
new file mode 100644
index 0000000..0f1dddc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-rsub8.c
@@ -0,0 +1,40 @@
+/* This is a test program for rsub8 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int rsub8 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__rsub8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int8x4_t v_rsub8 (int8x4_t ra, int8x4_t rb)
+{
+  return __nds32__v_rsub8 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = rsub8 (0x55667788, 0x11223344);
+  int8x4_t va = v_rsub8 ((int8x4_t) {0x7f, 0x80, 0x80, 0xaa},
+			 (int8x4_t) {0x80, 0x7f, 0x40, 0xaa});
+
+  if (a != 0x222222a2)
+    abort ();
+  else if (va[0] != 0x7f
+	   || va[1] != (char) 0x80
+	   || va[2] != (char) 0xa0
+	   || va[3] != 0)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-rsubw.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-rsubw.c
new file mode 100644
index 0000000..b70a229
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-rsubw.c
@@ -0,0 +1,27 @@
+/* This is a test program for rsubw instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int rsubw (int ra, int rb)
+{
+  return __nds32__rsubw (ra, rb);
+}
+
+int
+main ()
+{
+  int a = rsubw (0x80000000, 0x7fffffff);
+
+  if (a != 0x80000000)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-scmple16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-scmple16.c
new file mode 100644
index 0000000..95251d6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-scmple16.c
@@ -0,0 +1,37 @@
+/* This is a test program for scmple16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int scmple16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__scmple16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_scmple16 (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_scmple16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = scmple16 (0xfffe0001, 0xffff0000);
+  uint16x2_t va = v_scmple16 ((int16x2_t) {0x7fff, 0x7ffe},
+			      (int16x2_t) {0x7ffe, 0x7fff});
+  if (a != 0xffff0000)
+    abort ();
+  else if (va[0] != 0
+           || va[1] != 0xffff)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-scmple8.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-scmple8.c
new file mode 100644
index 0000000..6c0033d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-scmple8.c
@@ -0,0 +1,40 @@
+/* This is a test program for scmple8 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int scmple8 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__scmple8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint8x4_t v_scmple8 (int8x4_t ra, int8x4_t rb)
+{
+  return __nds32__v_scmple8 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = scmple8 (0xfefe0101, 0xffff0000);
+  uint8x4_t va = v_scmple8 ((int8x4_t) {0x7e, 0x7e, 0x01, 0x01},
+			    (int8x4_t) {0x7f, 0x7f, 0x00, 0x00});
+
+  if (a != 0xffff0000)
+    abort ();
+  else if (va[0] != 0xff
+           || va[1] != 0xff
+           || va[2] != 0
+	   || va[3] != 0)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-scmplt16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-scmplt16.c
new file mode 100644
index 0000000..5797711
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-scmplt16.c
@@ -0,0 +1,38 @@
+/* This is a test program for scmplt16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int scmplt16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__scmplt16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_scmplt16 (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_scmplt16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = scmplt16 (0xfffe0001, 0xffff0000);
+  uint16x2_t va = v_scmplt16 ((int16x2_t) {0x7fff, 0x7ffe},
+			      (int16x2_t) {0x7ffe, 0x7fff});
+
+  if (a != 0xffff0000)
+    abort ();
+  else if (va[0] != 0
+           || va[1] != 0xffff)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-scmplt8.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-scmplt8.c
new file mode 100644
index 0000000..3e52006
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-scmplt8.c
@@ -0,0 +1,40 @@
+/* This is a test program for scmplt8 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int scmplt8 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__scmplt8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint8x4_t v_scmplt8 (int8x4_t ra, int8x4_t rb)
+{
+  return __nds32__v_scmplt8 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = scmplt8 (0xfefe0101, 0xffff0000);
+  uint8x4_t va = v_scmplt8 ((int8x4_t) {0x7e, 0x7e, 0x01, 0x01},
+			    (int8x4_t) {0x7f, 0x7f, 0x00, 0x00});
+
+  if (a != 0xffff0000)
+    abort ();
+  else if (va[0] != 0xff
+           || va[1] != 0xff
+           || va[2] != 0
+	   || va[3] != 0)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-sll16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sll16.c
new file mode 100644
index 0000000..5ab9506
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sll16.c
@@ -0,0 +1,37 @@
+/* This is a test program for sll16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int sll16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__sll16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_sll16 (uint16x2_t ra, unsigned int rb)
+{
+  return __nds32__v_sll16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = sll16 (0x0f00f000, 4);
+  uint16x2_t va = v_sll16 ((uint16x2_t) {0x7fff, 0x8000}, 4);
+
+  if (a != 0xf0000000)
+    abort ();
+  else if (va[0] != 0xfff0
+	   || va[1] != 0)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smal.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smal.c
new file mode 100644
index 0000000..f7e54b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smal.c
@@ -0,0 +1,36 @@
+/* This is a test program for smal instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long smal (long long ra, unsigned int rb)
+{
+  return __nds32__smal (ra, rb);
+}
+
+static __attribute__ ((noinline))
+long long v_smal (long long ra, int16x2_t rb)
+{
+  return __nds32__v_smal (ra, rb);
+}
+
+int
+main ()
+{
+  long long a = smal (0xfffff0000ll, 0x0001ffff);
+  long long va = v_smal (0xffffff0000ll,
+			 (int16x2_t) {0x0002, 0xffff});
+  if (a != 0xffffeffffll)
+    abort ();
+  else if (va != 0xfffffefffell)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalbb.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalbb.c
new file mode 100644
index 0000000..c39a889
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalbb.c
@@ -0,0 +1,45 @@
+/* This is a test program for smalbb instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long smalbb (long long t, unsigned int a, unsigned int b)
+{
+  return __nds32__smalbb (t, a, b);
+}
+
+static __attribute__ ((noinline))
+long long v_smalbb (long long t, int16x2_t a, int16x2_t b)
+{
+  return __nds32__v_smalbb (t, a, b);
+}
+
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  long long a_p = 0x12345679075ca9d3ll;
+  long long va_p = 0x12345679075ca9d3ll;
+#else
+  long long a_p = 0x12345679075ca9d3ll;
+  long long va_p = 0x12345678ffffffffll;
+#endif
+
+  long long a = smalbb (0x12345678ffffffffll,0x00006789, 0x00001234);
+  long long va = v_smalbb (0x12345678ffffffffll, (int16x2_t) {0x6789, 0},
+						 (int16x2_t) {0x1234, 0});
+  if (a != a_p)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalbt.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalbt.c
new file mode 100644
index 0000000..06577fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalbt.c
@@ -0,0 +1,45 @@
+/* This is a test program for smalbt instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long smalbt (long long t, unsigned int a, unsigned int b)
+{
+  return __nds32__smalbt (t, a, b);
+}
+
+static __attribute__ ((noinline))
+long long v_smalbt (long long t, int16x2_t a, int16x2_t b)
+{
+  return __nds32__v_smalbt (t, a, b);
+}
+
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  long long a_p = 0x12345679075ca9d3ll;
+  long long va_p = 0x12345679075ca9d3ll;
+#else
+  long long a_p = 0x12345679075ca9d3ll;
+  long long va_p = 0x12345678ffffffffll;
+#endif
+
+  long long a = smalbt (0x12345678ffffffffll, 0x00006789, 0x12340000);
+  long long va = v_smalbt (0x12345678ffffffffll, (int16x2_t) {0x6789, 0},
+						 (int16x2_t) {0, 0x1234});
+  if (a != a_p)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalda.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalda.c
new file mode 100644
index 0000000..33b4b3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalda.c
@@ -0,0 +1,38 @@
+/* This is a test program for smalda instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long smalda (long long t, unsigned int a, unsigned int b)
+{
+  return __nds32__smalda (t, a, b);
+}
+
+static __attribute__ ((noinline))
+long long v_smalda (long long t, int16x2_t a, int16x2_t b)
+{
+  return __nds32__v_smalda (t, a, b);
+}
+
+
+int
+main ()
+{
+  long long a = smalda (0x12345678ffffffffll, 0x67890000, 0x12340000);
+  long long va = v_smalda (0x12345678ffffffffll, (int16x2_t) {0, 0x6789},
+						 (int16x2_t) {0, 0x1234});
+
+  if (a != 0x12345679075CA9D3ll)
+    abort ();
+  else if (va != 0x12345679075CA9D3ll)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smaldrs.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smaldrs.c
new file mode 100644
index 0000000..48255b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smaldrs.c
@@ -0,0 +1,46 @@
+/* This is a test program for smaldrs instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long smaldrs (long long t, unsigned int a, unsigned int b)
+{
+  return __nds32__smaldrs (t, a, b);
+}
+
+static __attribute__ ((noinline))
+long long v_smaldrs (long long t, int16x2_t a, int16x2_t b)
+{
+  return __nds32__v_smaldrs (t, a, b);
+}
+
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  long long a_p = 0x12345678ffffaaaall;
+  long long va_p = 0x12345678ffffaaaall;
+#else
+  long long a_p = 0x12345678ffffaaaall;
+  long long va_p = 0x1234567900005554ll;
+#endif
+
+  long long a = smaldrs (0x12345678ffffffffll, 0x67890001, 0x00011234);
+  long long va = v_smaldrs (0x12345678ffffffffll, (int16x2_t) {0x0001, 0x6789},
+						  (int16x2_t) {0x1234, 0x0001});
+
+  if (a != a_p)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalds.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalds.c
new file mode 100644
index 0000000..5a89ea6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalds.c
@@ -0,0 +1,46 @@
+/* This is a test program for smalds instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long smalds (long long t, unsigned int a, unsigned int b)
+{
+  return __nds32__smalds (t, a, b);
+}
+
+static __attribute__ ((noinline))
+long long v_smalds (long long t, int16x2_t a, int16x2_t b)
+{
+  return __nds32__v_smalds (t, a, b);
+}
+
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  long long a_p = 0x12345678ffffaaaall;
+  long long va_p = 0x12345678ffffaaaall;
+#else
+  long long a_p = 0x12345678ffffaaaall;
+  long long va_p = 0x1234567900005554ll;
+#endif
+
+  long long a = smalds (0x12345678ffffffffll, 0x12340001, 0x00016789);
+  long long va = v_smalds (0x12345678ffffffffll, (int16x2_t) {0x0001, 0x1234},
+						 (int16x2_t) {0x6789, 0x0001});
+
+  if (a != a_p)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smaltt.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smaltt.c
new file mode 100644
index 0000000..709607a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smaltt.c
@@ -0,0 +1,46 @@
+/* This is a test program for smaltt instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long smaltt (long long t, unsigned int a, unsigned int b)
+{
+  return __nds32__smaltt (t, a, b);
+}
+
+static __attribute__ ((noinline))
+long long v_smaltt (long long t, int16x2_t a, int16x2_t b)
+{
+  return __nds32__v_smaltt (t, a, b);
+}
+
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  long long a_p = 0x12345679075ca9d3ll;
+  long long va_p = 0x12345679075ca9d3ll;
+#else
+  long long a_p = 0x12345679075ca9d3ll;
+  long long va_p = 0x12345678ffffffffll;
+#endif
+
+  long long a = smaltt (0x12345678ffffffffll, 0x67890000, 0x12340000);
+  long long va = v_smaltt (0x12345678ffffffffll, (int16x2_t) {0, 0x6789},
+						 (int16x2_t) {0, 0x1234});
+
+  if (a != a_p)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalxda.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalxda.c
new file mode 100644
index 0000000..0f90250
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalxda.c
@@ -0,0 +1,38 @@
+/* This is a test program for smalxda instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long smalxda (long long t, unsigned int a, unsigned int b)
+{
+  return __nds32__smalxda (t, a, b);
+}
+
+static __attribute__ ((noinline))
+long long v_smalxda (long long t, int16x2_t a, int16x2_t b)
+{
+  return __nds32__v_smalxda (t, a, b);
+}
+
+
+int
+main ()
+{
+  long long a = smalxda (0x12345678ffffffffll, 0x67890000, 0x00001234);
+  long long va = v_smalxda (0x12345678ffffffffll, (int16x2_t) {0, 0x6789},
+						  (int16x2_t) {0x1234, 0});
+
+  if (a != 0x12345679075CA9D3)
+    abort ();
+  else if (va != 0x12345679075CA9D3)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalxds.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalxds.c
new file mode 100644
index 0000000..ee2e098
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smalxds.c
@@ -0,0 +1,46 @@
+/* This is a test program for smalxds instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long smalxds (long long t, unsigned int a, unsigned int b)
+{
+  return __nds32__smalxds (t, a, b);
+}
+
+static __attribute__ ((noinline))
+long long v_smalxds (long long t, int16x2_t a, int16x2_t b)
+{
+  return __nds32__v_smalxds (t, a, b);
+}
+
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  long long a_p = 0x12345678ffffaaaall;
+  long long va_p = 0x12345678ffffaaaall;
+#else
+  long long a_p = 0x12345678ffffaaaall;
+  long long va_p = 0x1234567900005554ll;
+#endif
+
+  long long a = smalxds (0x12345678ffffffffll, 0x12340001, 0x67890001);
+  long long va = v_smalxds (0x12345678ffffffffll, (int16x2_t) {0x0001, 0x1234},
+						  (int16x2_t) {0x0001, 0x6789});
+
+  if (a != a_p)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smar64.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smar64.c
new file mode 100644
index 0000000..59c6f1f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smar64.c
@@ -0,0 +1,27 @@
+/* This is a test program for smar64 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long smar64 (long long t, int a, int b)
+{
+  return __nds32__smar64 (t, a, b);
+}
+
+int
+main ()
+{
+  long long a = smar64 (0xf000000000000000ll, 0x12345678, 0x23);
+
+  if (a != 0xf00000027d27d268ll)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smax16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smax16.c
new file mode 100644
index 0000000..72bf957
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smax16.c
@@ -0,0 +1,37 @@
+/* This is a test program for smax16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int smax16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__smax16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_smax16 (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_smax16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = smax16 (0xfffe0001, 0xffff0000);
+  int16x2_t va = v_smax16 ((int16x2_t) {0x7fff, 0},
+			   (int16x2_t) {0x7ffe, 1});
+  if (a != 0xffff0001)
+    abort ();
+  else if (va[0] != 0x7fff
+           || va[1] != 1)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smax8.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smax8.c
new file mode 100644
index 0000000..128bf19
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smax8.c
@@ -0,0 +1,41 @@
+/* This is a test program for smax8 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int smax8 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__smax8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int8x4_t v_smax8 (int8x4_t ra, int8x4_t rb)
+{
+  return __nds32__v_smax8 (ra, rb);
+}
+
+
+int
+main ()
+{
+  unsigned int a = smax8 (0xffff0000, 0xfefe0001);
+  int8x4_t va = v_smax8 ((int8x4_t) {0x7f, 0x7f, 0x01, 0x01},
+			 (int8x4_t) {0x7e, 0x7e, 0x00, 0x00});
+
+  if (a != 0xffff0001)
+    abort ();
+  else if (va[0] != 0x7f
+           || va[1] != 0x7f
+           || va[2] != 1
+	   || va[3] != 1)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smbb.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smbb.c
new file mode 100644
index 0000000..25759bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smbb.c
@@ -0,0 +1,44 @@
+/* This is a test program for smbb instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int smbb (unsigned int ra, unsigned int rb)
+{
+  return __nds32__smbb (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int v_smbb (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_smbb (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int va_p = 1;
+#else
+  int va_p = 2;
+#endif
+
+  int a = smbb (0x80000002, 0x80000001);
+
+  int va = v_smbb ((int16x2_t) {0xffff, 0x0002},
+		   (int16x2_t) {0xffff, 0x0001});
+
+  if (a != 2)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smbt.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smbt.c
new file mode 100644
index 0000000..7ed2c22
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smbt.c
@@ -0,0 +1,44 @@
+/* This is a test program for smbt instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int smbt (unsigned int ra, unsigned int rb)
+{
+  return __nds32__smbt (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int v_smbt (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_smbt (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int va_p = 0xffffffff;
+#else
+  int va_p = 0xfffffffe;
+#endif
+
+  int a = smbt (0x80000002, 0x80000001);
+
+  int va = v_smbt ((int16x2_t) {0xffff, 0x0002},
+		   (int16x2_t) {0xffff, 0x0001});
+
+  if (a != 0xffff0000)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smdrs.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smdrs.c
new file mode 100644
index 0000000..4224b04
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smdrs.c
@@ -0,0 +1,43 @@
+/* This is a test program for smdrs instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int smdrs (unsigned int ra, unsigned int rb)
+{
+  return __nds32__smdrs (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int v_smdrs (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_smdrs (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int va_p = 0xffffffff;
+#else
+  int va_p = 1;
+#endif
+
+  int a = smdrs (0x80000002, 0x80000001);
+  int va = v_smdrs ((int16x2_t) {0xffff, 0x0002},
+		    (int16x2_t) {0xffff, 0x0001});
+
+  if (a != 0xc0000002)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smds.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smds.c
new file mode 100644
index 0000000..9875efb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smds.c
@@ -0,0 +1,43 @@
+/* This is a test program for smds instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int smds (unsigned int ra, unsigned int rb)
+{
+  return __nds32__smds (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int v_smds (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_smds (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int va_p = 1;
+#else
+  int va_p = 0xffffffff;
+#endif
+
+  int a = smds (0x80000002, 0x80000001);
+  int va = v_smds ((int16x2_t) {0xffff, 0x0002},
+		   (int16x2_t) {0xffff, 0x0001});
+
+  if (a != 0x3ffffffe)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smin16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smin16.c
new file mode 100644
index 0000000..60deb4b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smin16.c
@@ -0,0 +1,37 @@
+/* This is a test program for smin16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int smin16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__smin16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_smin16 (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_smin16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = smin16 (0xfffe0001, 0xffff0000);
+  int16x2_t v_sa = v_smin16 ((int16x2_t) {0x7fff, 0},
+			     (int16x2_t) {0x7ffe, 1});
+  if (a != 0xfffe0000)
+    abort ();
+  else if (v_sa[0] != 0x7ffe
+           || v_sa[1] != 0)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmul.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmul.c
new file mode 100644
index 0000000..5735efa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmul.c
@@ -0,0 +1,27 @@
+/* This is a test program for smmul instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int smmul (int ra, int rb)
+{
+  return __nds32__smmul (ra, rb);
+}
+
+int
+main ()
+{
+  int a = smmul (0x80000000, 0x80000000);
+
+  if (a != 0x40000000)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmulu.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmulu.c
new file mode 100644
index 0000000..fbe0b15
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmulu.c
@@ -0,0 +1,27 @@
+/* This is a test program for smmul.u instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int smmul_u (int ra, int rb)
+{
+  return __nds32__smmul_u (ra, rb);
+}
+
+int
+main ()
+{
+  int a = smmul_u (0x80000002, 0x80000001);
+
+  if (a != 0x3fffffff)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmwb.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmwb.c
new file mode 100644
index 0000000..9160b9a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmwb.c
@@ -0,0 +1,43 @@
+/* This is a test program for smmwb instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int smmwb (int ra, unsigned int rb)
+{
+  return __nds32__smmwb (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int v_smmwb (int ra, int16x2_t rb)
+{
+  return __nds32__v_smmwb (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int va_p = 0;
+#else
+  int va_p = 0xffffffff;
+#endif
+
+  int a = smmwb (0x80000002, 0x80000001);
+
+  int va = v_smmwb (0xffff0002, (int16x2_t) {0xffff, 0x0001});
+
+  if (a != 0xffff8000)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmwbu.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmwbu.c
new file mode 100644
index 0000000..46ebed2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmwbu.c
@@ -0,0 +1,43 @@
+/* This is a test program for smmwb.u instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int smmwb_u (int ra, unsigned int rb)
+{
+  return __nds32__smmwb_u (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int v_smmwb_u (int ra, int16x2_t rb)
+{
+  return __nds32__v_smmwb_u (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int va_p = 1;
+#else
+  int va_p = 0xffffffff;
+#endif
+
+  int a = smmwb_u (0x80000002, 0x80000001);
+
+  int va = v_smmwb_u (0xffff0002, (int16x2_t) {0xffff, 0x0001});
+
+  if (a != 0xffff8000)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmwt.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmwt.c
new file mode 100644
index 0000000..45d4792
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmwt.c
@@ -0,0 +1,43 @@
+/* This is a test program for smmwt instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int smmwt (int ra, unsigned int rb)
+{
+  return __nds32__smmwt (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int v_smmwt (int ra, int16x2_t rb)
+{
+  return __nds32__v_smmwt (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int va_p = 0xffffffff;
+#else
+  int va_p = 0;
+#endif
+
+  int a = smmwt (0x80000002, 0x80000001);
+
+  int va = v_smmwt (0xffff0002, (int16x2_t) {0xffff, 0x0001});
+
+  if (a != 0x3fffffff)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmwtu.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmwtu.c
new file mode 100644
index 0000000..3b4b487
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smmwtu.c
@@ -0,0 +1,43 @@
+/* This is a test program for smmwt.u instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int smmwt_u (int ra, unsigned int rb)
+{
+  return __nds32__smmwt_u (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int v_smmwt_u (int ra, int16x2_t rb)
+{
+  return __nds32__v_smmwt_u (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int va_p = 0xffffffff;
+#else
+  int va_p = 1;
+#endif
+
+  int a = smmwt_u (0x80000002, 0x80000001);
+
+  int va = v_smmwt_u (0xffff0002, (int16x2_t) {0xffff, 0x0001});
+
+  if (a != 0x3fffffff)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smslda.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smslda.c
new file mode 100644
index 0000000..be2ac27
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smslda.c
@@ -0,0 +1,37 @@
+/* This is a test program for smslda instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long smslda (long long rt, unsigned int ra, unsigned int rb)
+{
+  return __nds32__smslda (rt, ra, rb);
+}
+
+static __attribute__ ((noinline))
+long long v_smslda (long long rt, int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_smslda (rt, ra, rb);
+}
+
+int
+main ()
+{
+  long long a = smslda (0xff0000000000ll, 0xffffffff, 0x2);
+  long long va = v_smslda (0x100000000ll,
+			   (int16x2_t) {0xf000, 0}, (int16x2_t) {0, 3});
+
+  if (a != 0xff0000000002ll)
+    abort ();
+  else if (va != 0x100000000ll)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smslxda.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smslxda.c
new file mode 100644
index 0000000..f276a2e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smslxda.c
@@ -0,0 +1,37 @@
+/* This is a test program for smslxda instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long smslxda (long long rt, unsigned int ra, unsigned int rb)
+{
+  return __nds32__smslxda (rt, ra, rb);
+}
+
+static __attribute__ ((noinline))
+long long v_smslxda (long long rt, int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_smslxda (rt, ra, rb);
+}
+
+int
+main ()
+{
+  long long a = smslxda (0xff0000000000ll, 0xffffffff, 0x2);
+  long long va = v_smslxda (0x100000000ll,
+			    (int16x2_t) {0xf000, 0}, (int16x2_t) {0, 3});
+
+  if (a != 0xff0000000002ll)
+    abort ();
+  else if (va != 0x100003000ll)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smsr64.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smsr64.c
new file mode 100644
index 0000000..64a84e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smsr64.c
@@ -0,0 +1,27 @@
+/* This is a test program for smsr64 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long smsr64 (long long t, int a, int b)
+{
+  return __nds32__smsr64 (t, a, b);
+}
+
+int
+main ()
+{
+  long long a = smsr64 (0x5000000300000000ll, 0x12345678, 0x23);
+
+  if (a != 0x5000000082D82D98ll)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smtt.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smtt.c
new file mode 100644
index 0000000..bfb30f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smtt.c
@@ -0,0 +1,44 @@
+/* This is a test program for smtt instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int smtt (unsigned int ra, unsigned int rb)
+{
+  return __nds32__smtt (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int v_smtt (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_smtt (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int va_p = 2;
+#else
+  int va_p = 1;
+#endif
+
+  int a = smtt (0x80000002, 0x80000001);
+
+  int va = v_smtt ((int16x2_t) {0xffff, 0x0002},
+		   (int16x2_t) {0xffff, 0x0001});
+
+  if (a != 0x40000000)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smul16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smul16.c
new file mode 100644
index 0000000..bb3fad4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smul16.c
@@ -0,0 +1,38 @@
+/* This is a test program for smul16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned long long smul16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__smul16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int32x2_t v_smul16 (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_smul16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned long long a = smul16 (0xffff0000, 0x0001ffff);
+  int32x2_t va = v_smul16 ((int16x2_t) {0xffff, 0},
+			   (int16x2_t) {0x0001, 0xffff});
+
+  if (a != 0xffffffff00000000)
+    abort ();
+  else if (va[0] != 0xffffffff
+           || va[1] != 0)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smulx16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smulx16.c
new file mode 100644
index 0000000..0e65a2a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smulx16.c
@@ -0,0 +1,37 @@
+/* This is a test program for smulx16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned long long smulx16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__smulx16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int32x2_t v_smulx16 (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_smulx16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned long long a = smulx16 (0xffff0000, 0xffff0001);
+  int32x2_t va = v_smulx16 ((int16x2_t) {0xffff, 0xffff},
+			    (int16x2_t) {1, 0});
+  if (a != 0xffffffff00000000)
+    abort ();
+  else if (va[0] != 0
+           || va[1] != 0xffffffff)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-smxds.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smxds.c
new file mode 100644
index 0000000..e429aa3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-smxds.c
@@ -0,0 +1,45 @@
+/* This is a test program for smxds instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int smxds (unsigned int ra, unsigned int rb)
+{
+  return __nds32__smxds (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int v_smxds (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_smxds (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int a_p = 0x8000;
+  int va_p = 0xffffffff;
+#else
+  int a_p = 0x8000;
+  int va_p = 1;
+#endif
+
+  int a = smxds (0x80000002, 0x80000001);
+  int va = v_smxds ((int16x2_t) {0xffff, 0x0002},
+		    (int16x2_t) {0xffff, 0x0001});
+
+  if (a != a_p)
+    abort ();
+  else if (va != va_p)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-sra16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sra16.c
new file mode 100644
index 0000000..7d85032
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sra16.c
@@ -0,0 +1,37 @@
+/* This is a test program for sra16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int sra16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__sra16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_sra16 (int16x2_t ra, unsigned int rb)
+{
+  return __nds32__v_sra16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = sra16 (0x0ffff000, 4);
+  int16x2_t va = v_sra16 ((int16x2_t) {0x7fff, 0x8000}, 4);
+
+  if (a != 0x00ffff00)
+    abort ();
+  else if (va[0] != 0x7ff
+	   || va[1] != (short) 0xf800)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-sra16u.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sra16u.c
new file mode 100644
index 0000000..5bc127c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sra16u.c
@@ -0,0 +1,37 @@
+/* This is a test program for sra16.u instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int sra16u (unsigned int ra, unsigned int rb)
+{
+  return __nds32__sra16_u (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_sra16u (int16x2_t ra, unsigned int rb)
+{
+  return __nds32__v_sra16_u (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = sra16u (0x0ffff000, 4);
+  int16x2_t va = v_sra16u ((int16x2_t) {0x7fff, 0x8000}, 4);
+
+  if (a != 0x100ff00)
+    abort ();
+  else if (va[0] != 0x800
+	   || va[1] != (short) 0xf800)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-srai16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-srai16.c
new file mode 100644
index 0000000..f3c6e16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-srai16.c
@@ -0,0 +1,39 @@
+/* This is a test program for srai16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int srai16 (unsigned int ra)
+{
+  return __nds32__sra16 (ra, 4);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_srai16 (int16x2_t ra)
+{
+  return __nds32__v_sra16 (ra, 4);
+}
+
+int
+main ()
+{
+  unsigned int a = srai16 (0x0ffff000);
+
+  int16x2_t aa;
+  int16x2_t va = v_srai16 ((int16x2_t) {0x7fff, 0x8000});
+
+  if (a != 0x00ffff00)
+    abort ();
+  else if (va[0] != 0x7ff
+	   || va[1] != (short) 0xf800)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-srai16u.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-srai16u.c
new file mode 100644
index 0000000..380bd2e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-srai16u.c
@@ -0,0 +1,37 @@
+/* This is a test program for srai16.u instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int srai16u (unsigned int ra)
+{
+  return __nds32__sra16_u (ra, 4);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_srai16u (int16x2_t ra)
+{
+  return __nds32__v_sra16_u (ra, 4);
+}
+
+int
+main ()
+{
+  unsigned int a = srai16u (0x0ffff000);
+  int16x2_t va = v_srai16u ((int16x2_t) {0x7fff, 0x8000});
+
+  if (a != 0x100ff00)
+    abort ();
+  else if (va[0] != 0x800
+	   || va[1] != (short) 0xf800)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-sraiu.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sraiu.c
new file mode 100644
index 0000000..4090762
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sraiu.c
@@ -0,0 +1,27 @@
+/* This is a test program for srai.u instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int sraiu (int ra)
+{
+  return __nds32__sra_u (ra, 8);
+}
+
+int
+main ()
+{
+  int a = sraiu (0xf00ff);
+
+  if (a != 0xf01)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-srau.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-srau.c
new file mode 100644
index 0000000..e3a3137
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-srau.c
@@ -0,0 +1,27 @@
+/* This is a test program for sra.u instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+int srau (int ra, unsigned int rb)
+{
+  return __nds32__sra_u (ra, rb);
+}
+
+int
+main ()
+{
+  int a = srau (0xf00ff, 8);
+
+  if (a != 0xf01)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-srl16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-srl16.c
new file mode 100644
index 0000000..8aa9c59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-srl16.c
@@ -0,0 +1,37 @@
+/* This is a test program for srl16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int srl16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__srl16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_srl16 (uint16x2_t ra, unsigned int rb)
+{
+  return __nds32__v_srl16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = srl16 (0x0f00f000, 4);
+  uint16x2_t va = v_srl16 ((uint16x2_t) {0x7fff, 0x8000}, 4);
+
+  if (a != 0xf00f00)
+    abort ();
+  else if (va[0] != 0x7ff
+	   || va[1] != 0x0800)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-srl16u.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-srl16u.c
new file mode 100644
index 0000000..3f4ac5b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-srl16u.c
@@ -0,0 +1,37 @@
+/* This is a test program for srl16.u instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int srl16_u (unsigned int ra, unsigned int rb)
+{
+  return __nds32__srl16_u (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_srl16_u (uint16x2_t ra, unsigned int rb)
+{
+  return __nds32__v_srl16_u (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = srl16_u (0x0f00f000, 4);
+  uint16x2_t va = v_srl16_u ((uint16x2_t) {0x7fff, 0x8000}, 4);
+
+  if (a != 0xf00f00)
+    abort ();
+  else if (va[0] != 0x800
+	   || va[1] != 0x800)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-srli16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-srli16.c
new file mode 100644
index 0000000..200bf8c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-srli16.c
@@ -0,0 +1,37 @@
+/* This is a test program for srli16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int srli16 (unsigned int ra)
+{
+  return __nds32__srl16 (ra, 4);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_srli16 (uint16x2_t ra)
+{
+  return __nds32__v_srl16 (ra, 4);
+}
+
+int
+main ()
+{
+  unsigned int a = srli16 (0x0f00f000);
+  uint16x2_t va = v_srli16 ((uint16x2_t) {0x7fff, 0x8000});
+
+  if (a != 0xf00f00)
+    abort ();
+  else if (va[0] != 0x7ff
+	   || va[1] != 0x0800)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-srli16u.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-srli16u.c
new file mode 100644
index 0000000..808319b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-srli16u.c
@@ -0,0 +1,37 @@
+/* This is a test program for sril16.u instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int srli16_u (unsigned int ra)
+{
+  return __nds32__srl16_u (ra, 4);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_srli16_u (uint16x2_t ra)
+{
+  return __nds32__v_srl16_u (ra, 4);
+}
+
+int
+main ()
+{
+  unsigned int a = srli16_u (0x0f00f000);
+  uint16x2_t va = v_srli16_u ((uint16x2_t) {0x7fff, 0x8000});
+
+  if (a != 0xf00f00)
+    abort ();
+  else if (va[0] != 0x800
+	   || va[1] != 0x800)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-sub16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sub16.c
new file mode 100644
index 0000000..eff5f92
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sub16.c
@@ -0,0 +1,49 @@
+/* This is a test program for sub16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int sub16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__sub16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_usub16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_usub16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_ssub16 (int16x2_t ra, int16x2_t rb)
+{
+  return __nds32__v_ssub16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = sub16 (0x00010000, 0x00010001);
+  uint16x2_t v_ua = v_usub16 ((uint16x2_t) {0x1000, 0x0001},
+			      (uint16x2_t) {0xf000, 0x0000});
+  int16x2_t v_sa = v_ssub16 ((int16x2_t) {0x7777, 0x2111},
+			     (int16x2_t) {0x1000, 0x2000});
+
+  if (a != 0x0000ffff)
+    abort ();
+  else if (v_ua[0] != 0x2000
+	   || v_ua[1] != 0x0001)
+    abort ();
+  else if (v_sa[0] != 0x6777
+	   || v_sa[1] != 0x0111)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-sub64.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sub64.c
new file mode 100644
index 0000000..efdd879
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sub64.c
@@ -0,0 +1,36 @@
+/* This is a test program for sub64 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+long long ssub64 (long long ra, long long rb)
+{
+  return __nds32__ssub64 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+unsigned long long usub64 (unsigned long long ra, unsigned long long rb)
+{
+  return __nds32__usub64 (ra, rb);
+}
+
+int
+main ()
+{
+  long long sa = ssub64 (0x100000000ll, 0xffffffffll);
+  unsigned long long ua = usub64 (0xf00000000ull, 0x1111ull);
+
+  if (sa != 1ll)
+    abort ();
+  else if (ua != 0xeffffeeefull)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-sub8.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sub8.c
new file mode 100644
index 0000000..b21f8a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sub8.c
@@ -0,0 +1,53 @@
+/* This is a test program for sub8 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int sub8 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__sub8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint8x4_t v_usub8 (uint8x4_t ra, uint8x4_t rb)
+{
+  return __nds32__v_usub8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+int8x4_t v_ssub8 (int8x4_t ra, int8x4_t rb)
+{
+  return __nds32__v_ssub8 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = sub8 (0x55667788, 0x11223344);
+  uint8x4_t v_ua = v_usub8 ((uint8x4_t) {0xff, 0xee, 0xee, 0xcc},
+			    (uint8x4_t) {0x1, 0xee, 0xdd, 0xdd});
+  int8x4_t v_sa = v_ssub8 ((int8x4_t) {0x81, 0x0, 0xdd, 0xaa},
+			   (int8x4_t) {0x80, 0x1, 0xcc, 0xaa});
+
+  if (a != 0x44444444)
+    abort ();
+  else if (v_ua[0] != 0xfe
+	   || v_ua[1] != 0
+	   || v_ua[2] != 0x11
+	   || v_ua[3] != 0xef)
+    abort ();
+  else if (v_sa[0] != 1
+	   || v_sa[1] != (char) 0xff
+	   || v_sa[2] != 0x11
+	   || v_sa[3] != 0)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-sunpkd810.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sunpkd810.c
new file mode 100644
index 0000000..29fff3a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sunpkd810.c
@@ -0,0 +1,43 @@
+/* This is a test program for sunpkd810 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int sunpkd810 (unsigned int a)
+{
+  return __nds32__sunpkd810 (a);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_sunpkd810 (int8x4_t a)
+{
+  return __nds32__v_sunpkd810 (a);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int16x2_t va_p = {0xfff8, 0x56};
+#else
+  int16x2_t va_p = {0, 0};
+#endif
+
+  unsigned int a = sunpkd810 (0x000056f8);
+  int16x2_t va = v_sunpkd810 ((int8x4_t) {0xf8, 0x56, 0, 0});
+
+  if (a != 0x0056fff8)
+    abort ();
+  else if (va[0] != va_p[0]
+           || va[1] != va_p[1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-sunpkd820.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sunpkd820.c
new file mode 100644
index 0000000..43f969a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sunpkd820.c
@@ -0,0 +1,43 @@
+/* This is a test program for sunpkd820 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int sunpkd820 (unsigned int a)
+{
+  return __nds32__sunpkd820 (a);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_sunpkd820 (int8x4_t a)
+{
+  return __nds32__v_sunpkd820 (a);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int16x2_t va_p = {0xfff8, 0x34};
+#else
+  int16x2_t va_p = {0, 0};
+#endif
+
+  unsigned int a = sunpkd820 (0x003400f8);
+  int16x2_t va = v_sunpkd820 ((int8x4_t) {0xf8, 0, 0x34, 0});
+
+  if (a != 0x0034fff8)
+    abort ();
+  else if (va[0] != va_p[0]
+           || va[1] != va_p[1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-sunpkd830.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sunpkd830.c
new file mode 100644
index 0000000..76540b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sunpkd830.c
@@ -0,0 +1,37 @@
+/* This is a test program for sunpkd830 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int sunpkd830 (unsigned int a)
+{
+  return __nds32__sunpkd830 (a);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_sunpkd830 (int8x4_t a)
+{
+  return __nds32__v_sunpkd830 (a);
+}
+
+int
+main ()
+{
+  unsigned int a = sunpkd830 (0x120000f8);
+  int16x2_t va = v_sunpkd830 ((int8x4_t) {0xf8, 0x00, 0, 0x12});
+
+  if (a != 0x0012fff8)
+    abort ();
+  else if (va[0] != (short) 0xfff8
+           || va[1] != 0x0012)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-sunpkd831.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sunpkd831.c
new file mode 100644
index 0000000..05149e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-sunpkd831.c
@@ -0,0 +1,43 @@
+/* This is a test program for sunpkd831 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int sunpkd831 (unsigned int a)
+{
+  return __nds32__sunpkd831 (a);
+}
+
+static __attribute__ ((noinline))
+int16x2_t v_sunpkd831 (int8x4_t a)
+{
+  return __nds32__v_sunpkd831 (a);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int16x2_t va_p = {0xfff8, 0x12};
+#else
+  int16x2_t va_p = {0, 0};
+#endif
+
+  unsigned int a = sunpkd831 (0x1200f800);
+  int16x2_t va = v_sunpkd831 ((int8x4_t) {0, 0xf8, 0, 0x12});
+
+  if (a != 0x0012fff8)
+    abort ();
+  else if (va[0] != va_p[0]
+           || va[1] != va_p[1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-ucmple16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ucmple16.c
new file mode 100644
index 0000000..17b5344
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ucmple16.c
@@ -0,0 +1,37 @@
+/* This is a test program for ucmple16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int ucmple16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__ucmple16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_ucmple16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_ucmple16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = ucmple16 (0xfffe0001, 0xffff0000);
+  uint16x2_t va = v_ucmple16 ((uint16x2_t) {0x7fff, 0x7ffe},
+			      (uint16x2_t) {0x7ffe, 0x7fff});
+  if (a != 0xffff0000)
+    abort ();
+  else if (va[0] != 0
+           || va[1] != 0xffff)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-ucmple8.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ucmple8.c
new file mode 100644
index 0000000..561b500
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ucmple8.c
@@ -0,0 +1,40 @@
+/* This is a test program for ucmple8 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int ucmple8 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__ucmple8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint8x4_t v_ucmple8 (uint8x4_t ra, uint8x4_t rb)
+{
+  return __nds32__v_ucmple8 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = ucmple8 (0xfefe0101, 0xffff0000);
+  uint8x4_t va = v_ucmple8 ((uint8x4_t) {0x7e, 0x7e, 0x01, 0x01},
+			    (uint8x4_t) {0x7f, 0x7f, 0x00, 0x00});
+
+  if (a != 0xffff0000)
+    abort ();
+  else if (va[0] != 0xff
+           || va[1] != 0xff
+           || va[2] != 0
+	   || va[3] != 0)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-ucmplt16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ucmplt16.c
new file mode 100644
index 0000000..820ce1e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ucmplt16.c
@@ -0,0 +1,37 @@
+/* This is a test program for ucmplt16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int ucmplt16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__ucmplt16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_ucmplt16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_ucmplt16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = ucmplt16 (0xfffe0001, 0xffff0000);
+  uint16x2_t va = v_ucmplt16 ((uint16x2_t) {0x7fff, 0x7ffe},
+			      (uint16x2_t) {0x7ffe, 0x7fff});
+  if (a != 0xffff0000)
+    abort ();
+  else if (va[0] != 0
+           || va[1] != 0xffff)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-ucmplt8.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ucmplt8.c
new file mode 100644
index 0000000..8001586
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ucmplt8.c
@@ -0,0 +1,40 @@
+/* This is a test program for ucmplt8 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int ucmplt8 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__ucmplt8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint8x4_t v_ucmplt8 (uint8x4_t ra, uint8x4_t rb)
+{
+  return __nds32__v_ucmplt8 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = ucmplt8 (0xfefe0101, 0xffff0000);
+  uint8x4_t va = v_ucmplt8 ((uint8x4_t) {0x7e, 0x7e, 0x01, 0x01},
+			    (uint8x4_t) {0x7f, 0x7f, 0x00, 0x00});
+
+  if (a != 0xffff0000)
+    abort ();
+  else if (va[0] != 0xff
+           || va[1] != 0xff
+           || va[2] != 0
+	   || va[3] != 0)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-umar64.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-umar64.c
new file mode 100644
index 0000000..ac32ae1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-umar64.c
@@ -0,0 +1,27 @@
+/* This is a test program for umar64 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned long long umar64 (unsigned long long t,unsigned int a,unsigned int b)
+{
+  return __nds32__umar64 (t, a, b);
+}
+
+int
+main ()
+{
+  unsigned long long a = umar64 (0xf000000000000000ull, 0x12345678, 0x23);
+
+  if (a != 0xf00000027d27d268ull)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-umax16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-umax16.c
new file mode 100644
index 0000000..99a43d2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-umax16.c
@@ -0,0 +1,37 @@
+/* This is a test program for umax16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int umax16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__umax16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_umax16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_umax16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = umax16 (0xfffe0001, 0xffff0000);
+  uint16x2_t va = v_umax16 ((uint16x2_t) {0xffff, 0},
+			    (uint16x2_t) {0xfffe, 1});
+  if (a != 0xffff0001)
+    abort ();
+  else if (va[0] != 0xffff
+           || va[1] != 1)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-umax8.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-umax8.c
new file mode 100644
index 0000000..23904b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-umax8.c
@@ -0,0 +1,41 @@
+/* This is a test program for umax8 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int umax8 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__umax8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint8x4_t v_umax8 (uint8x4_t ra, uint8x4_t rb)
+{
+  return __nds32__v_umax8 (ra, rb);
+}
+
+
+int
+main ()
+{
+  unsigned int a = umax8 (0xffff0000, 0xfffe0001);
+  uint8x4_t va = v_umax8 ((uint8x4_t) {0xff, 0xff, 0x01, 0x01},
+			  (uint8x4_t) {0xfe, 0xfe, 0x00, 0x00});
+
+  if (a != 0xffff0001)
+    abort ();
+  else if (va[0] != 0xff
+           || va[1] != 0xff
+           || va[2] != 1
+	   || va[3] != 1)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-umin16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-umin16.c
new file mode 100644
index 0000000..eec7058
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-umin16.c
@@ -0,0 +1,37 @@
+/* This is a test program for umin16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int umin16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__umin16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_umin16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_umin16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = umin16 (0xfffe0001, 0xffff0000);
+  uint16x2_t va = v_umin16 ((uint16x2_t) {0x7fff, 0},
+			    (uint16x2_t) {0x7ffe, 1});
+  if (a != 0xfffe0000)
+    abort ();
+  else if (va[0] != 0x7ffe
+           || va[1] != 0)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-umsr64.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-umsr64.c
new file mode 100644
index 0000000..3fb20bf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-umsr64.c
@@ -0,0 +1,27 @@
+/* This is a test program for umsr64 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned long long umsr64 (unsigned long long t, unsigned int a, unsigned int b)
+{
+  return __nds32__umsr64 (t, a, b);
+}
+
+int
+main ()
+{
+  unsigned long long a = umsr64 (0x5000000300000000ull, 0x12345678, 0x23);
+
+  if (a != 0x5000000082D82D98ull)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-umul16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-umul16.c
new file mode 100644
index 0000000..ddfb6be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-umul16.c
@@ -0,0 +1,37 @@
+/* This is a test program for umul16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned long long umul16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__umul16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint32x2_t v_umul16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_umul16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned long long a = umul16 (0xffff0000, 0x0001ffff);
+  uint32x2_t va = v_umul16 ((uint16x2_t) {0xffff, 0},
+			    (uint16x2_t) {0x0001, 0xffff});
+  if (a != 0xffff00000000)
+    abort ();
+  else if (va[0] != 0xffff
+           || va[1] != 0)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-umulx16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-umulx16.c
new file mode 100644
index 0000000..c57d304
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-umulx16.c
@@ -0,0 +1,37 @@
+/* This is a test program for umulx16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned long long umulx16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__umulx16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint32x2_t v_umulx16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_umulx16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned long long a = umulx16 (0xffff0000, 0xffff0001);
+  uint32x2_t va = v_umulx16 ((uint16x2_t) {0xffff, 0xffff},
+			     (uint16x2_t) {1, 0});
+  if (a != 0xffff00000000)
+    abort ();
+  else if (va[0] != 0
+           || va[1] != 0xffff)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-uradd16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-uradd16.c
new file mode 100644
index 0000000..82c7be7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-uradd16.c
@@ -0,0 +1,38 @@
+/* This is a test program for uradd16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int uradd16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__uradd16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_uradd16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_uradd16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = uradd16 (0x7fff7fff, 0x7fff7fff);
+  uint16x2_t va = v_uradd16 ((uint16x2_t) {0x8000, 0x4000},
+			     (uint16x2_t) {0x8000, 0x8000});
+
+  if (a != 0x7fff7fff)
+    abort ();
+  else if (va[0] != 0x8000
+	   || va[1] != 0x6000)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-uradd64.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-uradd64.c
new file mode 100644
index 0000000..51ee961
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-uradd64.c
@@ -0,0 +1,27 @@
+/* This is a test program for uradd64 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned long long uradd64 (unsigned long long ra, unsigned long long rb)
+{
+  return __nds32__uradd64 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned long long a = uradd64 (0xf000000000000000ull, 0xf000000000000000ull);
+
+  if (a != 0xf000000000000000ull)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-uradd8.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-uradd8.c
new file mode 100644
index 0000000..d4f91d6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-uradd8.c
@@ -0,0 +1,40 @@
+/* This is a test program for uradd8 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int uradd8 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__uradd8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint8x4_t v_uradd8 (uint8x4_t ra, uint8x4_t rb)
+{
+  return __nds32__v_uradd8 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = uradd8 (0x11223344, 0x55667788);
+  uint8x4_t va = v_uradd8 ((uint8x4_t) {0x7f, 0x80, 0x40, 0xaa},
+			   (uint8x4_t) {0x7f, 0x80, 0x80, 0xaa});
+
+  if (a != 0x33445566)
+    abort ();
+  else if (va[0] != 0x7f
+	   || va[1] != 0x80
+	   || va[2] != 0x60
+	   || va[3] != 0xaa)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-uraddw.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-uraddw.c
new file mode 100644
index 0000000..9fc76b0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-uraddw.c
@@ -0,0 +1,27 @@
+/* This is a test program for uraddw instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int uraddw (unsigned int ra, unsigned int rb)
+{
+  return __nds32__uraddw (ra, rb);
+}
+
+unsigned int
+main ()
+{
+  unsigned int a = uraddw (0x80000000, 0x80000000);
+
+  if (a != 0x80000000)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-urcras16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-urcras16.c
new file mode 100644
index 0000000..1330374
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-urcras16.c
@@ -0,0 +1,44 @@
+/* This is a test program for urcras16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int urcras16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__urcras16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_urcras16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_urcras16 (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  uint16x2_t va_p = {0xffff, 0x8000};
+#else
+  uint16x2_t va_p = {0x7fff, 0};
+#endif
+
+  unsigned int a = urcras16 (0x7fff7fff, 0x80007fff);
+  uint16x2_t va = v_urcras16 ((uint16x2_t) {0x7fff, 0x8000},
+			      (uint16x2_t) {0x8000, 0x8000});
+
+  if (a != 0x7fffffff)
+    abort ();
+  else if (va[0] != va_p[0]
+	   || va[1] != va_p[1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-urcrsa16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-urcrsa16.c
new file mode 100644
index 0000000..806fa7a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-urcrsa16.c
@@ -0,0 +1,44 @@
+/* This is a test program for urcrsa16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int urcrsa16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__urcrsa16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_urcrsa16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_urcrsa16 (ra, rb);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  uint16x2_t va_p = {0x8000, 0xffff};
+#else
+  uint16x2_t va_p = {0, 0x7fff};
+#endif
+
+  unsigned int a = urcrsa16 (0x7fff7fff, 0x7fff8000);
+  uint16x2_t va = v_urcrsa16 ((uint16x2_t) {0x8000, 0x7fff},
+			      (uint16x2_t) {0x8000, 0x8000});
+
+  if (a != 0xffff7fff)
+    abort ();
+  else if (va[0] != va_p[0]
+	   || va[1] != va_p[1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-ursub16.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ursub16.c
new file mode 100644
index 0000000..9e87234
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ursub16.c
@@ -0,0 +1,38 @@
+/* This is a test program for ursub16 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int ursub16 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__ursub16 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_ursub16 (uint16x2_t ra, uint16x2_t rb)
+{
+  return __nds32__v_ursub16 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = ursub16 (0x7fff7fff, 0x80008000);
+  uint16x2_t va = v_ursub16 ((uint16x2_t) {0x8000, 0x8000},
+			     (uint16x2_t) {0x7fff, 0x4000});
+
+  if (a != 0xffffffff)
+    abort ();
+  else if (va[0] != 0
+	   || va[1] != 0x2000)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-ursub64.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ursub64.c
new file mode 100644
index 0000000..e1f7b15
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ursub64.c
@@ -0,0 +1,27 @@
+/* This is a test program for ursub64 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned long long ursub64 (unsigned long long ra, unsigned long long rb)
+{
+  return __nds32__ursub64 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned long long a = ursub64 (0xeull, 0xfull);
+
+  if (a != 0xffffffffffffffffull)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-ursub8.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ursub8.c
new file mode 100644
index 0000000..f5e3ff6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ursub8.c
@@ -0,0 +1,40 @@
+/* This is a test program for ursub8 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int ursub8 (unsigned int ra, unsigned int rb)
+{
+  return __nds32__ursub8 (ra, rb);
+}
+
+static __attribute__ ((noinline))
+uint8x4_t v_ursub8 (uint8x4_t ra, uint8x4_t rb)
+{
+  return __nds32__v_ursub8 (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = ursub8 (0x55667788, 0x11223344);
+  uint8x4_t va = v_ursub8 ((uint8x4_t) {0x7f, 0x80, 0x80, 0xaa},
+			   (uint8x4_t) {0x80, 0x7f, 0x40, 0xaa});
+
+  if (a != 0x22222222)
+    abort ();
+  else if (va[0] != 0xff
+	   || va[1] != 0
+	   || va[2] != 0x20
+	   || va[3] != 0)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-ursubw.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ursubw.c
new file mode 100644
index 0000000..b12afb0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-ursubw.c
@@ -0,0 +1,27 @@
+/* This is a test program for ursubw instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int ursubw (unsigned int ra,unsigned int rb)
+{
+  return __nds32__ursubw (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = ursubw (0x80000000, 0x40000000);
+
+  if (a != 0x20000000)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-wext.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-wext.c
new file mode 100644
index 0000000..d86fb8f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-wext.c
@@ -0,0 +1,27 @@
+/* This is a test program for wext instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int wext (long long ra, unsigned int rb)
+{
+  return __nds32__wext (ra, rb);
+}
+
+int
+main ()
+{
+  unsigned int a = wext (0x1234ffff0000ll, 16);
+
+  if (a != 0x1234ffff)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-wexti.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-wexti.c
new file mode 100644
index 0000000..8f09423
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-wexti.c
@@ -0,0 +1,27 @@
+/* This is a test program for wexti instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int wexti (long long ra)
+{
+  return __nds32__wext (ra, 16);
+}
+
+int
+main ()
+{
+  unsigned int a = wexti (0x1234ffff0000ll);
+
+  if (a != 0x1234ffff)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-zunpkd810.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-zunpkd810.c
new file mode 100644
index 0000000..7b3aebb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-zunpkd810.c
@@ -0,0 +1,43 @@
+/* This is a test program for zunpkd810 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int zunpkd810 (unsigned int a)
+{
+  return __nds32__zunpkd810 (a);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_zunpkd810 (uint8x4_t a)
+{
+  return __nds32__v_zunpkd810 (a);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int16x2_t va_p = {0xf8, 0x56};
+#else
+  int16x2_t va_p = {0, 0};
+#endif
+
+  unsigned int a = zunpkd810 (0x000056f8);
+  uint16x2_t va = v_zunpkd810 ((uint8x4_t) {0xf8, 0x56, 0, 0});
+
+  if (a != 0x005600f8)
+    abort ();
+  else if (va[0] != va_p[0]
+           || va[1] != va_p[1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-zunpkd820.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-zunpkd820.c
new file mode 100644
index 0000000..dc37a3d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-zunpkd820.c
@@ -0,0 +1,43 @@
+/* This is a test program for zunpkd820 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int zunpkd820 (unsigned int a)
+{
+  return __nds32__zunpkd820 (a);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_zunpkd820 (uint8x4_t a)
+{
+  return __nds32__v_zunpkd820 (a);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int16x2_t va_p = {0xf8, 0x34};
+#else
+  int16x2_t va_p = {0, 0};
+#endif
+
+  unsigned int a = zunpkd820 (0x003400f8);
+  uint16x2_t va = v_zunpkd820 ((uint8x4_t) {0xf8, 0, 0x34, 0});
+
+  if (a != 0x003400f8)
+    abort ();
+  else if (va[0] != va_p[0]
+           || va[1] != va_p[1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-zunpkd830.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-zunpkd830.c
new file mode 100644
index 0000000..8f5a224
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-zunpkd830.c
@@ -0,0 +1,37 @@
+/* This is a test program for zunpkd830 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int zunpkd830 (unsigned int a)
+{
+  return __nds32__zunpkd830 (a);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_zunpkd830 (uint8x4_t a)
+{
+  return __nds32__v_zunpkd830 (a);
+}
+
+int
+main ()
+{
+  unsigned int a = zunpkd830 (0x120000f8);
+  uint16x2_t va = v_zunpkd830 ((uint8x4_t) { 0xf8, 0x00, 0, 0x12});
+
+  if (a != 0x001200f8)
+    abort ();
+  else if (va[0] != 0x00f8
+           || va[1] != 0x0012)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-dsp-zunpkd831.c b/gcc/testsuite/gcc.target/nds32/builtin-dsp-zunpkd831.c
new file mode 100644
index 0000000..6878cd3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-dsp-zunpkd831.c
@@ -0,0 +1,43 @@
+/* This is a test program for zunpkd831 instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+static __attribute__ ((noinline))
+unsigned int zunpkd831 (unsigned int a)
+{
+  return __nds32__zunpkd831 (a);
+}
+
+static __attribute__ ((noinline))
+uint16x2_t v_zunpkd831 (uint8x4_t a)
+{
+  return __nds32__v_zunpkd831 (a);
+}
+
+int
+main ()
+{
+#ifdef __NDS32_EL__
+  int16x2_t va_p = {0xf8, 0x12};
+#else
+  int16x2_t va_p = {0, 0};
+#endif
+
+  unsigned int a = zunpkd831 (0x1200f800);
+  uint16x2_t va = v_zunpkd831 ((uint8x4_t) {0, 0xf8, 0, 0x12});
+
+  if (a != 0x001200f8)
+    abort ();
+  else if (va[0] != va_p[0]
+           || va[1] != va_p[1])
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-fpu-fcpyd.c b/gcc/testsuite/gcc.target/nds32/builtin-fpu-fcpyd.c
new file mode 100644
index 0000000..4ee7e5e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-fpu-fcpyd.c
@@ -0,0 +1,21 @@
+/* This is a test program for fcpysd instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_fpu_dp } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  double da = -1.5;
+  double db = 1.3;
+  double dr = __nds32__fcpysd (da, db);
+
+  if (dr != 1.5)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-fpu-fcpynd.c b/gcc/testsuite/gcc.target/nds32/builtin-fpu-fcpynd.c
new file mode 100644
index 0000000..804410b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-fpu-fcpynd.c
@@ -0,0 +1,21 @@
+/* This is a test program for fcpynsd instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_fpu_dp } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  double da = -1.5;
+  double db = -1.3;
+  double dr =  __nds32__fcpynsd (da, db);
+
+  if (dr != 1.5)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-fpu-fcpyns.c b/gcc/testsuite/gcc.target/nds32/builtin-fpu-fcpyns.c
new file mode 100644
index 0000000..0d86734
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-fpu-fcpyns.c
@@ -0,0 +1,21 @@
+/* This is a test program for fcpynss instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_fpu_sp } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  float a = -1.5;
+  float b = -1.3;
+  float r = __nds32__fcpynss (a, b);
+
+  if (r != 1.5)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-fpu-fcpys.c b/gcc/testsuite/gcc.target/nds32/builtin-fpu-fcpys.c
new file mode 100644
index 0000000..4bccf57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-fpu-fcpys.c
@@ -0,0 +1,21 @@
+/* This is a test program for fcpyss instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_fpu_sp } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  float a = -1.5;
+  float b = 1.3;
+  float r = __nds32__fcpyss (a, b);
+
+  if (r != 1.5)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-fpu-fmfcfg.c b/gcc/testsuite/gcc.target/nds32/builtin-fpu-fmfcfg.c
new file mode 100644
index 0000000..83e65ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-fpu-fmfcfg.c
@@ -0,0 +1,23 @@
+/* This is a test program for fmfcfg instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_fpu } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned int intrinsic_fmfcfg = -1;
+  unsigned int inline_assemble_fmfcfg = -2;
+
+  intrinsic_fmfcfg = __nds32__fmfcfg ();
+  __asm volatile ("fmfcfg %0" : "=r" (inline_assemble_fmfcfg));
+
+  if (intrinsic_fmfcfg == inline_assemble_fmfcfg)
+    exit (0);
+  else
+    abort ();
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-fpu-fpcsr.c b/gcc/testsuite/gcc.target/nds32/builtin-fpu-fpcsr.c
new file mode 100644
index 0000000..787b430
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-fpu-fpcsr.c
@@ -0,0 +1,35 @@
+/* This is a test program for fmtcsr/fmfcsr instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_fpu } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned int fpcsr;
+  unsigned int real_fpcsr;
+
+  /* Keep real fpcsr value.  */
+  real_fpcsr = __nds32__fmfcsr ();
+
+  /* write fpcsr */
+  fpcsr = 3;
+  __nds32__fmtcsr (fpcsr);
+
+  /* read fpcsr */
+  fpcsr = 0;
+  fpcsr = __nds32__fmfcsr ();
+  fpcsr = fpcsr & 0x00001fff;
+
+  /* Recover fpcsr value.  */
+  __nds32__fmtcsr (real_fpcsr);
+
+  if (fpcsr == 3)
+    exit (0);
+  else
+   abort ();
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-get-lp.c b/gcc/testsuite/gcc.target/nds32/builtin-get-lp.c
new file mode 100644
index 0000000..80b4921
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-get-lp.c
@@ -0,0 +1,22 @@
+/* Verify the return address with builtin function.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int main()
+{
+  unsigned int intrinsic_lp = -1;
+  unsigned int inline_assemble_lp = -2;
+
+  intrinsic_lp = __nds32__return_address ();
+
+  __asm volatile ("mov55 %0, $lp" : "=r" (inline_assemble_lp));
+
+  if (intrinsic_lp != inline_assemble_lp)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-isb.c b/gcc/testsuite/gcc.target/nds32/builtin-isb.c
deleted file mode 100644
index e65061b..0000000
--- a/gcc/testsuite/gcc.target/nds32/builtin-isb.c
+++ /dev/null
@@ -1,11 +0,0 @@
-/* Verify that we generate isb instruction with builtin function.  */
-
-/* { dg-do compile }  */
-/* { dg-options "-O0" }  */
-/* { dg-final { scan-assembler "\\tisb" } }  */
-
-void
-test (void)
-{
-  __builtin_nds32_isb ();
-}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-isync.c b/gcc/testsuite/gcc.target/nds32/builtin-isync.c
deleted file mode 100644
index 3160e4a..0000000
--- a/gcc/testsuite/gcc.target/nds32/builtin-isync.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/* Verify that we generate isync instruction with builtin function.  */
-
-/* { dg-do compile }  */
-/* { dg-options "-O0" }  */
-/* { dg-final { scan-assembler "\\tisync" } }  */
-
-void
-test (void)
-{
-  int *addr = (int *) 0x53000000;
-  __builtin_nds32_isync (addr);
-}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-mfsr-mtsr.c b/gcc/testsuite/gcc.target/nds32/builtin-mfsr-mtsr.c
deleted file mode 100644
index db4c558..0000000
--- a/gcc/testsuite/gcc.target/nds32/builtin-mfsr-mtsr.c
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Verify that we generate mfsr/mtsr instruction with builtin function.  */
-
-/* { dg-do compile }  */
-/* { dg-options "-O0" }  */
-/* { dg-final { scan-assembler "\\tmfsr" } }  */
-/* { dg-final { scan-assembler "\\tmtsr" } }  */
-
-#include <nds32_intrinsic.h>
-
-void
-test (void)
-{
-  int ipsw_value;
-
-  ipsw_value = __builtin_nds32_mfsr (__NDS32_REG_IPSW__);
-  __builtin_nds32_mtsr (ipsw_value, __NDS32_REG_IPSW__);
-}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-mfusr-mtusr.c b/gcc/testsuite/gcc.target/nds32/builtin-mfusr-mtusr.c
deleted file mode 100644
index 3cfaab9..0000000
--- a/gcc/testsuite/gcc.target/nds32/builtin-mfusr-mtusr.c
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Verify that we generate mfusr/mtusr instruction with builtin function.  */
-
-/* { dg-do compile }  */
-/* { dg-options "-O0" }  */
-/* { dg-final { scan-assembler "\\tmfusr" } }  */
-/* { dg-final { scan-assembler "\\tmtusr" } }  */
-
-#include <nds32_intrinsic.h>
-
-void
-test (void)
-{
-  int itype_value;
-
-  itype_value = __builtin_nds32_mfusr (__NDS32_REG_ITYPE__);
-  __builtin_nds32_mtusr (itype_value, __NDS32_REG_ITYPE__);
-}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-rotr.c b/gcc/testsuite/gcc.target/nds32/builtin-rotr.c
new file mode 100644
index 0000000..a295cb2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-rotr.c
@@ -0,0 +1,19 @@
+/* This is a test program for rotr instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O0" } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned int a = 1;
+  a = __nds32__rotr (a, 30);
+
+  if (a != 4)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-setgie-dis.c b/gcc/testsuite/gcc.target/nds32/builtin-setgie-dis.c
deleted file mode 100644
index 2dceed9..0000000
--- a/gcc/testsuite/gcc.target/nds32/builtin-setgie-dis.c
+++ /dev/null
@@ -1,11 +0,0 @@
-/* Verify that we generate setgie.d instruction with builtin function.  */
-
-/* { dg-do compile }  */
-/* { dg-options "-O0" }  */
-/* { dg-final { scan-assembler "\\tsetgie.d" } }  */
-
-void
-test (void)
-{
-  __builtin_nds32_setgie_dis ();
-}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-setgie-en.c b/gcc/testsuite/gcc.target/nds32/builtin-setgie-en.c
deleted file mode 100644
index 8928870..0000000
--- a/gcc/testsuite/gcc.target/nds32/builtin-setgie-en.c
+++ /dev/null
@@ -1,11 +0,0 @@
-/* Verify that we generate setgie.e instruction with builtin function.  */
-
-/* { dg-do compile }  */
-/* { dg-options "-O0" }  */
-/* { dg-final { scan-assembler "\\tsetgie.e" } }  */
-
-void
-test (void)
-{
-  __builtin_nds32_setgie_en ();
-}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-setgie_mtsr_mfsr.c b/gcc/testsuite/gcc.target/nds32/builtin-setgie_mtsr_mfsr.c
new file mode 100644
index 0000000..b353909
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-setgie_mtsr_mfsr.c
@@ -0,0 +1,43 @@
+/* This is a test program for checking gie with
+   mtsr/mfsr instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O0" } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned int psw;
+  unsigned int gie;
+  unsigned int pfm_ctl;
+  unsigned int real_psw;
+
+  /* Keep PSW value.  */
+  real_psw = __nds32__mfsr (NDS32_SR_PSW);
+
+  __nds32__setgie_en ();
+  __nds32__dsb(); /* This is needed for waiting pipeline.  */
+  psw = __nds32__mfsr (NDS32_SR_PSW);
+
+  gie = psw & 0x00000001;
+
+  if (gie != 1)
+    abort ();
+
+  psw = psw & 0xFFFFFFFE;
+  __nds32__mtsr (psw, NDS32_SR_PSW);
+  __nds32__dsb(); /* This is needed for waiting pipeline.  */
+  psw = __nds32__mfsr (NDS32_SR_PSW);
+  gie = psw & 0x00000001;
+
+  /* Recover PSW value.  */
+  __nds32__mtsr (real_psw, NDS32_SR_PSW);
+
+  if (gie != 0)
+    abort ();
+  else
+   exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-sp.c b/gcc/testsuite/gcc.target/nds32/builtin-sp.c
new file mode 100644
index 0000000..2e5499d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-sp.c
@@ -0,0 +1,33 @@
+/* This is a test program for sp intrinsic usage.
+   Because we want to use frame pointer to access local variable,
+   we need to use -fno-omit-frame-pointer to make sure the existence
+   of frame pointer.  */
+
+/* { dg-do run } */
+/* { dg-options "-O0 -fno-omit-frame-pointer" } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned int old_sp, new_sp;
+
+  old_sp = __nds32__get_current_sp ();
+  new_sp = old_sp - 4;
+  __nds32__set_current_sp (new_sp);
+  new_sp = __nds32__get_current_sp ();
+
+  if (new_sp != (old_sp - 4))
+    abort ();
+
+  new_sp = new_sp + 4;
+  __nds32__set_current_sp (new_sp);
+  new_sp = __nds32__get_current_sp ();
+
+  if (new_sp != old_sp)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-string-ffb.c b/gcc/testsuite/gcc.target/nds32/builtin-string-ffb.c
new file mode 100644
index 0000000..cf02434
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-string-ffb.c
@@ -0,0 +1,28 @@
+/* This is a test program for ffb instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_string } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned int a = 0x1b2a3d4c;
+  unsigned int b = 0x0000003d;
+  int r;
+
+  r =  __nds32__ffb (a, b);
+
+#ifdef __NDS32_EL__
+  if (r != -3)
+    abort ();
+#else
+  if (r != -2)
+    abort ();
+#endif
+
+  exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-string-ffmism.c b/gcc/testsuite/gcc.target/nds32/builtin-string-ffmism.c
new file mode 100644
index 0000000..b2fb008
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-string-ffmism.c
@@ -0,0 +1,28 @@
+/* This is a test program for ffmism instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_string } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned int a = 0x1b2a3d4c;
+  unsigned int b = 0x112a334c;
+  int r;
+
+  r = __nds32__ffmism (a, b);
+
+#ifdef __NDS32_EL__
+  if (r != -3)
+    abort ();
+#else
+  if (r != -4)
+    abort ();
+#endif
+
+  exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-string-flmism.c b/gcc/testsuite/gcc.target/nds32/builtin-string-flmism.c
new file mode 100644
index 0000000..105fce5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-string-flmism.c
@@ -0,0 +1,28 @@
+/* This is a test program for flmism instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+/* { dg-require-effective-target nds32_ext_string } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned int a = 0x1b2a3d4c;
+  unsigned int b = 0x112a334c;
+  int r;
+
+  r = __nds32__flmism (a, b);
+
+#ifdef __NDS32_EL__
+  if (r != -1)
+    abort ();
+#else
+  if (r != -2)
+    abort ();
+#endif
+
+  exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-unaligned-s16x2.c b/gcc/testsuite/gcc.target/nds32/builtin-unaligned-s16x2.c
new file mode 100644
index 0000000..5a2e8b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-unaligned-s16x2.c
@@ -0,0 +1,36 @@
+/* This is a test program for smbb instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+int
+main (void)
+{
+  char data[] = {0x55,0x66,0x77,0x88};
+  short* short_data = (short*)& data[1];
+  int16x2_t test_short = {0x1111, 0xaaaa};
+  int16x2_t vecdata =  __nds32__get_unaligned_s16x2 (short_data);
+
+#ifdef __NDS32_EL__
+  if (vecdata[0] != 0x7766)
+    abort ();
+#else
+  if (vecdata[0] != 0x6677)
+    abort ();
+#endif
+
+  __nds32__put_unaligned_s16x2 (short_data, test_short);
+  vecdata =  __nds32__get_unaligned_s16x2 (short_data);
+
+  if (vecdata[0] != 0x1111
+      & vecdata[1] != 0xaaaa)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-unaligned-s8x4.c b/gcc/testsuite/gcc.target/nds32/builtin-unaligned-s8x4.c
new file mode 100644
index 0000000..f6cb4c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-unaligned-s8x4.c
@@ -0,0 +1,36 @@
+/* This is a test program for smbb instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+int
+main (void)
+{
+  char data[] = {0x55,0x66,0x77,0x88};
+  char* char_data = (char*)& data[1];
+  int8x4_t test_char = {0x11, 0x22, 0xaa, 0xbb};
+  int8x4_t vecdata =  __nds32__get_unaligned_s8x4 (char_data);
+
+#ifdef __NDS32_EL__
+  if (vecdata[0] != 0x66)
+    abort ();
+#else
+  if (vecdata[0] != 0x66)
+    abort ();
+#endif
+
+  __nds32__put_unaligned_s8x4 (char_data, test_char);
+  vecdata =  __nds32__get_unaligned_s8x4 (char_data);
+
+  if (vecdata[0] != 0x11
+      & vecdata[3] != 0xaa)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-unaligned-u16x2.c b/gcc/testsuite/gcc.target/nds32/builtin-unaligned-u16x2.c
new file mode 100644
index 0000000..63ebd40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-unaligned-u16x2.c
@@ -0,0 +1,36 @@
+/* This is a test program for smbb instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+int
+main (void)
+{
+  unsigned char data[] = {0x55,0x66,0x77,0x88};
+  unsigned short* short_data = (unsigned short*)& data[1];
+  uint16x2_t test_short = {0x1111, 0xaaaa};
+  uint16x2_t vecdata =  __nds32__get_unaligned_u16x2 (short_data);
+
+#ifdef __NDS32_EL__
+  if (vecdata[0] != 0x7766)
+    abort ();
+#else
+  if (vecdata[0] != 0x6677)
+    abort ();
+#endif
+
+  __nds32__put_unaligned_u16x2 (short_data, test_short);
+  vecdata =  __nds32__get_unaligned_u16x2 (short_data);
+
+  if (vecdata[0] != 0x1111
+      & vecdata[1] != 0xaaaa)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-unaligned-u8x4.c b/gcc/testsuite/gcc.target/nds32/builtin-unaligned-u8x4.c
new file mode 100644
index 0000000..7b48274
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-unaligned-u8x4.c
@@ -0,0 +1,36 @@
+/* This is a test program for smbb instruction.  */
+
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+#ifdef __NDS32_EXT_DSP__
+int
+main (void)
+{
+  char data[] = {0x55,0x66,0x77,0x88};
+  unsigned char* char_data = (char*)& data[1];
+  uint8x4_t test_char = {0x11, 0x22, 0xaa, 0xbb};
+  uint8x4_t vecdata =  __nds32__get_unaligned_u8x4 (char_data);
+
+#ifdef __NDS32_EL__
+  if (vecdata[0] != 0x66)
+    abort ();
+#else
+  if (vecdata[0] != 0x66)
+    abort ();
+#endif
+
+  __nds32__put_unaligned_u8x4 (char_data, test_char);
+  vecdata =  __nds32__get_unaligned_u8x4 (char_data);
+
+  if (vecdata[0] != 0x11
+      & vecdata[3] != 0xaa)
+    abort ();
+  else
+    exit (0);
+}
+#else
+int main(){return 0;}
+#endif
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-unaligned_dw.c b/gcc/testsuite/gcc.target/nds32/builtin-unaligned_dw.c
new file mode 100644
index 0000000..42640b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-unaligned_dw.c
@@ -0,0 +1,31 @@
+/* This is a test program for unaligned double word access.  */
+
+/* { dg-do run } */
+/* { dg-options "-O0 -std=c99" } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned char data[] = {0x55, 0x66, 0x77, 0x88, 0xAA,
+			  0xBB, 0xCC, 0xDD, 0xEE, 0xFF};
+  unsigned long long* long_long_data = (unsigned long long*) & data[1];
+  unsigned long long test_long_long = 0x1122334455667788LL;
+
+#ifdef __NDS32_EL__
+  if (__nds32__get_unaligned_dw (long_long_data) != 0xEEDDCCBBAA887766LL)
+    abort ();
+#else
+  if (__nds32__get_unaligned_dw (long_long_data) != 0x667788AABBCCDDEELL)
+    abort ();
+#endif
+
+  __nds32__put_unaligned_dw (long_long_data, test_long_long);
+
+  if (__nds32__get_unaligned_dw (long_long_data) != 0x1122334455667788LL)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-unaligned_hw.c b/gcc/testsuite/gcc.target/nds32/builtin-unaligned_hw.c
new file mode 100644
index 0000000..f9e1ceb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-unaligned_hw.c
@@ -0,0 +1,30 @@
+/* This is a test program for unaligned half word access.  */
+
+/* { dg-do run } */
+/* { dg-options "-O0" } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned char data[] = {0x55,0x66,0x77,0x88};
+  unsigned short* short_data = (unsigned short*)& data[1];
+  unsigned short test_short = 0x5566;
+
+#ifdef __NDS32_EL__
+  if (__nds32__get_unaligned_hw (short_data) != 0x7766)
+    abort ();
+#else
+  if (__nds32__get_unaligned_hw (short_data) != 0x6677)
+    abort ();
+#endif
+
+  __nds32__put_unaligned_hw (short_data, test_short);
+
+  if (__nds32__get_unaligned_hw (short_data) != 0x5566)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-unaligned_w.c b/gcc/testsuite/gcc.target/nds32/builtin-unaligned_w.c
new file mode 100644
index 0000000..40d8711
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-unaligned_w.c
@@ -0,0 +1,30 @@
+/* This is a test program for unaligned word access.  */
+
+/* { dg-do run } */
+/* { dg-options "-O0 -std=c99" } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned char data[] = {0x55,0x66,0x77,0x88,0xAA,0xBB,0xCC,0xDD};
+  unsigned int* int_data = (unsigned int*)& data[1];
+  unsigned int test_int = 0x55667788;
+
+#ifdef __NDS32_EL__
+  if (__nds32__get_unaligned_w (int_data) != 0xAA887766)
+    abort ();
+#else
+  if (__nds32__get_unaligned_w (int_data) != 0x667788AA)
+    abort ();
+#endif
+
+  __nds32__put_unaligned_w (int_data, test_int);
+
+  if (__nds32__get_unaligned_w (int_data) != 0x55667788)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/builtin-wsbh.c b/gcc/testsuite/gcc.target/nds32/builtin-wsbh.c
new file mode 100644
index 0000000..1cee2ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/builtin-wsbh.c
@@ -0,0 +1,21 @@
+/* This is a test program for wsbh instruction.  */
+
+/* { dg-do run } */
+/* { dg-options "-O0" } */
+
+#include <nds32_intrinsic.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  unsigned int a = 0x03020100;
+  unsigned int b;
+
+  b = __nds32__wsbh (a);
+
+  if (b != 0x02030001)
+    abort ();
+  else
+    exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-all-pending.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-all-pending.c
new file mode 100644
index 0000000..0e57831
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-all-pending.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <nds32_intrinsic.h>
+
+int
+main (void)
+{
+  int a = __nds32__get_all_pending_int ();
+  return a;
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-cctl.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-cctl.c
new file mode 100644
index 0000000..2af55f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-cctl.c
@@ -0,0 +1,29 @@
+/* Verify that we generate cache control instruction with builtin function.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0" } */
+/* { dg-final { scan-assembler "L1D_VA_INVAL" } } */
+/* { dg-final { scan-assembler "L1D_VA_INVAL" } } */
+/* { dg-final { scan-assembler "L1D_INVALALL" } } */
+/* { dg-final { scan-assembler "L1D_IX_WWD" } } */
+/* { dg-final { scan-assembler "L1D_IX_RWD" } } */
+/* { dg-final { scan-assembler "PFM_CTL" } } */
+/* { dg-final { scan-assembler "PFM_CTL" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  unsigned int va = 0;
+
+  __nds32__cctlva_lck (NDS32_CCTL_L1D_VA_FILLCK, &va);
+  __nds32__cctlidx_wbinval (NDS32_CCTL_L1D_IX_WBINVAL, va);
+  __nds32__cctlva_wbinval_alvl (NDS32_CCTL_L1D_VA_INVAL, &va);
+  __nds32__cctlva_wbinval_one_lvl (NDS32_CCTL_L1D_VA_INVAL, &va);
+  __nds32__cctl_l1d_invalall ();
+  __nds32__cctlidx_write (NDS32_CCTL_L1D_IX_WWD, va, 1);
+  __nds32__cctlidx_read (NDS32_CCTL_L1D_IX_RWD, 1);
+  __nds32__mtusr (0, NDS32_USR_PFM_CTL);
+  __nds32__mfusr (NDS32_USR_PFM_CTL);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-clr-pending-hw.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-clr-pending-hw.c
new file mode 100644
index 0000000..fce90e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-clr-pending-hw.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <nds32_intrinsic.h>
+
+void
+main (void)
+{
+ __nds32__clr_pending_hwint (NDS32_INT_H0);
+ __nds32__clr_pending_hwint (NDS32_INT_H1);
+ __nds32__clr_pending_hwint (NDS32_INT_H2);
+
+ __nds32__clr_pending_hwint (NDS32_INT_H15);
+ __nds32__clr_pending_hwint (NDS32_INT_H16);
+ __nds32__clr_pending_hwint (NDS32_INT_H31);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-clr-pending.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-clr-pending.c
new file mode 100644
index 0000000..08e1dd0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-clr-pending.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <nds32_intrinsic.h>
+
+void
+main (void)
+{
+  __nds32__clr_pending_swint ();
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-disable.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-disable.c
new file mode 100644
index 0000000..a3a1f44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-disable.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <nds32_intrinsic.h>
+
+void
+main (void)
+{
+  __nds32__disable_int (NDS32_INT_H15);
+  __nds32__disable_int (NDS32_INT_H16);
+  __nds32__disable_int (NDS32_INT_H31);
+  __nds32__disable_int (NDS32_INT_SWI);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-dpref.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-dpref.c
new file mode 100644
index 0000000..38cf822
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-dpref.c
@@ -0,0 +1,24 @@
+/* Verify that we generate data prefetch instruction with builtin function.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0" } */
+/* { dg-final { scan-assembler "dpref\\tSRD" } } */
+/* { dg-final { scan-assembler "dpref\\tSRD" } } */
+/* { dg-final { scan-assembler "dpref\\tSRD" } } */
+/* { dg-final { scan-assembler "dpref\\tSRD" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  unsigned char dpref_q = 0;
+  unsigned short dpref_h = 0;
+  unsigned int dpref_w = 0;
+  unsigned long long dpref_dw = 0;
+
+  __nds32__dpref_qw (&dpref_q, 0, NDS32_DPREF_SRD);
+  __nds32__dpref_hw (&dpref_h, 0, NDS32_DPREF_SRD);
+  __nds32__dpref_w (&dpref_w, 0, NDS32_DPREF_SRD);
+  __nds32__dpref_dw (&dpref_dw, 0, NDS32_DPREF_SRD);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-enable.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-enable.c
new file mode 100644
index 0000000..e18ed7a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-enable.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <nds32_intrinsic.h>
+
+void
+main (void)
+{
+  __nds32__enable_int (NDS32_INT_H15);
+  __nds32__enable_int (NDS32_INT_H16);
+  __nds32__enable_int (NDS32_INT_H31);
+  __nds32__enable_int (NDS32_INT_SWI);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-get-pending-int.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-get-pending-int.c
new file mode 100644
index 0000000..4ced0a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-get-pending-int.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <nds32_intrinsic.h>
+
+int
+main (void)
+{
+  int a = __nds32__get_pending_int (NDS32_INT_H15);
+  int b = __nds32__get_pending_int (NDS32_INT_SWI);
+  int c = __nds32__get_pending_int (NDS32_INT_H16);
+
+  return a + b + c;
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-get-trig.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-get-trig.c
new file mode 100644
index 0000000..a394a60
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-get-trig.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <nds32_intrinsic.h>
+
+int
+main (void)
+{
+  int a = __nds32__get_trig_type (NDS32_INT_H0);
+  int b = __nds32__get_trig_type (NDS32_INT_H15);
+  int c = __nds32__get_trig_type (NDS32_INT_H16);
+  int d = __nds32__get_trig_type (NDS32_INT_H31);
+  return a + b + c + d;
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-isb.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-isb.c
new file mode 100644
index 0000000..c699966
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-isb.c
@@ -0,0 +1,13 @@
+/* Verify that we generate isb instruction with builtin function.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0" } */
+/* { dg-final { scan-assembler "\\tisb" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  __nds32__isb ();
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-isync.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-isync.c
new file mode 100644
index 0000000..0c312e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-isync.c
@@ -0,0 +1,14 @@
+/* Verify that we generate isync instruction with builtin function.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0" } */
+/* { dg-final { scan-assembler "\\tisync" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  int *addr = (int *) 0x53000000;
+  __nds32__isync (addr);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-load-store.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-load-store.c
new file mode 100644
index 0000000..fc15716
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-load-store.c
@@ -0,0 +1,25 @@
+/* Verify that we generate llw/lwup/scw/swup instruction
+   with builtin function.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target nds32_no_v3m } */
+/* { dg-options "-O0" } */
+/* { dg-final { scan-assembler "\\tllw" } } */
+/* { dg-final { scan-assembler "\\tlwup" } } */
+/* { dg-final { scan-assembler "\\tscw" } } */
+/* { dg-final { scan-assembler "\\tswup" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  int a = 0;
+  int b = 0;
+  unsigned int cc = 0;
+
+  __nds32__llw (&a);
+  cc = __nds32__lwup (&a);
+  __nds32__scw (&a, b);
+  __nds32__swup (&a, b);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-lto.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-lto.c
new file mode 100644
index 0000000..fbebcb6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-lto.c
@@ -0,0 +1,28 @@
+/* Verify that we use -flto option to generate instructions
+   with builtin function.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0 -flto" } */
+/* { dg-final { scan-assembler "\\tdsb" } } */
+/* { dg-final { scan-assembler "\\tisb" } } */
+/* { dg-final { scan-assembler "\\tmsync\\tall" } } */
+/* { dg-final { scan-assembler "\\tmsync\\tstore" } } */
+/* { dg-final { scan-assembler "\\tnop" } } */
+/* { dg-final { scan-assembler "\\tstandby\\tno_wake_grant" } } */
+/* { dg-final { scan-assembler "\\tstandby\\twake_grant" } } */
+/* { dg-final { scan-assembler "\\tstandby\\twait_done" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  __nds32__dsb ();
+  __nds32__isb ();
+  __nds32__msync_all ();
+  __nds32__msync_store ();
+  __nds32__nop ();
+  __nds32__standby_no_wake_grant ();
+  __nds32__standby_wake_grant ();
+  __nds32__standby_wait_done ();
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-machine-sva.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-machine-sva.c
new file mode 100644
index 0000000..f927c72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-machine-sva.c
@@ -0,0 +1,16 @@
+/* Verify that we generate sva instruction with builtin function.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0" } */
+/* { dg-final { scan-assembler "\\tsva" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  int a, b;
+  char c;
+
+  c = __nds32__sva (a, b);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-machine-svs.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-machine-svs.c
new file mode 100644
index 0000000..f998491
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-machine-svs.c
@@ -0,0 +1,16 @@
+/* Verify that we generate svs instruction with builtin function.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0" } */
+/* { dg-final { scan-assembler "\\tsvs" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  int a, b;
+  char c;
+
+  c = __nds32__svs (a, b);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-mfsr-mtsr.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-mfsr-mtsr.c
new file mode 100644
index 0000000..f069507
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-mfsr-mtsr.c
@@ -0,0 +1,17 @@
+/* Verify that we generate mfsr/mtsr instruction with builtin function.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0" } */
+/* { dg-final { scan-assembler "\\tmfsr" } } */
+/* { dg-final { scan-assembler "\\tmtsr" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  int ipsw_value;
+
+  ipsw_value = __nds32__mfsr (__NDS32_REG_IPSW__);
+  __nds32__mtsr (ipsw_value, __NDS32_REG_IPSW__);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-mfusr-mtusr.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-mfusr-mtusr.c
new file mode 100644
index 0000000..d6d069b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-mfusr-mtusr.c
@@ -0,0 +1,17 @@
+/* Verify that we generate mfusr/mtusr instruction with builtin function.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0" } */
+/* { dg-final { scan-assembler "\\tmfusr" } } */
+/* { dg-final { scan-assembler "\\tmtusr" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  int itype_value;
+
+  itype_value = __nds32__mfusr (__NDS32_REG_ITYPE__);
+  __nds32__mtusr (itype_value, __NDS32_REG_ITYPE__);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-misc.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-misc.c
new file mode 100644
index 0000000..a11f6d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-misc.c
@@ -0,0 +1,39 @@
+/* Verify that we generate other instructions with builtin function.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0" } */
+/* { dg-final { scan-assembler "\\tbreak" } } */
+/* { dg-final { scan-assembler "\\tdsb" } } */
+/* { dg-final { scan-assembler "\\tisb" } } */
+/* { dg-final { scan-assembler "\\tisync" } } */
+/* { dg-final { scan-assembler "\\tmsync\\tall" } } */
+/* { dg-final { scan-assembler "\\tmsync\\tstore" } } */
+/* { dg-final { scan-assembler "\\tnop" } } */
+/* { dg-final { scan-assembler "\\tstandby\\tno_wake_grant" } } */
+/* { dg-final { scan-assembler "\\tstandby\\twake_grant" } } */
+/* { dg-final { scan-assembler "\\tstandby\\twait_done" } } */
+/* { dg-final { scan-assembler "\\tteqz" } } */
+/* { dg-final { scan-assembler "\\ttnez" } } */
+/* { dg-final { scan-assembler "\\ttrap" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  int a = 0;
+
+  __nds32__break (2);
+  __nds32__dsb ();
+  __nds32__isb ();
+  __nds32__isync (&a);
+  __nds32__msync_all ();
+  __nds32__msync_store ();
+  __nds32__nop ();
+  __nds32__standby_no_wake_grant ();
+  __nds32__standby_wake_grant ();
+  __nds32__standby_wait_done ();
+  __nds32__teqz (a, 2);
+  __nds32__tnez (a, 2);
+  __nds32__trap (2);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-mtsr-dsb.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-mtsr-dsb.c
new file mode 100644
index 0000000..226d627
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-mtsr-dsb.c
@@ -0,0 +1,14 @@
+/* Verify that we generate mtsr and dsb instruction with builtin function.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0" } */
+/* { dg-final { scan-assembler "\\tmtsr" } } */
+/* { dg-final { scan-assembler "\\tdsb" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+main (void)
+{
+  __nds32__mtsr_dsb (1, NDS32_SR_ILMB);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-mtsr-isb.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-mtsr-isb.c
new file mode 100644
index 0000000..e8b1f98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-mtsr-isb.c
@@ -0,0 +1,14 @@
+/* Verify that we generate mtsr and isb instruction with builtin function.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0" } */
+/* { dg-final { scan-assembler "\\tmtsr" } } */
+/* { dg-final { scan-assembler "\\tisb" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+main (void)
+{
+  __nds32__mtsr_isb (1, NDS32_SR_ILMB);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-priority.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-priority.c
new file mode 100644
index 0000000..c2ec6f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-priority.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <nds32_intrinsic.h>
+
+int
+main (void)
+{
+  __nds32__set_int_priority (NDS32_INT_H0,  0);
+  __nds32__set_int_priority (NDS32_INT_H15, 3);
+  __nds32__set_int_priority (NDS32_INT_H31, 3);
+
+  int a =  __nds32__get_int_priority (NDS32_INT_H0);
+  int b =  __nds32__get_int_priority (NDS32_INT_H15);
+  int c =  __nds32__get_int_priority (NDS32_INT_H31);
+
+  return a + b + c;
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-set-pending.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-set-pending.c
new file mode 100644
index 0000000..f10b83d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-set-pending.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <nds32_intrinsic.h>
+
+int
+main (void)
+{
+  __nds32__set_pending_swint ();
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-set-trig-edge.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-set-trig-edge.c
new file mode 100644
index 0000000..bd8178c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-set-trig-edge.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <nds32_intrinsic.h>
+
+void
+main (void)
+{
+  __nds32__set_trig_type_edge (NDS32_INT_H0);
+  __nds32__set_trig_type_edge (NDS32_INT_H15);
+  __nds32__set_trig_type_edge (NDS32_INT_H16);
+  __nds32__set_trig_type_edge (NDS32_INT_H31);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-set-trig-level.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-set-trig-level.c
new file mode 100644
index 0000000..1780543
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-set-trig-level.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <nds32_intrinsic.h>
+
+void
+main (void)
+{
+  __nds32__set_trig_type_level (NDS32_INT_H0);
+  __nds32__set_trig_type_level (NDS32_INT_H15);
+  __nds32__set_trig_type_level (NDS32_INT_H16);
+  __nds32__set_trig_type_level (NDS32_INT_H31);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-setgie-dis.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-setgie-dis.c
new file mode 100644
index 0000000..e143d3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-setgie-dis.c
@@ -0,0 +1,13 @@
+/* Verify that we generate setgie.d instruction with builtin function.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0" } */
+/* { dg-final { scan-assembler "\\tsetgie.d" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  __nds32__setgie_dis ();
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-setgie-en.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-setgie-en.c
new file mode 100644
index 0000000..ed95782
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-setgie-en.c
@@ -0,0 +1,13 @@
+/* Verify that we generate setgie.e instruction with builtin function.  */
+
+/* { dg-do compile */
+/* { dg-options "-O0" } */
+/* { dg-final { scan-assembler "\\tsetgie.e" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  __nds32__setgie_en ();
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-add16.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-add16.c
new file mode 100644
index 0000000..49fca46
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-add16.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "kadd16" } } */
+/* { dg-final { scan-assembler "kadd16" } } */
+/* { dg-final { scan-assembler "ukadd16" } } */
+/* { dg-final { scan-assembler "ukadd16" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  unsigned int r, a, b;
+  int16x2_t vr, va, vb;
+  uint16x2_t v_ur, v_ua, v_ub;
+
+  r = __nds32__kadd16 (a, b);
+  vr = __nds32__v_kadd16 (va, vb);
+
+  r = __nds32__ukadd16 (a, b);
+  v_ur = __nds32__v_ukadd16 (v_ua, v_ub);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-add64.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-add64.c
new file mode 100644
index 0000000..1f33a42
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-add64.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "kadd64" } } */
+/* { dg-final { scan-assembler "ukadd64" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  long long r, a, b;
+  unsigned long long ur, ua, ub;
+
+  r = __nds32__kadd64 (a, b);
+  ur = __nds32__ukadd64 (ua, ub);
+
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-add8.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-add8.c
new file mode 100644
index 0000000..1f2d226
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-add8.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "kadd8" } } */
+/* { dg-final { scan-assembler "kadd8" } } */
+/* { dg-final { scan-assembler "ukadd8" } } */
+/* { dg-final { scan-assembler "ukadd8" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  unsigned int r, a, b;
+  int8x4_t vr, va, vb;
+  uint8x4_t v_ur, v_ua, v_ub;
+
+  r = __nds32__kadd8 (a, b);
+  vr = __nds32__v_kadd8 (va, vb);
+
+  r = __nds32__ukadd8 (a, b);
+  v_ur = __nds32__v_ukadd8 (v_ua, v_ub);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-cras16.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-cras16.c
new file mode 100644
index 0000000..89c7e6d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-cras16.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "kcras16" } } */
+/* { dg-final { scan-assembler "kcras16" } } */
+/* { dg-final { scan-assembler "ukcras16" } } */
+/* { dg-final { scan-assembler "ukcras16" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  unsigned int r, a, b;
+  int16x2_t vr, va, vb;
+  uint16x2_t v_ur, v_ua, v_ub;
+
+  r = __nds32__kcras16 (a, b);
+  vr = __nds32__v_kcras16 (va, vb);
+
+  r = __nds32__ukcras16 (a, b);
+  v_ur = __nds32__v_ukcras16 (v_ua, v_ub);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-crsa16.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-crsa16.c
new file mode 100644
index 0000000..beaa69a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-crsa16.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "kcrsa16" } } */
+/* { dg-final { scan-assembler "kcrsa16" } } */
+/* { dg-final { scan-assembler "ukcrsa16" } } */
+/* { dg-final { scan-assembler "ukcrsa16" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  unsigned int r, a, b;
+  int16x2_t vr, va, vb;
+  uint16x2_t v_ur, v_ua, v_ub;
+
+  r = __nds32__kcrsa16 (a, b);
+  vr = __nds32__v_kcrsa16 (va, vb);
+
+  r = __nds32__ukcrsa16 (a, b);
+  v_ur = __nds32__v_ukcrsa16 (v_ua, v_ub);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-kabs8.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-kabs8.c
new file mode 100644
index 0000000..de2e3c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-kabs8.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "kabs8" } } */
+/* { dg-final { scan-assembler "kabs8" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  unsigned int r, a;
+  int8x4_t vr, va;
+
+  r = __nds32__kabs8 (a);
+  vr = __nds32__v_kabs8 (va);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-ksll.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-ksll.c
new file mode 100644
index 0000000..316b10c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-ksll.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "ksll" } } */
+/* { dg-final { scan-assembler "kslli" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  int r, a;
+  unsigned int b;
+
+  r = __nds32__ksll (a, b);
+  r = __nds32__ksll (a, 0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-ksll16.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-ksll16.c
new file mode 100644
index 0000000..be9a08e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-ksll16.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "ksll16" } } */
+/* { dg-final { scan-assembler "ksll16" } } */
+/* { dg-final { scan-assembler "kslli16" } } */
+/* { dg-final { scan-assembler "kslli16" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  unsigned int r, a, b;
+  int16x2_t vr, va;
+
+  r = __nds32__ksll16 (a, b);
+  vr = __nds32__v_ksll16 (va, b);
+
+  r = __nds32__ksll16 (a, 0);
+  vr = __nds32__v_ksll16 (va, 0);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-kslrawu.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-kslrawu.c
new file mode 100644
index 0000000..4eb03e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-kslrawu.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "kslraw.u" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  int r, a;
+  unsigned int b;
+
+  r = __nds32__kslraw_u (a, b);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-mar64.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-mar64.c
new file mode 100644
index 0000000..79a3eb3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-mar64.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "kmar64" } } */
+/* { dg-final { scan-assembler "ukmar64" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  long long r, a, b;
+  unsigned long long ur, ua, ub;
+
+  r = __nds32__kmar64 (r, a, b);
+  ur = __nds32__ukmar64 (ur, ua, ub);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-misc16.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-misc16.c
new file mode 100644
index 0000000..272e922
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-misc16.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "sclip16" } } */
+/* { dg-final { scan-assembler "sclip16" } } */
+/* { dg-final { scan-assembler "uclip16" } } */
+/* { dg-final { scan-assembler "uclip16" } } */
+/* { dg-final { scan-assembler "khm16" } } */
+/* { dg-final { scan-assembler "khm16" } } */
+/* { dg-final { scan-assembler "khmx16" } } */
+/* { dg-final { scan-assembler "khmx16" } } */
+/* { dg-final { scan-assembler "kabs16" } } */
+/* { dg-final { scan-assembler "kabs16" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  unsigned int r, a, b;
+  int16x2_t vr, va, vb;
+
+  r = __nds32__sclip16 (a, 0);
+  vr = __nds32__v_sclip16 (va, 0);
+
+  r = __nds32__uclip16 (a, 0);
+  vr = __nds32__v_uclip16 (va, 0);
+
+  r = __nds32__khm16 (a, b);
+  vr = __nds32__v_khm16 (va, vb);
+
+  r = __nds32__khmx16 (a, b);
+  vr = __nds32__v_khmx16 (va, vb);
+
+  r = __nds32__kabs16 (a);
+  vr = __nds32__v_kabs16 (va);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-msr64.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-msr64.c
new file mode 100644
index 0000000..2ad64fa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-msr64.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "kmsr64" } } */
+/* { dg-final { scan-assembler "ukmsr64" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  long long r, a, b;
+  unsigned long long ur, ua, ub;
+
+  r = __nds32__kmsr64 (r, a, b);
+  ur = __nds32__ukmsr64 (ur, ua, ub);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-msw16.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-msw16.c
new file mode 100644
index 0000000..d7ccecb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-msw16.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "kmmawb" } } */
+/* { dg-final { scan-assembler "kmmawb" } } */
+/* { dg-final { scan-assembler "kmmawb.u" } } */
+/* { dg-final { scan-assembler "kmmawb.u" } } */
+/* { dg-final { scan-assembler "kmmawt" } } */
+/* { dg-final { scan-assembler "kmmawt" } } */
+/* { dg-final { scan-assembler "kmmawt.u" } } */
+/* { dg-final { scan-assembler "kmmawt.u" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  int r, a;
+  unsigned int b;
+  int16x2_t vb;
+
+  r = __nds32__kmmawb (r, a, b);
+  r = __nds32__v_kmmawb (r, a, vb);
+
+  r = __nds32__kmmawb_u (r, a, b);
+  r = __nds32__v_kmmawb_u (r, a, vb);
+
+  r = __nds32__kmmawt (r, a, b);
+  r = __nds32__v_kmmawt (r, a, vb);
+
+  r = __nds32__kmmawt_u (r, a, b);
+  r = __nds32__v_kmmawt_u (r, a, vb);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-msw32.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-msw32.c
new file mode 100644
index 0000000..64d8d4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-msw32.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "kmmac" } } */
+/* { dg-final { scan-assembler "kmmac.u" } } */
+/* { dg-final { scan-assembler "kmmsb" } } */
+/* { dg-final { scan-assembler "kmmsb.u" } } */
+/* { dg-final { scan-assembler "kwmmul" } } */
+/* { dg-final { scan-assembler "kwmmul.u" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  int r, a, b;
+  r = __nds32__kmmac (r, a, b);
+  r = __nds32__kmmac_u (r, a, b);
+
+  r = __nds32__kmmsb (r, a, b);
+  r = __nds32__kmmsb_u (r, a, b);
+
+  r = __nds32__kwmmul (a, b);
+  r = __nds32__kwmmul_u (a, b);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-smul16x32.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-smul16x32.c
new file mode 100644
index 0000000..0d2b87f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-smul16x32.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "kmda" } } */
+/* { dg-final { scan-assembler "kmda" } } */
+/* { dg-final { scan-assembler "kmxda" } } */
+/* { dg-final { scan-assembler "kmxda" } } */
+/* { dg-final { scan-assembler "kmabb" } } */
+/* { dg-final { scan-assembler "kmabb" } } */
+/* { dg-final { scan-assembler "kmabt" } } */
+/* { dg-final { scan-assembler "kmabt" } } */
+/* { dg-final { scan-assembler "kmatt" } } */
+/* { dg-final { scan-assembler "kmatt" } } */
+/* { dg-final { scan-assembler "kmada" } } */
+/* { dg-final { scan-assembler "kmada" } } */
+/* { dg-final { scan-assembler "kmaxda" } } */
+/* { dg-final { scan-assembler "kmaxda" } } */
+/* { dg-final { scan-assembler "kmads" } } */
+/* { dg-final { scan-assembler "kmads" } } */
+/* { dg-final { scan-assembler "kmadrs" } } */
+/* { dg-final { scan-assembler "kmadrs" } } */
+/* { dg-final { scan-assembler "kmaxds" } } */
+/* { dg-final { scan-assembler "kmaxds" } } */
+/* { dg-final { scan-assembler "kmsda" } } */
+/* { dg-final { scan-assembler "kmsda" } } */
+/* { dg-final { scan-assembler "kmsxda" } } */
+/* { dg-final { scan-assembler "kmsxda" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  int r;
+  unsigned int a, b;
+  int16x2_t va, vb;
+
+  r = __nds32__kmda (a, b);
+  r = __nds32__v_kmda (va, vb);
+
+  r = __nds32__kmxda (a, b);
+  r = __nds32__v_kmxda (va, vb);
+
+  r = __nds32__kmabb (r, a, b);
+  r = __nds32__v_kmabb (r, va, vb);
+
+  r = __nds32__kmabt (r, a, b);
+  r = __nds32__v_kmabt (r, va, vb);
+
+  r = __nds32__kmatt (r, a, b);
+  r = __nds32__v_kmatt (r, va, vb);
+
+  r = __nds32__kmada (r, a, b);
+  r = __nds32__v_kmada (r, va, vb);
+
+  r = __nds32__kmaxda (r, a, b);
+  r = __nds32__v_kmaxda (r, va, vb);
+
+  r = __nds32__kmads (r, a, b);
+  r = __nds32__v_kmads (r, va, vb);
+
+  r = __nds32__kmadrs (r, a, b);
+  r = __nds32__v_kmadrs (r, va, vb);
+
+  r = __nds32__kmaxds (r, a, b);
+  r = __nds32__v_kmaxds (r, va, vb);
+
+  r = __nds32__kmsda (r, a, b);
+  r = __nds32__v_kmsda (r, va, vb);
+
+  r = __nds32__kmsxda (r, a, b);
+  r = __nds32__v_kmsxda (r, va, vb);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-sub16.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-sub16.c
new file mode 100644
index 0000000..ecea7bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-sub16.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "ksub16" } } */
+/* { dg-final { scan-assembler "ksub16" } } */
+/* { dg-final { scan-assembler "uksub16" } } */
+/* { dg-final { scan-assembler "uksub16" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  unsigned int r, a, b;
+  int16x2_t vr, va, vb;
+  uint16x2_t v_ur, v_ua, v_ub;
+
+  r = __nds32__ksub16 (a, b);
+  vr = __nds32__v_ksub16 (va, vb);
+
+  r = __nds32__uksub16 (a, b);
+  v_ur = __nds32__v_uksub16 (v_ua, v_ub);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-sub64.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-sub64.c
new file mode 100644
index 0000000..fae30e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-sub64.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "ksub64" } } */
+/* { dg-final { scan-assembler "uksub64" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  long long r, a, b;
+  unsigned long long ur, ua, ub;
+
+  r = __nds32__ksub64 (a, b);
+  ur = __nds32__uksub64 (ua, ub);
+
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-sub8.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-sub8.c
new file mode 100644
index 0000000..5e343e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-stura-sub8.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-mext-dsp" } */
+/* { dg-final { scan-assembler "ksub8" } } */
+/* { dg-final { scan-assembler "ksub8" } } */
+/* { dg-final { scan-assembler "uksub8" } } */
+/* { dg-final { scan-assembler "uksub8" } } */
+
+#include <nds32_intrinsic.h>
+
+void
+test (void)
+{
+  unsigned int r, a, b;
+  int8x4_t vr, va, vb;
+  uint8x4_t v_ur, v_ua, v_ub;
+
+  r = __nds32__ksub8 (a, b);
+  vr = __nds32__v_ksub8 (va, vb);
+
+  r = __nds32__uksub8 (a, b);
+  v_ur = __nds32__v_uksub8 (v_ua, v_ub);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/builtin-unaligned-feature.c b/gcc/testsuite/gcc.target/nds32/compile/builtin-unaligned-feature.c
new file mode 100644
index 0000000..6199109
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/builtin-unaligned-feature.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <nds32_intrinsic.h>
+
+int
+main ()
+{
+  unsigned unalign = __nds32__unaligned_feature ();
+   __nds32__enable_unaligned ();
+   __nds32__disable_unaligned ();
+  return unalign;
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/dsp-add-sub.c b/gcc/testsuite/gcc.target/nds32/compile/dsp-add-sub.c
new file mode 100644
index 0000000..704610e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/dsp-add-sub.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mext-dsp" } */
+/* { dg-final { scan-assembler "add8" } } */
+/* { dg-final { scan-assembler "add16" } } */
+/* { dg-final { scan-assembler "add64" } } */
+/* { dg-final { scan-assembler "sub8" } } */
+/* { dg-final { scan-assembler "sub16" } } */
+/* { dg-final { scan-assembler "sub64" } } */
+
+typedef signed char v4qi __attribute__ ((vector_size (4)));
+typedef short v2hi __attribute__ ((vector_size (4)));
+
+v4qi  __attribute__ ((noinline))
+add8 (v4qi a, v4qi b)
+{
+  return a + b;
+}
+
+v4qi  __attribute__ ((noinline))
+sub8 (v4qi a, v4qi b)
+{
+  return a - b;
+}
+
+v2hi  __attribute__ ((noinline))
+add16 (v2hi a, v2hi b)
+{
+  return a + b;
+}
+
+v2hi  __attribute__ ((noinline))
+sub16 (v2hi a, v2hi b)
+{
+  return a - b;
+}
+
+long long
+add64 (long long a, long long b)
+{
+  return a + b;
+}
+
+long long
+sub64 (long long a, long long b)
+{
+  return a - b;
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/dsp-bpick.c b/gcc/testsuite/gcc.target/nds32/compile/dsp-bpick.c
new file mode 100644
index 0000000..5f9d7de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/dsp-bpick.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mext-dsp" } */
+/* { dg-final { scan-assembler "bpick" } } */
+
+int bpick(int a, int b, int mask)
+{
+  return (a & mask) | (b & ~mask);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/dsp-mmul.c b/gcc/testsuite/gcc.target/nds32/compile/dsp-mmul.c
new file mode 100644
index 0000000..5c9cdeb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/dsp-mmul.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mext-dsp" } */
+/* { dg-final { scan-assembler "smmul" } } */
+
+typedef signed char v4qi __attribute__ ((vector_size (4)));
+typedef short v2hi __attribute__ ((vector_size (4)));
+
+int smmul(int a, int b)
+{
+  long long tmp = (long long)a * b;
+  return (int)((tmp >> 32) & 0xffffffffll);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/dsp-mulhisi.c b/gcc/testsuite/gcc.target/nds32/compile/dsp-mulhisi.c
new file mode 100644
index 0000000..856530b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/dsp-mulhisi.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mext-dsp" } */
+/* { dg-final { scan-assembler "smbb" } } */
+/* { dg-final { scan-assembler "smbt" } } */
+/* { dg-final { scan-assembler "smtt" } } */
+
+typedef signed char v4qi __attribute__ ((vector_size (4)));
+typedef short v2hi __attribute__ ((vector_size (4)));
+
+int smbb(v2hi a, v2hi b)
+{
+  return a[0] * b[0];
+}
+
+int smbt(v2hi a, v2hi b)
+{
+  return a[0] * b[1];
+}
+
+int smtt(v2hi a, v2hi b)
+{
+  return a[1] * b[1];
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/dsp-raddsub.c b/gcc/testsuite/gcc.target/nds32/compile/dsp-raddsub.c
new file mode 100644
index 0000000..4817637
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/dsp-raddsub.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mext-dsp" } */
+/* { dg-final { scan-assembler "raddw" } } */
+/* { dg-final { scan-assembler "rsubw" } } */
+/* { dg-final { scan-assembler "uraddw" } } */
+/* { dg-final { scan-assembler "ursubw" } } */
+
+int raddw(int a, int b)
+{
+  return (a + b) >> 1;
+}
+
+int rsubw(int a, int b)
+{
+  return (a - b) >> 1;
+}
+
+unsigned uraddw(unsigned a, unsigned b)
+{
+  return (a + b) >> 1;
+}
+
+unsigned ursubw(unsigned a, unsigned b)
+{
+  return (a - b) >> 1;
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/dsp-smals.c b/gcc/testsuite/gcc.target/nds32/compile/dsp-smals.c
new file mode 100644
index 0000000..f1dc684
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/dsp-smals.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mext-dsp" } */
+/* { dg-final { scan-assembler "smalbb" } } */
+/* { dg-final { scan-assembler "smalbt" } } */
+/* { dg-final { scan-assembler "smaltt" } } */
+/* { dg-final { scan-assembler "smal" } } */
+
+typedef signed char v4qi __attribute__ ((vector_size (4)));
+typedef short v2hi __attribute__ ((vector_size (4)));
+
+
+long long smalbb(long long acc, v2hi a, v2hi b)
+{
+  return acc + a[0] * b[0];
+}
+
+long long smalbt(long long acc, v2hi a, v2hi b)
+{
+  return acc + a[1] * b[0];
+}
+
+long long smaltt(long long acc, v2hi a, v2hi b)
+{
+  return acc + a[1] * b[1];
+}
+
+long long smal(v2hi a, long long b)
+{
+  return b + (long long)(a[0] * a[1]);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/dsp-smalxda.c b/gcc/testsuite/gcc.target/nds32/compile/dsp-smalxda.c
new file mode 100644
index 0000000..2fe606b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/dsp-smalxda.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mext-dsp" } */
+/* { dg-final { scan-assembler "smalxda" } } */
+/* { dg-final { scan-assembler "smalxds" } } */
+
+typedef signed char v4qi __attribute__ ((vector_size (4)));
+typedef short v2hi __attribute__ ((vector_size (4)));
+
+long long smalxda(long long acc, v2hi a, v2hi b)
+{
+  return acc + (a[0] * b[1] + a[1] * b[0]);
+}
+
+long long smalxds(long long acc, v2hi a, v2hi b)
+{
+  return acc + (a[1] * b[0] - a[0] * b[1]);
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/dsp-unpkd.c b/gcc/testsuite/gcc.target/nds32/compile/dsp-unpkd.c
new file mode 100644
index 0000000..2de7107
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/dsp-unpkd.c
@@ -0,0 +1,79 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mext-dsp" } */
+/* { dg-final { scan-assembler "sunpkd810" } } */
+/* { dg-final { scan-assembler "sunpkd820" } } */
+/* { dg-final { scan-assembler "sunpkd830" } } */
+/* { dg-final { scan-assembler "sunpkd831" } } */
+/* { dg-final { scan-assembler "zunpkd810" } } */
+/* { dg-final { scan-assembler "zunpkd820" } } */
+/* { dg-final { scan-assembler "zunpkd830" } } */
+/* { dg-final { scan-assembler "zunpkd831" } } */
+
+typedef signed char v4qi __attribute__ ((vector_size (4)));
+typedef short v2hi __attribute__ ((vector_size (4)));
+typedef unsigned char uv4qi __attribute__ ((vector_size (4)));
+typedef unsigned short uv2hi __attribute__ ((vector_size (4)));
+
+v2hi sunpkd810(v4qi v)
+{
+  v2hi ret;
+  ret[0] = v[0];
+  ret[1] = v[1];
+  return ret;
+}
+
+v2hi sunpkd820(v4qi v)
+{
+  v2hi ret;
+  ret[0] = v[0];
+  ret[1] = v[2];
+  return ret;
+}
+
+v2hi sunpkd830(v4qi v)
+{
+  v2hi ret;
+  ret[0] = v[0];
+  ret[1] = v[3];
+  return ret;
+}
+
+v2hi sunpkd831(v4qi v)
+{
+  v2hi ret;
+  ret[0] = v[1];
+  ret[1] = v[3];
+  return ret;
+}
+
+uv2hi zunpkd810(uv4qi v)
+{
+  uv2hi ret;
+  ret[0] = v[0];
+  ret[1] = v[1];
+  return ret;
+}
+
+uv2hi zunpkd820(uv4qi v)
+{
+  uv2hi ret;
+  ret[0] = v[0];
+  ret[1] = v[2];
+  return ret;
+}
+
+uv2hi zunpkd830(uv4qi v)
+{
+  uv2hi ret;
+  ret[0] = v[0];
+  ret[1] = v[3];
+  return ret;
+}
+
+uv2hi zunpkd831(uv4qi v)
+{
+  uv2hi ret;
+  ret[0] = v[1];
+  ret[1] = v[3];
+  return ret;
+}
diff --git a/gcc/testsuite/gcc.target/nds32/compile/scalbn-transform-1.c b/gcc/testsuite/gcc.target/nds32/compile/scalbn-transform-1.c
new file mode 100644
index 0000000..d456fa5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/scalbn-transform-1.c
@@ -0,0 +1,21 @@
+/* Verify scalbn transform pass for normal case.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-all -lm" } */
+/* { dg-require-effective-target nds32_soft_fp } */
+
+float test_scalbnf (float x)
+{
+  return x * 128;
+}
+
+double test_scalbn (double x)
+{
+  return x * 256;
+}
+
+/* { dg-final { scan-tree-dump "(_\[0-9\]+) = __builtin_scalbnf \\(x_\[0-9\]+\\(D\\), 7\\);\\s*_\[0-9\]+ = \\(float\\) \\1;" "scalbn_transform" } } */
+/* { dg-final { scan-tree-dump "(_\[0-9\]+) = __builtin_scalbn \\(x_\[0-9\]+\\(D\\), 8\\);\\s*_\[0-9\]+ = \\(double\\) \\1;" "scalbn_transform" } } */
+/* { dg-final { scan-tree-dump-not " \\* 1.28e\\+2" "scalbn_transform" } } */
+/* { dg-final { scan-tree-dump-not " \\* 2.56e\\+2" "scalbn_transform" } } */
+/* { dg-final { cleanup-tree-dump "*" } } */
diff --git a/gcc/testsuite/gcc.target/nds32/compile/scalbn-transform-2.c b/gcc/testsuite/gcc.target/nds32/compile/scalbn-transform-2.c
new file mode 100644
index 0000000..480cf23
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/scalbn-transform-2.c
@@ -0,0 +1,14 @@
+/* Verify scalbn transform pass for negative number case.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-all" } */
+/* { dg-require-effective-target nds32_soft_fp } */
+
+double test_neg_scalbn (double x)
+{
+  return x * -8;
+}
+
+/* { dg-final { scan-tree-dump "(_\[0-9\]+) = __builtin_scalbn \\(x_\[0-9\]+\\(D\\), 3\\);\\s*_\[0-9\]+ = -\\1;" "scalbn_transform" } } */
+/* { dg-final { scan-tree-dump-not " \\* -8.0e\\+0" "scalbn_transform" } } */
+/* { dg-final { cleanup-tree-dump "*" } } */
diff --git a/gcc/testsuite/gcc.target/nds32/compile/scalbn-transform-3.c b/gcc/testsuite/gcc.target/nds32/compile/scalbn-transform-3.c
new file mode 100644
index 0000000..256f31a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/scalbn-transform-3.c
@@ -0,0 +1,14 @@
+/* Verify scalbn transform pass for negative-exponent case.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-all" } */
+/* { dg-require-effective-target nds32_soft_fp } */
+
+double test_neg_exp_scalbnf (double x)
+{
+  return x * 0.0625;
+}
+
+/* { dg-final { scan-tree-dump "(_\[0-9\]+) = __builtin_scalbn \\(x_\[0-9\]+\\(D\\), -4\\);\\s*_\[0-9\]+ = \\(double\\) \\1;" "scalbn_transform" } } */
+/* { dg-final { scan-tree-dump-not " \\* 6.25e\\-2" "scalbn_transform" } } */
+/* { dg-final { cleanup-tree-dump "*" } } */
diff --git a/gcc/testsuite/gcc.target/nds32/compile/scalbn-transform-4.c b/gcc/testsuite/gcc.target/nds32/compile/scalbn-transform-4.c
new file mode 100644
index 0000000..b6ba596
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/scalbn-transform-4.c
@@ -0,0 +1,52 @@
+/* Verify scalbn transform pass for cases that can't be optimized.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-all" } */
+/* { dg-require-effective-target nds32_soft_fp } */
+
+#include "math.h"
+
+double test_filter_condition_1 (double x)
+{
+  return x * 0;
+}
+
+double test_filter_condition_2 (double x)
+{
+  return x * -0;
+}
+
+double test_filter_condition_3 (double x)
+{
+  return x * 485;
+}
+
+double test_filter_condition_4 (double x)
+{
+  return x * -85;
+}
+
+double test_filter_condition_5 (double x)
+{
+  return x * 0.12;
+}
+
+double test_filter_condition_6 (double x)
+{
+  return x * -INFINITY;
+}
+
+double test_filter_condition_7 (double x)
+{
+  return x * NAN;
+}
+
+/* { dg-final { scan-tree-dump-times "x_\[0-9\]+\\(D\\) \\* 0.0" 2 "scalbn_transform" } } */
+/* { dg-final { scan-tree-dump " \\* 4.85e\\+2" "scalbn_transform" } } */
+/* { dg-final { scan-tree-dump " \\* -8.5e\\+1" "scalbn_transform" } } */
+/* { dg-final { scan-tree-dump " \\* 1.19999" "scalbn_transform" } } */
+/* { dg-final { scan-tree-dump " \\*  -Inf" "scalbn_transform" } } */
+/* { dg-final { scan-tree-dump " \\*  Nan" "scalbn_transform" } } */
+/* { dg-final { scan-tree-dump-not "__builtin_scalbn" "scalbn_transform" } } */
+/* { dg-final { scan-tree-dump-times "No multiplication stmt is transformed" 7  "scalbn_transform" } } */
+/* { dg-final { cleanup-tree-dump "*" } } */
diff --git a/gcc/testsuite/gcc.target/nds32/compile/scalbn-transform-5.c b/gcc/testsuite/gcc.target/nds32/compile/scalbn-transform-5.c
new file mode 100644
index 0000000..874170e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/compile/scalbn-transform-5.c
@@ -0,0 +1,20 @@
+/* Verify scalbn transform pass for bug 11424 case.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-all" } */
+/* { dg-require-effective-target nds32_soft_fp } */
+
+typedef float float32_t;
+float32_t test_case (float32_t *pIn)
+{
+  float32_t in;
+  in = *pIn++;
+  in = (in * 128);
+  in += in > 0.0f ? 0.5f : -0.5f;
+
+  return in;
+}
+
+/* { dg-final { scan-tree-dump "(_\[0-9\]+) = __builtin_scalbnf \\(in_\[0-9\]+, 7\\);\\s*in_\[0-9\]+ = \\(float32_t\\) \\1;" "scalbn_transform" } } */
+/* { dg-final { scan-tree-dump-not "in_\[0-9\]+ = in_\[0-9\]+ \\* 1.28e\\+2" "scalbn_transform" } } */
+/* { dg-final { cleanup-tree-dump "*" } } */
diff --git a/gcc/testsuite/gcc.target/nds32/dsp-v2hi-packing00.c b/gcc/testsuite/gcc.target/nds32/dsp-v2hi-packing00.c
new file mode 100644
index 0000000..d1c61b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nds32/dsp-v2hi-packing00.c
@@ -0,0 +1,127 @@
+/* { dg-do run } */
+
+#include <nds32_intrinsic.h>
+
+int16x2_t packing01(int16x2_t x, int16x2_t y) __attribute__ ((noinline));
+int16x2_t packing01(int16x2_t x, int16x2_t y)
+{
+  int16x2_t ret;
+  ret[0] = x[0];
+  ret[1] = y[1];
+  return ret;
+}
+
+int16x2_t packing10(int16x2_t x, int16x2_t y) __attribute__ ((noinline));
+int16x2_t packing10(int16x2_t x, int16x2_t y)
+{
+  int16x2_t ret;
+  ret[0] = x[1];
+  ret[1] = y[0];
+  return ret;
+}
+
+int16x2_t packing00(int16x2_t x, int16x2_t y) __attribute__ ((noinline));
+int16x2_t packing00(int16x2_t x, int16x2_t y)
+{
+  int16x2_t ret;
+  ret[0] = x[0];
+  ret[1] = y[0];
+  return ret;
+}
+
+int16x2_t packing0cv0(int16x2_t x) __attribute__ ((noinline));
+int16x2_t packing0cv0(int16x2_t x)
+{
+  int16x2_t ret = {0, 0};
+  ret[0] = x[0];
+  return ret;
+}
+
+int16x2_t packingcv00(int16x2_t x) __attribute__ ((noinline));
+int16x2_t packingcv00(int16x2_t x)
+{
+  int16x2_t ret = {0, 0};
+  ret[1] = x[0];
+  return ret;
+}
+
+int16x2_t packing11(int16x2_t x, int16x2_t y) __attribute__ ((noinline));
+int16x2_t packing11(int16x2_t x, int16x2_t y)
+{
+  int16x2_t ret;
+  ret[0] = x[1];
+  ret[1] = y[1];
+  return ret;
+}
+int16x2_t packing1cv0(int16x2_t x) __attribute__ ((noinline));
+int16x2_t packing1cv0(int16x2_t x)
+{
+  int16x2_t ret = {0, 0};
+  ret[0] = x[1];
+  return ret;
+}
+
+int16x2_t packingcv01(int16x2_t x) __attribute__ ((noinline));
+int16x2_t packingcv01(int16x2_t x)
+{
+  int16x2_t ret = {0, 0};
+  ret[1] = x[1];
+  return ret;
+}
+
+int main() {
+  int16x2_t a = {0x11, 0x22};
+  int16x2_t b = {0x33, 0x44};
+
+  int16x2_t ret00, ret01, ret10, ret11;
+  int16x2_t ret0cv0, retcv00, ret1cv0, retcv01;
+  ret00 = packing00 (a, b);
+
+  if (ret00[0] != 0x11
+      || ret00[1] != 0x33)
+    return 1;
+
+  ret0cv0 = packing0cv0 (a);
+
+  if (ret0cv0[0] != 0x11
+      || ret0cv0[1] != 0)
+    return 1;
+
+  retcv00 = packingcv00 (a);
+
+  if (retcv00[0] != 0
+      || retcv00[1] != 0x11)
+    return 1;
+
+  ret11 = packing11 (a, b);
+
+  if (ret11[0] != 0x22
+      || ret11[1] != 0x44)
+    return 1;
+
+  ret1cv0 = packing1cv0 (a);
+
+  if (ret1cv0[0] != 0x22
+      || ret1cv0[1] != 0)
+    return 1;
+
+  retcv01 = packingcv01 (a);
+
+  if (retcv01[0] != 0
+      || retcv01[1] != 0x22)
+    return 1;
+
+  ret01 = packing01 (a, b);
+
+  if (ret01[0] != 0x11
+      || ret01[1] != 0x44)
+    return 1;
+
+  ret10 = packing10 (a, b);
+
+  if (ret10[0] != 0x22
+      || ret10[1] != 0x33)
+    return 1;
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/nds32/nds32.exp b/gcc/testsuite/gcc.target/nds32/nds32.exp
index 1c245f6..2f5a150 100644
--- a/gcc/testsuite/gcc.target/nds32/nds32.exp
+++ b/gcc/testsuite/gcc.target/nds32/nds32.exp
@@ -38,8 +38,10 @@ if ![info exists DEFAULT_CFLAGS] then {
 dg-init

 # Main loop.
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cS\]]] \
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/compile/*.\[cS\]]] \
 	"" $DEFAULT_CFLAGS
+gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cS\]]] \
+	"" ""

 # All done.
 dg-finish
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index f0f5ac4..5a9b57d 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -487,6 +487,10 @@ proc check_effective_target_trampolines { } {
 	 || [istarget hppa64-hp-hpux11.23] } {
 	return 0;
     }
+    if { [istarget nds32*-*-*]
+	 && [check_effective_target_nds32_reduced_regs] } {
+	return 0;
+    }
     return 1
 }

@@ -500,7 +504,7 @@ proc check_effective_target_keeps_null_pointer_checks { } {
     if [target_info exists keeps_null_pointer_checks] {
       return 1
     }
-    if { [istarget avr-*-*] } {
+    if { [istarget avr-*-*] || [istarget nds32*-*-elf] } {
 	return 1;
     }
     return 0
@@ -3597,6 +3601,125 @@ proc check_effective_target_arm_prefer_ldrd_strd { } {
     }  "-O2 -mthumb" ]
 }

+# If board info says it only has 16M addressing space, return 0.
+# Otherwise, return 1.
+proc check_effective_target_nds32_full_addr_space { } {
+    if [board_info target exists addr16m] {
+	return 0
+    }
+    return 1;
+}
+
+# Return 1 if gp direct is enable by default.
+proc check_effective_target_nds32_gp_direct { } {
+    return [check_no_compiler_messages gp_direct object {
+	#ifdef __NDS32_GP_DIRECT__
+	int dummy;
+	#else
+	#error no GP_DIRECT
+	#endif
+    }]
+}
+
+# Return 1 if this is a nds32 target supporting -mext-perf.
+proc check_effective_target_nds32_ext_perf { } {
+    return [check_no_compiler_messages ext_perf object {
+	#ifdef __NDS32_EXT_PERF__
+	int dummy;
+	#else
+	#error no EXT_PERF
+	#endif
+    }]
+}
+
+# Return 1 if this is a nds32 target supporting -mext-perf2.
+proc check_effective_target_nds32_ext_perf2 { } {
+    return [check_no_compiler_messages ext_perf2 object {
+	#ifdef __NDS32_EXT_PERF2__
+	int dummy;
+	#else
+	#error no EXT_PERF2
+	#endif
+    }]
+}
+
+# Return 1 if this is a nds32 target supporting -mext-string.
+proc check_effective_target_nds32_ext_string { } {
+    return [check_no_compiler_messages ext_string object {
+	#ifdef __NDS32_EXT_STRING__
+	int dummy;
+	#else
+	#error no EXT_STRING
+	#endif
+    }]
+}
+
+# Return 1 if this is a nds32 target supporting -mext-fpu-sp or -mext-fpu-dp.
+proc check_effective_target_nds32_ext_fpu { } {
+    return [check_no_compiler_messages ext_fpu object {
+	#if defined(__NDS32_EXT_FPU_SP__) || defined(__NDS32_EXT_FPU_DP__)
+	int dummy;
+	#else
+	#error no support FPU
+	#endif
+    }]
+}
+
+# Return 1 if this is a nds32 target not supporting -mext-fpu-sp or -mext-fpu-dp.
+proc check_effective_target_nds32_soft_fp { } {
+    return [check_no_compiler_messages soft_fp object {
+	#if defined(__NDS32_EXT_FPU_SP__) || defined(__NDS32_EXT_FPU_DP__)
+	#error Hard FP
+	#else
+	int dummy;
+	#endif
+    }]
+}
+
+# Return 1 if this is a nds32 target supporting -mext-fpu-sp.
+proc check_effective_target_nds32_ext_fpu_sp { } {
+    return [check_no_compiler_messages ext_fpu_sp object {
+	#ifdef __NDS32_EXT_FPU_SP__
+	int dummy;
+	#else
+	#error no EXT_FPU_SP
+	#endif
+    }]
+}
+
+# Return 1 if this is a nds32 target supporting -mext-fpu-dp.
+proc check_effective_target_nds32_ext_fpu_dp { } {
+    return [check_no_compiler_messages ext_fpu_dp object {
+	#ifdef __NDS32_EXT_FPU_DP__
+	int dummy;
+	#else
+	#error no EXT_FPU_DP
+	#endif
+    }]
+}
+
+# Return 1 if this is a nds32 target supporting -mreduced-regs.
+proc check_effective_target_nds32_reduced_regs { } {
+    return [check_no_compiler_messages reduced_regs object {
+	#ifdef __NDS32_REDUCED_REGS__
+	int dummy;
+	#else
+	#error no REDUCED_REGS
+	#endif
+    }]
+}
+
+# Return 1 if this is a nds32 target not supporting v3m ISA.
+proc check_effective_target_nds32_no_v3m { } {
+    return [check_no_compiler_messages no_v3m object {
+	#if !defined(__NDS32_BASELINE_V3M__)
+	int dummy;
+	#else
+	#error Support V3M ISA
+	#endif
+    }]
+}
+
 # Return 1 if this is a PowerPC target supporting -meabi.

 proc check_effective_target_powerpc_eabi_ok { } {
@@ -6897,6 +7020,7 @@ proc check_effective_target_logical_op_short_circuit {} {
 	 || [istarget avr*-*-*]
 	 || [istarget crisv32-*-*] || [istarget cris-*-*]
 	 || [istarget mmix-*-*]
+	 || [istarget nds32*-*-*]
 	 || [istarget s390*-*-*]
 	 || [istarget powerpc*-*-*]
 	 || [istarget nios2*-*-*]
diff --git a/gcc/tree-vrp.c b/gcc/tree-vrp.c
index 154df21..acd1a52 100644
--- a/gcc/tree-vrp.c
+++ b/gcc/tree-vrp.c
@@ -9518,6 +9518,7 @@ simplify_cond_using_ranges (gcond *stmt)
      used for the comparison directly if we just massage the constant in the
      comparison.  */
   if (TREE_CODE (op0) == SSA_NAME
+      && has_single_use (op0)
       && TREE_CODE (op1) == INTEGER_CST)
     {
       gimple *def_stmt = SSA_NAME_DEF_STMT (op0);
diff --git a/libgcc/config.host b/libgcc/config.host
index 124f2ce..107ccb1 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -946,6 +946,23 @@ msp430*-*-elf)
 	tmake_file="$tm_file t-crtstuff t-fdpbit msp430/t-msp430"
         extra_parts="$extra_parts libmul_none.a libmul_16.a libmul_32.a libmul_f5.a"
 	;;
+nds32*-linux*)
+	# Basic makefile fragment and extra_parts for crt stuff.
+	# We also append c-isr library implementation.
+	tmake_file="${tmake_file} t-slibgcc-libgcc"
+	tmake_file="${tmake_file} nds32/t-nds32-glibc nds32/t-crtstuff t-softfp-sfdf t-softfp"
+	# The header file of defining MD_FALLBACK_FRAME_STATE_FOR.
+	md_unwind_header=nds32/linux-unwind.h
+	# Append library definition makefile fragment according to --with-nds32-lib=X setting.
+	case "${with_nds32_lib}" in
+	"" | glibc | uclibc )
+		;;
+	*)
+		echo "Cannot accept --with-nds32-lib=$with_nds32_lib, available values are: glibc uclibc" 1>&2
+		exit 1
+		;;
+	esac
+	;;
 nds32*-elf*)
 	# Basic makefile fragment and extra_parts for crt stuff.
 	# We also append c-isr library implementation.
@@ -959,9 +976,19 @@ nds32*-elf*)
 		tmake_file="${tmake_file} nds32/t-nds32-newlib t-softfp-sfdf t-softfp"
 		;;
 	mculib)
-		# Append library definition makefile fragment t-nds32-mculib.
+		case "${with_arch}" in
+		"" | v2 | v2j | v3 | v3j | v3m)
+		# Append library definition makefile fragment t-nds32-mculib-generic.
 		# The software floating point library is included in mculib.
-		tmake_file="${tmake_file} nds32/t-nds32-mculib"
+			tmake_file="${tmake_file} nds32/t-nds32-mculib-generic"
+			;;
+		v3f | v3s)
+		# Append library definition makefile fragment t-nds32-mculib-softfp.
+		# Append mculib do not support ABI2FP_PLUS,
+		# so using'soft-fp' software floating point make rule fragment provided by gcc.
+			tmake_file="${tmake_file} nds32/t-nds32-mculib-softfp t-softfp-sfdf t-softfp"
+			;;
+		esac
 		;;
 	*)
 		echo "Cannot accept --with-nds32-lib=$with_nds32_lib, available values are: newlib mculib" 1>&2
diff --git a/libgcc/config/nds32/crtzero.S b/libgcc/config/nds32/crtzero.S
deleted file mode 100644
index 9898525..0000000
--- a/libgcc/config/nds32/crtzero.S
+++ /dev/null
@@ -1,103 +0,0 @@
-/* The startup code sample of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-!!==============================================================================
-!!
-!!      crtzero.S
-!!
-!!      This is JUST A SAMPLE of nds32 startup code !!
-!!      You can refer this content and implement
-!!      the actual one in newlib/mculib.
-!!
-!!==============================================================================
-
-!!------------------------------------------------------------------------------
-!! Jump to start up code
-!!------------------------------------------------------------------------------
-	.section	.nds32_init, "ax"
-	j	_start
-
-!!------------------------------------------------------------------------------
-!! Startup code implementation
-!!------------------------------------------------------------------------------
-	.section	.text
-	.global	_start
-	.weak	_SDA_BASE_
-	.weak	_FP_BASE_
-	.align	2
-	.func	_start
-	.type	_start, @function
-_start:
-.L_fp_gp_lp_init:
-	la	$fp, _FP_BASE_		! init $fp
-	la	$gp, _SDA_BASE_		! init $gp for small data access
-	movi	$lp, 0			! init $lp
-
-.L_stack_init:
-	la	$sp, _stack		! init $sp
-	movi	$r0, -8			! align $sp to 8-byte (use 0xfffffff8)
-	and	$sp, $sp, $r0		! align $sp to 8-byte (filter out lower 3-bit)
-
-.L_bss_init:
-	! clear BSS, this process can be 4 time faster if data is 4 byte aligned
-	! if so, use swi.p instead of sbi.p
-	! the related stuff are defined in linker script
-	la	$r0, _edata		! get the starting addr of bss
-	la	$r2, _end		! get ending addr of bss
-	beq	$r0, $r2, .L_call_main	! if no bss just do nothing
-	movi	$r1, 0			! should be cleared to 0
-.L_clear_bss:
-	sbi.p	$r1, [$r0], 1		! Set 0 to bss
-	bne	$r0, $r2, .L_clear_bss	! Still bytes left to set
-
-!.L_stack_heap_check:
-!	la	$r0, _end		! init heap_end
-!	s.w	$r0, heap_end		! save it
-
-
-!.L_init_argc_argv:
-!	! argc/argv initialization if necessary; default implementation is in crt1.o
-!	la	$r9, _arg_init		! load address of _arg_init?
-!	beqz	$r9, .L4		! has _arg_init? no, go check main()
-!	addi	$sp, $sp, -512		! allocate space for command line + arguments
-!	move	$r6, $sp		! r6 = buffer addr of cmd line
-!	move	$r0, $r6		! r0 = buffer addr of cmd line
-!	syscall	6002			! get cmd line
-!	move	$r0, $r6		! r0 = buffer addr of cmd line
-!	addi	$r1, $r6, 256		! r1 = argv
-!	jral	$r9			! init argc/argv
-!	addi	$r1, $r6, 256		! r1 = argv
-
-.L_call_main:
-	! call main() if main() is provided
-	la	$r15, main		! load address of main
-	jral	$r15			! call main
-
-.L_terminate_program:
-	syscall	0x1			! use syscall 0x1 to terminate program
-	.size	_start, .-_start
-	.end
-
-!! ------------------------------------------------------------------------
diff --git a/libgcc/config/nds32/initfini.c b/libgcc/config/nds32/initfini.c
index 0aa33f5..34406f0 100644
--- a/libgcc/config/nds32/initfini.c
+++ b/libgcc/config/nds32/initfini.c
@@ -25,6 +25,10 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */

+#include <stddef.h>
+/* Need header file for `struct object' type.  */
+#include "../libgcc/unwind-dw2-fde.h"
+
 /*  Declare a pointer to void function type.  */
 typedef void (*func_ptr) (void);

@@ -42,11 +46,59 @@ typedef void (*func_ptr) (void);
    refer to only the __CTOR_END__ symbol in crtfini.o and the __DTOR_LIST__
    symbol in crtinit.o, where they are defined.  */

-static func_ptr __CTOR_LIST__[1] __attribute__ ((section (".ctors")))
-     = { (func_ptr) (-1) };
+static func_ptr __CTOR_LIST__[1] __attribute__ ((section (".ctors"), used))
+     = { (func_ptr) 0 };
+
+static func_ptr __DTOR_LIST__[1] __attribute__ ((section (".dtors"), used))
+     = { (func_ptr) 0 };
+
+
+#ifdef SUPPORT_UNWINDING_DWARF2
+/* Preparation of exception handling with dwar2 mechanism registration.  */

-static func_ptr __DTOR_LIST__[1] __attribute__ ((section (".dtors")))
-     = { (func_ptr) (-1) };
+asm ("\n\
+	.section .eh_frame,\"aw\",@progbits\n\
+	.global __EH_FRAME_BEGIN__\n\
+	.type	__EH_FRAME_BEGIN__, @object\n\
+	.align 2\n\
+__EH_FRAME_BEGIN__:\n\
+	! Beginning location of eh_frame section\n\
+	.previous\n\
+");
+
+extern func_ptr __EH_FRAME_BEGIN__[];
+
+
+/* Note that the following two functions are going to be chained into
+   constructor and destructor list, repectively.  So these two declarations
+   must be placed after __CTOR_LIST__ and __DTOR_LIST.  */
+extern void __nds32_register_eh(void) __attribute__((constructor, used));
+extern void __nds32_deregister_eh(void) __attribute__((destructor, used));
+
+/* Register the exception handling table as the first constructor.  */
+void
+__nds32_register_eh (void)
+{
+  static struct object object;
+  if (__register_frame_info)
+    __register_frame_info (__EH_FRAME_BEGIN__, &object);
+}
+
+/* Unregister the exception handling table as a deconstructor.  */
+void
+__nds32_deregister_eh (void)
+{
+  static int completed = 0;
+
+  if (completed)
+    return;
+
+  if (__deregister_frame_info)
+    __deregister_frame_info (__EH_FRAME_BEGIN__);
+
+  completed = 1;
+}
+#endif

 /* Run all the global destructors on exit from the program.  */

@@ -63,7 +115,7 @@ static func_ptr __DTOR_LIST__[1] __attribute__ ((section (".dtors")))
    same particular root executable or shared library file.  */

 static void __do_global_dtors (void)
-asm ("__do_global_dtors") __attribute__ ((section (".text")));
+asm ("__do_global_dtors") __attribute__ ((section (".text"), used));

 static void
 __do_global_dtors (void)
@@ -116,23 +168,37 @@ void *__dso_handle = 0;
    last, these words naturally end up at the very ends of the two lists
    contained in these two sections.  */

-static func_ptr __CTOR_END__[1] __attribute__ ((section (".ctors")))
+static func_ptr __CTOR_END__[1] __attribute__ ((section (".ctors"), used))
      = { (func_ptr) 0 };

-static func_ptr __DTOR_END__[1] __attribute__ ((section (".dtors")))
+static func_ptr __DTOR_END__[1] __attribute__ ((section (".dtors"), used))
      = { (func_ptr) 0 };

+#ifdef SUPPORT_UNWINDING_DWARF2
+/* ZERO terminator in .eh_frame section.  */
+asm ("\n\
+	.section .eh_frame,\"aw\",@progbits\n\
+	.global __EH_FRAME_END__\n\
+	.type	__EH_FRAME_END__, @object\n\
+	.align 2\n\
+__EH_FRAME_END__:\n\
+	! End location of eh_frame section with ZERO terminator\n\
+	.word 0\n\
+	.previous\n\
+");
+#endif
+
 /* Run all global constructors for the program.
    Note that they are run in reverse order.  */

 static void __do_global_ctors (void)
-asm ("__do_global_ctors") __attribute__ ((section (".text")));
+asm ("__do_global_ctors") __attribute__ ((section (".text"), used));

 static void
 __do_global_ctors (void)
 {
   func_ptr *p;
-  for (p = __CTOR_END__ - 1; *p != (func_ptr) -1; p--)
+  for (p = __CTOR_END__ - 1; *p; p--)
     (*p) ();
 }

diff --git a/libgcc/config/nds32/isr-library/adj_intr_lvl.inc b/libgcc/config/nds32/isr-library/adj_intr_lvl.inc
index 3e978b4..a519df8 100644
--- a/libgcc/config/nds32/isr-library/adj_intr_lvl.inc
+++ b/libgcc/config/nds32/isr-library/adj_intr_lvl.inc
@@ -26,13 +26,26 @@
 .macro ADJ_INTR_LVL
 #if defined(NDS32_NESTED) /* Nested handler.  */
 	mfsr	$r3, $PSW
+	/* By substracting 1 from $PSW, we can lower PSW.INTL
+	   and enable GIE simultaneously.  */
 	addi	$r3, $r3, #-0x1
+  #if __NDS32_EXT_ZOL__ || __NDS32_EXT_DSP__
+    ori   $r3, $r3, 0x2000  /* Set PSW.AEN(b'13) */
+  #endif
 	mtsr	$r3, $PSW
 #elif defined(NDS32_NESTED_READY) /* Nested ready handler.  */
 	/* Save ipc and ipsw and lower INT level.  */
 	mfsr	$r3, $PSW
 	addi	$r3, $r3, #-0x2
+  #if __NDS32_EXT_ZOL__ || __NDS32_EXT_DSP__
+    ori   $r3, $r3, 0x2000  /* Set PSW.AEN(b'13) */
+  #endif
 	mtsr	$r3, $PSW
 #else /* Not nested handler.  */
+  #if __NDS32_EXT_ZOL__ || __NDS32_EXT_DSP__
+    mfsr	$r3, $PSW
+    ori   $r3, $r3, 0x2000  /* Set PSW.AEN(b'13) */
+    mtsr	$r3, $PSW
+  #endif
 #endif
 .endm
diff --git a/libgcc/config/nds32/isr-library/excp_isr.S b/libgcc/config/nds32/isr-library/excp_isr.S
index 6179a98..f1a3b59 100644
--- a/libgcc/config/nds32/isr-library/excp_isr.S
+++ b/libgcc/config/nds32/isr-library/excp_isr.S
@@ -23,6 +23,7 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */

+#include "save_usr_regs.inc"
 #include "save_mac_regs.inc"
 #include "save_fpu_regs.inc"
 #include "save_fpu_regs_00.inc"
@@ -32,35 +33,33 @@
 #include "save_all.inc"
 #include "save_partial.inc"
 #include "adj_intr_lvl.inc"
-#include "restore_mac_regs.inc"
 #include "restore_fpu_regs_00.inc"
 #include "restore_fpu_regs_01.inc"
 #include "restore_fpu_regs_02.inc"
 #include "restore_fpu_regs_03.inc"
 #include "restore_fpu_regs.inc"
+#include "restore_mac_regs.inc"
+#include "restore_usr_regs.inc"
 #include "restore_all.inc"
 #include "restore_partial.inc"
+
 	.section .nds32_isr, "ax"       /* Put it in the section of 1st level handler. */
 	.align	1
-/*
-  First Level Handlers
-  1. First Level Handlers are invokded in vector section via jump instruction
-  with specific names for different configurations.
-  2. Naming Format: _nds32_e_SR_NT for exception handlers.
-		    _nds32_i_SR_NT for interrupt handlers.
-  2.1 All upper case letters are replaced with specific lower case letters encodings.
-  2.2 SR: Saved Registers
-      sa: Save All regs (context)
-      ps: Partial Save (all caller-saved regs)
-  2.3 NT: Nested Type
-      ns: nested
-      nn: not nested
-      nr: nested ready
-*/
-
-/*
-  This is original 16-byte vector size version.
-*/
+
+/* First Level Handlers
+   1. First Level Handlers are invokded in vector section via jump instruction
+      with specific names for different configurations.
+   2. Naming Format: _nds32_e_SR_NT for exception handlers.
+                     _nds32_i_SR_NT for interrupt handlers.
+     2.1 All upper case letters are replaced with specific lower case letters encodings.
+     2.2 SR -- Saved Registers
+         sa: Save All regs (context)
+         ps: Partial Save (all caller-saved regs)
+     2.3 NT -- Nested Type
+         ns: nested
+         nn: not nested
+         nr: nested ready */
+
 #ifdef NDS32_SAVE_ALL_REGS
 #if defined(NDS32_NESTED)
 	.globl	_nds32_e_sa_ns
@@ -91,21 +90,26 @@ _nds32_e_ps_nn:
 #endif /* endif for Nest Type */
 #endif /* not NDS32_SAVE_ALL_REGS */

-/*
-  This is 16-byte vector size version.
-  The vector id was restored into $r0 in vector by compiler.
-*/
+
+/* For 4-byte vector size version, the vector id is
+   extracted from $ITYPE and is set into $r0 by library.
+   For 16-byte vector size version, the vector id
+   is set into $r0 in vector section by compiler.  */
+
+/* Save used registers.  */
 #ifdef NDS32_SAVE_ALL_REGS
         SAVE_ALL
 #else
         SAVE_PARTIAL
 #endif
+
 	/* Prepare to call 2nd level handler. */
 	la	$r2, _nds32_jmptbl_00
 	lw	$r2, [$r2 + $r0 << #2]
 	ADJ_INTR_LVL	/* Adjust INTR level. $r3 is clobbered.  */
 	jral    $r2
-	/* Restore used registers. */
+
+/* Restore used registers.  */
 #ifdef NDS32_SAVE_ALL_REGS
 	RESTORE_ALL
 #else
@@ -113,6 +117,7 @@ _nds32_e_ps_nn:
 #endif
 	iret

+
 #ifdef NDS32_SAVE_ALL_REGS
 #if defined(NDS32_NESTED)
 	.size	_nds32_e_sa_ns, .-_nds32_e_sa_ns
diff --git a/libgcc/config/nds32/isr-library/excp_isr_4b.S b/libgcc/config/nds32/isr-library/excp_isr_4b.S
deleted file mode 100644
index af70c7a..0000000
--- a/libgcc/config/nds32/isr-library/excp_isr_4b.S
+++ /dev/null
@@ -1,133 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include "save_mac_regs.inc"
-#include "save_fpu_regs.inc"
-#include "save_fpu_regs_00.inc"
-#include "save_fpu_regs_01.inc"
-#include "save_fpu_regs_02.inc"
-#include "save_fpu_regs_03.inc"
-#include "save_all.inc"
-#include "save_partial.inc"
-#include "adj_intr_lvl.inc"
-#include "restore_mac_regs.inc"
-#include "restore_fpu_regs_00.inc"
-#include "restore_fpu_regs_01.inc"
-#include "restore_fpu_regs_02.inc"
-#include "restore_fpu_regs_03.inc"
-#include "restore_fpu_regs.inc"
-#include "restore_all.inc"
-#include "restore_partial.inc"
-	.section .nds32_isr, "ax"       /* Put it in the section of 1st level handler. */
-	.align	1
-/*
-  First Level Handlers
-  1. First Level Handlers are invokded in vector section via jump instruction
-  with specific names for different configurations.
-  2. Naming Format: _nds32_e_SR_NT for exception handlers.
-		    _nds32_i_SR_NT for interrupt handlers.
-  2.1 All upper case letters are replaced with specific lower case letters encodings.
-  2.2 SR: Saved Registers
-      sa: Save All regs (context)
-      ps: Partial Save (all caller-saved regs)
-  2.3 NT: Nested Type
-      ns: nested
-      nn: not nested
-      nr: nested ready
-*/
-
-/*
-  This is 4-byte vector size version.
-  The "_4b" postfix was added for 4-byte version symbol.
-*/
-#ifdef NDS32_SAVE_ALL_REGS
-#if defined(NDS32_NESTED)
-	.globl	_nds32_e_sa_ns_4b
-	.type	_nds32_e_sa_ns_4b, @function
-_nds32_e_sa_ns_4b:
-#elif defined(NDS32_NESTED_READY)
-	.globl	_nds32_e_sa_nr_4b
-	.type	_nds32_e_sa_nr_4b, @function
-_nds32_e_sa_nr_4b:
-#else /* Not nested handler. */
-	.globl	_nds32_e_sa_nn_4b
-	.type	_nds32_e_sa_nn_4b, @function
-_nds32_e_sa_nn_4b:
-#endif /* endif for Nest Type */
-#else /* not NDS32_SAVE_ALL_REGS */
-#if defined(NDS32_NESTED)
-	.globl	_nds32_e_ps_ns_4b
-	.type	_nds32_e_ps_ns_4b, @function
-_nds32_e_ps_ns_4b:
-#elif defined(NDS32_NESTED_READY)
-	.globl	_nds32_e_ps_nr_4b
-	.type	_nds32_e_ps_nr_4b, @function
-_nds32_e_ps_nr_4b:
-#else /* Not nested handler. */
-	.globl	_nds32_e_ps_nn_4b
-	.type	_nds32_e_ps_nn_4b, @function
-_nds32_e_ps_nn_4b:
-#endif /* endif for Nest Type */
-#endif /* not NDS32_SAVE_ALL_REGS */
-
-/*
-  This is 4-byte vector size version.
-  The vector id was restored into $lp in vector by compiler.
-*/
-#ifdef NDS32_SAVE_ALL_REGS
-	SAVE_ALL_4B
-#else
-	SAVE_PARTIAL_4B
-#endif
-	/* Prepare to call 2nd level handler. */
-	la	$r2, _nds32_jmptbl_00
-	lw	$r2, [$r2 + $r0 << #2]
-	ADJ_INTR_LVL	/* Adjust INTR level. $r3 is clobbered.  */
-	jral    $r2
-	/* Restore used registers. */
-#ifdef NDS32_SAVE_ALL_REGS
-	RESTORE_ALL
-#else
-	RESTORE_PARTIAL
-#endif
-	iret
-
-#ifdef NDS32_SAVE_ALL_REGS
-#if defined(NDS32_NESTED)
-	.size	_nds32_e_sa_ns_4b, .-_nds32_e_sa_ns_4b
-#elif defined(NDS32_NESTED_READY)
-	.size	_nds32_e_sa_nr_4b, .-_nds32_e_sa_nr_4b
-#else /* Not nested handler. */
-	.size	_nds32_e_sa_nn_4b, .-_nds32_e_sa_nn_4b
-#endif /* endif for Nest Type */
-#else /* not NDS32_SAVE_ALL_REGS */
-#if defined(NDS32_NESTED)
-	.size	_nds32_e_ps_ns_4b, .-_nds32_e_ps_ns_4b
-#elif defined(NDS32_NESTED_READY)
-	.size	_nds32_e_ps_nr_4b, .-_nds32_e_ps_nr_4b
-#else /* Not nested handler. */
-	.size	_nds32_e_ps_nn_4b, .-_nds32_e_ps_nn_4b
-#endif /* endif for Nest Type */
-#endif /* not NDS32_SAVE_ALL_REGS */
diff --git a/libgcc/config/nds32/isr-library/intr_isr.S b/libgcc/config/nds32/isr-library/intr_isr.S
index c55da1c..90c5c25 100644
--- a/libgcc/config/nds32/isr-library/intr_isr.S
+++ b/libgcc/config/nds32/isr-library/intr_isr.S
@@ -23,6 +23,7 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */

+#include "save_usr_regs.inc"
 #include "save_mac_regs.inc"
 #include "save_fpu_regs.inc"
 #include "save_fpu_regs_00.inc"
@@ -32,35 +33,33 @@
 #include "save_all.inc"
 #include "save_partial.inc"
 #include "adj_intr_lvl.inc"
-#include "restore_mac_regs.inc"
 #include "restore_fpu_regs_00.inc"
 #include "restore_fpu_regs_01.inc"
 #include "restore_fpu_regs_02.inc"
 #include "restore_fpu_regs_03.inc"
 #include "restore_fpu_regs.inc"
+#include "restore_mac_regs.inc"
+#include "restore_usr_regs.inc"
 #include "restore_all.inc"
 #include "restore_partial.inc"
+
 	.section .nds32_isr, "ax"       /* Put it in the section of 1st level handler. */
 	.align	1
-/*
-  First Level Handlers
-  1. First Level Handlers are invokded in vector section via jump instruction
-  with specific names for different configurations.
-  2. Naming Format: _nds32_e_SR_NT for exception handlers.
-		    _nds32_i_SR_NT for interrupt handlers.
-  2.1 All upper case letters are replaced with specific lower case letters encodings.
-  2.2 SR: Saved Registers
-      sa: Save All regs (context)
-      ps: Partial Save (all caller-saved regs)
-  2.3 NT: Nested Type
-      ns: nested
-      nn: not nested
-      nr: nested ready
-*/
-
-/*
-  This is original 16-byte vector size version.
-*/
+
+/* First Level Handlers
+   1. First Level Handlers are invokded in vector section via jump instruction
+      with specific names for different configurations.
+   2. Naming Format: _nds32_e_SR_NT for exception handlers.
+                     _nds32_i_SR_NT for interrupt handlers.
+     2.1 All upper case letters are replaced with specific lower case letters encodings.
+     2.2 SR -- Saved Registers
+         sa: Save All regs (context)
+         ps: Partial Save (all caller-saved regs)
+     2.3 NT -- Nested Type
+         ns: nested
+         nn: not nested
+         nr: nested ready */
+
 #ifdef NDS32_SAVE_ALL_REGS
 #if defined(NDS32_NESTED)
 	.globl	_nds32_i_sa_ns
@@ -91,21 +90,36 @@ _nds32_i_ps_nn:
 #endif /* endif for Nest Type */
 #endif /* not NDS32_SAVE_ALL_REGS */

-/*
-  This is 16-byte vector size version.
-  The vector id was restored into $r0 in vector by compiler.
-*/
+
+/* For 4-byte vector size version, the vector id is
+   extracted from $ITYPE and is set into $r0 by library.
+   For 16-byte vector size version, the vector id
+   is set into $r0 in vector section by compiler.  */
+
+/* Save used registers first.  */
 #ifdef NDS32_SAVE_ALL_REGS
         SAVE_ALL
 #else
         SAVE_PARTIAL
 #endif
-	/* Prepare to call 2nd level handler. */
+
+/* According to vector size, we need to have different implementation.  */
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* Prepare to call 2nd level handler.  */
+	la	$r2, _nds32_jmptbl_00
+	lw	$r2, [$r2 + $r0 << #2]
+	addi    $r0, $r0, #-9	/* Make interrput vector id zero-based.  */
+	ADJ_INTR_LVL	/* Adjust INTR level.  $r3 is clobbered.  */
+	jral    $r2
+#else /* not __NDS32_ISR_VECTOR_SIZE_4__ */
+	/* Prepare to call 2nd level handler.  */
 	la	$r2, _nds32_jmptbl_09	/* For zero-based vcetor id.  */
 	lw	$r2, [$r2 + $r0 << #2]
 	ADJ_INTR_LVL	/* Adjust INTR level. $r3 is clobbered.  */
 	jral    $r2
-	/* Restore used registers. */
+#endif /* not __NDS32_ISR_VECTOR_SIZE_4__ */
+
+/* Restore used registers.  */
 #ifdef NDS32_SAVE_ALL_REGS
 	RESTORE_ALL
 #else
@@ -113,6 +127,7 @@ _nds32_i_ps_nn:
 #endif
 	iret

+
 #ifdef NDS32_SAVE_ALL_REGS
 #if defined(NDS32_NESTED)
 	.size	_nds32_i_sa_ns, .-_nds32_i_sa_ns
diff --git a/libgcc/config/nds32/isr-library/intr_isr_4b.S b/libgcc/config/nds32/isr-library/intr_isr_4b.S
deleted file mode 100644
index d82c007..0000000
--- a/libgcc/config/nds32/isr-library/intr_isr_4b.S
+++ /dev/null
@@ -1,134 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include "save_mac_regs.inc"
-#include "save_fpu_regs.inc"
-#include "save_fpu_regs_00.inc"
-#include "save_fpu_regs_01.inc"
-#include "save_fpu_regs_02.inc"
-#include "save_fpu_regs_03.inc"
-#include "save_all.inc"
-#include "save_partial.inc"
-#include "adj_intr_lvl.inc"
-#include "restore_mac_regs.inc"
-#include "restore_fpu_regs_00.inc"
-#include "restore_fpu_regs_01.inc"
-#include "restore_fpu_regs_02.inc"
-#include "restore_fpu_regs_03.inc"
-#include "restore_fpu_regs.inc"
-#include "restore_all.inc"
-#include "restore_partial.inc"
-	.section .nds32_isr, "ax"       /* Put it in the section of 1st level handler. */
-	.align	1
-/*
-  First Level Handlers
-  1. First Level Handlers are invokded in vector section via jump instruction
-  with specific names for different configurations.
-  2. Naming Format: _nds32_e_SR_NT for exception handlers.
-		    _nds32_i_SR_NT for interrupt handlers.
-  2.1 All upper case letters are replaced with specific lower case letters encodings.
-  2.2 SR: Saved Registers
-      sa: Save All regs (context)
-      ps: Partial Save (all caller-saved regs)
-  2.3 NT: Nested Type
-      ns: nested
-      nn: not nested
-      nr: nested ready
-*/
-
-/*
-  This is 4-byte vector size version.
-  The "_4b" postfix was added for 4-byte version symbol.
-*/
-#ifdef NDS32_SAVE_ALL_REGS
-#if defined(NDS32_NESTED)
-	.globl	_nds32_i_sa_ns_4b
-	.type	_nds32_i_sa_ns_4b, @function
-_nds32_i_sa_ns_4b:
-#elif defined(NDS32_NESTED_READY)
-	.globl	_nds32_i_sa_nr_4b
-	.type	_nds32_i_sa_nr_4b, @function
-_nds32_i_sa_nr_4b:
-#else /* Not nested handler. */
-	.globl	_nds32_i_sa_nn_4b
-	.type	_nds32_i_sa_nn_4b, @function
-_nds32_i_sa_nn_4b:
-#endif /* endif for Nest Type */
-#else /* not NDS32_SAVE_ALL_REGS */
-#if defined(NDS32_NESTED)
-	.globl	_nds32_i_ps_ns_4b
-	.type	_nds32_i_ps_ns_4b, @function
-_nds32_i_ps_ns_4b:
-#elif defined(NDS32_NESTED_READY)
-	.globl	_nds32_i_ps_nr_4b
-	.type	_nds32_i_ps_nr_4b, @function
-_nds32_i_ps_nr_4b:
-#else /* Not nested handler. */
-	.globl	_nds32_i_ps_nn_4b
-	.type	_nds32_i_ps_nn_4b, @function
-_nds32_i_ps_nn_4b:
-#endif /* endif for Nest Type */
-#endif /* not NDS32_SAVE_ALL_REGS */
-
-/*
-  This is 4-byte vector size version.
-  The vector id was restored into $lp in vector by compiler.
-*/
-#ifdef NDS32_SAVE_ALL_REGS
-	SAVE_ALL_4B
-#else
-        SAVE_PARTIAL_4B
-#endif
-	/* Prepare to call 2nd level handler. */
-	la	$r2, _nds32_jmptbl_00
-	lw	$r2, [$r2 + $r0 << #2]
-	addi    $r0, $r0, #-9	/* Make interrput vector id zero-based.  */
-	ADJ_INTR_LVL	/* Adjust INTR level. $r3 is clobbered.  */
-	jral    $r2
-	/* Restore used registers. */
-#ifdef NDS32_SAVE_ALL_REGS
-	RESTORE_ALL
-#else
-	RESTORE_PARTIAL
-#endif
-	iret
-
-#ifdef NDS32_SAVE_ALL_REGS
-#if defined(NDS32_NESTED)
-	.size	_nds32_i_sa_ns_4b, .-_nds32_i_sa_ns_4b
-#elif defined(NDS32_NESTED_READY)
-	.size	_nds32_i_sa_nr_4b, .-_nds32_i_sa_nr_4b
-#else /* Not nested handler. */
-	.size	_nds32_i_sa_nn_4b, .-_nds32_i_sa_nn_4b
-#endif /* endif for Nest Type */
-#else /* not NDS32_SAVE_ALL_REGS */
-#if defined(NDS32_NESTED)
-	.size	_nds32_i_ps_ns_4b, .-_nds32_i_ps_ns_4b
-#elif defined(NDS32_NESTED_READY)
-	.size	_nds32_i_ps_nr_4b, .-_nds32_i_ps_nr_4b
-#else /* Not nested handler. */
-	.size	_nds32_i_ps_nn_4b, .-_nds32_i_ps_nn_4b
-#endif /* endif for Nest Type */
-#endif /* not NDS32_SAVE_ALL_REGS */
diff --git a/libgcc/config/nds32/isr-library/reset.S b/libgcc/config/nds32/isr-library/reset.S
index 961d731..8b9ccf5 100644
--- a/libgcc/config/nds32/isr-library/reset.S
+++ b/libgcc/config/nds32/isr-library/reset.S
@@ -26,22 +26,18 @@
 	.section .nds32_isr, "ax"	/* Put it in the section of 1st level handler.  */
 	.align	1
 	.weak	_SDA_BASE_	/* For reset handler only.  */
-	.weak	_FP_BASE_	/* For reset handler only.  */
 	.weak	_nds32_init_mem	/* User defined memory initialization function.  */
 	.globl	_start
 	.globl	_nds32_reset
 	.type	_nds32_reset, @function
 _nds32_reset:
 _start:
-#ifdef  NDS32_EXT_EX9
-	.no_ex9_begin
-#endif
 	/* Handle NMI and warm boot if any of them exists.  */
 	beqz	$sp, 1f		/* Reset, NMI or warm boot?  */
 	/* Either NMI or warm boot; save all regs.  */

 	/* Preserve registers for context-switching.  */
-#ifdef __NDS32_REDUCED_REGS__
+#if __NDS32_REDUCED_REGS__ || __NDS32_REDUCE_REGS
 	/* For 16-reg mode.  */
 	smw.adm $r0, [$sp], $r10, #0x0
 	smw.adm $r15, [$sp], $r15, #0xf
@@ -49,10 +45,9 @@ _start:
 	/* For 32-reg mode.  */
 	smw.adm $r0, [$sp], $r27, #0xf
 #endif
-#ifdef NDS32_EXT_IFC
+#if __NDS32_EXT_IFC__
 	mfusr   $r1, $IFC_LP
-	smw.adm $r1, [$sp], $r2, #0x0	/* Save extra $r2 to keep
-					   stack 8-byte alignment.  */
+	smw.adm $r1, [$sp], $r2, #0x0	/* Save extra $r2 to keep stack 8-byte alignment.  */
 #endif

 	la	$gp, _SDA_BASE_	/* Init GP for small data access.  */
@@ -71,12 +66,11 @@ _start:
 	bnez    $r0, 1f		/* If fail to resume, do cold boot.  */

 	/* Restore registers for context-switching.  */
-#ifdef NDS32_EXT_IFC
-	lmw.bim	$r1, [$sp], $r2, #0x0	/* Restore extra $r2 to keep
-					   stack 8-byte alignment.  */
+#if __NDS32_EXT_IFC__
+	lmw.bim	$r1, [$sp], $r2, #0x0	/* Restore extra $r2 to keep stack 8-byte alignment.  */
 	mtusr   $r1, $IFC_LP
 #endif
-#ifdef __NDS32_REDUCED_REGS__
+#if __NDS32_REDUCED_REGS__ || __NDS32_REDUCE_REGS
 	/* For 16-reg mode.  */
 	lmw.bim	$r15, [$sp], $r15, #0xf
 	lmw.bim	$r0, [$sp], $r10, #0x0
@@ -88,6 +82,17 @@ _start:


 1:	/* Cold boot.  */
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* With vector ID feature for v3 architecture, default vector size is 4-byte.  */
+	/* Set IVB.ESZ = 0 (vector table entry size = 4 bytes)  */
+	mfsr    $r0, $IVB
+	li      $r1, #0xc000
+	or      $r0, $r0, $r1
+	xor     $r0, $r0, $r1
+	mtsr    $r0, $IVB
+	dsb
+#else
+	/* There is no vector ID feature, so the vector size must be 16-byte.  */
 	/* Set IVB.ESZ = 1 (vector table entry size = 16 bytes)  */
 	mfsr    $r0, $IVB
 	li	$r1, #0xffff3fff
@@ -95,36 +100,54 @@ _start:
 	ori	$r0, $r0, #0x4000
 	mtsr    $r0, $IVB
 	dsb
+#endif

 	la	$gp, _SDA_BASE_		/* Init $gp.  */
-	la	$fp, _FP_BASE_		/* Init $fp.  */
 	la	$sp, _stack		/* Init $sp.  */
-#ifdef  NDS32_EXT_EX9
-/*
- *	Initialize the table base of EX9 instruction
- *	ex9 generation needs to disable before the ITB is set
- */
-	mfsr    $r0, $MSC_CFG	/* Check if HW support of EX9.  */
+
+#if __NDS32_EXT_EX9__
+.L_init_itb:
+	/* Initialization for Instruction Table Base (ITB).
+	   The symbol _ITB_BASE_ is determined by Linker.
+	   Set $ITB only if MSC_CFG.EIT (cr4.b'24) is set.  */
+	mfsr    $r0, $MSC_CFG
 	srli	$r0, $r0, 24
 	andi	$r0, $r0, 0x1
-	beqz	$r0, 4f		/* Zero means HW does not support EX9.  */
-	la      $r0, _ITB_BASE_	/* Init $ITB.  */
+	beqz	$r0, 4f		/* Fall through ?  */
+	la      $r0, _ITB_BASE_
 	mtusr   $r0, $ITB
-	.no_ex9_end
 4:
 #endif
-	la	$r15, _nds32_init_mem	/* Call DRAM init. _nds32_init_mem
-					  may written by C language.  */
+
+#if __NDS32_EXT_FPU_SP__ || __NDS32_EXT_FPU_DP__
+.L_init_fpu:
+	/* Initialize FPU
+	   Set FUCOP_CTL.CP0EN (fucpr.b'0).  */
+	mfsr    $r0, $FUCOP_CTL
+	ori     $r0, $r0, 0x1
+	mtsr    $r0, $FUCOP_CTL
+	dsb
+	/* According to [bugzilla #9425], set flush-to-zero mode.
+	   That is, set $FPCSR.DNZ(b'12) = 1.  */
+	FMFCSR	$r0
+	ori	$r0, $r0, 0x1000
+	FMTCSR	$r0
+	dsb
+#endif
+
+	/* Call DRAM init. _nds32_init_mem may written by C language.  */
+	la	$r15, _nds32_init_mem
 	beqz	$r15, 6f
 	jral	$r15
 6:
 	l.w	$r15, _nds32_jmptbl_00	/* Load reset handler.  */
 	jral	$r15
-/* Reset handler() should never return in a RTOS or non-OS system.
-   In case it does return, an exception will be generated.
-   This exception will be caught either by default break handler or by EDM.
-   Default break handle may just do an infinite loop.
-   EDM will notify GDB and GDB will regain control when the ID is 0x7fff. */
+
+	/* Reset handler() should never return in a RTOS or non-OS system.
+	   In case it does return, an exception will be generated.
+	   This exception will be caught either by default break handler or by EDM.
+	   Default break handle may just do an infinite loop.
+	   EDM will notify GDB and GDB will regain control when the ID is 0x7fff.  */
 5:
 	break    #0x7fff
 	.size	_nds32_reset, .-_nds32_reset
diff --git a/libgcc/config/nds32/isr-library/reset_4b.S b/libgcc/config/nds32/isr-library/reset_4b.S
deleted file mode 100644
index 792e655..0000000
--- a/libgcc/config/nds32/isr-library/reset_4b.S
+++ /dev/null
@@ -1,131 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section .nds32_isr, "ax"	/* Put it in the section of 1st level handler.  */
-	.align	1
-	.weak	_SDA_BASE_	/* For reset handler only.  */
-	.weak	_FP_BASE_	/* For reset handler only.  */
-	.weak	_nds32_init_mem	/* User defined memory initialization function.  */
-	.globl	_start
-	.globl	_nds32_reset_4b
-	.type	_nds32_reset_4b, @function
-_nds32_reset_4b:
-_start:
-#ifdef  NDS32_EXT_EX9
-	.no_ex9_begin
-#endif
-	/* Handle NMI and warm boot if any of them exists.  */
-	beqz	$sp, 1f		/* Reset, NMI or warm boot?  */
-	/* Either NMI or warm boot; save all regs.  */
-
-	/* Preserve registers for context-switching.  */
-#ifdef __NDS32_REDUCED_REGS__
-	/* For 16-reg mode.  */
-	smw.adm $r0, [$sp], $r10, #0x0
-	smw.adm $r15, [$sp], $r15, #0xf
-#else
-	/* For 32-reg mode.  */
-	smw.adm $r0, [$sp], $r27, #0xf
-#endif
-#ifdef NDS32_EXT_IFC
-	mfusr   $r1, $IFC_LP
-	smw.adm $r1, [$sp], $r2, #0x0	/* Save extra $r2 to keep
-					   stack 8-byte alignment.  */
-#endif
-
-	la	$gp, _SDA_BASE_	/* Init GP for small data access.  */
-	move	$r0, $sp	/* Init parameter.  */
-	mfsr	$r1, $ITYPE	/* Check ITYPE for NMI or warm boot.  */
-	andi	$r1, $r1, #0xf
-	addi	$r1, $r1, #-1
-	beqz	$r1, 2f		/* Warm boot if true.  */
-	l.w	$r15, _nds32_nmih	/* Load NMI handler.  */
-	j	3f
-2:
-	l.w	$r15, _nds32_wrh	/* Load warm boot handler.  */
-3:
-	beqz    $r15, 1f	/* If no handler, do cold boot.  */
-	jral    $r15		/* Call handler.  */
-	bnez    $r0, 1f		/* If fail to resume, do cold boot.  */
-
-	/* Restore registers for context-switching.  */
-#ifdef NDS32_EXT_IFC
-	lmw.bim	$r1, [$sp], $r2, #0x0	/* Restore extra $r2 to keep
-					   stack 8-byte alignment.  */
-	mtusr   $r1, $IFC_LP
-#endif
-#ifdef __NDS32_REDUCED_REGS__
-	/* For 16-reg mode.  */
-	lmw.bim	$r15, [$sp], $r15, #0xf
-	lmw.bim	$r0, [$sp], $r10, #0x0
-#else
-	/* For 32-reg mode.  */
-	lmw.bim $r0, [$sp], $r27, #0xf
-#endif
-	iret	/* Resume operation.  */
-
-
-1:	/* Cold boot.  */
-	/* With vector ID feature, set default vector size to 4B.  */
-	/* Set IVB.ESZ = 0 (vector table entry size = 4 bytes)  */
-	mfsr    $r0, $IVB
-	li      $r1, #0xc000
-	or      $r0, $r0, $r1
-	xor     $r0, $r0, $r1
-	mtsr    $r0, $IVB
-	dsb
-
-	la	$gp, _SDA_BASE_		/* Init $gp.  */
-	la	$fp, _FP_BASE_		/* Init $fp.  */
-	la	$sp, _stack		/* Init $sp.  */
-#ifdef  NDS32_EXT_EX9
-/*
- *	Initialize the table base of EX9 instruction
- *	ex9 generation needs to disable before the ITB is set
- */
-	mfsr    $r0, $MSC_CFG	/* Check if HW support of EX9.  */
-	srli	$r0, $r0, 24
-	andi	$r0, $r0, 0x1
-	beqz	$r0, 4f		/* Zero means HW does not support EX9.  */
-	la      $r0, _ITB_BASE_	/* Init $ITB.  */
-	mtusr   $r0, $ITB
-	.no_ex9_end
-4:
-#endif
-	la	$r15, _nds32_init_mem	/* Call DRAM init. _nds32_init_mem
-					  may written by C language.  */
-	beqz	$r15, 6f
-	jral	$r15
-6:
-	l.w	$r15, _nds32_jmptbl_00	/* Load reset handler.  */
-	jral	$r15
-/* Reset handler() should never return in a RTOS or non-OS system.
-   In case it does return, an exception will be generated.
-   This exception will be caught either by default break handler or by EDM.
-   Default break handle may just do an infinite loop.
-   EDM will notify GDB and GDB will regain control when the ID is 0x7fff. */
-5:
-	break    #0x7fff
-	.size	_nds32_reset_4b, .-_nds32_reset_4b
diff --git a/libgcc/config/nds32/isr-library/restore_all.inc b/libgcc/config/nds32/isr-library/restore_all.inc
index c25b46e..96f87ec 100644
--- a/libgcc/config/nds32/isr-library/restore_all.inc
+++ b/libgcc/config/nds32/isr-library/restore_all.inc
@@ -31,15 +31,11 @@
 	mtsr	$r2, $IPSW
 	RESTORE_FPU_REGS
 	RESTORE_MAC_REGS
-#ifdef NDS32_EXT_IFC
-	lmw.bim	$r1, [$sp], $r2, #0x0	/* Restore extra $r2 to keep
-					   stack 8-byte alignment.  */
-	mtusr   $r1, $IFC_LP
-#endif
-#ifdef __NDS32_REDUCED_REGS__
+  RESTORE_USR_REGS
+#if __NDS32_REDUCED_REGS__ || __NDS32_REDUCE_REGS
 	lmw.bim $r0, [$sp], $r10, #0x0	/* Restore all regs.  */
 	lmw.bim $r15, [$sp], $r15, #0xf
-#else /* not __NDS32_REDUCED_REGS__ */
+#else
 	lmw.bim $r0, [$sp], $r27, #0xf	/* Restore all regs.  */
 #endif
 .endm
diff --git a/libgcc/config/nds32/isr-library/restore_mac_regs.inc b/libgcc/config/nds32/isr-library/restore_mac_regs.inc
index 0ffc980..a15024c 100644
--- a/libgcc/config/nds32/isr-library/restore_mac_regs.inc
+++ b/libgcc/config/nds32/isr-library/restore_mac_regs.inc
@@ -24,7 +24,7 @@
    <http://www.gnu.org/licenses/>.  */

 .macro RESTORE_MAC_REGS
-#ifdef NDS32_DX_REGS
+#if __NDS32_DX_REGS__
 	lmw.bim	$r1, [$sp], $r4, #0x0
 	mtusr	$r1, $d0.lo
 	mtusr	$r2, $d0.hi
diff --git a/libgcc/config/nds32/isr-library/restore_partial.inc b/libgcc/config/nds32/isr-library/restore_partial.inc
index 70d5421..c07d30e 100644
--- a/libgcc/config/nds32/isr-library/restore_partial.inc
+++ b/libgcc/config/nds32/isr-library/restore_partial.inc
@@ -31,15 +31,11 @@
 	mtsr $r1, $IPC	/* Set IPC.  */
 	mtsr $r2, $IPSW	/* Set IPSW.  */
 #endif
-	RESTORE_FPU_REGS
-	RESTORE_MAC_REGS
-#ifdef NDS32_EXT_IFC
-	lmw.bim	$r1, [$sp], $r2, #0x0	/* Restore extra $r2 to keep
-					   stack 8-byte alignment.  */
-	mtusr   $r1, $IFC_LP
-#endif
+  RESTORE_FPU_REGS
+  RESTORE_MAC_REGS
+  RESTORE_USR_REGS
 	lmw.bim $r0, [$sp], $r5, #0x0	/* Restore all regs.  */
-#ifdef __NDS32_REDUCED_REGS__
+#if __NDS32_REDUCED_REGS__ || __NDS32_REDUCE_REGS
 	lmw.bim $r15, [$sp], $r15, #0x2
 #else
 	lmw.bim $r15, [$sp], $r27, #0x2	/* Restore all regs.  */
diff --git a/libgcc/config/nds32/isr-library/vec_vid03_4b.S b/libgcc/config/nds32/isr-library/restore_usr_regs.inc
similarity index 72%
rename from libgcc/config/nds32/isr-library/vec_vid03_4b.S
rename to libgcc/config/nds32/isr-library/restore_usr_regs.inc
index cd30906..c8f6e4a 100644
--- a/libgcc/config/nds32/isr-library/vec_vid03_4b.S
+++ b/libgcc/config/nds32/isr-library/restore_usr_regs.inc
@@ -23,12 +23,20 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */

-	.section	.nds32_vector.03, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_03_4b
-	.type	_nds32_vector_03_4b, @function
-_nds32_vector_03_4b:
-1:
-	j	1b
-	.size	_nds32_vector_03_4b, .-_nds32_vector_03_4b
+.macro RESTORE_USR_REGS
+#if __NDS32_EXT_IFC__ && (__NDS32_EXT_ZOL__ || __NDS32_EXT_DSP__)
+  lmw.bim $r1, [$sp], $r4, #0x0
+  mtusr   $r1, $IFC_LP
+  mtusr   $r2, $LB
+  mtusr   $r3, $LE
+  mtusr   $r4, $LC
+#elif __NDS32_EXT_IFC__
+  lmw.bim	$r1, [$sp], $r2, #0x0
+  mtusr   $r1, $IFC_LP
+#elif __NDS32_EXT_ZOL__ || __NDS32_EXT_DSP__
+  lmw.bim $r1, [$sp], $r4, #0x0
+  mtusr   $r1, $LB
+  mtusr   $r2, $LE
+  mtusr   $r3, $LC
+#endif
+.endm
diff --git a/libgcc/config/nds32/isr-library/save_all.inc b/libgcc/config/nds32/isr-library/save_all.inc
index 20eb29d..c926664 100644
--- a/libgcc/config/nds32/isr-library/save_all.inc
+++ b/libgcc/config/nds32/isr-library/save_all.inc
@@ -23,45 +23,42 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */

-.macro SAVE_ALL_4B
-#ifdef __NDS32_REDUCED_REGS__
+#if __NDS32_ISR_VECTOR_SIZE_4__
+
+/* If vector size is 4-byte, we have to save registers
+   in the macro implementation.  */
+.macro SAVE_ALL
+#if __NDS32_REDUCED_REGS__ || __NDS32_REDUCE_REGS
 	smw.adm $r15, [$sp], $r15, #0xf
 	smw.adm $r0, [$sp], $r10, #0x0
-#else /* not __NDS32_REDUCED_REGS__ */
+#else
 	smw.adm $r0, [$sp], $r27, #0xf
-#endif /* not __NDS32_REDUCED_REGS__ */
-#ifdef NDS32_EXT_IFC
-	mfusr   $r1, $IFC_LP
-	smw.adm $r1, [$sp], $r2, #0x0	/* Save extra $r2 to keep
-					   stack 8-byte alignment.  */
 #endif
-	SAVE_MAC_REGS
-	SAVE_FPU_REGS
+  SAVE_USR_REGS
+  SAVE_MAC_REGS
+  SAVE_FPU_REGS
 	mfsr	$r1, $IPC	/* Get IPC.  */
 	mfsr	$r2, $IPSW	/* Get IPSW.  */
 	smw.adm	$r1, [$sp], $r2, #0x0	/* Push IPC, IPSW.  */
 	move	$r1, $sp	/* $r1 is ptr to NDS32_CONTEXT.  */
 	mfsr	$r0, $ITYPE	/* Get VID to $r0.  */
 	srli	$r0, $r0, #5
-#ifdef __NDS32_ISA_V2__
 	andi	$r0, $r0, #127
-#else
-	fexti33	$r0, #6
-#endif
 .endm

+#else /* not __NDS32_ISR_VECTOR_SIZE_4__ */
+
+/* If vector size is 16-byte, some works can be done in
+   the vector section generated by compiler, so that we
+   can implement less in the macro.  */
 .macro SAVE_ALL
-/* SAVE_REG_TBL code has been moved to
-   vector table generated by compiler.  */
-#ifdef NDS32_EXT_IFC
-	mfusr   $r1, $IFC_LP
-	smw.adm $r1, [$sp], $r2, #0x0	/* Save extra $r2 to keep
-					   stack 8-byte alignment.  */
-#endif
-	SAVE_MAC_REGS
-	SAVE_FPU_REGS
+  SAVE_USR_REGS
+  SAVE_MAC_REGS
+  SAVE_FPU_REGS
 	mfsr	$r1, $IPC	/* Get IPC.  */
 	mfsr	$r2, $IPSW	/* Get IPSW.  */
 	smw.adm	$r1, [$sp], $r2, #0x0	/* Push IPC, IPSW.  */
 	move	$r1, $sp	/* $r1 is ptr to NDS32_CONTEXT.  */
 .endm
+
+#endif /* not __NDS32_ISR_VECTOR_SIZE_4__ */
diff --git a/libgcc/config/nds32/isr-library/save_mac_regs.inc b/libgcc/config/nds32/isr-library/save_mac_regs.inc
index ddb5e77..2d79d70 100644
--- a/libgcc/config/nds32/isr-library/save_mac_regs.inc
+++ b/libgcc/config/nds32/isr-library/save_mac_regs.inc
@@ -24,7 +24,7 @@
    <http://www.gnu.org/licenses/>.  */

 .macro SAVE_MAC_REGS
-#ifdef NDS32_DX_REGS
+#if __NDS32_DX_REGS__
 	mfusr	$r1, $d0.lo
 	mfusr	$r2, $d0.hi
 	mfusr	$r3, $d1.lo
diff --git a/libgcc/config/nds32/isr-library/save_partial.inc b/libgcc/config/nds32/isr-library/save_partial.inc
index ee514c4..0c6d481 100644
--- a/libgcc/config/nds32/isr-library/save_partial.inc
+++ b/libgcc/config/nds32/isr-library/save_partial.inc
@@ -23,20 +23,20 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */

-.macro SAVE_PARTIAL_4B
-#ifdef __NDS32_REDUCED_REGS__
+#if __NDS32_ISR_VECTOR_SIZE_4__
+
+/* If vector size is 4-byte, we have to save registers
+   in the macro implementation.  */
+.macro SAVE_PARTIAL
+#if __NDS32_REDUCED_REGS__ || __NDS32_REDUCE_REGS
 	smw.adm $r15, [$sp], $r15, #0x2
-#else /* not __NDS32_REDUCED_REGS__ */
+#else
 	smw.adm $r15, [$sp], $r27, #0x2
-#endif /* not __NDS32_REDUCED_REGS__ */
-	smw.adm $r0, [$sp], $r5, #0x0
-#ifdef NDS32_EXT_IFC
-	mfusr   $r1, $IFC_LP
-	smw.adm $r1, [$sp], $r2, #0x0	/* Save extra $r2 to keep
-					   stack 8-byte alignment.  */
 #endif
-	SAVE_MAC_REGS
-	SAVE_FPU_REGS
+	smw.adm $r0, [$sp], $r5, #0x0
+  SAVE_USR_REGS
+  SAVE_MAC_REGS
+  SAVE_FPU_REGS
 #if defined(NDS32_NESTED) || defined(NDS32_NESTED_READY)
        mfsr    $r1, $IPC       /* Get IPC.  */
        mfsr    $r2, $IPSW      /* Get IPSW.  */
@@ -44,26 +44,24 @@
 #endif
 	mfsr	$r0, $ITYPE	/* Get VID to $r0.  */
 	srli	$r0, $r0, #5
-#ifdef __NDS32_ISA_V2__
 	andi	$r0, $r0, #127
-#else
-	fexti33	$r0, #6
-#endif
 .endm

+#else /* not __NDS32_ISR_VECTOR_SIZE_4__ */
+
+/* If vector size is 16-byte, some works can be done in
+   the vector section generated by compiler, so that we
+   can implement less in the macro.  */
+
 .macro SAVE_PARTIAL
-/* SAVE_CALLER_REGS code has been moved to
-   vector table generated by compiler.  */
-#ifdef NDS32_EXT_IFC
-	mfusr   $r1, $IFC_LP
-	smw.adm $r1, [$sp], $r2, #0x0	/* Save extra $r2 to keep
-					   stack 8-byte alignment.  */
-#endif
-	SAVE_MAC_REGS
-	SAVE_FPU_REGS
+  SAVE_USR_REGS
+  SAVE_MAC_REGS
+  SAVE_FPU_REGS
 #if defined(NDS32_NESTED) || defined(NDS32_NESTED_READY)
        mfsr    $r1, $IPC       /* Get IPC.  */
        mfsr    $r2, $IPSW      /* Get IPSW.  */
        smw.adm $r1, [$sp], $r2, #0x0   /* Push IPC, IPSW.  */
 #endif
 .endm
+
+#endif /* not __NDS32_ISR_VECTOR_SIZE_4__ */
diff --git a/libgcc/config/nds32/isr-library/vec_vid00_4b.S b/libgcc/config/nds32/isr-library/save_usr_regs.inc
similarity index 61%
rename from libgcc/config/nds32/isr-library/vec_vid00_4b.S
rename to libgcc/config/nds32/isr-library/save_usr_regs.inc
index e1a37b4..b6807d7 100644
--- a/libgcc/config/nds32/isr-library/vec_vid00_4b.S
+++ b/libgcc/config/nds32/isr-library/save_usr_regs.inc
@@ -23,12 +23,22 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */

-	.section	.nds32_vector.00, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_00_4b
-	.type	_nds32_vector_00_4b, @function
-_nds32_vector_00_4b:
-1:
-	j	1b
-	.size	_nds32_vector_00_4b, .-_nds32_vector_00_4b
+.macro SAVE_USR_REGS
+/* Store User Special Registers according to supported ISA extension
+   !!! WATCH OUT !!! Take care of 8-byte alignment issue.  */
+#if __NDS32_EXT_IFC__ && (__NDS32_EXT_ZOL__ || __NDS32_EXT_DSP__)
+  mfusr   $r1, $IFC_LP
+  mfusr   $r2, $LB
+  mfusr   $r3, $LE
+  mfusr   $r4, $LC
+  smw.adm $r1, [$sp], $r4, #0x0 /* Save even. Ok!  */
+#elif __NDS32_EXT_IFC__
+  mfusr   $r1, $IFC_LP
+  smw.adm $r1, [$sp], $r2, #0x0	/* Save extra $r2 to keep stack 8-byte aligned.  */
+#elif (__NDS32_EXT_ZOL__ || __NDS32_EXT_DSP__)
+  mfusr   $r1, $LB
+  mfusr   $r2, $LE
+  mfusr   $r3, $LC
+  smw.adm $r1, [$sp], $r4, #0x0	/* Save extra $r4 to keep stack 8-byte aligned.  */
+#endif
+.endm
diff --git a/libgcc/config/nds32/isr-library/vec_vid00.S b/libgcc/config/nds32/isr-library/vec_vid00.S
index ccdbd19..f02e92c 100644
--- a/libgcc/config/nds32/isr-library/vec_vid00.S
+++ b/libgcc/config/nds32/isr-library/vec_vid00.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.00, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_00
 	.type	_nds32_vector_00, @function
 _nds32_vector_00:
diff --git a/libgcc/config/nds32/isr-library/vec_vid01.S b/libgcc/config/nds32/isr-library/vec_vid01.S
index ed5a88e..542fcf8 100644
--- a/libgcc/config/nds32/isr-library/vec_vid01.S
+++ b/libgcc/config/nds32/isr-library/vec_vid01.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.01, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_01
 	.type	_nds32_vector_01, @function
 _nds32_vector_01:
diff --git a/libgcc/config/nds32/isr-library/vec_vid01_4b.S b/libgcc/config/nds32/isr-library/vec_vid01_4b.S
deleted file mode 100644
index 239bd75..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid01_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.01, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_01_4b
-	.type	_nds32_vector_01_4b, @function
-_nds32_vector_01_4b:
-1:
-	j	1b
-	.size	_nds32_vector_01_4b, .-_nds32_vector_01_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid02.S b/libgcc/config/nds32/isr-library/vec_vid02.S
index 1a95a57..72b8b56 100644
--- a/libgcc/config/nds32/isr-library/vec_vid02.S
+++ b/libgcc/config/nds32/isr-library/vec_vid02.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.02, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_02
 	.type	_nds32_vector_02, @function
 _nds32_vector_02:
diff --git a/libgcc/config/nds32/isr-library/vec_vid02_4b.S b/libgcc/config/nds32/isr-library/vec_vid02_4b.S
deleted file mode 100644
index c532e62..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid02_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.02, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_02_4b
-	.type	_nds32_vector_02_4b, @function
-_nds32_vector_02_4b:
-1:
-	j	1b
-	.size	_nds32_vector_02_4b, .-_nds32_vector_02_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid03.S b/libgcc/config/nds32/isr-library/vec_vid03.S
index 9bc572a..b0f8a60 100644
--- a/libgcc/config/nds32/isr-library/vec_vid03.S
+++ b/libgcc/config/nds32/isr-library/vec_vid03.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.03, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_03
 	.type	_nds32_vector_03, @function
 _nds32_vector_03:
diff --git a/libgcc/config/nds32/isr-library/vec_vid04.S b/libgcc/config/nds32/isr-library/vec_vid04.S
index e8d4e10..d76ef73 100644
--- a/libgcc/config/nds32/isr-library/vec_vid04.S
+++ b/libgcc/config/nds32/isr-library/vec_vid04.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.04, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_04
 	.type	_nds32_vector_04, @function
 _nds32_vector_04:
diff --git a/libgcc/config/nds32/isr-library/vec_vid04_4b.S b/libgcc/config/nds32/isr-library/vec_vid04_4b.S
deleted file mode 100644
index 21fc77e..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid04_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.04, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_04_4b
-	.type	_nds32_vector_04_4b, @function
-_nds32_vector_04_4b:
-1:
-	j	1b
-	.size	_nds32_vector_04_4b, .-_nds32_vector_04_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid05.S b/libgcc/config/nds32/isr-library/vec_vid05.S
index 1621a9d..ed5a5bb 100644
--- a/libgcc/config/nds32/isr-library/vec_vid05.S
+++ b/libgcc/config/nds32/isr-library/vec_vid05.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.05, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_05
 	.type	_nds32_vector_05, @function
 _nds32_vector_05:
diff --git a/libgcc/config/nds32/isr-library/vec_vid05_4b.S b/libgcc/config/nds32/isr-library/vec_vid05_4b.S
deleted file mode 100644
index b86fe19..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid05_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.05, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_05_4b
-	.type	_nds32_vector_05_4b, @function
-_nds32_vector_05_4b:
-1:
-	j	1b
-	.size	_nds32_vector_05_4b, .-_nds32_vector_05_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid06.S b/libgcc/config/nds32/isr-library/vec_vid06.S
index 934f0b1..834c7de 100644
--- a/libgcc/config/nds32/isr-library/vec_vid06.S
+++ b/libgcc/config/nds32/isr-library/vec_vid06.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.06, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_06
 	.type	_nds32_vector_06, @function
 _nds32_vector_06:
diff --git a/libgcc/config/nds32/isr-library/vec_vid06_4b.S b/libgcc/config/nds32/isr-library/vec_vid06_4b.S
deleted file mode 100644
index 3624cfd..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid06_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.06, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_06_4b
-	.type	_nds32_vector_06_4b, @function
-_nds32_vector_06_4b:
-1:
-	j	1b
-	.size	_nds32_vector_06_4b, .-_nds32_vector_06_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid07.S b/libgcc/config/nds32/isr-library/vec_vid07.S
index 0b0484d..cb3b33a 100644
--- a/libgcc/config/nds32/isr-library/vec_vid07.S
+++ b/libgcc/config/nds32/isr-library/vec_vid07.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.07, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_07
 	.type	_nds32_vector_07, @function
 _nds32_vector_07:
diff --git a/libgcc/config/nds32/isr-library/vec_vid07_4b.S b/libgcc/config/nds32/isr-library/vec_vid07_4b.S
deleted file mode 100644
index 997ca75..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid07_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.07, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_07_4b
-	.type	_nds32_vector_07_4b, @function
-_nds32_vector_07_4b:
-1:
-	j	1b
-	.size	_nds32_vector_07_4b, .-_nds32_vector_07_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid08.S b/libgcc/config/nds32/isr-library/vec_vid08.S
index 2a30375..b4ae947 100644
--- a/libgcc/config/nds32/isr-library/vec_vid08.S
+++ b/libgcc/config/nds32/isr-library/vec_vid08.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.08, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_08
 	.type	_nds32_vector_08, @function
 _nds32_vector_08:
diff --git a/libgcc/config/nds32/isr-library/vec_vid08_4b.S b/libgcc/config/nds32/isr-library/vec_vid08_4b.S
deleted file mode 100644
index 83546d1..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid08_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.08, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_08_4b
-	.type	_nds32_vector_08_4b, @function
-_nds32_vector_08_4b:
-1:
-	j	1b
-	.size	_nds32_vector_08_4b, .-_nds32_vector_08_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid09.S b/libgcc/config/nds32/isr-library/vec_vid09.S
index 9aeaf78..47fa5c1 100644
--- a/libgcc/config/nds32/isr-library/vec_vid09.S
+++ b/libgcc/config/nds32/isr-library/vec_vid09.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.09, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_09
 	.type	_nds32_vector_09, @function
 _nds32_vector_09:
diff --git a/libgcc/config/nds32/isr-library/vec_vid09_4b.S b/libgcc/config/nds32/isr-library/vec_vid09_4b.S
deleted file mode 100644
index 2d1944f..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid09_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.09, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_09_4b
-	.type	_nds32_vector_09_4b, @function
-_nds32_vector_09_4b:
-1:
-	j	1b
-	.size	_nds32_vector_09_4b, .-_nds32_vector_09_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid10.S b/libgcc/config/nds32/isr-library/vec_vid10.S
index 411edd7..6bf2c7c 100644
--- a/libgcc/config/nds32/isr-library/vec_vid10.S
+++ b/libgcc/config/nds32/isr-library/vec_vid10.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.10, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_10
 	.type	_nds32_vector_10, @function
 _nds32_vector_10:
diff --git a/libgcc/config/nds32/isr-library/vec_vid10_4b.S b/libgcc/config/nds32/isr-library/vec_vid10_4b.S
deleted file mode 100644
index 04761ab..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid10_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.10, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_10_4b
-	.type	_nds32_vector_10_4b, @function
-_nds32_vector_10_4b:
-1:
-	j	1b
-	.size	_nds32_vector_10_4b, .-_nds32_vector_10_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid11.S b/libgcc/config/nds32/isr-library/vec_vid11.S
index 8de45a4..86975ea 100644
--- a/libgcc/config/nds32/isr-library/vec_vid11.S
+++ b/libgcc/config/nds32/isr-library/vec_vid11.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.11, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_11
 	.type	_nds32_vector_11, @function
 _nds32_vector_11:
diff --git a/libgcc/config/nds32/isr-library/vec_vid11_4b.S b/libgcc/config/nds32/isr-library/vec_vid11_4b.S
deleted file mode 100644
index 328c1e6..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid11_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.11, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_11_4b
-	.type	_nds32_vector_11_4b, @function
-_nds32_vector_11_4b:
-1:
-	j	1b
-	.size	_nds32_vector_11_4b, .-_nds32_vector_11_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid12.S b/libgcc/config/nds32/isr-library/vec_vid12.S
index ff5c6df..07cb7de 100644
--- a/libgcc/config/nds32/isr-library/vec_vid12.S
+++ b/libgcc/config/nds32/isr-library/vec_vid12.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.12, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_12
 	.type	_nds32_vector_12, @function
 _nds32_vector_12:
diff --git a/libgcc/config/nds32/isr-library/vec_vid12_4b.S b/libgcc/config/nds32/isr-library/vec_vid12_4b.S
deleted file mode 100644
index 52b7d23..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid12_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.12, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_12_4b
-	.type	_nds32_vector_12_4b, @function
-_nds32_vector_12_4b:
-1:
-	j	1b
-	.size	_nds32_vector_12_4b, .-_nds32_vector_12_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid13.S b/libgcc/config/nds32/isr-library/vec_vid13.S
index 66014c3..5ac1a83 100644
--- a/libgcc/config/nds32/isr-library/vec_vid13.S
+++ b/libgcc/config/nds32/isr-library/vec_vid13.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.13, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_13
 	.type	_nds32_vector_13, @function
 _nds32_vector_13:
diff --git a/libgcc/config/nds32/isr-library/vec_vid13_4b.S b/libgcc/config/nds32/isr-library/vec_vid13_4b.S
deleted file mode 100644
index 59029ad..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid13_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.13, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_13_4b
-	.type	_nds32_vector_13_4b, @function
-_nds32_vector_13_4b:
-1:
-	j	1b
-	.size	_nds32_vector_13_4b, .-_nds32_vector_13_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid14.S b/libgcc/config/nds32/isr-library/vec_vid14.S
index ca6f66f..5116f2f 100644
--- a/libgcc/config/nds32/isr-library/vec_vid14.S
+++ b/libgcc/config/nds32/isr-library/vec_vid14.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.14, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_14
 	.type	_nds32_vector_14, @function
 _nds32_vector_14:
diff --git a/libgcc/config/nds32/isr-library/vec_vid14_4b.S b/libgcc/config/nds32/isr-library/vec_vid14_4b.S
deleted file mode 100644
index 0d2afe4..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid14_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.14, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_14_4b
-	.type	_nds32_vector_14_4b, @function
-_nds32_vector_14_4b:
-1:
-	j	1b
-	.size	_nds32_vector_14_4b, .-_nds32_vector_14_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid15.S b/libgcc/config/nds32/isr-library/vec_vid15.S
index c94b42a..03449c0 100644
--- a/libgcc/config/nds32/isr-library/vec_vid15.S
+++ b/libgcc/config/nds32/isr-library/vec_vid15.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.15, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_15
 	.type	_nds32_vector_15, @function
 _nds32_vector_15:
diff --git a/libgcc/config/nds32/isr-library/vec_vid15_4b.S b/libgcc/config/nds32/isr-library/vec_vid15_4b.S
deleted file mode 100644
index 60799d7..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid15_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.15, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_15_4b
-	.type	_nds32_vector_15_4b, @function
-_nds32_vector_15_4b:
-1:
-	j	1b
-	.size	_nds32_vector_15_4b, .-_nds32_vector_15_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid16.S b/libgcc/config/nds32/isr-library/vec_vid16.S
index f19454d..b01d673 100644
--- a/libgcc/config/nds32/isr-library/vec_vid16.S
+++ b/libgcc/config/nds32/isr-library/vec_vid16.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.16, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_16
 	.type	_nds32_vector_16, @function
 _nds32_vector_16:
diff --git a/libgcc/config/nds32/isr-library/vec_vid16_4b.S b/libgcc/config/nds32/isr-library/vec_vid16_4b.S
deleted file mode 100644
index 6791204..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid16_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.16, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_16_4b
-	.type	_nds32_vector_16_4b, @function
-_nds32_vector_16_4b:
-1:
-	j	1b
-	.size	_nds32_vector_16_4b, .-_nds32_vector_16_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid17.S b/libgcc/config/nds32/isr-library/vec_vid17.S
index 486a0aa..c6ed785 100644
--- a/libgcc/config/nds32/isr-library/vec_vid17.S
+++ b/libgcc/config/nds32/isr-library/vec_vid17.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.17, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_17
 	.type	_nds32_vector_17, @function
 _nds32_vector_17:
diff --git a/libgcc/config/nds32/isr-library/vec_vid17_4b.S b/libgcc/config/nds32/isr-library/vec_vid17_4b.S
deleted file mode 100644
index 04f4285..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid17_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.17, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_17_4b
-	.type	_nds32_vector_17_4b, @function
-_nds32_vector_17_4b:
-1:
-	j	1b
-	.size	_nds32_vector_17_4b, .-_nds32_vector_17_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid18.S b/libgcc/config/nds32/isr-library/vec_vid18.S
index 137511f..e0e7b7e 100644
--- a/libgcc/config/nds32/isr-library/vec_vid18.S
+++ b/libgcc/config/nds32/isr-library/vec_vid18.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.18, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_18
 	.type	_nds32_vector_18, @function
 _nds32_vector_18:
diff --git a/libgcc/config/nds32/isr-library/vec_vid18_4b.S b/libgcc/config/nds32/isr-library/vec_vid18_4b.S
deleted file mode 100644
index 4d80192..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid18_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.18, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_18_4b
-	.type	_nds32_vector_18_4b, @function
-_nds32_vector_18_4b:
-1:
-	j	1b
-	.size	_nds32_vector_18_4b, .-_nds32_vector_18_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid19.S b/libgcc/config/nds32/isr-library/vec_vid19.S
index 791e135..ef7075f 100644
--- a/libgcc/config/nds32/isr-library/vec_vid19.S
+++ b/libgcc/config/nds32/isr-library/vec_vid19.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.19, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_19
 	.type	_nds32_vector_19, @function
 _nds32_vector_19:
diff --git a/libgcc/config/nds32/isr-library/vec_vid19_4b.S b/libgcc/config/nds32/isr-library/vec_vid19_4b.S
deleted file mode 100644
index 87d4c7c..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid19_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.19, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_19_4b
-	.type	_nds32_vector_19_4b, @function
-_nds32_vector_19_4b:
-1:
-	j	1b
-	.size	_nds32_vector_19_4b, .-_nds32_vector_19_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid20.S b/libgcc/config/nds32/isr-library/vec_vid20.S
index e7ab0e3..99bcf01 100644
--- a/libgcc/config/nds32/isr-library/vec_vid20.S
+++ b/libgcc/config/nds32/isr-library/vec_vid20.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.20, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_20
 	.type	_nds32_vector_20, @function
 _nds32_vector_20:
diff --git a/libgcc/config/nds32/isr-library/vec_vid20_4b.S b/libgcc/config/nds32/isr-library/vec_vid20_4b.S
deleted file mode 100644
index 308385a..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid20_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.20, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_20_4b
-	.type	_nds32_vector_20_4b, @function
-_nds32_vector_20_4b:
-1:
-	j	1b
-	.size	_nds32_vector_20_4b, .-_nds32_vector_20_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid21.S b/libgcc/config/nds32/isr-library/vec_vid21.S
index 315ae56..8c66bef 100644
--- a/libgcc/config/nds32/isr-library/vec_vid21.S
+++ b/libgcc/config/nds32/isr-library/vec_vid21.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.21, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_21
 	.type	_nds32_vector_21, @function
 _nds32_vector_21:
diff --git a/libgcc/config/nds32/isr-library/vec_vid21_4b.S b/libgcc/config/nds32/isr-library/vec_vid21_4b.S
deleted file mode 100644
index 16cf02a..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid21_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.21, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_21_4b
-	.type	_nds32_vector_21_4b, @function
-_nds32_vector_21_4b:
-1:
-	j	1b
-	.size	_nds32_vector_21_4b, .-_nds32_vector_21_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid22.S b/libgcc/config/nds32/isr-library/vec_vid22.S
index 6f9de85..5c442ce 100644
--- a/libgcc/config/nds32/isr-library/vec_vid22.S
+++ b/libgcc/config/nds32/isr-library/vec_vid22.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.22, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_22
 	.type	_nds32_vector_22, @function
 _nds32_vector_22:
diff --git a/libgcc/config/nds32/isr-library/vec_vid22_4b.S b/libgcc/config/nds32/isr-library/vec_vid22_4b.S
deleted file mode 100644
index 587ee7f..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid22_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.22, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_22_4b
-	.type	_nds32_vector_22_4b, @function
-_nds32_vector_22_4b:
-1:
-	j	1b
-	.size	_nds32_vector_22_4b, .-_nds32_vector_22_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid23.S b/libgcc/config/nds32/isr-library/vec_vid23.S
index 956b585..c5d73df 100644
--- a/libgcc/config/nds32/isr-library/vec_vid23.S
+++ b/libgcc/config/nds32/isr-library/vec_vid23.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.23, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_23
 	.type	_nds32_vector_23, @function
 _nds32_vector_23:
diff --git a/libgcc/config/nds32/isr-library/vec_vid23_4b.S b/libgcc/config/nds32/isr-library/vec_vid23_4b.S
deleted file mode 100644
index 5e4b643..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid23_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.23, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_23_4b
-	.type	_nds32_vector_23_4b, @function
-_nds32_vector_23_4b:
-1:
-	j	1b
-	.size	_nds32_vector_23_4b, .-_nds32_vector_23_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid24.S b/libgcc/config/nds32/isr-library/vec_vid24.S
index 57086e9..fe7dada 100644
--- a/libgcc/config/nds32/isr-library/vec_vid24.S
+++ b/libgcc/config/nds32/isr-library/vec_vid24.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.24, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_24
 	.type	_nds32_vector_24, @function
 _nds32_vector_24:
diff --git a/libgcc/config/nds32/isr-library/vec_vid24_4b.S b/libgcc/config/nds32/isr-library/vec_vid24_4b.S
deleted file mode 100644
index 43495f9..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid24_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.24, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_24_4b
-	.type	_nds32_vector_24_4b, @function
-_nds32_vector_24_4b:
-1:
-	j	1b
-	.size	_nds32_vector_24_4b, .-_nds32_vector_24_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid25.S b/libgcc/config/nds32/isr-library/vec_vid25.S
index 61fa526..ada24e4 100644
--- a/libgcc/config/nds32/isr-library/vec_vid25.S
+++ b/libgcc/config/nds32/isr-library/vec_vid25.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.25, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_25
 	.type	_nds32_vector_25, @function
 _nds32_vector_25:
diff --git a/libgcc/config/nds32/isr-library/vec_vid25_4b.S b/libgcc/config/nds32/isr-library/vec_vid25_4b.S
deleted file mode 100644
index 1ce6cf3..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid25_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.25, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_25_4b
-	.type	_nds32_vector_25_4b, @function
-_nds32_vector_25_4b:
-1:
-	j	1b
-	.size	_nds32_vector_25_4b, .-_nds32_vector_25_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid26.S b/libgcc/config/nds32/isr-library/vec_vid26.S
index 3d9191d..1f97945 100644
--- a/libgcc/config/nds32/isr-library/vec_vid26.S
+++ b/libgcc/config/nds32/isr-library/vec_vid26.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.26, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_26
 	.type	_nds32_vector_26, @function
 _nds32_vector_26:
diff --git a/libgcc/config/nds32/isr-library/vec_vid26_4b.S b/libgcc/config/nds32/isr-library/vec_vid26_4b.S
deleted file mode 100644
index 5803247..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid26_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.26, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_26_4b
-	.type	_nds32_vector_26_4b, @function
-_nds32_vector_26_4b:
-1:
-	j	1b
-	.size	_nds32_vector_26_4b, .-_nds32_vector_26_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid27.S b/libgcc/config/nds32/isr-library/vec_vid27.S
index ff12cfb..f440a8b 100644
--- a/libgcc/config/nds32/isr-library/vec_vid27.S
+++ b/libgcc/config/nds32/isr-library/vec_vid27.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.27, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_27
 	.type	_nds32_vector_27, @function
 _nds32_vector_27:
diff --git a/libgcc/config/nds32/isr-library/vec_vid27_4b.S b/libgcc/config/nds32/isr-library/vec_vid27_4b.S
deleted file mode 100644
index d61e3f9..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid27_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.27, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_27_4b
-	.type	_nds32_vector_27_4b, @function
-_nds32_vector_27_4b:
-1:
-	j	1b
-	.size	_nds32_vector_27_4b, .-_nds32_vector_27_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid28.S b/libgcc/config/nds32/isr-library/vec_vid28.S
index 6b7610e..e1621c7 100644
--- a/libgcc/config/nds32/isr-library/vec_vid28.S
+++ b/libgcc/config/nds32/isr-library/vec_vid28.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.28, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_28
 	.type	_nds32_vector_28, @function
 _nds32_vector_28:
diff --git a/libgcc/config/nds32/isr-library/vec_vid28_4b.S b/libgcc/config/nds32/isr-library/vec_vid28_4b.S
deleted file mode 100644
index a39d015..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid28_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.28, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_28_4b
-	.type	_nds32_vector_28_4b, @function
-_nds32_vector_28_4b:
-1:
-	j	1b
-	.size	_nds32_vector_28_4b, .-_nds32_vector_28_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid29.S b/libgcc/config/nds32/isr-library/vec_vid29.S
index b995841..4fa29c1 100644
--- a/libgcc/config/nds32/isr-library/vec_vid29.S
+++ b/libgcc/config/nds32/isr-library/vec_vid29.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.29, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_29
 	.type	_nds32_vector_29, @function
 _nds32_vector_29:
diff --git a/libgcc/config/nds32/isr-library/vec_vid29_4b.S b/libgcc/config/nds32/isr-library/vec_vid29_4b.S
deleted file mode 100644
index 803f323..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid29_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.29, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_29_4b
-	.type	_nds32_vector_29_4b, @function
-_nds32_vector_29_4b:
-1:
-	j	1b
-	.size	_nds32_vector_29_4b, .-_nds32_vector_29_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid30.S b/libgcc/config/nds32/isr-library/vec_vid30.S
index 57d1507..214e67b 100644
--- a/libgcc/config/nds32/isr-library/vec_vid30.S
+++ b/libgcc/config/nds32/isr-library/vec_vid30.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.30, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_30
 	.type	_nds32_vector_30, @function
 _nds32_vector_30:
diff --git a/libgcc/config/nds32/isr-library/vec_vid30_4b.S b/libgcc/config/nds32/isr-library/vec_vid30_4b.S
deleted file mode 100644
index a2a1e3e..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid30_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.30, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_30_4b
-	.type	_nds32_vector_30_4b, @function
-_nds32_vector_30_4b:
-1:
-	j	1b
-	.size	_nds32_vector_30_4b, .-_nds32_vector_30_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid31.S b/libgcc/config/nds32/isr-library/vec_vid31.S
index f9aee4e..b758b8c 100644
--- a/libgcc/config/nds32/isr-library/vec_vid31.S
+++ b/libgcc/config/nds32/isr-library/vec_vid31.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.31, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_31
 	.type	_nds32_vector_31, @function
 _nds32_vector_31:
diff --git a/libgcc/config/nds32/isr-library/vec_vid31_4b.S b/libgcc/config/nds32/isr-library/vec_vid31_4b.S
deleted file mode 100644
index 989645f..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid31_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.31, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_31_4b
-	.type	_nds32_vector_31_4b, @function
-_nds32_vector_31_4b:
-1:
-	j	1b
-	.size	_nds32_vector_31_4b, .-_nds32_vector_31_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid32.S b/libgcc/config/nds32/isr-library/vec_vid32.S
index fc26cad..58234d5 100644
--- a/libgcc/config/nds32/isr-library/vec_vid32.S
+++ b/libgcc/config/nds32/isr-library/vec_vid32.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.32, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_32
 	.type	_nds32_vector_32, @function
 _nds32_vector_32:
diff --git a/libgcc/config/nds32/isr-library/vec_vid32_4b.S b/libgcc/config/nds32/isr-library/vec_vid32_4b.S
deleted file mode 100644
index 1ac7e31..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid32_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.32, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_32_4b
-	.type	_nds32_vector_32_4b, @function
-_nds32_vector_32_4b:
-1:
-	j	1b
-	.size	_nds32_vector_32_4b, .-_nds32_vector_32_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid33.S b/libgcc/config/nds32/isr-library/vec_vid33.S
index dd655e6..d920352 100644
--- a/libgcc/config/nds32/isr-library/vec_vid33.S
+++ b/libgcc/config/nds32/isr-library/vec_vid33.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.33, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_33
 	.type	_nds32_vector_33, @function
 _nds32_vector_33:
diff --git a/libgcc/config/nds32/isr-library/vec_vid33_4b.S b/libgcc/config/nds32/isr-library/vec_vid33_4b.S
deleted file mode 100644
index 3c99412..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid33_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.33, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_33_4b
-	.type	_nds32_vector_33_4b, @function
-_nds32_vector_33_4b:
-1:
-	j	1b
-	.size	_nds32_vector_33_4b, .-_nds32_vector_33_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid34.S b/libgcc/config/nds32/isr-library/vec_vid34.S
index a6b8517..01999b4 100644
--- a/libgcc/config/nds32/isr-library/vec_vid34.S
+++ b/libgcc/config/nds32/isr-library/vec_vid34.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.34, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_34
 	.type	_nds32_vector_34, @function
 _nds32_vector_34:
diff --git a/libgcc/config/nds32/isr-library/vec_vid34_4b.S b/libgcc/config/nds32/isr-library/vec_vid34_4b.S
deleted file mode 100644
index 77c07b9..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid34_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.34, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_34_4b
-	.type	_nds32_vector_34_4b, @function
-_nds32_vector_34_4b:
-1:
-	j	1b
-	.size	_nds32_vector_34_4b, .-_nds32_vector_34_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid35.S b/libgcc/config/nds32/isr-library/vec_vid35.S
index 65ceeab..7ab0536 100644
--- a/libgcc/config/nds32/isr-library/vec_vid35.S
+++ b/libgcc/config/nds32/isr-library/vec_vid35.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.35, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_35
 	.type	_nds32_vector_35, @function
 _nds32_vector_35:
diff --git a/libgcc/config/nds32/isr-library/vec_vid35_4b.S b/libgcc/config/nds32/isr-library/vec_vid35_4b.S
deleted file mode 100644
index 432873a..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid35_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.35, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_35_4b
-	.type	_nds32_vector_35_4b, @function
-_nds32_vector_35_4b:
-1:
-	j	1b
-	.size	_nds32_vector_35_4b, .-_nds32_vector_35_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid36.S b/libgcc/config/nds32/isr-library/vec_vid36.S
index 688dbb9..5da079d 100644
--- a/libgcc/config/nds32/isr-library/vec_vid36.S
+++ b/libgcc/config/nds32/isr-library/vec_vid36.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.36, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_36
 	.type	_nds32_vector_36, @function
 _nds32_vector_36:
diff --git a/libgcc/config/nds32/isr-library/vec_vid36_4b.S b/libgcc/config/nds32/isr-library/vec_vid36_4b.S
deleted file mode 100644
index dadd381..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid36_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.36, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_36_4b
-	.type	_nds32_vector_36_4b, @function
-_nds32_vector_36_4b:
-1:
-	j	1b
-	.size	_nds32_vector_36_4b, .-_nds32_vector_36_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid37.S b/libgcc/config/nds32/isr-library/vec_vid37.S
index 712bbe8..704d6b8 100644
--- a/libgcc/config/nds32/isr-library/vec_vid37.S
+++ b/libgcc/config/nds32/isr-library/vec_vid37.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.37, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_37
 	.type	_nds32_vector_37, @function
 _nds32_vector_37:
diff --git a/libgcc/config/nds32/isr-library/vec_vid37_4b.S b/libgcc/config/nds32/isr-library/vec_vid37_4b.S
deleted file mode 100644
index ec845e1..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid37_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.37, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_37_4b
-	.type	_nds32_vector_37_4b, @function
-_nds32_vector_37_4b:
-1:
-	j	1b
-	.size	_nds32_vector_37_4b, .-_nds32_vector_37_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid38.S b/libgcc/config/nds32/isr-library/vec_vid38.S
index b6e4979..fdfc4a9 100644
--- a/libgcc/config/nds32/isr-library/vec_vid38.S
+++ b/libgcc/config/nds32/isr-library/vec_vid38.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.38, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_38
 	.type	_nds32_vector_38, @function
 _nds32_vector_38:
diff --git a/libgcc/config/nds32/isr-library/vec_vid38_4b.S b/libgcc/config/nds32/isr-library/vec_vid38_4b.S
deleted file mode 100644
index 84919ed..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid38_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.38, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_38_4b
-	.type	_nds32_vector_38_4b, @function
-_nds32_vector_38_4b:
-1:
-	j	1b
-	.size	_nds32_vector_38_4b, .-_nds32_vector_38_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid39.S b/libgcc/config/nds32/isr-library/vec_vid39.S
index 2dee269..00dd245 100644
--- a/libgcc/config/nds32/isr-library/vec_vid39.S
+++ b/libgcc/config/nds32/isr-library/vec_vid39.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.39, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_39
 	.type	_nds32_vector_39, @function
 _nds32_vector_39:
diff --git a/libgcc/config/nds32/isr-library/vec_vid39_4b.S b/libgcc/config/nds32/isr-library/vec_vid39_4b.S
deleted file mode 100644
index 8f2f634..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid39_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.39, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_39_4b
-	.type	_nds32_vector_39_4b, @function
-_nds32_vector_39_4b:
-1:
-	j	1b
-	.size	_nds32_vector_39_4b, .-_nds32_vector_39_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid40.S b/libgcc/config/nds32/isr-library/vec_vid40.S
index fe7508c..82b579f 100644
--- a/libgcc/config/nds32/isr-library/vec_vid40.S
+++ b/libgcc/config/nds32/isr-library/vec_vid40.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.40, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_40
 	.type	_nds32_vector_40, @function
 _nds32_vector_40:
diff --git a/libgcc/config/nds32/isr-library/vec_vid40_4b.S b/libgcc/config/nds32/isr-library/vec_vid40_4b.S
deleted file mode 100644
index 0aab8f4..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid40_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.40, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_40_4b
-	.type	_nds32_vector_40_4b, @function
-_nds32_vector_40_4b:
-1:
-	j	1b
-	.size	_nds32_vector_40_4b, .-_nds32_vector_40_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid41.S b/libgcc/config/nds32/isr-library/vec_vid41.S
index 711fcd5..721c735 100644
--- a/libgcc/config/nds32/isr-library/vec_vid41.S
+++ b/libgcc/config/nds32/isr-library/vec_vid41.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.41, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_41
 	.type	_nds32_vector_41, @function
 _nds32_vector_41:
diff --git a/libgcc/config/nds32/isr-library/vec_vid41_4b.S b/libgcc/config/nds32/isr-library/vec_vid41_4b.S
deleted file mode 100644
index e8a8527..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid41_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.41, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_41_4b
-	.type	_nds32_vector_41_4b, @function
-_nds32_vector_41_4b:
-1:
-	j	1b
-	.size	_nds32_vector_41_4b, .-_nds32_vector_41_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid42.S b/libgcc/config/nds32/isr-library/vec_vid42.S
index 0c6a849..307b51d 100644
--- a/libgcc/config/nds32/isr-library/vec_vid42.S
+++ b/libgcc/config/nds32/isr-library/vec_vid42.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.42, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_42
 	.type	_nds32_vector_42, @function
 _nds32_vector_42:
diff --git a/libgcc/config/nds32/isr-library/vec_vid42_4b.S b/libgcc/config/nds32/isr-library/vec_vid42_4b.S
deleted file mode 100644
index cfe184c..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid42_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.42, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_42_4b
-	.type	_nds32_vector_42_4b, @function
-_nds32_vector_42_4b:
-1:
-	j	1b
-	.size	_nds32_vector_42_4b, .-_nds32_vector_42_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid43.S b/libgcc/config/nds32/isr-library/vec_vid43.S
index 2b4681a..c0ce02d 100644
--- a/libgcc/config/nds32/isr-library/vec_vid43.S
+++ b/libgcc/config/nds32/isr-library/vec_vid43.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.43, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_43
 	.type	_nds32_vector_43, @function
 _nds32_vector_43:
diff --git a/libgcc/config/nds32/isr-library/vec_vid43_4b.S b/libgcc/config/nds32/isr-library/vec_vid43_4b.S
deleted file mode 100644
index 3edd606..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid43_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.43, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_43_4b
-	.type	_nds32_vector_43_4b, @function
-_nds32_vector_43_4b:
-1:
-	j	1b
-	.size	_nds32_vector_43_4b, .-_nds32_vector_43_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid44.S b/libgcc/config/nds32/isr-library/vec_vid44.S
index 232ef41..c2a384c 100644
--- a/libgcc/config/nds32/isr-library/vec_vid44.S
+++ b/libgcc/config/nds32/isr-library/vec_vid44.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.44, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_44
 	.type	_nds32_vector_44, @function
 _nds32_vector_44:
diff --git a/libgcc/config/nds32/isr-library/vec_vid44_4b.S b/libgcc/config/nds32/isr-library/vec_vid44_4b.S
deleted file mode 100644
index 0f2b8a3..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid44_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.44, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_44_4b
-	.type	_nds32_vector_44_4b, @function
-_nds32_vector_44_4b:
-1:
-	j	1b
-	.size	_nds32_vector_44_4b, .-_nds32_vector_44_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid45.S b/libgcc/config/nds32/isr-library/vec_vid45.S
index e2f9863..e13c52b 100644
--- a/libgcc/config/nds32/isr-library/vec_vid45.S
+++ b/libgcc/config/nds32/isr-library/vec_vid45.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.45, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_45
 	.type	_nds32_vector_45, @function
 _nds32_vector_45:
diff --git a/libgcc/config/nds32/isr-library/vec_vid45_4b.S b/libgcc/config/nds32/isr-library/vec_vid45_4b.S
deleted file mode 100644
index 7358ec1..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid45_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.45, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_45_4b
-	.type	_nds32_vector_45_4b, @function
-_nds32_vector_45_4b:
-1:
-	j	1b
-	.size	_nds32_vector_45_4b, .-_nds32_vector_45_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid46.S b/libgcc/config/nds32/isr-library/vec_vid46.S
index f3b93aa..71bfb53 100644
--- a/libgcc/config/nds32/isr-library/vec_vid46.S
+++ b/libgcc/config/nds32/isr-library/vec_vid46.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.46, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_46
 	.type	_nds32_vector_46, @function
 _nds32_vector_46:
diff --git a/libgcc/config/nds32/isr-library/vec_vid46_4b.S b/libgcc/config/nds32/isr-library/vec_vid46_4b.S
deleted file mode 100644
index 2782e86..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid46_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.46, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_46_4b
-	.type	_nds32_vector_46_4b, @function
-_nds32_vector_46_4b:
-1:
-	j	1b
-	.size	_nds32_vector_46_4b, .-_nds32_vector_46_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid47.S b/libgcc/config/nds32/isr-library/vec_vid47.S
index 130c8d7..d1f2131 100644
--- a/libgcc/config/nds32/isr-library/vec_vid47.S
+++ b/libgcc/config/nds32/isr-library/vec_vid47.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.47, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_47
 	.type	_nds32_vector_47, @function
 _nds32_vector_47:
diff --git a/libgcc/config/nds32/isr-library/vec_vid47_4b.S b/libgcc/config/nds32/isr-library/vec_vid47_4b.S
deleted file mode 100644
index f237577..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid47_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.47, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_47_4b
-	.type	_nds32_vector_47_4b, @function
-_nds32_vector_47_4b:
-1:
-	j	1b
-	.size	_nds32_vector_47_4b, .-_nds32_vector_47_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid48.S b/libgcc/config/nds32/isr-library/vec_vid48.S
index f3bca05..4ba5eb9 100644
--- a/libgcc/config/nds32/isr-library/vec_vid48.S
+++ b/libgcc/config/nds32/isr-library/vec_vid48.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.48, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_48
 	.type	_nds32_vector_48, @function
 _nds32_vector_48:
diff --git a/libgcc/config/nds32/isr-library/vec_vid48_4b.S b/libgcc/config/nds32/isr-library/vec_vid48_4b.S
deleted file mode 100644
index 3e35f68..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid48_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.48, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_48_4b
-	.type	_nds32_vector_48_4b, @function
-_nds32_vector_48_4b:
-1:
-	j	1b
-	.size	_nds32_vector_48_4b, .-_nds32_vector_48_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid49.S b/libgcc/config/nds32/isr-library/vec_vid49.S
index 0b32691..dd3d35e 100644
--- a/libgcc/config/nds32/isr-library/vec_vid49.S
+++ b/libgcc/config/nds32/isr-library/vec_vid49.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.49, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_49
 	.type	_nds32_vector_49, @function
 _nds32_vector_49:
diff --git a/libgcc/config/nds32/isr-library/vec_vid49_4b.S b/libgcc/config/nds32/isr-library/vec_vid49_4b.S
deleted file mode 100644
index a510bbb..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid49_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.49, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_49_4b
-	.type	_nds32_vector_49_4b, @function
-_nds32_vector_49_4b:
-1:
-	j	1b
-	.size	_nds32_vector_49_4b, .-_nds32_vector_49_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid50.S b/libgcc/config/nds32/isr-library/vec_vid50.S
index 48334feb..8f801ec 100644
--- a/libgcc/config/nds32/isr-library/vec_vid50.S
+++ b/libgcc/config/nds32/isr-library/vec_vid50.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.50, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_50
 	.type	_nds32_vector_50, @function
 _nds32_vector_50:
diff --git a/libgcc/config/nds32/isr-library/vec_vid50_4b.S b/libgcc/config/nds32/isr-library/vec_vid50_4b.S
deleted file mode 100644
index 1f42b73..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid50_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.50, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_50_4b
-	.type	_nds32_vector_50_4b, @function
-_nds32_vector_50_4b:
-1:
-	j	1b
-	.size	_nds32_vector_50_4b, .-_nds32_vector_50_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid51.S b/libgcc/config/nds32/isr-library/vec_vid51.S
index 4c27f27..445abf9 100644
--- a/libgcc/config/nds32/isr-library/vec_vid51.S
+++ b/libgcc/config/nds32/isr-library/vec_vid51.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.51, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_51
 	.type	_nds32_vector_51, @function
 _nds32_vector_51:
diff --git a/libgcc/config/nds32/isr-library/vec_vid51_4b.S b/libgcc/config/nds32/isr-library/vec_vid51_4b.S
deleted file mode 100644
index 7bb8abe..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid51_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.51, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_51_4b
-	.type	_nds32_vector_51_4b, @function
-_nds32_vector_51_4b:
-1:
-	j	1b
-	.size	_nds32_vector_51_4b, .-_nds32_vector_51_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid52.S b/libgcc/config/nds32/isr-library/vec_vid52.S
index 4c44811..7283975 100644
--- a/libgcc/config/nds32/isr-library/vec_vid52.S
+++ b/libgcc/config/nds32/isr-library/vec_vid52.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.52, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_52
 	.type	_nds32_vector_52, @function
 _nds32_vector_52:
diff --git a/libgcc/config/nds32/isr-library/vec_vid52_4b.S b/libgcc/config/nds32/isr-library/vec_vid52_4b.S
deleted file mode 100644
index 4cb89f6..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid52_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.52, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_52_4b
-	.type	_nds32_vector_52_4b, @function
-_nds32_vector_52_4b:
-1:
-	j	1b
-	.size	_nds32_vector_52_4b, .-_nds32_vector_52_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid53.S b/libgcc/config/nds32/isr-library/vec_vid53.S
index 2882583..299c645 100644
--- a/libgcc/config/nds32/isr-library/vec_vid53.S
+++ b/libgcc/config/nds32/isr-library/vec_vid53.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.53, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_53
 	.type	_nds32_vector_53, @function
 _nds32_vector_53:
diff --git a/libgcc/config/nds32/isr-library/vec_vid53_4b.S b/libgcc/config/nds32/isr-library/vec_vid53_4b.S
deleted file mode 100644
index 9abc839..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid53_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.53, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_53_4b
-	.type	_nds32_vector_53_4b, @function
-_nds32_vector_53_4b:
-1:
-	j	1b
-	.size	_nds32_vector_53_4b, .-_nds32_vector_53_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid54.S b/libgcc/config/nds32/isr-library/vec_vid54.S
index a014c72..ae99390 100644
--- a/libgcc/config/nds32/isr-library/vec_vid54.S
+++ b/libgcc/config/nds32/isr-library/vec_vid54.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.54, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_54
 	.type	_nds32_vector_54, @function
 _nds32_vector_54:
diff --git a/libgcc/config/nds32/isr-library/vec_vid54_4b.S b/libgcc/config/nds32/isr-library/vec_vid54_4b.S
deleted file mode 100644
index f736ba8..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid54_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.54, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_54_4b
-	.type	_nds32_vector_54_4b, @function
-_nds32_vector_54_4b:
-1:
-	j	1b
-	.size	_nds32_vector_54_4b, .-_nds32_vector_54_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid55.S b/libgcc/config/nds32/isr-library/vec_vid55.S
index 44d820c..e75d24a 100644
--- a/libgcc/config/nds32/isr-library/vec_vid55.S
+++ b/libgcc/config/nds32/isr-library/vec_vid55.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.55, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_55
 	.type	_nds32_vector_55, @function
 _nds32_vector_55:
diff --git a/libgcc/config/nds32/isr-library/vec_vid55_4b.S b/libgcc/config/nds32/isr-library/vec_vid55_4b.S
deleted file mode 100644
index d09c665..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid55_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.55, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_55_4b
-	.type	_nds32_vector_55_4b, @function
-_nds32_vector_55_4b:
-1:
-	j	1b
-	.size	_nds32_vector_55_4b, .-_nds32_vector_55_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid56.S b/libgcc/config/nds32/isr-library/vec_vid56.S
index d5cb362..cc4904e 100644
--- a/libgcc/config/nds32/isr-library/vec_vid56.S
+++ b/libgcc/config/nds32/isr-library/vec_vid56.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.56, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_56
 	.type	_nds32_vector_56, @function
 _nds32_vector_56:
diff --git a/libgcc/config/nds32/isr-library/vec_vid56_4b.S b/libgcc/config/nds32/isr-library/vec_vid56_4b.S
deleted file mode 100644
index 86b4103..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid56_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.56, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_56_4b
-	.type	_nds32_vector_56_4b, @function
-_nds32_vector_56_4b:
-1:
-	j	1b
-	.size	_nds32_vector_56_4b, .-_nds32_vector_56_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid57.S b/libgcc/config/nds32/isr-library/vec_vid57.S
index 5fb3ce9..a17ed45 100644
--- a/libgcc/config/nds32/isr-library/vec_vid57.S
+++ b/libgcc/config/nds32/isr-library/vec_vid57.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.57, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_57
 	.type	_nds32_vector_57, @function
 _nds32_vector_57:
diff --git a/libgcc/config/nds32/isr-library/vec_vid57_4b.S b/libgcc/config/nds32/isr-library/vec_vid57_4b.S
deleted file mode 100644
index 45c5d29..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid57_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.57, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_57_4b
-	.type	_nds32_vector_57_4b, @function
-_nds32_vector_57_4b:
-1:
-	j	1b
-	.size	_nds32_vector_57_4b, .-_nds32_vector_57_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid58.S b/libgcc/config/nds32/isr-library/vec_vid58.S
index d420d68..629bf1a 100644
--- a/libgcc/config/nds32/isr-library/vec_vid58.S
+++ b/libgcc/config/nds32/isr-library/vec_vid58.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.58, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_58
 	.type	_nds32_vector_58, @function
 _nds32_vector_58:
diff --git a/libgcc/config/nds32/isr-library/vec_vid58_4b.S b/libgcc/config/nds32/isr-library/vec_vid58_4b.S
deleted file mode 100644
index 812470c..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid58_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.58, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_58_4b
-	.type	_nds32_vector_58_4b, @function
-_nds32_vector_58_4b:
-1:
-	j	1b
-	.size	_nds32_vector_58_4b, .-_nds32_vector_58_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid59.S b/libgcc/config/nds32/isr-library/vec_vid59.S
index 78a1885..540e02e 100644
--- a/libgcc/config/nds32/isr-library/vec_vid59.S
+++ b/libgcc/config/nds32/isr-library/vec_vid59.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.59, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_59
 	.type	_nds32_vector_59, @function
 _nds32_vector_59:
diff --git a/libgcc/config/nds32/isr-library/vec_vid59_4b.S b/libgcc/config/nds32/isr-library/vec_vid59_4b.S
deleted file mode 100644
index fa3a467..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid59_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.59, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_59_4b
-	.type	_nds32_vector_59_4b, @function
-_nds32_vector_59_4b:
-1:
-	j	1b
-	.size	_nds32_vector_59_4b, .-_nds32_vector_59_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid60.S b/libgcc/config/nds32/isr-library/vec_vid60.S
index a6f704d..8658249 100644
--- a/libgcc/config/nds32/isr-library/vec_vid60.S
+++ b/libgcc/config/nds32/isr-library/vec_vid60.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.60, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_60
 	.type	_nds32_vector_60, @function
 _nds32_vector_60:
diff --git a/libgcc/config/nds32/isr-library/vec_vid60_4b.S b/libgcc/config/nds32/isr-library/vec_vid60_4b.S
deleted file mode 100644
index 505da2a..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid60_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.60, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_60_4b
-	.type	_nds32_vector_60_4b, @function
-_nds32_vector_60_4b:
-1:
-	j	1b
-	.size	_nds32_vector_60_4b, .-_nds32_vector_60_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid61.S b/libgcc/config/nds32/isr-library/vec_vid61.S
index 4e79bde..376acb9 100644
--- a/libgcc/config/nds32/isr-library/vec_vid61.S
+++ b/libgcc/config/nds32/isr-library/vec_vid61.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.61, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_61
 	.type	_nds32_vector_61, @function
 _nds32_vector_61:
diff --git a/libgcc/config/nds32/isr-library/vec_vid61_4b.S b/libgcc/config/nds32/isr-library/vec_vid61_4b.S
deleted file mode 100644
index 9a0cce5..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid61_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.61, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_61_4b
-	.type	_nds32_vector_61_4b, @function
-_nds32_vector_61_4b:
-1:
-	j	1b
-	.size	_nds32_vector_61_4b, .-_nds32_vector_61_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid62.S b/libgcc/config/nds32/isr-library/vec_vid62.S
index 5eef0a6..5ab06a8 100644
--- a/libgcc/config/nds32/isr-library/vec_vid62.S
+++ b/libgcc/config/nds32/isr-library/vec_vid62.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.62, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_62
 	.type	_nds32_vector_62, @function
 _nds32_vector_62:
diff --git a/libgcc/config/nds32/isr-library/vec_vid62_4b.S b/libgcc/config/nds32/isr-library/vec_vid62_4b.S
deleted file mode 100644
index da8ba28..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid62_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.62, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_62_4b
-	.type	_nds32_vector_62_4b, @function
-_nds32_vector_62_4b:
-1:
-	j	1b
-	.size	_nds32_vector_62_4b, .-_nds32_vector_62_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid63.S b/libgcc/config/nds32/isr-library/vec_vid63.S
index 0a8c0ad..6646bcc 100644
--- a/libgcc/config/nds32/isr-library/vec_vid63.S
+++ b/libgcc/config/nds32/isr-library/vec_vid63.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.63, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_63
 	.type	_nds32_vector_63, @function
 _nds32_vector_63:
diff --git a/libgcc/config/nds32/isr-library/vec_vid63_4b.S b/libgcc/config/nds32/isr-library/vec_vid63_4b.S
deleted file mode 100644
index 8f1045e..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid63_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.63, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_63_4b
-	.type	_nds32_vector_63_4b, @function
-_nds32_vector_63_4b:
-1:
-	j	1b
-	.size	_nds32_vector_63_4b, .-_nds32_vector_63_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid64.S b/libgcc/config/nds32/isr-library/vec_vid64.S
index b3f034b..f892aec 100644
--- a/libgcc/config/nds32/isr-library/vec_vid64.S
+++ b/libgcc/config/nds32/isr-library/vec_vid64.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.64, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_64
 	.type	_nds32_vector_64, @function
 _nds32_vector_64:
diff --git a/libgcc/config/nds32/isr-library/vec_vid64_4b.S b/libgcc/config/nds32/isr-library/vec_vid64_4b.S
deleted file mode 100644
index 81d9679..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid64_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.64, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_64_4b
-	.type	_nds32_vector_64_4b, @function
-_nds32_vector_64_4b:
-1:
-	j	1b
-	.size	_nds32_vector_64_4b, .-_nds32_vector_64_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid65.S b/libgcc/config/nds32/isr-library/vec_vid65.S
index 72db454..03f79a5 100644
--- a/libgcc/config/nds32/isr-library/vec_vid65.S
+++ b/libgcc/config/nds32/isr-library/vec_vid65.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.65, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_65
 	.type	_nds32_vector_65, @function
 _nds32_vector_65:
diff --git a/libgcc/config/nds32/isr-library/vec_vid65_4b.S b/libgcc/config/nds32/isr-library/vec_vid65_4b.S
deleted file mode 100644
index aa9ad2b..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid65_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.65, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_65_4b
-	.type	_nds32_vector_65_4b, @function
-_nds32_vector_65_4b:
-1:
-	j	1b
-	.size	_nds32_vector_65_4b, .-_nds32_vector_65_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid66.S b/libgcc/config/nds32/isr-library/vec_vid66.S
index 75469e7..ff805bd 100644
--- a/libgcc/config/nds32/isr-library/vec_vid66.S
+++ b/libgcc/config/nds32/isr-library/vec_vid66.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.66, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_66
 	.type	_nds32_vector_66, @function
 _nds32_vector_66:
diff --git a/libgcc/config/nds32/isr-library/vec_vid66_4b.S b/libgcc/config/nds32/isr-library/vec_vid66_4b.S
deleted file mode 100644
index 9830fe2..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid66_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.66, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_66_4b
-	.type	_nds32_vector_66_4b, @function
-_nds32_vector_66_4b:
-1:
-	j	1b
-	.size	_nds32_vector_66_4b, .-_nds32_vector_66_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid67.S b/libgcc/config/nds32/isr-library/vec_vid67.S
index 4b076cd..f592aba 100644
--- a/libgcc/config/nds32/isr-library/vec_vid67.S
+++ b/libgcc/config/nds32/isr-library/vec_vid67.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.67, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_67
 	.type	_nds32_vector_67, @function
 _nds32_vector_67:
diff --git a/libgcc/config/nds32/isr-library/vec_vid67_4b.S b/libgcc/config/nds32/isr-library/vec_vid67_4b.S
deleted file mode 100644
index c7e31dd..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid67_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.67, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_67_4b
-	.type	_nds32_vector_67_4b, @function
-_nds32_vector_67_4b:
-1:
-	j	1b
-	.size	_nds32_vector_67_4b, .-_nds32_vector_67_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid68.S b/libgcc/config/nds32/isr-library/vec_vid68.S
index 7df1cdd..ee2702a 100644
--- a/libgcc/config/nds32/isr-library/vec_vid68.S
+++ b/libgcc/config/nds32/isr-library/vec_vid68.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.68, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_68
 	.type	_nds32_vector_68, @function
 _nds32_vector_68:
diff --git a/libgcc/config/nds32/isr-library/vec_vid68_4b.S b/libgcc/config/nds32/isr-library/vec_vid68_4b.S
deleted file mode 100644
index 0d6fcb5..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid68_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.68, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_68_4b
-	.type	_nds32_vector_68_4b, @function
-_nds32_vector_68_4b:
-1:
-	j	1b
-	.size	_nds32_vector_68_4b, .-_nds32_vector_68_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid69.S b/libgcc/config/nds32/isr-library/vec_vid69.S
index e30e5bf..c152015 100644
--- a/libgcc/config/nds32/isr-library/vec_vid69.S
+++ b/libgcc/config/nds32/isr-library/vec_vid69.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.69, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_69
 	.type	_nds32_vector_69, @function
 _nds32_vector_69:
diff --git a/libgcc/config/nds32/isr-library/vec_vid69_4b.S b/libgcc/config/nds32/isr-library/vec_vid69_4b.S
deleted file mode 100644
index 3508162..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid69_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.69, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_69_4b
-	.type	_nds32_vector_69_4b, @function
-_nds32_vector_69_4b:
-1:
-	j	1b
-	.size	_nds32_vector_69_4b, .-_nds32_vector_69_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid70.S b/libgcc/config/nds32/isr-library/vec_vid70.S
index d436ac5..a3578d6 100644
--- a/libgcc/config/nds32/isr-library/vec_vid70.S
+++ b/libgcc/config/nds32/isr-library/vec_vid70.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.70, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_70
 	.type	_nds32_vector_70, @function
 _nds32_vector_70:
diff --git a/libgcc/config/nds32/isr-library/vec_vid70_4b.S b/libgcc/config/nds32/isr-library/vec_vid70_4b.S
deleted file mode 100644
index f3f0dd6..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid70_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.70, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_70_4b
-	.type	_nds32_vector_70_4b, @function
-_nds32_vector_70_4b:
-1:
-	j	1b
-	.size	_nds32_vector_70_4b, .-_nds32_vector_70_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid71.S b/libgcc/config/nds32/isr-library/vec_vid71.S
index d7d7ab3..6790888 100644
--- a/libgcc/config/nds32/isr-library/vec_vid71.S
+++ b/libgcc/config/nds32/isr-library/vec_vid71.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.71, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_71
 	.type	_nds32_vector_71, @function
 _nds32_vector_71:
diff --git a/libgcc/config/nds32/isr-library/vec_vid71_4b.S b/libgcc/config/nds32/isr-library/vec_vid71_4b.S
deleted file mode 100644
index 505c79e..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid71_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.71, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_71_4b
-	.type	_nds32_vector_71_4b, @function
-_nds32_vector_71_4b:
-1:
-	j	1b
-	.size	_nds32_vector_71_4b, .-_nds32_vector_71_4b
diff --git a/libgcc/config/nds32/isr-library/vec_vid72.S b/libgcc/config/nds32/isr-library/vec_vid72.S
index 08652d2..32984a0 100644
--- a/libgcc/config/nds32/isr-library/vec_vid72.S
+++ b/libgcc/config/nds32/isr-library/vec_vid72.S
@@ -24,8 +24,15 @@
    <http://www.gnu.org/licenses/>.  */

 	.section	.nds32_vector.72, "ax"
+#if __NDS32_ISR_VECTOR_SIZE_4__
+	/* The vector size is default 4-byte for v3 architecture.  */
+	.vec_size	4
+	.align	2
+#else
+	/* The vector size is default 16-byte for other architectures.  */
 	.vec_size	16
 	.align	4
+#endif
 	.weak	_nds32_vector_72
 	.type	_nds32_vector_72, @function
 _nds32_vector_72:
diff --git a/libgcc/config/nds32/isr-library/vec_vid72_4b.S b/libgcc/config/nds32/isr-library/vec_vid72_4b.S
deleted file mode 100644
index 1083c03..0000000
--- a/libgcc/config/nds32/isr-library/vec_vid72_4b.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* c-isr library stuff of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.nds32_vector.72, "ax"
-	.vec_size	4
-	.align	2
-	.weak	_nds32_vector_72_4b
-	.type	_nds32_vector_72_4b, @function
-_nds32_vector_72_4b:
-1:
-	j	1b
-	.size	_nds32_vector_72_4b, .-_nds32_vector_72_4b
diff --git a/libgcc/config/nds32/lib1asmsrc-mculib.S b/libgcc/config/nds32/lib1asmsrc-mculib.S
deleted file mode 100644
index bdbcd74..0000000
--- a/libgcc/config/nds32/lib1asmsrc-mculib.S
+++ /dev/null
@@ -1,5213 +0,0 @@
-/* mculib libgcc routines of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-	.section	.mdebug.abi_nds32
-	.previous
-
-
-/* ------------------------------------------- */
-/* FPBIT floating point operations for libgcc  */
-/* ------------------------------------------- */
-
-#ifdef L_addsub_sf
-
-	.text
-	.align	2
-	.global	__subsf3
-	.type	__subsf3, @function
-__subsf3:
-	push    $lp
-	pushm   $r6, $r9
-
-	move    $r2, #0x80000000
-	xor     $r1, $r1, $r2
-
-	j       .Lsfpadd
-
-	.global	__addsf3
-	.type	__addsf3, @function
-__addsf3:
-	push    $lp
-	pushm   $r6, $r9
-.Lsfpadd:
-	srli    $r5, $r0, #23
-	andi    $r5, $r5, #0xff
-	srli    $r7, $r1, #23
-	andi    $r7, $r7, #0xff
-	move    $r3, #0x80000000
-	slli    $r4, $r0, #8
-	or      $r4, $r4, $r3
-	slli    $r6, $r1, #8
-	or      $r6, $r6, $r3
-
-	addi    $r9, $r5, #-1
-	slti    $r15, $r9, #0xfe
-	beqzs8  .LEspecA
-
-.LElab1:
-	addi    $r9, $r7, #-1
-	slti    $r15, $r9, #0xfe
-	beqzs8  .LEspecB
-
-.LElab2:
-	sub     $r8, $r5, $r7
-	sltsi   $r15, $r8, #0
-	bnezs8  .Li1
-	sltsi   $r15, $r8, #0x20
-	bnezs8  .Li2
-	move    $r6, #2
-	j       .Le1
-.Li2:
-	move    $r2, $r6
-	srl     $r6, $r6, $r8
-	sll     $r9, $r6, $r8
-	beq     $r9, $r2, .Le1
-	ori     $r6, $r6, #2
-	j       .Le1
-.Li1:
-	move    $r5, $r7
-	subri   $r8, $r8, #0
-	sltsi   $r15, $r8, #0x20
-	bnezs8  .Li4
-	move    $r4, #2
-	j       .Le1
-.Li4:
-	move    $r2, $r4
-	srl     $r4, $r4, $r8
-	sll     $r9, $r4, $r8
-	beq     $r9, $r2, .Le1
-	ori     $r4, $r4, #2
-
-.Le1:
-	and     $r8, $r0, $r3
-	xor     $r9, $r8, $r1
-	sltsi   $r15, $r9, #0
-	bnezs8  .LEsub1
-
-	#ADD($r4, $r6)
-	add     $r4, $r4, $r6
-	slt     $r15, $r4, $r6
-	beqzs8  .LEres
-	andi    $r9, $r4, #1
-	beqz    $r9, .Li7
-	ori     $r4, $r4, #2
-.Li7:
-	srli    $r4, $r4, #1
-	addi    $r5, $r5, #1
-	subri   $r15, $r5, #0xff
-	bnezs8  .LEres
-	move    $r4, #0
-	j       .LEres
-
-.LEsub1:
-	#SUB($r4, $r6)
-	move    $r15, $r4
-	sub     $r4, $r4, $r6
-	slt     $r15, $r15, $r4
-	beqzs8  .Li9
-	subri   $r4, $r4, #0
-	xor     $r8, $r8, $r3
-	j       .Le9
-.Li9:
-	beqz    $r4, .LEzer
-.Le9:
-#ifdef __NDS32_PERF_EXT__
-	clz	$r2, $r4
-#else
-	pushm	$r0, $r1
-	pushm	$r3, $r5
-	move	$r0, $r4
-	bal	__clzsi2
-	move	$r2, $r0
-	popm	$r3, $r5
-	popm	$r0, $r1
-#endif
-	sub     $r5, $r5, $r2
-	sll     $r4, $r4, $r2
-
-.LEres:
-	blez    $r5, .LEund
-
-.LElab12:
-	#ADD($r4, $0x80)
-	move    $r15, #0x80
-	add     $r4, $r4, $r15
-	slt     $r15, $r4, $r15
-
-	#ADDC($r5, $0x0)
-	add     $r5, $r5, $r15
-	srli    $r9, $r4, #8
-	andi    $r9, $r9, #1
-	sub     $r4, $r4, $r9
-	slli    $r4, $r4, #1
-	srli    $r4, $r4, #9
-	slli    $r9, $r5, #23
-	or      $r4, $r4, $r9
-	or      $r0, $r4, $r8
-
-.LE999:
-	popm    $r6, $r9
-	pop     $lp
-	ret5    $lp
-
-.LEund:
-	subri   $r2, $r5, #1
-	slti    $r15, $r2, #0x20
-	beqzs8  .LEzer
-	move    $r9, #0x80000000
-	or      $r4, $r4, $r9
-	subri   $r9, $r2, #0x20
-	sll     $r5, $r4, $r9
-	srl     $r4, $r4, $r2
-	beqz    $r5, .Li10
-	ori     $r4, $r4, #1
-.Li10:
-	move    $r5, #0
-	addi    $r9, $r4, #0x80
-	sltsi   $r15, $r9, #0
-	beqzs8  .LElab12
-	move    $r5, #1
-	j       .LElab12
-
-.LEspecA:
-	bnez    $r5, .Li12
-	add     $r4, $r4, $r4
-	beqz    $r4, .Li13
-#ifdef __NDS32_PERF_EXT__
-	clz	$r8, $r4
-#else
-	pushm	$r0, $r5
-	move	$r0, $r4
-	bal	__clzsi2
-	move	$r8, $r0
-	popm	$r0, $r5
-#endif
-	sub     $r5, $r5, $r8
-	sll     $r4, $r4, $r8
-	j       .LElab1
-.Li13:
-	subri   $r15, $r7, #0xff
-	beqzs8  .LEspecB
-	move    $r9, #0x80000000
-	bne     $r1, $r9, .LEretB
-.Li12:
-	add     $r9, $r4, $r4
-	bnez    $r9, .LEnan
-	subri   $r15, $r7, #0xff
-	bnezs8  .LEretA
-	xor     $r9, $r0, $r1
-	sltsi   $r15, $r9, #0
-	bnezs8  .LEnan
-	j       .LEretB
-
-.LEspecB:
-	bnez    $r7, .Li15
-	add     $r6, $r6, $r6
-	beqz    $r6, .LEretA
-#ifdef __NDS32_PERF_EXT__
-	clz	$r8, $r6
-#else
-	pushm	$r0, $r5
-	move	$r0, $r6
-	bal	__clzsi2
-	move	$r8, $r0
-	popm	$r0, $r5
-#endif
-	sub     $r7, $r7, $r8
-	sll     $r6, $r6, $r8
-	j       .LElab2
-.Li15:
-	add     $r9, $r6, $r6
-	bnez    $r9, .LEnan
-
-.LEretB:
-	move    $r0, $r1
-	j       .LE999
-
-.LEretA:
-	j       .LE999
-
-.LEzer:
-	move    $r0, #0
-	j       .LE999
-
-.LEnan:
-	move    $r0, #0xffc00000
-	j       .LE999
-	.size	__subsf3, .-__subsf3
-	.size	__addsf3, .-__addsf3
-#endif /* L_addsub_sf */
-
-
-
-#ifdef L_sf_to_si
-
-	.text
-	.align	2
-	.global	__fixsfsi
-	.type	__fixsfsi, @function
-__fixsfsi:
-	push    $lp
-
-	slli    $r1, $r0, #8
-	move    $r3, #0x80000000
-	or      $r1, $r1, $r3
-	srli    $r3, $r0, #23
-	andi    $r3, $r3, #0xff
-	subri   $r2, $r3, #0x9e
-	blez    $r2, .LJspec
-	sltsi   $r15, $r2, #0x20
-	bnezs8  .Li42
-	move    $r0, #0
-	j       .LJ999
-.Li42:
-	srl     $r1, $r1, $r2
-	sltsi   $r15, $r0, #0
-	beqzs8  .Li43
-	subri   $r1, $r1, #0
-.Li43:
-	move    $r0, $r1
-
-.LJ999:
-	pop     $lp
-	ret5    $lp
-
-.LJspec:
-	move    $r3, #0x7f800000
-	slt     $r15, $r3, $r0
-	beqzs8  .Li44
-	move    $r0, #0x80000000
-	j       .LJ999
-.Li44:
-	move    $r0, #0x7fffffff
-	j       .LJ999
-	.size	__fixsfsi, .-__fixsfsi
-#endif /* L_sf_to_si */
-
-
-
-#ifdef L_divsi3
-
-	.text
-	.align	2
-	.globl	__divsi3
-	.type	__divsi3, @function
-__divsi3:
-	! ---------------------------------------------------------------------
-	! neg = 0;
-	! if (a < 0)
-	! {   a = -a;
-	!     neg = !neg;
-	! }
-	! ---------------------------------------------------------------------
-	sltsi	$r5, $r0, 0			! $r5  <- neg = (a < 0) ? 1 : 0
-	subri	$r4, $r0, 0			! $r4  <- a = -a
-	cmovn	$r0, $r4, $r5			! $r0  <- a = neg ? -a : a
-.L2:
-	! ---------------------------------------------------------------------
-	! if (b < 0)
-	! ---------------------------------------------------------------------
-	bgez	$r1, .L3			! if b >= 0, skip
-	! ---------------------------------------------------------------------
-	! {   b=-b;
-	!     neg=!neg;
-	! }
-	! ---------------------------------------------------------------------
-	subri	$r1, $r1, 0			! $r1  <- b = -b
-	subri	$r5, $r5, 1			! $r5  <- neg = !neg
-.L3:
-	! ---------------------------------------------------------------------
-	!!res = udivmodsi4 (a, b, 1);
-	! res = 0;
-	! if (den != 0)
-	! ---------------------------------------------------------------------
-	movi	$r2, 0				! $r2  <- res = 0
-	beqz	$r1, .L1			! if den == 0, skip
-	! ---------------------------------------------------------------------
-	! bit = 1;
-	! ---------------------------------------------------------------------
-	movi	$r4, 1				! $r4  <- bit = 1
-#ifndef __OPTIMIZE_SIZE__
-.L6:
-#endif
-	! ---------------------------------------------------------------------
-	! while (den < num && bit && !(den & (1L << 31)))
-	! ---------------------------------------------------------------------
-	slt	$ta, $r1, $r0			! $ta  <- den < num ?
-	beqz	$ta, .L5			! if no, skip
-	! ---------------------------------------------------------------------
-	! {   den << = 1;
-	!     bit << = 1;
-	! }
-	! ---------------------------------------------------------------------
-#if defined (__OPTIMIZE_SIZE__) && !defined (__NDS32_ISA_V3M__)
-	clz	$r3, $r1			! $r3  <- leading zero count for den
-	clz	$ta, $r0			! $ta  <- leading zero count for num
-	sub	$r3, $r3, $ta			! $r3  <- number of bits to shift
-	sll	$r1, $r1, $r3			! $r1  <- den
-	sll	$r4, $r4, $r3			! $r2  <- bit
-#else
-	slli	$r1, $r1, 1			! $r1  <- den << = 1
-	slli	$r4, $r4, 1			! $r4  <- bit << = 1
-	b	.L6				! continue loop
-#endif
-.L5:
-	! ---------------------------------------------------------------------
-	! while (bit)
-	! {   if (num >= den)
-	! ---------------------------------------------------------------------
-	slt	$ta, $r0, $r1			! $ta  <- num < den ?
-	bnez	$ta, .L9			! if yes, skip
-	! ---------------------------------------------------------------------
-	!     {   num -= den;
-	!         res |= bit;
-	!     }
-	! ---------------------------------------------------------------------
-	sub	$r0, $r0, $r1			! $r0  <- num -= den
-	or	$r2, $r2, $r4			! $r2  <- res |= bit
-.L9:
-	! ---------------------------------------------------------------------
-	!     bit >> = 1;
-	!     den >> = 1;
-	! }
-	!!if (modwanted)
-	!!    return num;
-	!!return res;
-	! ---------------------------------------------------------------------
-	srli	$r4, $r4, 1			! $r4  <- bit >> = 1
-	srli	$r1, $r1, 1			! $r1  <- den >> = 1
-	bnez	$r4, .L5			! if bit != 0, continue loop
-.L1:
-	! ---------------------------------------------------------------------
-	! if (neg)
-	!     res = -res;
-	! return res;
-	! ---------------------------------------------------------------------
-	subri	$r0, $r2, 0			! $r0  <- -res
-	cmovz	$r0, $r2, $r5			! $r0  <- neg ? -res : res
-	! ---------------------------------------------------------------------
-	ret
-	.size	__divsi3, .-__divsi3
-#endif /* L_divsi3 */
-
-
-
-#ifdef L_divdi3
-
-	!--------------------------------------
-	#ifdef __big_endian__
-		#define  V1H  $r0
-		#define  V1L  $r1
-		#define  V2H  $r2
-		#define  V2L  $r3
-	#else
-		#define  V1H  $r1
-		#define  V1L  $r0
-		#define  V2H  $r3
-		#define  V2L  $r2
-	#endif
-	!--------------------------------------
-	.text
-	.align	2
-	.globl	__divdi3
-	.type	__divdi3, @function
-__divdi3:
-	! prologue
-#ifdef __NDS32_ISA_V3M__
-	push25	$r10, 0
-#else
-	smw.adm	$r6, [$sp], $r10, 2
-#endif
-	! end of prologue
-	move	$r8, V1L
-	move	$r9, V1H
-	move	$r6, V2L
-	move	$r7, V2H
-	movi	$r10, 0
-	bgez	V1H, .L80
-	bal	__negdi2
-	move	$r8, V1L
-	move	$r9, V1H
-	movi	$r10, -1
-.L80:
-	bgez	$r7, .L81
-	move	V1L, $r6
-	move	V1H, $r7
-	bal	__negdi2
-	move	$r6, V1L
-	move	$r7, V1H
-	nor	$r10, $r10, $r10
-.L81:
-	move	V2L, $r6
-	move	V2H, $r7
-	move	V1L, $r8
-	move	V1H, $r9
-	movi	$r4, 0
-	bal	__udivmoddi4
-	beqz	$r10, .L82
-	bal	__negdi2
-.L82:
-	! epilogue
-#ifdef __NDS32_ISA_V3M__
-	pop25	$r10, 0
-#else
-	lmw.bim	$r6, [$sp], $r10, 2
-	ret
-#endif
-	.size	__divdi3, .-__divdi3
-#endif /* L_divdi3 */
-
-
-
-#ifdef L_modsi3
-
-	.text
-	.align	2
-	.globl	__modsi3
-	.type	__modsi3, @function
-__modsi3:
-	! ---------------------------------------------------------------------
-	! neg=0;
-	! if (a<0)
-	! {   a=-a;
-	!     neg=1;
-	! }
-	! ---------------------------------------------------------------------
-	sltsi	$r5, $r0, 0			! $r5  <- neg < 0 ? 1 : 0
-	subri	$r4, $r0, 0			! $r4  <- -a
-	cmovn	$r0, $r4, $r5			! $r0  <- |a|
-	! ---------------------------------------------------------------------
-	! if (b < 0)
-#ifndef __NDS32_PERF_EXT__
-	! ---------------------------------------------------------------------
-	bgez	$r1, .L3			! if b >= 0, skip
-	! ---------------------------------------------------------------------
-	!     b = -b;
-	! ---------------------------------------------------------------------
-	subri	$r1, $r1, 0			! $r1  <- |b|
-.L3:
-	! ---------------------------------------------------------------------
-	!!res = udivmodsi4 (a, b, 1);
-	! if (den != 0)
-	! ---------------------------------------------------------------------
-#else /* __NDS32_PERF_EXT__ */
-	!     b = -b;
-	!!res = udivmodsi4 (a, b, 1);
-	! if (den != 0)
-	! ---------------------------------------------------------------------
-	abs	$r1, $r1			! $r1  <- |b|
-#endif /* __NDS32_PERF_EXT__ */
-	beqz	$r1, .L1			! if den == 0, skip
-	! ---------------------------------------------------------------------
-	! {   bit = 1;
-	!     res = 0;
-	! ---------------------------------------------------------------------
-	movi	$r4, 1				! $r4  <- bit = 1
-#ifndef __OPTIMIZE_SIZE__
-.L6:
-#endif
-	! ---------------------------------------------------------------------
-	!     while (den < num&&bit && !(den & (1L << 31)))
-	! ---------------------------------------------------------------------
-	slt	$ta, $r1, $r0			! $ta  <- den < num ?
-	beqz	$ta, .L5			! if no, skip
-	! ---------------------------------------------------------------------
-	!     {   den << = 1;
-	!         bit << = 1;
-	!     }
-	! ---------------------------------------------------------------------
-#if defined (__OPTIMIZE_SIZE__) && ! defined (__NDS32_ISA_V3M__)
-	clz	$r3, $r1			! $r3  <- leading zero count for den
-	clz	$ta, $r0			! $ta  <- leading zero count for num
-	sub	$r3, $r3, $ta			! $r3  <- number of bits to shift
-	sll	$r1, $r1, $r3			! $r1  <- den
-	sll	$r4, $r4, $r3			! $r2  <- bit
-#else
-	slli	$r1, $r1, 1			! $r1  <- den << = 1
-	slli	$r4, $r4, 1			! $r4  <- bit << = 1
-	b	.L6				! continue loop
-#endif
-.L5:
-	! ---------------------------------------------------------------------
-	!     while (bit)
-	!     {   if (num >= den)
-	!         {   num -= den;
-	!             res |= bit;
-	!         }
-	!         bit >> = 1;
-	!         den >> = 1;
-	!     }
-	! }
-	!!if (modwanted)
-	!!    return num;
-	!!return res;
-	! ---------------------------------------------------------------------
-	sub	$r2, $r0, $r1			! $r2  <- num - den
-	slt	$ta, $r0, $r1			! $ta  <- num < den ?
-	srli	$r4, $r4, 1			! $r4  <- bit >> = 1
-	cmovz	$r0, $r2, $ta			! $r0  <- num = (num < den) ? num : num - den
-	srli	$r1, $r1, 1			! $r1  <- den >> = 1
-	bnez	$r4, .L5			! if bit != 0, continue loop
-.L1:
-	! ---------------------------------------------------------------------
-	! if (neg)
-	!     res = -res;
-	! return res;
-	! ---------------------------------------------------------------------
-	subri	$r3, $r0, 0			! $r3  <- -res
-	cmovn	$r0, $r3, $r5			! $r0  <- neg ? -res : res
-	! ---------------------------------------------------------------------
-	ret
-	.size	__modsi3, .-__modsi3
-#endif /* L_modsi3 */
-
-
-
-#ifdef L_moddi3
-
-	!--------------------------------------
-	#ifdef __big_endian__
-		#define  V1H  $r0
-		#define  V1L  $r1
-		#define  V2H  $r2
-		#define  V2L  $r3
-	#else
-		#define  V1H  $r1
-		#define  V1L  $r0
-		#define  V2H  $r3
-		#define  V2L  $r2
-	#endif
-	!--------------------------------------
-	.text
-	.align	2
-	.globl	__moddi3
-	.type	__moddi3, @function
-__moddi3:
-	! =====================================================================
-	! stack allocation:
-	! sp+32 +-----------------------+
-	!       | $lp                   |
-	! sp+28 +-----------------------+
-	!       | $r6 - $r10            |
-	! sp+8  +-----------------------+
-	!       |                       |
-	! sp+4  +-----------------------+
-	!       |                       |
-	! sp    +-----------------------+
-	! =====================================================================
-	! prologue
-#ifdef __NDS32_ISA_V3M__
-	push25	$r10, 8
-#else
-	smw.adm	$r6, [$sp], $r10, 2
-	addi	$sp, $sp, -8
-#endif
-	! end of prologue
-	!------------------------------------------
-	! 	__moddi3 (DWtype u, DWtype v)
-	!		{
-	!			word_type c = 0;
-	!			DWunion uu = {.ll = u};
-	!			DWunion vv = {.ll = v};
-	!			DWtype w;
-	!		if (uu.s.high < 0)
-	!  		  c = ~c,
-	!		  uu.ll = -uu.ll;
-	!---------------------------------------------
-	move	$r8, V1L
-	move	$r9, V1H
-	move	$r6, V2L
-	move	$r7, V2H
-	movi	$r10, 0        ! r10 = c = 0
-	bgez	V1H, .L80      ! if u > 0 , go L80
-	bal	__negdi2
-	move	$r8, V1L
-	move	$r9, V1H
-	movi	$r10, -1       ! r10 = c = ~c
-	!------------------------------------------------
-	!	 	if (vv.s.high < 0)
-	!		  vv.ll = -vv.ll;
-	!----------------------------------------------
-.L80:
-	bgez	$r7, .L81     !  if v > 0 , go L81
-	move	V1L, $r6
-	move	V1H, $r7
-	bal	__negdi2
-	move	$r6, V1L
-	move	$r7, V1H
-	!------------------------------------------
-	!		(void) __udivmoddi4 (uu.ll, vv.ll, &w);
-	!		if (c)
-	!		  w = -w;
-	!		return w;
-	!-----------------------------------------
-.L81:
-	move	V2L, $r6
-	move	V2H, $r7
-	move	V1L, $r8
-	move	V1H, $r9
-	addi	$r4, $sp, 0
-	bal	__udivmoddi4
-	lwi	$r0, [$sp+(0)]    ! le: sp + 0 is low, be: sp + 0 is high
-	lwi	$r1, [$sp+(4)]    ! le: sp + 4 is low, be: sp + 4 is high
-	beqz	$r10, .L82
-	bal	__negdi2
-.L82:
-	! epilogue
-#ifdef __NDS32_ISA_V3M__
-	pop25	$r10, 8
-#else
-	addi	$sp, $sp, 8
-	lmw.bim	$r6, [$sp], $r10, 2
-	ret
-#endif
-	.size	__moddi3, .-__moddi3
-#endif /* L_moddi3 */
-
-
-
-#ifdef L_mulsi3
-
-	.text
-	.align	2
-	.globl	__mulsi3
-	.type	__mulsi3, @function
-__mulsi3:
-	! ---------------------------------------------------------------------
-	! r = 0;
-	! while (a)
-	! $r0:       r
-	! $r1:       b
-	! $r2:       a
-	! ---------------------------------------------------------------------
-	beqz	$r0, .L7			! if a == 0, done
-	move	$r2, $r0			! $r2  <- a
-	movi	$r0, 0				! $r0  <- r <- 0
-.L8:
-	! ---------------------------------------------------------------------
-	! {   if (a & 1)
-	!         r += b;
-	!     a >> = 1;
-	!     b << = 1;
-	! }
-	! $r0:       r
-	! $r1:       b
-	! $r2:       a
-	! $r3:       scratch
-	! $r4:       scratch
-	! ---------------------------------------------------------------------
-	andi	$r3, $r2, 1			! $r3  <- a & 1
-	add	$r4, $r0, $r1			! $r4  <- r += b
-	cmovn	$r0, $r4, $r3			! $r0  <- r
-	srli	$r2, $r2, 1			! $r2  <- a >> = 1
-	slli	$r1, $r1, 1			! $r1  <- b << = 1
-	bnez	$r2, .L8			! if a != 0, continue loop
-.L7:
-	! ---------------------------------------------------------------------
-	! $r0:       return code
-	! ---------------------------------------------------------------------
-	ret
-	.size	__mulsi3, .-__mulsi3
-#endif /* L_mulsi3 */
-
-
-
-#ifdef L_udivsi3
-
-	.text
-	.align	2
-	.globl	__udivsi3
-	.type	__udivsi3, @function
-__udivsi3:
-	! ---------------------------------------------------------------------
-	!!res=udivmodsi4(a,b,0);
-	! res=0;
-	! if (den!=0)
-	! ---------------------------------------------------------------------
-	movi	$r2, 0				! $r2  <- res=0
-	beqz	$r1, .L1			! if den==0, skip
-	! ---------------------------------------------------------------------
-	! {   bit=1;
-	! ---------------------------------------------------------------------
-	movi	$r4, 1				! $r4  <- bit=1
-#ifndef __OPTIMIZE_SIZE__
-.L6:
-#endif
-	! ---------------------------------------------------------------------
-	!     while (den<num
-	! ---------------------------------------------------------------------
-	slt	$ta, $r1, $r0			! $ta  <- den<num?
-	beqz	$ta, .L5			! if no, skip
-	! ---------------------------------------------------------------------
-	!          &&bit&&!(den&(1L<<31)))
-	! ---------------------------------------------------------------------
-	bltz	$r1, .L5			! if den<0, skip
-	! ---------------------------------------------------------------------
-	!     {   den<<=1;
-	!         bit<<=1;
-	!     }
-	! ---------------------------------------------------------------------
-#if defined (__OPTIMIZE_SIZE__) && ! defined (__NDS32_ISA_V3M__)
-	clz	$r3, $r1			! $r3  <- leading zero count for den
-	clz	$ta, $r0			! $ta  <- leading zero count for num
-	sub	$r3, $r3, $ta			! $r3  <- number of bits to shift
-	sll	$r1, $r1, $r3			! $r1  <- den
-	sll	$r2, $r2, $r3			! $r2  <- bit
-#else
-	slli	$r1, $r1, 1			! $r1  <- den<<=1
-	slli	$r4, $r4, 1			! $r4  <- bit<<=1
-	b	.L6				! continue loop
-#endif
-.L5:
-	! ---------------------------------------------------------------------
-	!     while (bit)
-	!     {   if (num>=den)
-	! ---------------------------------------------------------------------
-	slt	$ta, $r0, $r1			! $ta  <- num<den?
-	bnez	$ta, .L9			! if yes, skip
-	! ---------------------------------------------------------------------
-	!         {   num-=den;
-	!             res|=bit;
-	!         }
-	! ---------------------------------------------------------------------
-	sub	$r0, $r0, $r1			! $r0  <- num-=den
-	or	$r2, $r2, $r4			! $r2  <- res|=bit
-.L9:
-	! ---------------------------------------------------------------------
-	!         bit>>=1;
-	!         den>>=1;
-	!     }
-	! }
-	!!if (modwanted)
-	!!    return num;
-	!!return res;
-	! ---------------------------------------------------------------------
-	srli	$r4, $r4, 1			! $r4  <- bit>>=1
-	srli	$r1, $r1, 1			! $r1  <- den>>=1
-	bnez	$r4, .L5			! if bit!=0, continue loop
-.L1:
-	! ---------------------------------------------------------------------
-	! return res;
-	! ---------------------------------------------------------------------
-	move	$r0, $r2			! $r0  <- return value
-	! ---------------------------------------------------------------------
-	! ---------------------------------------------------------------------
-	ret
-	.size	__udivsi3, .-__udivsi3
-#endif /* L_udivsi3 */
-
-
-
-#ifdef L_udivdi3
-
-	!--------------------------------------
-	#ifdef __big_endian__
-		#define  V1H  $r0
-		#define  V1L  $r1
-		#define  V2H  $r2
-		#define  V2L  $r3
-	#else
-		#define  V1H  $r1
-		#define  V1L  $r0
-		#define  V2H  $r3
-		#define  V2L  $r2
-	#endif
-	!--------------------------------------
-
-	.text
-	.align	2
-	.globl	__udivdi3
-	.type	__udivdi3, @function
-__udivdi3:
-	! prologue
-#ifdef __NDS32_ISA_V3M__
-	push25	$r8, 0
-#else
-	smw.adm	$r6, [$sp], $r8, 2
-#endif
-	! end of prologue
-	movi	$r4, 0
-	bal	__udivmoddi4
-	! epilogue
-#ifdef __NDS32_ISA_V3M__
-	pop25	$r8, 0
-#else
-	lmw.bim	$r6, [$sp], $r8, 2
-	ret
-#endif
-	.size	__udivdi3, .-__udivdi3
-#endif /* L_udivdi3 */
-
-
-
-#ifdef L_udivmoddi4
-
-	.text
-	.align	2
-	.globl	fudiv_qrnnd
-	.type	fudiv_qrnnd, @function
-	#ifdef __big_endian__
-		#define P1H     $r0
-		#define P1L     $r1
-		#define P2H     $r2
-		#define P2L     $r3
-		#define W6H     $r4
-		#define W6L     $r5
-		#define OFFSET_L 4
-		#define OFFSET_H 0
-	#else
-		#define P1H     $r1
-		#define P1L     $r0
-		#define P2H     $r3
-		#define P2L     $r2
-		#define W6H     $r5
-		#define W6L     $r4
-		#define OFFSET_L 0
-		#define OFFSET_H 4
-	#endif
-fudiv_qrnnd:
-	!------------------------------------------------------
-	! function:  fudiv_qrnnd(quotient, remainder, high_numerator, low_numerator, denominator)
-	!            divides a UDWtype, composed by the UWtype integers,HIGH_NUMERATOR (from $r4)
-	!            and LOW_NUMERATOR(from $r5) by DENOMINATOR(from $r6), and places the quotient
-	!            in $r7 and the remainder in $r8.
-	!------------------------------------------------------
-	!  in reg:$r4(n1), $r5(n0), $r6(d0)
-	!  __d1 = ((USItype) (d) >> ((4 * 8) / 2));
-	!  __d0 = ((USItype) (d) & (((USItype) 1 << ((4 * 8) / 2)) - 1));
-	!  __r1 = (n1) % __d1;
-	!  __q1 = (n1) / __d1;
-	!  __m = (USItype) __q1 * __d0;
-	!  __r1 = __r1 * ((USItype) 1 << ((4 * 8) / 2)) | ((USItype) (n0) >> ((4 * 8) / 2));
-	!   if (__r1 < __m)
-	!    {
-	!------------------------------------------------------
-	smw.adm $r0, [$sp], $r4, 2				! store $lp, when use BASELINE_V1,and must store $r0-$r3
-	srli	$r7, $r6, 16					! $r7 = d1 =__ll_highpart (d)
-	movi	$ta, 65535
-	and	  $r8, $r6, $ta       				! $r8 = d0 = __ll_lowpart (d)
-
-	divr	$r9, $r10, $r4, $r7				! $r9 = q1, $r10 = r1
-	and	  $r4, $r5, $ta       				! $r4 = __ll_lowpart (n0)
-	slli	$r10, $r10, 16      				! $r10 = r1 << 16
-	srli	$ta, $r5, 16        				! $ta = __ll_highpart (n0)
-
-	or	$r10, $r10, $ta					! $r10 <- $r0|$r3=__r1
-	mul	$r5, $r9, $r8					! $r5 = m =  __q1*__d0
-	slt	$ta, $r10, $r5					! $ta <- __r1<__m
-	beqz	$ta, .L2					!if yes,skip
-	!------------------------------------------------------
-	!    __q1--, __r1 += (d);
-	!    if (__r1 >= (d))
-	!     {
-	!------------------------------------------------------
-
-	add	$r10, $r10, $r6					!$r10 <- __r1+d=__r1
-	addi	$r9, $r9, -1					!$r9 <- __q1--=__q1
-	slt	$ta, $r10, $r6					!$ta <- __r1<d
-	bnez	$ta, .L2					!if yes,skip
-	!------------------------------------------------------
-	!       if (__r1 < __m)
-	!        {
-	!------------------------------------------------------
-
-	slt	$ta, $r10, $r5					!$ta <- __r1<__m
-	beqz	$ta, .L2					!if yes,skip
-	!------------------------------------------------------
-	!           __q1--, __r1 += (d);
-	!        }
-	!     }
-	!  }
-	!------------------------------------------------------
-
-	addi	$r9, $r9, -1					!$r9 <- __q1--=__q1
-	add	$r10, $r10, $r6					!$r2 <- __r1+d=__r1
-.L2:
-	!------------------------------------------------------
-	!  __r1 -= __m;
-	!  __r0 = __r1 % __d1;
-	!  __q0 = __r1 / __d1;
-	!  __m = (USItype) __q0 * __d0;
-	!  __r0 = __r0 * ((USItype) 1 << ((4 * 8) / 2)) \
-	!        | ((USItype) (n0) & (((USItype) 1 << ((4 * 8) / 2)) - 1));
-	!  if (__r0 < __m)
-	!   {
-	!------------------------------------------------------
-	sub  $r10, $r10, $r5					!$r10 <- __r1-__m=__r1
-	divr	$r7, $r10, $r10, $r7				!$r7 <- r1/__d1=__q0,$r10 <- r1%__d1=__r0
-	slli	$r10, $r10, 16					!$r10 <- __r0<<16
-	mul	$r5, $r8, $r7					!$r5 <- __q0*__d0=__m
-	or	$r10, $r4, $r10					!$r3 <- $r0|__ll_lowpart (n0) =__r0
-	slt	$ta, $r10, $r5					!$ta <- __r0<__m
-	beqz	$ta, .L5					!if yes,skip
-	!------------------------------------------------------
-	!      __q0--, __r0 += (d);
-	!      if (__r0 >= (d))
-	!       {
-	!------------------------------------------------------
-
-	add	$r10, $r10, $r6					!$r10 <- __r0+d=__r0
-	addi	$r7, $r7, -1					!$r7 <- __q0--=__q0
-	slt	$ta, $r10, $r6					!$ta <- __r0<d
-	bnez	$ta, .L5					!if yes,skip
-	!------------------------------------------------------
-	!         if (__r0 < __m)
-	!          {
-	!------------------------------------------------------
-
-	slt	$ta, $r10, $r5					!$ta <- __r0<__m
-	beqz	$ta, .L5					!if yes,skip
-	!------------------------------------------------------
-	!             __q0--, __r0 += (d);
-	!          }
-	!       }
-	!   }
-	!------------------------------------------------------
-
-	add	  $r10, $r10, $r6				!$r3 <- __r0+d=__r0
-	addi	$r7, $r7, -1					!$r2 <- __q0--=__q0
-.L5:
-	!------------------------------------------------------
-	!   __r0 -= __m;
-	!   *q = (USItype) __q1 * ((USItype) 1 << ((4 * 8) / 2)) | __q0;
-	!   *r = __r0;
-	!}
-	!------------------------------------------------------
-
-	sub		$r8, $r10, $r5				!$r8 = r = r0 = __r0-__m
-	slli	$r9, $r9, 16					!$r9 <- __q1<<16
-	or	$r7, $r9, $r7					!$r7 = q = $r9|__q0
-	lmw.bim $r0, [$sp], $r4, 2
-	ret
-	.size	fudiv_qrnnd, .-fudiv_qrnnd
-
-	.align	2
-	.globl	__udivmoddi4
-	.type	__udivmoddi4, @function
-__udivmoddi4:
-	! =====================================================================
-	! stack allocation:
-	! sp+40 +------------------+
-	!       | q1               |
-	! sp+36 +------------------+
-	!       | q0               |
-	! sp+32 +------------------+
-	!       | bm               |
-	! sp+28 +------------------+
-	!       | $lp              |
-	! sp+24 +------------------+
-	!       | $fp              |
-	! sp+20 +------------------+
-	!       | $r6 - $r10       |
-	! sp    +------------------+
-	! =====================================================================
-
-	addi	$sp, $sp, -40
-	smw.bi	$r6, [$sp], $r10, 10
-	!------------------------------------------------------
-	!  d0 = dd.s.low;
-	!  d1 = dd.s.high;
-	!  n0 = nn.s.low;
-	!  n1 = nn.s.high;
-	!  if (d1 == 0)
-	!   {
-	!------------------------------------------------------
-
-	move	$fp, $r4					!$fp <- rp
-	bnez	P2H, .L9					!if yes,skip
-	!------------------------------------------------------
-	!     if (d0 > n1)
-	!      {
-	!------------------------------------------------------
-
-	slt	$ta, P1H, P2L					!$ta <- n1<d0
-	beqz	$ta, .L10					!if yes,skip
-#ifndef __NDS32_PERF_EXT__
-	smw.adm $r0, [$sp], $r5, 0
-	move    $r0, P2L
-	bal __clzsi2
-	move	$r7, $r0
-	lmw.bim $r0, [$sp], $r5, 0
-#else
-	clz  $r7, P2L
-#endif
-	swi     $r7,  [$sp+(28)]
-	beqz	$r7, .L18					!if yes,skip
-	!------------------------------------------------------
-	!         d0 = d0 << bm;
-	!         n1 = (n1 << bm) | (n0 >> ((4 * 8) - bm));
-	!         n0 = n0 << bm;
-	!      }
-	!------------------------------------------------------
-
-	subri	$r5, $r7, 32					!$r5 <- 32-bm
-	srl	$r5, P1L, $r5					!$r5 <- n0>>$r5
-	sll	$r6, P1H, $r7					!$r6 <- n1<<bm
-	or	P1H, $r6, $r5					!P2h <- $r5|$r6=n1
-	sll	P1L, P1L, $r7					!P1H <- n0<<bm=n0
-	sll	P2L, P2L, $r7					!P2L <- d0<<bm=d0
-.L18:
-	!------------------------------------------------------
-	!    fudiv_qrnnd (&q0, &n0, n1, n0, d0);
-	!    q1 = 0;
-	!  } #if (d0 > n1)
-	!------------------------------------------------------
-
-	move 	$r4,P1H						! give fudiv_qrnnd args
-	move 	$r5,P1L						!
-	move 	$r6,P2L						!
-	bal	fudiv_qrnnd					!calcaulte q0 n0
-	movi	$r6, 0						!P1L <- 0
-	swi     $r7,[$sp+32]                                    !q0
-	swi     $r6,[$sp+36]                                    !q1
-	move    P1L,$r8						!n0
-	b	.L19
-.L10:
-	!------------------------------------------------------
-	!  else #if (d0 > n1)
-	!   {
-	!     if(d0 == 0)
-	!------------------------------------------------------
-
-	bnez	P2L, .L20					!if yes,skip
-	!------------------------------------------------------
-	!      d0 = 1 / d0;
-	!------------------------------------------------------
-
-	movi	$r4, 1						!P1L <- 1
-	divr	P2L, $r4, $r4, P2L				!$r9=1/d0,P1L=1%d0
-.L20:
-
-#ifndef __NDS32_PERF_EXT__
-	smw.adm $r0, [$sp], $r5, 0
-	move    $r0, P2L
-	bal __clzsi2
-	move    $r7, $r0
-	lmw.bim $r0, [$sp], $r5, 0
-#else
-	clz  $r7, P2L
-#endif
-	swi     $r7,[$sp+(28)]      ! store bm
-	beqz	$r7, .L28					! if yes,skip
-	!------------------------------------------------------
-	!         b = (4 * 8) - bm;
-	!         d0 = d0 << bm;
-	!         n2 = n1 >> b;
-	!         n1 = (n1 << bm) | (n0 >> b);
-	!         n0 = n0 << bm;
-	!         fudiv_qrnnd (&q1, &n1, n2, n1, d0);
-	!    }
-	!------------------------------------------------------
-
-	subri	$r10, $r7, 32					!$r10 <- 32-bm=b
-	srl	$r4, P1L, $r10					!$r4 <- n0>>b
-	sll	$r5, P1H, $r7					!$r5 <- n1<<bm
-	or	$r5, $r5, $r4					!$r5 <- $r5|$r4=n1  !for fun
-	sll	P2L, P2L, $r7					!P2L <- d0<<bm=d0   !for fun
-	sll	P1L, P1L, $r7					!P1L <- n0<<bm=n0
-	srl	$r4, P1H, $r10					!$r4 <- n1>>b=n2    !for fun
-
-	move    $r6,P2L                     			!for fun
-	bal	fudiv_qrnnd					!caculate q1, n1
-
-	swi  $r7,[$sp+(36)]          ! q1 store
-	move P1H,$r8                 ! n1 store
-
-	move $r4,$r8	             ! prepare for next fudiv_qrnnd()
-	move $r5,P1L
-	move $r6,P2L
-	b	.L29
-.L28:
-	!------------------------------------------------------
-	!    else // bm != 0
-	!     {
-	!        n1 -= d0;
-	!        q1 = 1;
-	!
-	!------------------------------------------------------
-
-	sub	P1H, P1H, P2L					!P1L <- n1-d0=n1
-	movi	$ta, 1						!
-	swi	$ta, [$sp+(36)]	                                !1 -> [$sp+(36)]
-
-	move $r4,P1H						! give fudiv_qrnnd args
-	move $r5,P1L
-	move $r6,P2L
-.L29:
-	!------------------------------------------------------
-	!    fudiv_qrnnd (&q0, &n0, n1, n0, d0);
-	!------------------------------------------------------
-
-	bal	fudiv_qrnnd					!calcuate  q0, n0
-	swi     $r7,[$sp+(32)]  !q0 store
-	move    P1L,$r8		!n0
-.L19:
-	!------------------------------------------------------
-	!    if (rp != 0)
-	!     {
-	!------------------------------------------------------
-
-	beqz	$fp, .L31					!if yes,skip
-	!------------------------------------------------------
-	!         rr.s.low = n0 >> bm;
-	!         rr.s.high = 0;
-	!         *rp = rr.ll;
-	!     }
-	!------------------------------------------------------
-
-	movi    $r5, 0							!$r5 <- 0
-	lwi     $r7,[$sp+(28)]    					!load bm
-	srl	$r4, P1L, $r7     	     				!$r4 <- n0>>bm
-        swi	$r4, [$fp+OFFSET_L]	  !r0				!$r4 -> [$sp+(48)]
-	swi	$r5, [$fp+OFFSET_H]	  !r1				!0 -> [$sp+(52)]
-	b .L31
-.L9:
-	!------------------------------------------------------
-	! else # d1 == 0
-	!  {
-	!     if(d1 > n1)
-	!      {
-	!------------------------------------------------------
-
-	slt	$ta, P1H, P2H					!$ta <- n1<d1
-	beqz	$ta, .L32					!if yes,skip
-	!------------------------------------------------------
-	!         q0 = 0;
-	!	  q1 = 0;
-	!         if (rp != 0)
-	!          {
-	!------------------------------------------------------
-
-	movi	$r5, 0						!$r5 <- 0
-	swi	$r5, [$sp+(32)]	   !q0				!0 -> [$sp+(40)]=q1
-	swi	$r5, [$sp+(36)]    !q1				!0 -> [$sp+(32)]=q0
-	beqz	$fp, .L31					!if yes,skip
-	!------------------------------------------------------
-	!             rr.s.low = n0;
-	!	      rr.s.high = n1;
-	!             *rp = rr.ll;
-	!          }
-	!------------------------------------------------------
-
-	swi	P1L, [$fp+OFFSET_L]					!P1L -> [rp]
-	swi	P1H, [$fp+OFFSET_H]					!P1H -> [rp+4]
-	b	.L31
-.L32:
-#ifndef __NDS32_PERF_EXT__
-	smw.adm $r0, [$sp], $r5, 0
-	move    $r0, P2H
-	bal __clzsi2
-	move    $r7, $r0
-	lmw.bim $r0, [$sp], $r5, 0
-#else
-	clz  $r7,P2H
-#endif
-        swi     $r7,[$sp+(28)] 	                                !$r7=bm  store
-	beqz	$r7, .L42					!if yes,skip
-	!------------------------------------------------------
-	!        USItype m1, m0;
-	!        b = (4 * 8) - bm;
-	!        d1 = (d0 >> b) | (d1 << bm);
-	!        d0 = d0 << bm;
-	!        n2 = n1 >> b;
-	!        n1 = (n0 >> b) | (n1 << bm);
-	!        n0 = n0 << bm;
-	!        fudiv_qrnnd (&q0, &n1, n2, n1, d1);
-	!------------------------------------------------------
-
-	subri	$r10, $r7, 32					!$r10 <- 32-bm=b
-	srl	$r5, P2L, $r10					!$r5 <- d0>>b
-	sll	$r6, P2H, $r7					!$r6 <- d1<<bm
-	or      $r6, $r5, $r6                                   !$r6 <- $r5|$r6=d1  !! func
-	move	P2H, $r6 					!P2H <- d1
-	srl     $r4, P1H, $r10                                  !$r4 <- n1>>b=n2    !!! func
-	srl	$r8, P1L, $r10					!$r8 <- n0>>b       !!$r8
-	sll     $r9, P1H, $r7                                   !$r9 <- n1<<bm
-	or	$r5, $r8, $r9					!$r5 <- $r8|$r9=n1  !func
-	sll     P2L, P2L, $r7                                   !P2L <- d0<<bm=d0
-	sll	P1L, P1L, $r7					!P1L <- n0<<bm=n0
-
-	bal	fudiv_qrnnd					! cal  q0,n1
-	swi     $r7,[$sp+(32)]
-	move    P1H,$r8            ! fudiv_qrnnd (&q0, &n1, n2, n1, d1);
-        move    $r6, $r7           ! from func
-
-	!----------------------------------------------------
-	!       #umul_ppmm (m1, m0, q0, d0);
-	!        do
-	!         {     USItype __x0, __x1, __x2, __x3;
-	!               USItype __ul, __vl, __uh, __vh;
-	!               __ul = ((USItype) (q0) & (((USItype) 1 << ((4 * 8) / 2)) - 1));
-	!               __uh = ((USItype) (q0) >> ((4 * 8) / 2));
-	!               __vl = ((USItype) (d0) & (((USItype) 1 << ((4 * 8) / 2)) - 1));
-	!               __vh = ((USItype) (d0) >> ((4 * 8) / 2));
-	!               __x0 = (USItype) __ul * __vl;
-	!               __x1 = (USItype) __ul * __vh;
-	!               __x2 = (USItype) __uh * __vl;
-	!               __x3 = (USItype) __uh * __vh;
-	!               __x1 += ((USItype) (__x0) >> ((4 * 8) / 2));
-	!               __x1 += __x2;
-	!               if (__x1 < __x2)
-	!                  __x3 += ((USItype) 1 << ((4 * 8) / 2));
-	!               (m1) = __x3 + ((USItype) (__x1) >> ((4 * 8) / 2));
-	!               (m0) = (USItype)(q0*d0);
-	!        }
-	!        if (m1 > n1)
-	!---------------------------------------------------
-#ifdef __NDS32_ISA_V3M__
-        !mulr64  $r4, P2L, $r6
-	smw.adm $r0, [$sp], $r3, 0
-	move	P1L, P2L
-	move	P2L, $r6
-	movi	P1H, 0
-	movi	P2H, 0
-	bal	__muldi3
-	movd44	$r4, $r0
-	lmw.bim $r0, [$sp], $r3, 0
-        move    $r8, W6H
-        move    $r5, W6L
-#else
-        mulr64  $r4, P2L, $r6
-        move    $r8, W6H
-        move    $r5, W6L
-#endif
-	slt	$ta, P1H, $r8					!$ta <- n1<m1
-	bnez	$ta, .L46					!if yes,skip
-	!------------------------------------------------------
-	!   if(m1 == n1)
-	!------------------------------------------------------
-
-	bne	$r8, P1H, .L45					!if yes,skip
-	!------------------------------------------------------
-	!   if(m0 > n0)
-	!------------------------------------------------------
-
-	slt	$ta, P1L, $r5					!$ta <- n0<m0
-	beqz	$ta, .L45					!if yes,skip
-.L46:
-	!------------------------------------------------------
-	!    {
-	!       q0--;
-	!       # sub_ddmmss (m1, m0, m1, m0, d1, d0);
-	!       do
-	!        {   USItype __x;
-	!            __x = (m0) - (d0);
-	!            (m1) = (m1) - (d1) - (__x > (m0));
-	!            (m0) = __x;
-	!        }
-	!    }
-	!------------------------------------------------------
-
-	sub	$r4, $r5, P2L					!$r4 <- m0-d0=__x
-	addi	$r6, $r6, -1					!$r6 <- q0--=q0
-	sub	$r8, $r8, P2H					!$r8 <- m1-d1
-	swi	$r6, [$sp+(32)]	      ! q0			!$r6->[$sp+(32)]
-	slt	$ta, $r5, $r4					!$ta <- m0<__x
-	sub	$r8, $r8, $ta					!$r8 <- P1H-P1L=m1
-	move	$r5, $r4					!$r5 <- __x=m0
-.L45:
-	!------------------------------------------------------
-	!    q1 = 0;
-	!    if (rp != 0)
-	!     {
-	!------------------------------------------------------
-
-	movi	$r4, 0						!$r4 <- 0
-	swi	$r4, [$sp+(36)]					!0 -> [$sp+(40)]=q1
-	beqz	$fp, .L31					!if yes,skip
-	!------------------------------------------------------
-	!      # sub_ddmmss (n1, n0, n1, n0, m1, m0);
-	!      do
-	!       {   USItype __x;
-	!           __x = (n0) - (m0);
-	!           (n1) = (n1) - (m1) - (__x > (n0));
-	!           (n0) = __x;
-	!       }
-	!       rr.s.low = (n1 << b) | (n0 >> bm);
-	!       rr.s.high = n1 >> bm;
-	!       *rp = rr.ll;
-	!------------------------------------------------------
-
-	sub	$r4, P1H, $r8					!$r4 <- n1-m1
-	sub	$r6, P1L, $r5					!$r6 <- n0-m0=__x=n0
-	slt	$ta, P1L, $r6					!$ta <- n0<__x
-	sub	P1H, $r4, $ta					!P1H <- $r4-$ta=n1
-	move    P1L, $r6
-
-	lwi     $r7,[$sp+(28)]         ! load bm
-	subri   $r10,$r7,32
-	sll	$r4, P1H, $r10					!$r4 <- n1<<b
-	srl	$r5, P1L, $r7					!$r5 <- __x>>bm
-	or	$r6, $r5, $r4					!$r6 <- $r5|$r4=rr.s.low
-	srl	$r8, P1H, $r7					!$r8 <- n1>>bm =rr.s.high
-	swi	$r6, [$fp+OFFSET_L]				!
-	swi	$r8, [$fp+OFFSET_H]				!
-	b	.L31
-.L42:
-	!------------------------------------------------------
-	!  else
-	!   {
-	!     if(n1 > d1)
-	!------------------------------------------------------
-
-	slt	$ta, P2H, P1H					!$ta <- P2H<P1H
-	bnez	$ta, .L52					!if yes,skip
-	!------------------------------------------------------
-	!     if (n0 >= d0)
-	!------------------------------------------------------
-
-	slt	$ta, P1L, P2L					!$ta <- P1L<P2L
-	bnez	$ta, .L51					!if yes,skip
-	!------------------------------------------------------
-	!        q0 = 1;
-	!        do
-	!         {   USItype __x;
-	!             __x = (n0) - (d0);
-	!             (n1) = (n1) - (d1) - (__x > (n0));
-	!             (n0) = __x;
-	!         }
-	!------------------------------------------------------
-.L52:
-	sub	$r4, P1H, P2H					!$r4 <- P1H-P2H
-	sub	$r6, P1L, P2L					!$r6 <- no-d0=__x=n0
-	slt	$ta, P1L, $r6					!$ta <- no<__x
-	sub	P1H, $r4, $ta					!P1H <- $r4-$ta=n1
-	move    P1L, $r6					!n0
-	movi	$r5, 1						!
-	swi	$r5, [$sp+(32)]					!1 -> [$sp+(32)]=q0
-	b	.L54
-.L51:
-	!------------------------------------------------------
-	!       q0 = 0;
-	!------------------------------------------------------
-
-	movi    $r5,0
-	swi	$r5, [$sp+(32)]					!$r5=0 -> [$sp+(32)]
-.L54:
-	!------------------------------------------------------
-	!       q1 = 0;
-	!       if (rp != 0)
-	!        {
-	!------------------------------------------------------
-
-	movi	$r5, 0						!
-	swi	$r5, [$sp+(36)]					!0 -> [$sp+(36)]
-	beqz	$fp, .L31
-	!------------------------------------------------------
-	!          rr.s.low = n0;
-	!          rr.s.high = n1;
-	!          *rp = rr.ll;
-	!        }
-	!------------------------------------------------------
-
-	swi	P1L, [$fp+OFFSET_L]				!remainder
-	swi	P1H, [$fp+OFFSET_H]				!
-.L31:
-	!------------------------------------------------------
-	! const DWunion ww = {{.low = q0, .high = q1}};
-	! return ww.ll;
-	!}
-	!------------------------------------------------------
-
-	lwi	P1L, [$sp+(32)]					!quotient
-	lwi	P1H, [$sp+(36)]
-	lmw.bim	$r6, [$sp], $r10, 10
-	addi	$sp, $sp, 12
-	ret
-	.size	__udivmoddi4, .-__udivmoddi4
-#endif /* L_udivmoddi4 */
-
-
-
-#ifdef L_umodsi3
-
-	! =====================================================================
-	.text
-	.align	2
-	.globl	__umodsi3
-	.type	__umodsi3, @function
-__umodsi3:
-	! ---------------------------------------------------------------------
-	!!res=udivmodsi4(a,b,1);
-	! if (den==0)
-	!     return num;
-	! ---------------------------------------------------------------------
-	beqz	$r1, .L1			! if den==0, skip
-	! ---------------------------------------------------------------------
-	! bit=1;
-	! res=0;
-	! ---------------------------------------------------------------------
-	movi	$r4, 1				! $r4  <- bit=1
-#ifndef __OPTIMIZE_SIZE__
-.L6:
-#endif
-	! ---------------------------------------------------------------------
-	! while (den<num
-	! ---------------------------------------------------------------------
-	slt	$ta, $r1, $r0			! $ta  <- den<num?
-	beqz	$ta, .L5			! if no, skip
-	! ---------------------------------------------------------------------
-	!      &&bit&&!(den&(1L<<31)))
-	! ---------------------------------------------------------------------
-	bltz	$r1, .L5			! if den<0, skip
-	! ---------------------------------------------------------------------
-	! {   den<<=1;
-	!     bit<<=1;
-	! }
-	! ---------------------------------------------------------------------
-#if defined (__OPTIMIZE_SIZE__) && ! defined (__NDS32_ISA_V3M__)
-	clz	$r3, $r1			! $r3  <- leading zero count for den
-	clz	$ta, $r0			! $ta  <- leading zero count for num
-	sub	$r3, $r3, $ta			! $r3  <- number of bits to shift
-	sll	$r1, $r1, $r3			! $r1  <- den
-	sll	$r4, $r4, $r3			! $r2  <- bit
-#else
-	slli	$r1, $r1, 1			! $r1  <- den<<=1
-	slli	$r4, $r4, 1			! $r4  <- bit<<=1
-	b	.L6				! continue loop
-#endif
-.L5:
-	! ---------------------------------------------------------------------
-	! while (bit)
-	! {   if (num>=den)
-	!     {   num-=den;
-	!         res|=bit;
-	!     }
-	!     bit>>=1;
-	!     den>>=1;
-	! }
-	!!if (modwanted)
-	!!    return num;
-	!!return res;
-	! ---------------------------------------------------------------------
-	sub	$r2, $r0, $r1			! $r2  <- num-den
-	slt	$ta, $r0, $r1			! $ta  <- num<den?
-	srli	$r4, $r4, 1			! $r4  <- bit>>=1
-	cmovz	$r0, $r2, $ta			! $r0  <- num=(num<den)?num:num-den
-	srli	$r1, $r1, 1			! $r1  <- den>>=1
-	bnez	$r4, .L5			! if bit!=0, continue loop
-.L1:
-	! ---------------------------------------------------------------------
-	! return res;
-	! ---------------------------------------------------------------------
-	ret
-	.size	__umodsi3, .-__umodsi3
-#endif /* L_umodsi3 */
-
-
-
-#ifdef L_umoddi3
-
-	!--------------------------------------
-	#ifdef __big_endian__
-		#define  V1H  $r0
-		#define  V1L  $r1
-		#define  V2H  $r2
-		#define  V2L  $r3
-	#else
-		#define  V1H  $r1
-		#define  V1L  $r0
-		#define  V2H  $r3
-		#define  V2L  $r2
-	#endif
-	!--------------------------------------
-	.text
-	.align	2
-	.globl	__umoddi3
-	.type	__umoddi3, @function
-__umoddi3:
-	! prologue
-	addi	$sp, $sp, -12
-	swi $lp, [$sp+(0)]
-	! end of prologue
-	addi	$r4, $sp, 4
-	bal	__udivmoddi4
-	lwi	$r0, [$sp+(4)]    ! __udivmoddi4 return low when LE mode or return high when BE mode
-	lwi	$r1, [$sp+(8)]    !
-.L82:
-	! epilogue
-	lwi $lp, [$sp+(0)]
-	addi	$sp, $sp, 12
-	ret
-	.size	__umoddi3, .-__umoddi3
-#endif /* L_umoddi3 */
-
-
-
-#ifdef L_muldi3
-
-#ifdef __big_endian__
-	#define P1H	$r0
-	#define P1L	$r1
-	#define P2H	$r2
-	#define P2L	$r3
-
-	#define V2H $r4
-	#define V2L $r5
-#else
-	#define P1H	$r1
-	#define P1L	$r0
-	#define P2H	$r3
-	#define P2L	$r2
-
-	#define V2H $r5
-	#define V2L $r4
-#endif
-
-	! ====================================================================
-	.text
-	.align	2
-	.globl	__muldi3
-	.type	__muldi3, @function
-__muldi3:
-	! parameter passing for libgcc functions normally involves 2 doubles
-	!---------------------------------------
-#ifdef __NDS32_ISA_V3M__
-	! There is no mulr64 instruction in Andes ISA V3M.
-	! So we must provide a sequence of calculations to complete the job.
-	smw.adm   $r6, [$sp], $r9, 0x0
-	zeh33	  $r4, P1L
-	srli      $r7, P1L, 16
-	zeh33     $r5, P2L
-	mul       $r6, $r5, $r4
-	mul33     $r5, $r7
-	srli      $r8, P2L, 16
-	mov55     $r9, $r5
-	maddr32   $r9, $r8, $r4
-	srli      $r4, $r6, 16
-	add       $r4, $r9, $r4
-	slt45     $r4, $r5
-	slli      $r5, $r15, 16
-	maddr32   $r5, $r8, $r7
-	mul       P2L, P1H, P2L
-	srli      $r7, $r4, 16
-	maddr32   P2L, P2H, P1L
-	add333    P1H, $r5, $r7
-	slli      $r4, $r4, 16
-	zeh33     $r6, $r6
-	add333    P1L, $r4, $r6
-	add333    P1H, P2L, P1H
-	lmw.bim   $r6, [$sp], $r9, 0x0
-	ret
-#else /* not  __NDS32_ISA_V3M__ */
-	mul	    $ta, P1L, P2H
-	mulr64	$r4, P1L, P2L
-	maddr32	$ta, P1H, P2L
-	move	  P1L, V2L
-	add	    P1H, $ta, V2H
-	ret
-#endif /* not __NDS32_ISA_V3M__ */
-	.size	__muldi3, .-__muldi3
-#endif /* L_muldi3 */
-
-
-
-#ifdef L_addsub_df
-
-#ifndef __big_endian__
-	#define P1L     $r0
-	#define P1H     $r1
-	#define P2L     $r2
-	#define P2H     $r3
-	#define P3L     $r4
-	#define P3H     $r5
-	#define O1L     $r7
-	#define O1H	$r8
-#else
-	#define P1H     $r0
-	#define P1L     $r1
-	#define P2H     $r2
-	#define P2L     $r3
-	#define P3H     $r4
-	#define P3L     $r5
-	#define O1H     $r7
-	#define O1L	$r8
-#endif
-	.text
-	.align	2
-	.global  __subdf3
-	.type    __subdf3, @function
-__subdf3:
-	push    $lp
-	pushm   $r6, $r10
-
-	move    $r4, #0x80000000
-	xor     P2H, P2H, $r4
-
-	j       .Lsdpadd
-
-	.global  __adddf3
-	.type    __adddf3, @function
-__adddf3:
-	push    $lp
-	pushm   $r6, $r10
-.Lsdpadd:
-	slli    $r6, P1H, #1
-	srli    $r6, $r6, #21
-	slli    P3H, P1H, #11
-	srli    $r10, P1L, #21
-	or      P3H, P3H, $r10
-	slli    P3L, P1L, #11
-	move    O1L, #0x80000000
-	or      P3H, P3H, O1L
-	slli    $r9, P2H, #1
-	srli    $r9, $r9, #21
-	slli    O1H, P2H, #11
-	srli    $r10, P2L, #21
-	or      O1H, O1H, $r10
-	or      O1H, O1H, O1L
-	slli    O1L, P2L, #11
-
-	addi    $r10, $r6, #-1
-	slti    $r15, $r10, #0x7fe
-	beqzs8  .LEspecA
-
-.LElab1:
-	addi    $r10, $r9, #-1
-	slti    $r15, $r10, #0x7fe
-	beqzs8  .LEspecB
-
-.LElab2:
-	#NORMd($r4, P2L, P1L)
-	bnez    P3H, .LL1
-	bnez    P3L, .LL2
-	move    $r6, #0
-	j       .LL3
-.LL2:
-	move    P3H, P3L
-	move    P3L, #0
-	move    P2L, #32
-	sub     $r6, $r6, P2L
-.LL1:
-#ifndef __big_endian__
-#ifdef __NDS32_PERF_EXT__
-	clz	$r2, $r5
-#else
-	pushm	$r0, $r1
-	pushm	$r3, $r5
-	move	$r0, $r5
-	bal	__clzsi2
-	move	$r2, $r0
-	popm	$r3, $r5
-	popm	$r0, $r1
-#endif
-#else /* __big_endian__ */
-#ifdef __NDS32_PERF_EXT__
-	clz	$r3, $r4
-#else
-	pushm	$r0, $r2
-	pushm	$r4, $r5
-	move	$r0, $r4
-	bal	__clzsi2
-	move	$r3, $r0
-	popm	$r4, $r5
-	popm	$r0, $r2
-#endif
-#endif /* __big_endian__ */
-	beqz    P2L, .LL3
-	sub     $r6, $r6, P2L
-	subri   P1L, P2L, #32
-	srl     P1L, P3L, P1L
-	sll     P3L, P3L, P2L
-	sll     P3H, P3H, P2L
-	or      P3H, P3H, P1L
-.LL3:
-	#NORMd End
-
-	#NORMd($r7, P2L, P1L)
-	bnez    O1H, .LL4
-	bnez    O1L, .LL5
-	move    $r9, #0
-	j       .LL6
-.LL5:
-	move    O1H, O1L
-	move    O1L, #0
-	move    P2L, #32
-	sub     $r9, $r9, P2L
-.LL4:
-#ifndef __big_endian__
-#ifdef __NDS32_PERF_EXT__
-	clz	$r2, O1H
-#else
-	pushm	$r0, $r1
-	pushm	$r3, $r5
-	move	$r0, O1H
-	bal	__clzsi2
-	move	$r2, $r0
-	popm	$r3, $r5
-	popm	$r0, $r1
-#endif
-#else /* __big_endian__ */
-#ifdef __NDS32_PERF_EXT__
-	clz	$r3, O1H
-#else
-	pushm	$r0, $r2
-	pushm	$r4, $r5
-	move	$r0, O1H
-	bal	__clzsi2
-	move	$r3, $r0
-	popm	$r4, $r5
-	popm	$r0, $r2
-#endif
-#endif /* __big_endian__ */
-	beqz    P2L, .LL6
-	sub     $r9, $r9, P2L
-	subri   P1L, P2L, #32
-	srl     P1L, O1L, P1L
-	sll     O1L, O1L, P2L
-	sll     O1H, O1H, P2L
-	or      O1H, O1H, P1L
-.LL6:
-	#NORMd End
-
-	move    $r10, #0x80000000
-	and     P1H, P1H, $r10
-
-	beq     $r6, $r9, .LEadd3
-	slts    $r15, $r9, $r6
-	beqzs8  .Li1
-	sub     $r9, $r6, $r9
-	move    P2L, #0
-.LL7:
-	move    $r10, #0x20
-	slt     $r15, $r9, $r10
-	bnezs8  .LL8
-	or      P2L, P2L, O1L
-	move    O1L, O1H
-	move    O1H, #0
-	addi    $r9, $r9, #0xffffffe0
-	bnez    O1L, .LL7
-.LL8:
-	beqz    $r9, .LEadd3
-	move    P1L, O1H
-	move    $r10, O1L
-	srl     O1L, O1L, $r9
-	srl     O1H, O1H, $r9
-	subri   $r9, $r9, #0x20
-	sll     P1L, P1L, $r9
-	or      O1L, O1L, P1L
-	sll     $r10, $r10, $r9
-	or      P2L, P2L, $r10
-	beqz    P2L, .LEadd3
-	ori     O1L, O1L, #1
-	j       .LEadd3
-.Li1:
-	move    $r15, $r6
-	move    $r6, $r9
-	sub     $r9, $r9, $r15
-	move    P2L, #0
-.LL10:
-	move    $r10, #0x20
-	slt     $r15, $r9, $r10
-	bnezs8  .LL11
-	or      P2L, P2L, P3L
-	move    P3L, P3H
-	move    P3H, #0
-	addi    $r9, $r9, #0xffffffe0
-	bnez    P3L, .LL10
-.LL11:
-	beqz    $r9, .LEadd3
-	move    P1L, P3H
-	move    $r10, P3L
-	srl     P3L, P3L, $r9
-	srl     P3H, P3H, $r9
-	subri   $r9, $r9, #0x20
-	sll     P1L, P1L, $r9
-	or      P3L, P3L, P1L
-	sll     $r10, $r10, $r9
-	or      P2L, P2L, $r10
-	beqz    P2L, .LEadd3
-	ori     P3L, P3L, #1
-
-.LEadd3:
-	xor     $r10, P1H, P2H
-	sltsi   $r15, $r10, #0
-	bnezs8  .LEsub1
-
-	#ADD(P3L, O1L)
-	add     P3L, P3L, O1L
-	slt     $r15, P3L, O1L
-
-	#ADDCC(P3H, O1H)
-	beqzs8  .LL13
-	add     P3H, P3H, O1H
-	slt     $r15, P3H, O1H
-	beqzs8  .LL14
-	addi    P3H, P3H, #0x1
-	j       .LL15
-.LL14:
-	move    $r15, #1
-	add     P3H, P3H, $r15
-	slt     $r15, P3H, $r15
-	j       .LL15
-.LL13:
-	add     P3H, P3H, O1H
-	slt     $r15, P3H, O1H
-.LL15:
-
-	beqzs8  .LEres
-	andi    $r10, P3L, #1
-	beqz    $r10, .Li3
-	ori     P3L, P3L, #2
-.Li3:
-	srli    P3L, P3L, #1
-	slli    $r10, P3H, #31
-	or      P3L, P3L, $r10
-	srli    P3H, P3H, #1
-	move    $r10, #0x80000000
-	or      P3H, P3H, $r10
-	addi    $r6, $r6, #1
-	subri   $r15, $r6, #0x7ff
-	bnezs8  .LEres
-	move    $r10, #0x7ff00000
-	or      P1H, P1H, $r10
-	move    P1L, #0
-	j       .LEretA
-
-.LEsub1:
-	#SUB(P3L, O1L)
-	move    $r15, P3L
-	sub     P3L, P3L, O1L
-	slt     $r15, $r15, P3L
-
-	#SUBCC(P3H, O1H)
-	beqzs8  .LL16
-	move    $r15, P3H
-	sub     P3H, P3H, O1H
-	slt     $r15, $r15, P3H
-	beqzs8  .LL17
-	subi333 P3H, P3H, #1
-	j       .LL18
-.LL17:
-	move    $r15, P3H
-	subi333 P3H, P3H, #1
-	slt     $r15, $r15, P3H
-	j       .LL18
-.LL16:
-	move    $r15, P3H
-	sub     P3H, P3H, O1H
-	slt     $r15, $r15, P3H
-.LL18:
-
-	beqzs8  .Li5
-	move    $r10, #0x80000000
-	xor     P1H, P1H, $r10
-
-	subri   P3H, P3H, #0
-	beqz    P3L, .LL19
-	subri   P3L, P3L, #0
-	subi45  P3H, #1
-.LL19:
-
-.Li5:
-	#NORMd($r4, $r9, P1L)
-	bnez    P3H, .LL20
-	bnez    P3L, .LL21
-	move    $r6, #0
-	j       .LL22
-.LL21:
-	move    P3H, P3L
-	move    P3L, #0
-	move    $r9, #32
-	sub     $r6, $r6, $r9
-.LL20:
-#ifdef __NDS32_PERF_EXT__
-	clz	$r9, P3H
-#else
-	pushm	$r0, $r5
-	move	$r0, P3H
-	bal	__clzsi2
-	move	$r9, $r0
-	popm	$r0, $r5
-#endif
-	beqz    $r9, .LL22
-	sub     $r6, $r6, $r9
-	subri   P1L, $r9, #32
-	srl     P1L, P3L, P1L
-	sll     P3L, P3L, $r9
-	sll     P3H, P3H, $r9
-	or      P3H, P3H, P1L
-.LL22:
-	#NORMd End
-
-	or      $r10, P3H, P3L
-	bnez    $r10, .LEres
-	move    P1H, #0
-
-.LEres:
-	blez    $r6, .LEund
-
-.LElab8:
-	#ADD(P3L, $0x400)
-	move    $r15, #0x400
-	add     P3L, P3L, $r15
-	slt     $r15, P3L, $r15
-
-	#ADDCC(P3H, $0x0)
-	beqzs8  .LL25
-	add     P3H, P3H, $r15
-	slt     $r15, P3H, $r15
-.LL25:
-
-	#ADDC($r6, $0x0)
-	add     $r6, $r6, $r15
-	srli    $r10, P3L, #11
-	andi    $r10, $r10, #1
-	sub     P3L, P3L, $r10
-	srli    P1L, P3L, #11
-	slli    $r10, P3H, #21
-	or      P1L, P1L, $r10
-	slli    $r10, P3H, #1
-	srli    $r10, $r10, #12
-	or      P1H, P1H, $r10
-	slli    $r10, $r6, #20
-	or      P1H, P1H, $r10
-
-.LEretA:
-.LE999:
-	popm    $r6, $r10
-	pop     $lp
-	ret5    $lp
-
-.LEspecA:
-	#ADD(P3L, P3L)
-	move    $r15, P3L
-	add     P3L, P3L, P3L
-	slt     $r15, P3L, $r15
-
-	#ADDC(P3H, P3H)
-	add     P3H, P3H, P3H
-	add     P3H, P3H, $r15
-	bnez    $r6, .Li7
-	or      $r10, P3H, P3L
-	beqz    $r10, .Li8
-	j       .LElab1
-.Li8:
-	subri   $r15, $r9, #0x7ff
-	beqzs8  .LEspecB
-	add     P3L, P2H, P2H
-	or      $r10, P3L, P2L
-	bnez    $r10, .LEretB
-	sltsi   $r15, P2H, #0
-	bnezs8  .LEretA
-
-.LEretB:
-	move    P1L, P2L
-	move    P1H, P2H
-	j       .LE999
-.Li7:
-	or      $r10, P3H, P3L
-	bnez    $r10, .LEnan
-	subri   $r15, $r9, #0x7ff
-	bnezs8  .LEretA
-	xor     $r10, P1H, P2H
-	sltsi   $r15, $r10, #0
-	bnezs8  .LEnan
-	j       .LEretB
-
-.LEspecB:
-	#ADD(O1L, O1L)
-	move    $r15, O1L
-	add     O1L, O1L, O1L
-	slt     $r15, O1L, $r15
-
-	#ADDC(O1H, O1H)
-	add     O1H, O1H, O1H
-	add     O1H, O1H, $r15
-	bnez    $r9, .Li11
-	or      $r10, O1H, O1L
-	beqz    $r10, .LEretA
-	j       .LElab2
-.Li11:
-	or      $r10, O1H, O1L
-	beqz    $r10, .LEretB
-
-.LEnan:
-	move    P1H, #0xfff80000
-	move    P1L, #0
-	j       .LEretA
-
-.LEund:
-	subri   $r9, $r6, #1
-	move    P2L, #0
-.LL26:
-	move    $r10, #0x20
-	slt     $r15, $r9, $r10
-	bnezs8  .LL27
-	or      P2L, P2L, P3L
-	move    P3L, P3H
-	move    P3H, #0
-	addi    $r9, $r9, #0xffffffe0
-	bnez    P3L, .LL26
-.LL27:
-	beqz    $r9, .LL28
-	move    P1L, P3H
-	move    $r10, P3L
-	srl     P3L, P3L, $r9
-	srl     P3H, P3H, $r9
-	subri   $r9, $r9, #0x20
-	sll     P1L, P1L, $r9
-	or      P3L, P3L, P1L
-	sll     $r10, $r10, $r9
-	or      P2L, P2L, $r10
-	beqz    P2L, .LL28
-	ori     P3L, P3L, #1
-.LL28:
-	move    $r6, #0
-	j       .LElab8
-	.size   __subdf3, .-__subdf3
-	.size   __adddf3, .-__adddf3
-#endif /* L_addsub_df */
-
-
-
-#ifdef L_mul_sf
-
-#if !defined (__big_endian__)
-	#define P1L     $r0
-	#define P1H     $r1
-	#define P2L     $r2
-	#define P2H     $r3
-#else
-	#define P1H     $r0
-	#define P1L     $r1
-	#define P2H     $r2
-	#define P2L     $r3
-#endif
-	.text
-	.align	2
-	.global	__mulsf3
-	.type	__mulsf3, @function
-__mulsf3:
-	push    $lp
-	pushm   $r6, $r10
-
-	srli    $r3, $r0, #23
-	andi    $r3, $r3, #0xff
-	srli    $r5, $r1, #23
-	andi    $r5, $r5, #0xff
-	move    $r6, #0x80000000
-	slli    $r2, $r0, #8
-	or      $r2, $r2, $r6
-	slli    $r4, $r1, #8
-	or      $r4, $r4, $r6
-	xor     $r8, $r0, $r1
-	and     $r6, $r6, $r8
-
-	addi    $r8, $r3, #-1
-	slti    $r15, $r8, #0xfe
-	beqzs8  .LFspecA
-
-.LFlab1:
-	addi    $r8, $r5, #-1
-	slti    $r15, $r8, #0xfe
-	beqzs8  .LFspecB
-
-.LFlab2:
-	move    $r10, $r3
-/* This is a 64-bit multiple. ($r2, $r7) is (high, low). */
-#ifndef __NDS32_ISA_V3M__
-	mulr64	$r2, $r2, $r4
-#else
-	pushm	$r0, $r1
-	pushm	$r4, $r5
-	move	P1L, $r2
-	movi	P1H, #0
-	move	P2L, $r4
-	movi	P2H, #0
-	bal	__muldi3
-	movd44	$r2, $r0
-	popm	$r4, $r5
-	popm	$r0, $r1
-#endif
-#ifndef __big_endian__
-	move    $r7, $r2
-	move    $r2, $r3
-#else
-	move	$r7, $r3
-#endif
-	move    $r3, $r10
-
-	beqz    $r7, .Li17
-	ori     $r2, $r2, #1
-
-.Li17:
-	sltsi   $r15, $r2, #0
-	bnezs8  .Li18
-	slli    $r2, $r2, #1
-	addi    $r3, $r3, #-1
-.Li18:
-	addi    $r8, $r5, #0xffffff82
-	add     $r3, $r3, $r8
-	addi    $r8, $r3, #-1
-	slti    $r15, $r8, #0xfe
-	beqzs8  .LFoveund
-
-.LFlab8:
-	#ADD($r2, $0x80)
-	move    $r15, #0x80
-	add     $r2, $r2, $r15
-	slt     $r15, $r2, $r15
-
-	#ADDC($r3, $0x0)
-	add     $r3, $r3, $r15
-	srli    $r8, $r2, #8
-	andi    $r8, $r8, #1
-	sub     $r2, $r2, $r8
-	slli    $r2, $r2, #1
-	srli    $r2, $r2, #9
-	slli    $r8, $r3, #23
-	or      $r2, $r2, $r8
-	or      $r0, $r2, $r6
-
-.LF999:
-	popm    $r6, $r10
-	pop     $lp
-	ret5    $lp
-
-.LFspecA:
-	bnez    $r3, .Li19
-	add     $r2, $r2, $r2
-	beqz    $r2, .Li20
-#ifdef __NDS32_PERF_EXT__
-	clz	$r7, $r2
-#else
-	pushm	$r0, $r5
-	move	$r0, $r2
-	bal	__clzsi2
-	move	$r7, $r0
-	popm	$r0, $r5
-#endif
-	sub     $r3, $r3, $r7
-	sll     $r2, $r2, $r7
-	j       .LFlab1
-.Li20:
-	subri   $r15, $r5, #0xff
-	beqzs8  .LFnan
-	j       .LFzer
-.Li19:
-	add     $r8, $r2, $r2
-	bnez    $r8, .LFnan
-	bnez    $r5, .Li21
-	add     $r8, $r4, $r4
-	beqz    $r8, .LFnan
-.Li21:
-	subri   $r15, $r5, #0xff
-	bnezs8  .LFinf
-
-.LFspecB:
-	bnez    $r5, .Li22
-	add     $r4, $r4, $r4
-	beqz    $r4, .LFzer
-#ifdef __NDS32_PERF_EXT__
-	clz	$r7, $r4
-#else
-	pushm	$r0, $r5
-	move	$r0, $r4
-	bal	__clzsi2
-	move	$r7, $r0
-	popm	$r0, $r5
-#endif
-	sub     $r5, $r5, $r7
-	sll     $r4, $r4, $r7
-	j       .LFlab2
-
-.LFzer:
-	move    $r0, $r6
-	j       .LF999
-.Li22:
-	add     $r8, $r4, $r4
-	bnez    $r8, .LFnan
-
-.LFinf:
-	move    $r8, #0x7f800000
-	or      $r0, $r6, $r8
-	j       .LF999
-
-.LFnan:
-	move    $r0, #0xffc00000
-	j       .LF999
-
-.LFoveund:
-	bgtz    $r3, .LFinf
-	subri   $r7, $r3, #1
-	slti    $r15, $r7, #0x20
-	beqzs8  .LFzer
-	subri   $r8, $r7, #0x20
-	sll     $r3, $r2, $r8
-	srl     $r2, $r2, $r7
-	beqz    $r3, .Li25
-	ori     $r2, $r2, #2
-.Li25:
-	move    $r3, #0
-	addi    $r8, $r2, #0x80
-	sltsi   $r15, $r8, #0
-	beqzs8  .LFlab8
-	move    $r3, #1
-	j       .LFlab8
-	.size	__mulsf3, .-__mulsf3
-#endif /* L_mul_sf */
-
-
-
-#ifdef L_mul_df
-
-#ifndef __big_endian__
-	#define P1L     $r0
-	#define P1H     $r1
-	#define P2L     $r2
-	#define P2H     $r3
-	#define P3L     $r4
-	#define P3H     $r5
-	#define O1L     $r7
-	#define O1H	$r8
-#else
-	#define P1H     $r0
-	#define P1L     $r1
-	#define P2H     $r2
-	#define P2L     $r3
-	#define P3H     $r4
-	#define P3L     $r5
-	#define O1H     $r7
-	#define O1L	$r8
-#endif
-	.text
-	.align	2
-	.global	__muldf3
-	.type	__muldf3, @function
-__muldf3:
-	push    $lp
-	pushm   $r6, $r10
-
-	slli    $r6, P1H, #1
-	srli    $r6, $r6, #21
-	slli    P3H, P1H, #11
-	srli    $r10, P1L, #21
-	or      P3H, P3H, $r10
-	slli    P3L, P1L, #11
-	move    O1L, #0x80000000
-	or      P3H, P3H, O1L
-	slli    $r9, P2H, #1
-	srli    $r9, $r9, #21
-	slli    O1H, P2H, #11
-	srli    $r10, P2L, #21
-	or      O1H, O1H, $r10
-	or      O1H, O1H, O1L
-	xor     P1H, P1H, P2H
-	and     P1H, P1H, O1L
-	slli    O1L, P2L, #11
-
-	addi    $r10, $r6, #-1
-	slti    $r15, $r10, #0x7fe
-	beqzs8  .LFspecA
-
-.LFlab1:
-	addi    $r10, $r9, #-1
-	slti    $r15, $r10, #0x7fe
-	beqzs8  .LFspecB
-
-.LFlab2:
-	addi    $r10, $r9, #0xfffffc02
-	add     $r6, $r6, $r10
-
-	move    $r10, $r8
-/* This is a 64-bit multiple. */
-#ifndef __big_endian__
-/* For little endian: ($r9, $r3) is (high, low). */
-#ifndef __NDS32_ISA_V3M__
-	mulr64	$r8, $r5, $r8
-#else
-	pushm	$r0, $r5
-	move	$r0, $r5
-	movi	$r1, #0
-	move	$r2, $r8
-	movi	$r3, #0
-	bal	__muldi3
-	movd44	$r8, $r0
-	popm	$r0, $r5
-#endif
-	move    $r3, $r8
-#else /* __big_endian__ */
-/* For big endain: ($r9, $r2) is (high, low). */
-#ifndef __NDS32_ISA_V3M__
-	mulr64	$r8, $r4, $r7
-#else
-	pushm	$r0, $r5
-	move	$r1, $r4
-	movi	$r0, #0
-	move	$r3, $r7
-	movi	$r2, #0
-	bal	__muldi3
-	movd44	$r8, $r0
-	popm	$r0, $r5
-#endif
-	move    $r2, $r9
-	move    $r9, $r8
-#endif /* __big_endian__ */
-	move    $r8, $r10
-
-	move    $r10, P1H
-/* This is a 64-bit multiple. */
-#ifndef __big_endian__
-/* For little endian: ($r0, $r2) is (high, low). */
-#ifndef __NDS32_ISA_V3M__
-	mulr64	$r0, $r4, $r8
-#else
-	pushm	$r2, $r5
-	move	$r0, $r4
-	movi	$r1, #0
-	move	$r2, $r8
-	movi	$r3, #0
-	bal	__muldi3
-	popm	$r2, $r5
-#endif
-	move    $r2, $r0
-	move    $r0, $r1
-#else /* __big_endian__ */
-/* For big endain: ($r1, $r3) is (high, low). */
-#ifndef __NDS32_ISA_V3M__
-	mulr64	$r0, $r5, $r7
-#else
-	pushm	$r2, $r5
-	move	$r1, $r5
-	movi	$r0, #0
-	move	$r3, $r7
-	movi	$r2, #0
-	bal	__muldi3
-	popm	$r2, $r5
-#endif
-	move    $r3, $r1
-	move    $r1, $r0
-#endif /* __big_endian__ */
-	move    P1H, $r10
-
-	#ADD(P2H, P1L)
-	add     P2H, P2H, P1L
-	slt     $r15, P2H, P1L
-
-	#ADDC($r9, $0x0)
-	add     $r9, $r9, $r15
-
-	move    $r10, P1H
-/* This is a 64-bit multiple. */
-#ifndef __big_endian__
-/* For little endian: ($r0, $r8) is (high, low). */
-#ifndef __NDS32_ISA_V3M__
-	mulr64	$r0, $r5, $r7
-#else
-	pushm	$r2, $r5
-	move	$r0, $r5
-	movi	$r1, #0
-	move	$r2, $r7
-	movi	$r3, #0
-	bal	__muldi3
-	popm	$r2, $r5
-#endif
-	move    $r8, $r0
-	move    $r0, $r1
-#else /* __big_endian__ */
-/* For big endian: ($r1, $r7) is (high, low). */
-#ifndef __NDS32_ISA_V3M__
-	mulr64	$r0, $r4, $r8
-#else
-	pushm	$r2, $r5
-	move	$r1, $r4
-	movi	$r0, #0
-	move	$r3, $r8
-	movi	$r2, #0
-	bal	__muldi3
-	popm	$r2, $r5
-#endif
-	move	$r7, $r1
-	move	$r1, $r0
-#endif /* __big_endian__ */
-	move    P1H, $r10
-
-	#ADD(P2L, O1H)
-	add     P2L, P2L, O1H
-	slt     $r15, P2L, O1H
-
-
-	#ADDCC(P2H, P1L)
-	beqzs8  .LL29
-	add     P2H, P2H, P1L
-	slt     $r15, P2H, P1L
-	beqzs8  .LL30
-	addi    P2H, P2H, #0x1
-	j       .LL31
-.LL30:
-	move    $r15, #1
-	add     P2H, P2H, $r15
-	slt     $r15, P2H, $r15
-	j       .LL31
-.LL29:
-	add     P2H, P2H, P1L
-	slt     $r15, P2H, P1L
-.LL31:
-
-	#ADDC($r9, $0x0)
-	add     $r9, $r9, $r15
-
-/* This is a 64-bit multiple. */
-#ifndef __big_endian__
-/* For little endian: ($r8, $r0) is (high, low). */
-	move    $r10, $r9
-#ifndef __NDS32_ISA_V3M__
-	mulr64	$r8, $r4, $r7
-#else
-	pushm	$r0, $r5
-	move	$r0, $r4
-	movi	$r1, #0
-	move	$r2, $r7
-	movi	$r3, #0
-	bal	__muldi3
-	movd44	$r8, $r0
-	popm	$r0, $r5
-#endif
-	move    $r0, $r8
-	move    $r8, $r9
-	move    $r9, $r10
-#else /* __big_endian__ */
-/* For big endian: ($r7, $r1) is (high, low). */
-	move	$r10, $r6
-#ifndef __NDS32_ISA_V3M__
-	mulr64	$r6, $r5, $r8
-#else
-	pushm	$r0, $r5
-	move	$r1, $r5
-	movi	$r0, #0
-	move	$r3, $r8
-	movi	$r2, #0
-	bal	__muldi3
-	movd44	$r6, $r0
-	popm	$r0, $r5
-#endif
-	move	$r1, $r7
-	move	$r7, $r6
-	move	$r6, $r10
-#endif /* __big_endian__ */
-
-	#ADD(P2L, O1H)
-	add     P2L, P2L, O1H
-	slt     $r15, P2L, O1H
-
-
-	#ADDCC(P2H, $0x0)
-	beqzs8  .LL34
-	add     P2H, P2H, $r15
-	slt     $r15, P2H, $r15
-.LL34:
-
-	#ADDC($r9, $0x0)
-	add     $r9, $r9, $r15
-	or      $r10, P1L, P2L
-	beqz    $r10, .Li13
-	ori     P2H, P2H, #1
-.Li13:
-	move    P3H, $r9
-	move    P3L, P2H
-	sltsi   $r15, P3H, #0
-	bnezs8  .Li14
-
-	move    $r15, P3L
-	add     P3L, P3L, P3L
-	slt     $r15, P3L, $r15
-	add     P3H, P3H, P3H
-	add     P3H, P3H, $r15
-	addi    $r6, $r6, #-1
-.Li14:
-	addi    $r10, $r6, #-1
-	slti    $r15, $r10, #0x7fe
-	beqzs8  .LFoveund
-
-	#ADD(P3L, $0x400)
-	move    $r15, #0x400
-	add     P3L, P3L, $r15
-	slt     $r15, P3L, $r15
-
-
-	#ADDCC(P3H, $0x0)
-	beqzs8  .LL37
-	add     P3H, P3H, $r15
-	slt     $r15, P3H, $r15
-.LL37:
-
-	#ADDC($r6, $0x0)
-	add     $r6, $r6, $r15
-
-.LFlab8:
-	srli    $r10, P3L, #11
-	andi    $r10, $r10, #1
-	sub     P3L, P3L, $r10
-	srli    P1L, P3L, #11
-	slli    $r10, P3H, #21
-	or      P1L, P1L, $r10
-	slli    $r10, P3H, #1
-	srli    $r10, $r10, #12
-	or      P1H, P1H, $r10
-	slli    $r10, $r6, #20
-	or      P1H, P1H, $r10
-
-.LFret:
-.LF999:
-	popm    $r6, $r10
-	pop     $lp
-	ret5    $lp
-
-.LFspecA:
-	#ADD(P3L, P3L)
-	move    $r15, P3L
-	add     P3L, P3L, P3L
-	slt     $r15, P3L, $r15
-
-	#ADDC(P3H, P3H)
-	add     P3H, P3H, P3H
-	add     P3H, P3H, $r15
-	bnez    $r6, .Li15
-	or      $r10, P3H, P3L
-	beqz    $r10, .Li16
-
-
-	#NORMd($r4, P1L, P2H)
-	bnez    P3H, .LL38
-	bnez    P3L, .LL39
-	move    $r6, #0
-	j       .LL40
-.LL39:
-	move    P3H, P3L
-	move    P3L, #0
-	move    P1L, #32
-	sub     $r6, $r6, P1L
-.LL38:
-#ifndef __big_endian__
-#ifdef __NDS32_PERF_EXT__
-	clz	$r0, P3H
-#else
-	pushm	$r1, P3H
-	move	$r0, P3H
-	bal	__clzsi2
-	popm	$r1, $r5
-#endif
-#else /* __big_endian__ */
-#ifdef __NDS32_PERF_EXT__
-	clz	$r1, $r4
-#else
-	push	$r0
-	pushm	$r2, $r5
-	move	$r0, $r4
-	bal	__clzsi2
-	move	$r1, $r0
-	popm	$r2, $r5
-	pop	$r0
-#endif
-#endif /* __big_endian__ */
-	beqz    P1L, .LL40
-	sub     $r6, $r6, P1L
-	subri   P2H, P1L, #32
-	srl     P2H, P3L, P2H
-	sll     P3L, P3L, P1L
-	sll     P3H, P3H, P1L
-	or      P3H, P3H, P2H
-.LL40:
-	#NORMd End
-
-	j       .LFlab1
-.Li16:
-	subri   $r15, $r9, #0x7ff
-	beqzs8  .LFnan
-	j       .LFret
-.Li15:
-	or      $r10, P3H, P3L
-	bnez    $r10, .LFnan
-	bnez    $r9, .Li17
-	slli    $r10, O1H, #1
-	or      $r10, $r10, O1L
-	beqz    $r10, .LFnan
-.Li17:
-	subri   $r15, $r9, #0x7ff
-	bnezs8  .LFinf
-
-.LFspecB:
-	#ADD(O1L, O1L)
-	move    $r15, O1L
-	add     O1L, O1L, O1L
-	slt     $r15, O1L, $r15
-
-	#ADDC(O1H, O1H)
-	add     O1H, O1H, O1H
-	add     O1H, O1H, $r15
-	bnez    $r9, .Li18
-	or      $r10, O1H, O1L
-	beqz    $r10, .Li19
-
-
-	#NORMd($r7, P2L, P1L)
-	bnez    O1H, .LL41
-	bnez    O1L, .LL42
-	move    $r9, #0
-	j       .LL43
-.LL42:
-	move    O1H, O1L
-	move    O1L, #0
-	move    P2L, #32
-	sub     $r9, $r9, P2L
-.LL41:
-#ifndef __big_endian__
-#ifdef __NDS32_PERF_EXT__
-	clz	$r2, $r8
-#else
-	pushm	$r0, $r1
-	pushm	$r3, $r5
-	move	$r0, $r8
-	bal	__clzsi2
-	move	$r2, $r0
-	popm	$r3, $r5
-	popm	$r0, $r1
-#endif
-#else /* __big_endian__ */
-#ifdef __NDS32_PERF_EXT__
-	clz	$r3, $r7
-#else
-	pushm	$r0, $r2
-	pushm	$r4, $r5
-	move	$r0, $r7
-	bal	__clzsi2
-	move	$r3, $r0
-	popm	$r4, $r5
-	popm	$r0, $r2
-#endif
-#endif /* __big_endian__ */
-	beqz    P2L, .LL43
-	sub     $r9, $r9, P2L
-	subri   P1L, P2L, #32
-	srl     P1L, O1L, P1L
-	sll     O1L, O1L, P2L
-	sll     O1H, O1H, P2L
-	or      O1H, O1H, P1L
-.LL43:
-	#NORMd End
-
-	j       .LFlab2
-.Li19:
-	move    P1L, #0
-	j       .LFret
-.Li18:
-	or      $r10, O1H, O1L
-	bnez    $r10, .LFnan
-
-.LFinf:
-	move    $r10, #0x7ff00000
-	or      P1H, P1H, $r10
-	move    P1L, #0
-	j       .LFret
-
-.LFnan:
-	move    P1H, #0xfff80000
-	move    P1L, #0
-	j       .LFret
-
-.LFoveund:
-	bgtz    $r6, .LFinf
-	subri   P1L, $r6, #1
-	move    P2L, #0
-.LL44:
-	move    $r10, #0x20
-	slt     $r15, P1L, $r10
-	bnezs8  .LL45
-	or      P2L, P2L, P3L
-	move    P3L, P3H
-	move    P3H, #0
-	addi    P1L, P1L, #0xffffffe0
-	bnez    P3L, .LL44
-.LL45:
-	beqz    P1L, .LL46
-	move    P2H, P3H
-	move    $r10, P3L
-	srl     P3L, P3L, P1L
-	srl     P3H, P3H, P1L
-	subri   P1L, P1L, #0x20
-	sll     P2H, P2H, P1L
-	or      P3L, P3L, P2H
-	sll     $r10, $r10, P1L
-	or      P2L, P2L, $r10
-	beqz    P2L, .LL46
-	ori     P3L, P3L, #1
-.LL46:
-	#ADD(P3L, $0x400)
-	move    $r15, #0x400
-	add     P3L, P3L, $r15
-	slt     $r15, P3L, $r15
-
-	#ADDC(P3H, $0x0)
-	add     P3H, P3H, $r15
-	srli    $r6, P3H, #31
-	j       .LFlab8
-	.size __muldf3, .-__muldf3
-#endif /* L_mul_df */
-
-
-
-#ifdef L_div_sf
-
-	.text
-	.align	2
-	.global	__divsf3
-	.type	__divsf3, @function
-__divsf3:
-	push    $lp
-	pushm   $r6, $r10
-
-	move    $r7, #0x80000000
-	srli    $r4, $r0, #23
-	andi    $r4, $r4, #0xff
-	srli    $r6, $r1, #23
-	andi    $r6, $r6, #0xff
-	slli    $r3, $r0, #8
-	or      $r3, $r3, $r7
-	slli    $r5, $r1, #8
-	or      $r5, $r5, $r7
-	xor     $r10, $r0, $r1
-	and     $r7, $r7, $r10
-
-	addi    $r10, $r4, #-1
-	slti    $r15, $r10, #0xfe
-	beqzs8  .LGspecA
-
-.LGlab1:
-	addi    $r10, $r6, #-1
-	slti    $r15, $r10, #0xfe
-	beqzs8  .LGspecB
-
-.LGlab2:
-	slt     $r15, $r3, $r5
-	bnezs8  .Li27
-	srli    $r3, $r3, #1
-	addi    $r4, $r4, #1
-.Li27:
-	srli    $r8, $r5, #14
-	divr    $r0, $r2, $r3, $r8
-	andi    $r9, $r5, #0x3fff
-	mul     $r1, $r9, $r0
-	slli    $r2, $r2, #14
-
-	#SUB($r2, $r1)
-	move    $r15, $r2
-	sub     $r2, $r2, $r1
-	slt     $r15, $r15, $r2
-	beqzs8  .Li28
-	addi    $r0, $r0, #-1
-
-	#ADD($r2, $r5)
-	add     $r2, $r2, $r5
-	slt     $r15, $r2, $r5
-.Li28:
-	divr    $r3, $r2, $r2, $r8
-	mul     $r1, $r9, $r3
-	slli    $r2, $r2, #14
-
-	#SUB($r2, $r1)
-	move    $r15, $r2
-	sub     $r2, $r2, $r1
-	slt     $r15, $r15, $r2
-	beqzs8  .Li29
-	addi    $r3, $r3, #-1
-
-	#ADD($r2, $r5)
-	add     $r2, $r2, $r5
-	slt     $r15, $r2, $r5
-.Li29:
-	slli    $r10, $r0, #14
-	add     $r3, $r3, $r10
-	slli    $r3, $r3, #4
-	beqz    $r2, .Li30
-	ori     $r3, $r3, #1
-.Li30:
-	subri   $r10, $r6, #0x7e
-	add     $r4, $r4, $r10
-	addi    $r10, $r4, #-1
-	slti    $r15, $r10, #0xfe
-	beqzs8  .LGoveund
-
-.LGlab8:
-	#ADD($r3, $0x80)
-	move    $r15, #0x80
-	add     $r3, $r3, $r15
-	slt     $r15, $r3, $r15
-
-	#ADDC($r4, $0x0)
-	add     $r4, $r4, $r15
-	srli    $r10, $r3, #8
-	andi    $r10, $r10, #1
-	sub     $r3, $r3, $r10
-	slli    $r3, $r3, #1
-	srli    $r3, $r3, #9
-	slli    $r10, $r4, #23
-	or      $r3, $r3, $r10
-	or      $r0, $r3, $r7
-
-.LG999:
-	popm    $r6, $r10
-	pop     $lp
-	ret5    $lp
-
-.LGspecA:
-	bnez    $r4, .Li31
-	add     $r3, $r3, $r3
-	beqz    $r3, .Li31
-#ifdef __NDS32_PERF_EXT__
-	clz	$r8, $r3
-#else
-	pushm	$r0, $r5
-	move	$r0, $r3
-	bal	__clzsi2
-	move	$r8, $r0
-	popm	$r0, $r5
-#endif
-	sub     $r4, $r4, $r8
-	sll     $r3, $r3, $r8
-	j       .LGlab1
-.Li31:
-	bne     $r6, $r4, .Li33
-	add     $r10, $r5, $r5
-	beqz    $r10, .LGnan
-.Li33:
-	subri   $r15, $r6, #0xff
-	beqzs8  .LGspecB
-	beqz    $r4, .LGzer
-	add     $r10, $r3, $r3
-	bnez    $r10, .LGnan
-	j       .LGinf
-
-.LGspecB:
-	bnez    $r6, .Li34
-	add     $r5, $r5, $r5
-	beqz    $r5, .LGinf
-#ifdef __NDS32_PERF_EXT__
-	clz	$r8, $r5
-#else
-	pushm	$r0, $r5
-	move	$r0, $r5
-	bal	__clzsi2
-	move	$r8, $r0
-	popm	$r0, $r5
-#endif
-	sub     $r6, $r6, $r8
-	sll     $r5, $r5, $r8
-	j       .LGlab2
-.Li34:
-	add     $r10, $r5, $r5
-	bnez    $r10, .LGnan
-
-.LGzer:
-	move    $r0, $r7
-	j       .LG999
-
-.LGoveund:
-	bgtz    $r4, .LGinf
-	subri   $r8, $r4, #1
-	slti    $r15, $r8, #0x20
-	beqzs8  .LGzer
-	subri   $r10, $r8, #0x20
-	sll     $r4, $r3, $r10
-	srl     $r3, $r3, $r8
-	beqz    $r4, .Li37
-	ori     $r3, $r3, #2
-.Li37:
-	move    $r4, #0
-	addi    $r10, $r3, #0x80
-	sltsi   $r15, $r10, #0
-	beqzs8  .LGlab8
-	move    $r4, #1
-	j       .LGlab8
-
-.LGinf:
-	move    $r10, #0x7f800000
-	or      $r0, $r7, $r10
-	j       .LG999
-
-.LGnan:
-	move    $r0, #0xffc00000
-	j       .LG999
-	.size	__divsf3, .-__divsf3
-#endif /* L_div_sf */
-
-
-
-#ifdef L_div_df
-
-#ifndef __big_endian__
-	#define P1L     $r0
-	#define P1H     $r1
-	#define P2L     $r2
-	#define P2H     $r3
-	#define P3L     $r4
-	#define P3H     $r5
-	#define O1L     $r7
-	#define O1H	$r8
-#else
-	#define P1H     $r0
-	#define P1L     $r1
-	#define P2H     $r2
-	#define P2L     $r3
-	#define P3H     $r4
-	#define P3L     $r5
-	#define O1H     $r7
-	#define O1L	$r8
-#endif
-	.text
-	.align	2
-	.global	__divdf3
-	.type	__divdf3, @function
-__divdf3:
-	push    $lp
-	pushm   $r6, $r10
-
-	slli    $r6, P1H, #1
-	srli    $r6, $r6, #21
-	slli    P3H, P1H, #11
-	srli    $r10, P1L, #21
-	or      P3H, P3H, $r10
-	slli    P3L, P1L, #11
-	move    O1L, #0x80000000
-	or      P3H, P3H, O1L
-	slli    $r9, P2H, #1
-	srli    $r9, $r9, #21
-	slli    O1H, P2H, #11
-	srli    $r10, P2L, #21
-	or      O1H, O1H, $r10
-	or      O1H, O1H, O1L
-	xor     P1H, P1H, P2H
-	and     P1H, P1H, O1L
-	slli    O1L, P2L, #11
-
-	addi    $r10, $r6, #-1
-	slti    $r15, $r10, #0x7fe
-	beqzs8  .LGspecA
-
-.LGlab1:
-	addi    $r10, $r9, #-1
-	slti    $r15, $r10, #0x7fe
-	beqzs8  .LGspecB
-
-.LGlab2:
-	sub     $r6, $r6, $r9
-	addi    $r6, $r6, #0x3ff
-	srli    P3L, P3L, #1
-	slli    $r10, P3H, #31
-	or      P3L, P3L, $r10
-	srli    P3H, P3H, #1
-	srli    $r9, O1H, #16
-	divr    P2H, P3H, P3H, $r9
-	move    $r10, #0xffff
-	and     P2L, O1H, $r10
-	mul     P1L, P2L, P2H
-	slli    P3H, P3H, #16
-	srli    $r10, P3L, #16
-	or      P3H, P3H, $r10
-
-	#SUB(P3H, P1L)
-	move    $r15, P3H
-	sub     P3H, P3H, P1L
-	slt     $r15, $r15, P3H
-	beqzs8  .Li20
-
-.Lb21:
-	addi    P2H, P2H, #-1
-	add     P3H, P3H, O1H
-	slt     $r15, P3H, O1H
-	beqzs8  .Lb21
-.Li20:
-	divr    $r9, P3H, P3H, $r9
-	mul     P1L, P2L, $r9
-	slli    P3H, P3H, #16
-	move    $r15, #0xffff
-	and     $r10, P3L, $r15
-	or      P3H, P3H, $r10
-
-	#SUB(P3H, P1L)
-	move    $r15, P3H
-	sub     P3H, P3H, P1L
-	slt     $r15, $r15, P3H
-	beqzs8  .Li22
-
-.Lb23:
-	addi    $r9, $r9, #-1
-	add     P3H, P3H, O1H
-	slt     $r15, P3H, O1H
-	beqzs8  .Lb23
-.Li22:
-	slli    P2H, P2H, #16
-	add     P2H, P2H, $r9
-
-/* This is a 64-bit multiple. */
-#ifndef __big_endian__
-/* For little endian: ($r0, $r9) is (high, low). */
-	move    $r10, $r1
-#ifndef __NDS32_ISA_V3M__
-	mulr64	$r0, $r3, $r7
-#else
-	pushm	$r2, $r5
-	move	$r0, $r3
-	movi	$r1, #0
-	move	$r2, $r7
-	movi	$r3, #0
-	bal	__muldi3
-	popm	$r2, $r5
-#endif
-	move    $r9, $r0
-	move    $r0, $r1
-	move    $r1, $r10
-#else /* __big_endian__ */
-/* For big endian: ($r1, $r9) is (high, low). */
-	move    $r10, $r0
-#ifndef __NDS32_ISA_V3M__
-	mulr64	$r0, $r2, $r8
-#else
-	pushm	$r2, $r5
-	move	$r1, $r2
-	movi	$r0, #0
-	move	$r3, $r8
-	movi	$r2, #0
-	bal	__muldi3
-	popm	$r2, $r5
-#endif
-	move    $r9, $r1
-	move    $r1, $r0
-	move    $r0, $r10
-#endif /* __big_endian__ */
-
-	move    P3L, #0
-
-	#SUB(P3L, $r9)
-	move    $r15, P3L
-	sub     P3L, P3L, $r9
-	slt     $r15, $r15, P3L
-
-
-	#SUBCC(P3H, P1L)
-	beqzs8  .LL47
-	move    $r15, P3H
-	sub     P3H, P3H, P1L
-	slt     $r15, $r15, P3H
-	beqzs8  .LL48
-	subi333 P3H, P3H, #1
-	j       .LL49
-.LL48:
-	move    $r15, P3H
-	subi333 P3H, P3H, #1
-	slt     $r15, $r15, P3H
-	j       .LL49
-.LL47:
-	move    $r15, P3H
-	sub     P3H, P3H, P1L
-	slt     $r15, $r15, P3H
-.LL49:
-
-	beqzs8  .Li24
-
-.LGlab3:
-	addi    P2H, P2H, #-1
-
-	#ADD(P3L, O1L)
-	add     P3L, P3L, O1L
-	slt     $r15, P3L, O1L
-
-
-	#ADDCC(P3H, O1H)
-	beqzs8  .LL50
-	add     P3H, P3H, O1H
-	slt     $r15, P3H, O1H
-	beqzs8  .LL51
-	addi    P3H, P3H, #0x1
-	j       .LL52
-.LL51:
-	move    $r15, #1
-	add     P3H, P3H, $r15
-	slt     $r15, P3H, $r15
-	j       .LL52
-.LL50:
-	add     P3H, P3H, O1H
-	slt     $r15, P3H, O1H
-.LL52:
-
-	beqzs8  .LGlab3
-.Li24:
-	bne     P3H, O1H, .Li25
-	move    P1L, O1L
-	move    P3H, P3L
-	move    $r9, #0
-	move    P2L, $r9
-	j       .Le25
-.Li25:
-	srli    P2L, O1H, #16
-	divr    $r9, P3H, P3H, P2L
-	move    $r10, #0xffff
-	and     $r10, O1H, $r10
-	mul     P1L, $r10, $r9
-	slli    P3H, P3H, #16
-	srli    $r15, P3L, #16
-	or      P3H, P3H, $r15
-
-	#SUB(P3H, P1L)
-	move    $r15, P3H
-	sub     P3H, P3H, P1L
-	slt     $r15, $r15, P3H
-	beqzs8  .Li26
-
-.Lb27:
-	addi    $r9, $r9, #-1
-	add     P3H, P3H, O1H
-	slt     $r15, P3H, O1H
-	beqzs8  .Lb27
-.Li26:
-	divr    P2L, P3H, P3H, P2L
-	mul     P1L, $r10, P2L
-	slli    P3H, P3H, #16
-	move    $r10, #0xffff
-	and     $r10, P3L, $r10
-	or      P3H, P3H, $r10
-
-	#SUB(P3H, P1L)
-	move    $r15, P3H
-	sub     P3H, P3H, P1L
-	slt     $r15, $r15, P3H
-	beqzs8  .Li28
-
-.Lb29:
-	addi    P2L, P2L, #-1
-	add     P3H, P3H, O1H
-	slt     $r15, P3H, O1H
-	beqzs8  .Lb29
-.Li28:
-	slli    $r9, $r9, #16
-	add     $r9, $r9, P2L
-
-/* This is a 64-bit multiple. */
-#ifndef __big_endian__
-/* For little endian: ($r0, $r2) is (high, low). */
-	move    $r10, $r1
-#ifndef __NDS32_ISA_V3M__
-	mulr64	$r0, $r9, $r7
-#else
-	pushm	$r2, $r5
-	move	$r0, $r9
-	movi	$r1, #0
-	move	$r2, $r7
-	movi	$r3, #0
-	bal	__muldi3
-	popm	$r2, $r5
-#endif
-	move    $r2, $r0
-	move    $r0, $r1
-	move    $r1, $r10
-#else /* __big_endian__ */
-/* For big endian: ($r1, $r3) is (high, low). */
-	move	$r10, $r0
-#ifndef __NDS32_ISA_V3M__
-	mulr64	$r0, $r9, $r8
-#else
-	pushm	$r2, $r5
-	move	$r0, $r9
-	movi	$r1, #0
-	move	$r2, $r7
-	movi	$r3, #0
-	bal	__muldi3
-	popm	$r2, $r5
-#endif
-	move	$r3, $r1
-	move	$r1, $r0
-	move	$r0, $r10
-#endif /* __big_endian__ */
-
-.Le25:
-	move    P3L, #0
-
-	#SUB(P3L, P2L)
-	move    $r15, P3L
-	sub     P3L, P3L, P2L
-	slt     $r15, $r15, P3L
-
-
-	#SUBCC(P3H, P1L)
-	beqzs8  .LL53
-	move    $r15, P3H
-	sub     P3H, P3H, P1L
-	slt     $r15, $r15, P3H
-	beqzs8  .LL54
-	subi333 P3H, P3H, #1
-	j       .LL55
-.LL54:
-	move    $r15, P3H
-	subi333 P3H, P3H, #1
-	slt     $r15, $r15, P3H
-	j       .LL55
-.LL53:
-	move    $r15, P3H
-	sub     P3H, P3H, P1L
-	slt     $r15, $r15, P3H
-.LL55:
-
-	beqzs8  .Li30
-
-.LGlab4:
-	addi    $r9, $r9, #-1
-
-	#ADD(P3L, O1L)
-	add     P3L, P3L, O1L
-	slt     $r15, P3L, O1L
-
-
-	#ADDCC(P3H, O1H)
-	beqzs8  .LL56
-	add     P3H, P3H, O1H
-	slt     $r15, P3H, O1H
-	beqzs8  .LL57
-	addi    P3H, P3H, #0x1
-	j       .LL58
-.LL57:
-	move    $r15, #1
-	add     P3H, P3H, $r15
-	slt     $r15, P3H, $r15
-	j       .LL58
-.LL56:
-	add     P3H, P3H, O1H
-	slt     $r15, P3H, O1H
-.LL58:
-
-	beqzs8  .LGlab4
-.Li30:
-	sltsi   $r15, P2H, #0
-	bnezs8  .Li31
-
-	#ADD($r9, $r9)
-	move    $r15, $r9
-	add     $r9, $r9, $r9
-	slt     $r15, $r9, $r15
-
-	#ADDC(P2H, P2H)
-	add     P2H, P2H, P2H
-	add     P2H, P2H, $r15
-	addi    $r6, $r6, #-1
-.Li31:
-	or      $r10, P3H, P3L
-	beqz    $r10, .Li32
-	ori     $r9, $r9, #1
-.Li32:
-	move    P3H, P2H
-	move    P3L, $r9
-	addi    $r10, $r6, #-1
-	slti    $r15, $r10, #0x7fe
-	beqzs8  .LGoveund
-
-	#ADD(P3L, $0x400)
-	move    $r15, #0x400
-	add     P3L, P3L, $r15
-	slt     $r15, P3L, $r15
-
-
-	#ADDCC(P3H, $0x0)
-	beqzs8  .LL61
-	add     P3H, P3H, $r15
-	slt     $r15, P3H, $r15
-.LL61:
-
-	#ADDC($r6, $0x0)
-	add     $r6, $r6, $r15
-
-.LGlab8:
-	srli    $r10, P3L, #11
-	andi    $r10, $r10, #1
-	sub     P3L, P3L, $r10
-	srli    P1L, P3L, #11
-	slli    $r10, P3H, #21
-	or      P1L, P1L, $r10
-	slli    $r10, P3H, #1
-	srli    $r10, $r10, #12
-	or      P1H, P1H, $r10
-	slli    $r10, $r6, #20
-	or      P1H, P1H, $r10
-
-.LGret:
-.LG999:
-	popm    $r6, $r10
-	pop     $lp
-	ret5    $lp
-
-.LGoveund:
-	bgtz    $r6, .LGinf
-	subri   P2H, $r6, #1
-	move    P1L, #0
-.LL62:
-	move    $r10, #0x20
-	slt     $r15, P2H, $r10
-	bnezs8  .LL63
-	or      P1L, P1L, P3L
-	move    P3L, P3H
-	move    P3H, #0
-	addi    P2H, P2H, #0xffffffe0
-	bnez    P3L, .LL62
-.LL63:
-	beqz    P2H, .LL64
-	move    P2L, P3H
-	move    $r10, P3L
-	srl     P3L, P3L, P2H
-	srl     P3H, P3H, P2H
-	subri   P2H, P2H, #0x20
-	sll     P2L, P2L, P2H
-	or      P3L, P3L, P2L
-	sll     $r10, $r10, P2H
-	or      P1L, P1L, $r10
-	beqz    P1L, .LL64
-	ori     P3L, P3L, #1
-.LL64:
-	#ADD(P3L, $0x400)
-	move    $r15, #0x400
-	add     P3L, P3L, $r15
-	slt     $r15, P3L, $r15
-
-	#ADDC(P3H, $0x0)
-	add     P3H, P3H, $r15
-	srli    $r6, P3H, #31
-	j       .LGlab8
-
-.LGspecA:
-	#ADD(P3L, P3L)
-	move    $r15, P3L
-	add     P3L, P3L, P3L
-	slt     $r15, P3L, $r15
-
-	#ADDC(P3H, P3H)
-	add     P3H, P3H, P3H
-	add     P3H, P3H, $r15
-	bnez    $r6, .Li33
-	or      $r10, P3H, P3L
-	beqz    $r10, .Li33
-
-
-	#NORMd($r4, P2H, P2L)
-	bnez    P3H, .LL65
-	bnez    P3L, .LL66
-	move    $r6, #0
-	j       .LL67
-.LL66:
-	move    P3H, P3L
-	move    P3L, #0
-	move    P2H, #32
-	sub     $r6, $r6, P2H
-.LL65:
-#ifndef __big_endian__
-#ifdef __NDS32_PERF_EXT__
-	clz	$r3, $r5
-#else
-	pushm	$r0, $r2
-	pushm	$r4, $r5
-	move	$r0, $r5
-	bal	__clzsi2
-	move	$r3, $r0
-	popm	$r4, $r5
-	popm	$r0, $r2
-#endif
-#else /* __big_endian__ */
-#ifdef __NDS32_PERF_EXT__
-	clz	$r2, $r4
-#else
-	pushm	$r0, $r1
-	pushm	$r3, $r5
-	move	$r0, $r4
-	bal	__clzsi2
-	move	$r2, $r0
-	popm	$r3, $r5
-	popm	$r0, $r1
-#endif
-#endif /* __big_endian_ */
-	beqz    P2H, .LL67
-	sub     $r6, $r6, P2H
-	subri   P2L, P2H, #32
-	srl     P2L, P3L, P2L
-	sll     P3L, P3L, P2H
-	sll     P3H, P3H, P2H
-	or      P3H, P3H, P2L
-.LL67:
-	#NORMd End
-
-	j       .LGlab1
-.Li33:
-	bne     $r6, $r9, .Li35
-	slli    $r10, O1H, #1
-	or      $r10, $r10, O1L
-	beqz    $r10, .LGnan
-.Li35:
-	subri   $r15, $r9, #0x7ff
-	beqzs8  .LGspecB
-	beqz    $r6, .LGret
-	or      $r10, P3H, P3L
-	bnez    $r10, .LGnan
-
-.LGinf:
-	move    $r10, #0x7ff00000
-	or      P1H, P1H, $r10
-	move    P1L, #0
-	j       .LGret
-
-.LGspecB:
-	#ADD(O1L, O1L)
-	move    $r15, O1L
-	add     O1L, O1L, O1L
-	slt     $r15, O1L, $r15
-
-	#ADDC(O1H, O1H)
-	add     O1H, O1H, O1H
-	add     O1H, O1H, $r15
-	bnez    $r9, .Li36
-	or      $r10, O1H, O1L
-	beqz    $r10, .LGinf
-
-
-	#NORMd($r7, P2H, P2L)
-	bnez    O1H, .LL68
-	bnez    O1L, .LL69
-	move    $r9, #0
-	j       .LL70
-.LL69:
-	move    O1H, O1L
-	move    O1L, #0
-	move    P2H, #32
-	sub     $r9, $r9, P2H
-.LL68:
-#ifndef __big_endian__
-#ifdef __NDS32_PERF_EXT__
-	clz	$r3, $r8
-#else
-	pushm	$r0, $r2
-	pushm	$r4, $r5
-	move	$r0, $r8
-	bal	__clzsi2
-	move	$r3, $r0
-	popm	$r4, $r5
-	popm	$r0, $r2
-#endif
-#else /* __big_endian__ */
-#ifdef __NDS32_PERF_EXT__
-	clz	$r2, $r7
-#else
-	pushm	$r0, $r1
-	pushm	$r3, $r5
-	move	$r0, $r7
-	bal	__clzsi2
-	move	$r2, $r0
-	popm	$r3, $r5
-	popm	$r0, $r1
-#endif
-#endif /* __big_endian__ */
-	beqz    P2H, .LL70
-	sub     $r9, $r9, P2H
-	subri   P2L, P2H, #32
-	srl     P2L, O1L, P2L
-	sll     O1L, O1L, P2H
-	sll     O1H, O1H, P2H
-	or      O1H, O1H, P2L
-.LL70:
-	#NORMd End
-
-	j       .LGlab2
-.Li36:
-	or      $r10, O1H, O1L
-	beqz    $r10, .Li38
-
-.LGnan:
-	move    P1H, #0xfff80000
-.Li38:
-	move    P1L, #0
-	j       .LGret
-	.size __divdf3, .-__divdf3
-#endif /* L_div_df */
-
-
-
-#ifdef L_negate_sf
-
-	.text
-	.align	2
-	.global	__negsf2
-	.type	__negsf2, @function
-__negsf2:
-	push    $lp
-
-	move    $r1, #0x80000000
-	xor     $r0, $r0, $r1
-
-.LN999:
-	pop     $lp
-	ret5    $lp
-	.size __negsf2, .-__negsf2
-#endif /* L_negate_sf */
-
-
-
-#ifdef L_negate_df
-
-#ifndef __big_endian__
-	#define P1H     $r1
-#else
-	#define P1H     $r0
-#endif
-	.text
-	.align	2
-	.global	__negdf2
-	.type	__negdf2, @function
-__negdf2:
-	push    $lp
-
-	move    $r2, #0x80000000
-	xor     P1H, P1H, $r2
-
-.LP999:
-	pop     $lp
-	ret5    $lp
-	.size __negdf2, .-__negdf2
-#endif /* L_negate_df */
-
-
-
-#ifdef L_sf_to_df
-
-#ifndef __big_endian__
-	#define O1L     $r1
-	#define O1H     $r2
-#else
-	#define O1H     $r1
-	#define O1L     $r2
-#endif
-	.text
-	.align	2
-	.global	__extendsfdf2
-	.type	__extendsfdf2, @function
-__extendsfdf2:
-	push    $lp
-
-	srli    $r3, $r0, #23
-	andi    $r3, $r3, #0xff
-	move    $r5, #0x80000000
-	and     O1H, $r0, $r5
-	addi    $r5, $r3, #-1
-	slti    $r15, $r5, #0xfe
-	beqzs8  .LJspec
-
-.LJlab1:
-	addi    $r3, $r3, #0x380
-	slli    $r5, $r0, #9
-	srli    $r5, $r5, #12
-	or      O1H, O1H, $r5
-	slli    O1L, $r0, #29
-
-.LJret:
-	slli    $r5, $r3, #20
-	or      O1H, O1H, $r5
-	move    $r0, $r1
-	move    $r1, $r2
-
-.LJ999:
-	pop     $lp
-	ret5    $lp
-
-.LJspec:
-	move    O1L, #0
-	add     $r0, $r0, $r0
-	beqz    $r0, .LJret
-	bnez    $r3, .Li42
-
-.Lb43:
-	addi    $r3, $r3, #-1
-	add     $r0, $r0, $r0
-	move    $r5, #0x800000
-	slt     $r15, $r0, $r5
-	bnezs8  .Lb43
-	j       .LJlab1
-.Li42:
-	move    $r3, #0x7ff
-	move    $r5, #0xff000000
-	slt     $r15, $r5, $r0
-	beqzs8  .LJret
-	move    O1H, #0xfff80000
-	j       .LJret
-	.size __extendsfdf2, .-__extendsfdf2
-#endif /* L_sf_to_df */
-
-
-
-#ifdef L_df_to_sf
-
-#ifndef __big_endian__
-	#define P1L     $r0
-	#define P1H     $r1
-	#define P2L     $r2
-	#define P2H     $r3
-#else
-	#define P1H     $r0
-	#define P1L     $r1
-	#define P2H     $r2
-	#define P2L     $r3
-#endif
-	.text
-	.align	2
-	.global	__truncdfsf2
-	.type	__truncdfsf2, @function
-__truncdfsf2:
-	push    $lp
-	pushm   $r6, $r8
-
-	slli    P2H, P1H, #11
-	srli    $r7, P1L, #21
-	or      P2H, P2H, $r7
-	slli    P2L, P1L, #11
-	move    $r7, #0x80000000
-	or      P2H, P2H, $r7
-	and     $r5, P1H, $r7
-	slli    $r4, P1H, #1
-	srli    $r4, $r4, #21
-	addi    $r4, $r4, #0xfffffc80
-	addi    $r7, $r4, #-1
-	slti    $r15, $r7, #0xfe
-	beqzs8  .LKspec
-
-.LKlab1:
-	beqz    P2L, .Li45
-	ori     P2H, P2H, #1
-.Li45:
-	#ADD(P2H, $0x80)
-	move    $r15, #0x80
-	add     P2H, P2H, $r15
-	slt     $r15, P2H, $r15
-
-	#ADDC($r4, $0x0)
-	add     $r4, $r4, $r15
-	srli    $r7, P2H, #8
-	andi    $r7, $r7, #1
-	sub     P2H, P2H, $r7
-	slli    P2H, P2H, #1
-	srli    P2H, P2H, #9
-	slli    $r7, $r4, #23
-	or      P2H, P2H, $r7
-	or      $r0, P2H, $r5
-
-.LK999:
-	popm    $r6, $r8
-	pop     $lp
-	ret5    $lp
-
-.LKspec:
-	subri   $r15, $r4, #0x47f
-	bnezs8  .Li46
-	slli    $r7, P2H, #1
-	or      $r7, $r7, P2L
-	beqz    $r7, .Li46
-	move    $r0, #0xffc00000
-	j       .LK999
-.Li46:
-	sltsi   $r15, $r4, #0xff
-	bnezs8  .Li48
-	move    $r7, #0x7f800000
-	or      $r0, $r5, $r7
-	j       .LK999
-.Li48:
-	subri   $r6, $r4, #1
-	move    $r7, #0x20
-	slt     $r15, $r6, $r7
-	bnezs8  .Li49
-	move    $r0, $r5
-	j       .LK999
-.Li49:
-	subri   $r8, $r6, #0x20
-	sll     $r7, P2H, $r8
-	or      P2L, P2L, $r7
-	srl     P2H, P2H, $r6
-	move    $r4, #0
-	move    $r7, #0x80000000
-	or      P2H, P2H, $r7
-	j       .LKlab1
-	.size __truncdfsf2, .-__truncdfsf2
-#endif /* L_df_to_sf */
-
-
-
-#ifdef L_df_to_si
-
-#ifndef __big_endian__
-	#define P1L     $r0
-	#define P1H     $r1
-#else
-	#define P1H     $r0
-	#define P1L     $r1
-#endif
-	.global	__fixdfsi
-	.type	__fixdfsi, @function
-__fixdfsi:
-	push    $lp
-	pushm   $r6, $r6
-
-	slli    $r3, P1H, #11
-	srli    $r6, P1L, #21
-	or      $r3, $r3, $r6
-	move    $r6, #0x80000000
-	or      $r3, $r3, $r6
-	slli    $r6, P1H, #1
-	srli    $r6, $r6, #21
-	subri   $r2, $r6, #0x41e
-	blez    $r2, .LLnaninf
-	move    $r6, #0x20
-	slt     $r15, $r2, $r6
-	bnezs8  .LL72
-	move    $r3, #0
-.LL72:
-	srl     $r3, $r3, $r2
-	sltsi   $r15, P1H, #0
-	beqzs8  .Li50
-	subri   $r3, $r3, #0
-.Li50:
-	move    $r0, $r3
-
-.LL999:
-	popm    $r6, $r6
-	pop     $lp
-	ret5    $lp
-
-.LLnaninf:
-	beqz    P1L, .Li51
-	ori     P1H, P1H, #1
-.Li51:
-	move    $r6, #0x7ff00000
-	slt     $r15, $r6, P1H
-	beqzs8  .Li52
-	move    $r0, #0x80000000
-	j       .LL999
-.Li52:
-	move    $r0, #0x7fffffff
-	j       .LL999
-	.size __fixdfsi, .-__fixdfsi
-#endif /* L_df_to_si */
-
-
-
-#ifdef L_fixsfdi
-
-#ifndef __big_endian__
-	#define O1L     $r1
-	#define O1H     $r2
-#else
-	#define O1H     $r1
-	#define O1L     $r2
-#endif
-	.text
-	.align	2
-	.global	__fixsfdi
-	.type	__fixsfdi, @function
-__fixsfdi:
-	push    $lp
-
-	srli    $r3, $r0, #23
-	andi    $r3, $r3, #0xff
-	slli    O1H, $r0, #8
-	move    $r5, #0x80000000
-	or      O1H, O1H, $r5
-	move    O1L, #0
-	sltsi   $r15, $r3, #0xbe
-	beqzs8  .LCinfnan
-	subri   $r3, $r3, #0xbe
-.LL8:
-	move    $r5, #0x20
-	slt     $r15, $r3, $r5
-	bnezs8  .LL9
-	move    O1L, O1H
-	move    O1H, #0
-	addi    $r3, $r3, #0xffffffe0
-	bnez    O1L, .LL8
-.LL9:
-	beqz    $r3, .LL10
-	move    $r4, O1H
-	srl     O1L, O1L, $r3
-	srl     O1H, O1H, $r3
-	subri   $r3, $r3, #0x20
-	sll     $r4, $r4, $r3
-	or      O1L, O1L, $r4
-.LL10:
-	sltsi   $r15, $r0, #0
-	beqzs8  .LCret
-
-	subri   O1H, O1H, #0
-	beqz    O1L, .LL11
-	subri   O1L, O1L, #0
-	subi45  O1H, #1
-.LL11:
-
-.LCret:
-	move    $r0, $r1
-	move    $r1, $r2
-
-.LC999:
-	pop     $lp
-	ret5    $lp
-
-.LCinfnan:
-	sltsi   $r15, $r0, #0
-	bnezs8  .LCret3
-	subri   $r15, $r3, #0xff
-	bnezs8  .Li7
-	slli    $r5, O1H, #1
-	beqz    $r5, .Li7
-
-.LCret3:
-	move    O1H, #0x80000000
-	j       .LCret
-.Li7:
-	move    O1H, #0x7fffffff
-	move    O1L, #-1
-	j       .LCret
-	.size	__fixsfdi, .-__fixsfdi
-#endif /* L_fixsfdi */
-
-
-
-#ifdef L_fixdfdi
-
-#ifndef __big_endian__
-	#define P1L     $r0
-	#define P1H     $r1
-	#define O1L     $r3
-	#define O1H     $r4
-#else
-	#define P1H     $r0
-	#define P1L     $r1
-	#define O1H     $r3
-	#define O1L     $r4
-#endif
-	.text
-	.align	2
-	.global	__fixdfdi
-	.type	__fixdfdi, @function
-__fixdfdi:
-	push    $lp
-	pushm   $r6, $r6
-
-	slli    $r5, P1H, #1
-	srli    $r5, $r5, #21
-	slli    O1H, P1H, #11
-	srli    $r6, P1L, #21
-	or      O1H, O1H, $r6
-	slli    O1L, P1L, #11
-	move    $r6, #0x80000000
-	or      O1H, O1H, $r6
-	slti    $r15, $r5, #0x43e
-	beqzs8  .LCnaninf
-	subri   $r2, $r5, #0x43e
-.LL14:
-	move    $r6, #0x20
-	slt     $r15, $r2, $r6
-	bnezs8  .LL15
-	move    O1L, O1H
-	move    O1H, #0
-	addi    $r2, $r2, #0xffffffe0
-	bnez    O1L, .LL14
-.LL15:
-	beqz    $r2, .LL16
-	move    P1L, O1H
-	srl     O1L, O1L, $r2
-	srl     O1H, O1H, $r2
-	subri   $r2, $r2, #0x20
-	sll     P1L, P1L, $r2
-	or      O1L, O1L, P1L
-.LL16:
-	sltsi   $r15, P1H, #0
-	beqzs8  .LCret
-
-	subri   O1H, O1H, #0
-	beqz    O1L, .LL17
-	subri   O1L, O1L, #0
-	subi45  O1H, #1
-.LL17:
-
-.LCret:
-	move    P1L, O1L
-	move    P1H, O1H
-
-.LC999:
-	popm    $r6, $r6
-	pop     $lp
-	ret5    $lp
-
-.LCnaninf:
-	sltsi   $r15, P1H, #0
-	bnezs8  .LCret3
-	subri   $r15, $r5, #0x7ff
-	bnezs8  .Li5
-	slli    $r6, O1H, #1
-	or      $r6, $r6, O1L
-	beqz    $r6, .Li5
-
-.LCret3:
-	move    O1H, #0x80000000
-	move    O1L, #0
-	j       .LCret
-.Li5:
-	move    O1H, #0x7fffffff
-	move    O1L, #-1
-	j       .LCret
-	.size	__fixdfdi, .-__fixdfdi
-#endif /* L_fixdfdi */
-
-
-
-#ifdef L_fixunssfsi
-
-	.global	__fixunssfsi
-	.type	__fixunssfsi, @function
-__fixunssfsi:
-	push    $lp
-
-	slli    $r1, $r0, #8
-	move    $r3, #0x80000000
-	or      $r1, $r1, $r3
-	srli    $r3, $r0, #23
-	andi    $r3, $r3, #0xff
-	subri   $r2, $r3, #0x9e
-	sltsi   $r15, $r2, #0
-	bnezs8  .LLspec
-	sltsi   $r15, $r2, #0x20
-	bnezs8  .Li45
-	move    $r0, #0
-	j       .LL999
-.Li45:
-	srl     $r1, $r1, $r2
-	sltsi   $r15, $r0, #0
-	beqzs8  .Li46
-	subri   $r1, $r1, #0
-.Li46:
-	move    $r0, $r1
-
-.LL999:
-	pop     $lp
-	ret5    $lp
-
-.LLspec:
-	move    $r3, #0x7f800000
-	slt     $r15, $r3, $r0
-	beqzs8  .Li47
-	move    $r0, #0x80000000
-	j       .LL999
-.Li47:
-	move    $r0, #-1
-	j       .LL999
-	.size	__fixunssfsi, .-__fixunssfsi
-#endif /* L_fixunssfsi */
-
-
-
-#ifdef L_fixunsdfsi
-
-#ifndef __big_endian__
-	#define P1L     $r0
-	#define P1H     $r1
-#else
-	#define P1H     $r0
-	#define P1L     $r1
-#endif
-	.text
-	.align	2
-	.global	__fixunsdfsi
-	.type	__fixunsdfsi, @function
-__fixunsdfsi:
-	push    $lp
-	pushm   $r6, $r6
-
-	slli    $r3, P1H, #11
-	srli    $r6, P1L, #21
-	or      $r3, $r3, $r6
-	move    $r6, #0x80000000
-	or      $r3, $r3, $r6
-	slli    $r6, P1H, #1
-	srli    $r6, $r6, #21
-	subri   $r2, $r6, #0x41e
-	sltsi   $r15, $r2, #0
-	bnezs8  .LNnaninf
-	move    $r6, #0x20
-	slt     $r15, $r2, $r6
-	bnezs8  .LL73
-	move    $r3, #0
-.LL73:
-	srl     $r3, $r3, $r2
-	sltsi   $r15, P1H, #0
-	beqzs8  .Li53
-	subri   $r3, $r3, #0
-.Li53:
-	move    $r0, $r3
-
-.LN999:
-	popm    $r6, $r6
-	pop     $lp
-	ret5    $lp
-
-.LNnaninf:
-	beqz    P1L, .Li54
-	ori     P1H, P1H, #1
-.Li54:
-	move    $r6, #0x7ff00000
-	slt     $r15, $r6, P1H
-	beqzs8  .Li55
-	move    $r0, #0x80000000
-	j       .LN999
-.Li55:
-	move    $r0, #-1
-	j       .LN999
-	.size __fixunsdfsi, .-__fixunsdfsi
-#endif /* L_fixunsdfsi */
-
-
-
-#ifdef L_fixunssfdi
-
-#ifndef __big_endian__
-	#define O1L     $r1
-	#define O1H     $r2
-#else
-	#define O1H     $r1
-	#define O1L     $r2
-#endif
-	.text
-	.align	2
-	.global	__fixunssfdi
-	.type	__fixunssfdi, @function
-__fixunssfdi:
-	push    $lp
-
-	srli    $r3, $r0, #23
-	andi    $r3, $r3, #0xff
-	slli    O1H, $r0, #8
-	move    $r5, #0x80000000
-	or      O1H, O1H, $r5
-	move    O1L, #0
-	sltsi   $r15, $r3, #0xbe
-	beqzs8  .LDinfnan
-	subri   $r3, $r3, #0xbe
-.LL12:
-	move    $r5, #0x20
-	slt     $r15, $r3, $r5
-	bnezs8  .LL13
-	move    O1L, O1H
-	move    O1H, #0
-	addi    $r3, $r3, #0xffffffe0
-	bnez    O1L, .LL12
-.LL13:
-	beqz    $r3, .LL14
-	move    $r4, O1H
-	srl     O1L, O1L, $r3
-	srl     O1H, O1H, $r3
-	subri   $r3, $r3, #0x20
-	sll     $r4, $r4, $r3
-	or      O1L, O1L, $r4
-.LL14:
-	sltsi   $r15, $r0, #0
-	beqzs8  .LDret
-
-	subri   O1H, O1H, #0
-	beqz    O1L, .LL15
-	subri   O1L, O1L, #0
-	subi45  O1H, #1
-.LL15:
-
-.LDret:
-	move    $r0, $r1
-	move    $r1, $r2
-
-.LD999:
-	pop     $lp
-	ret5    $lp
-
-.LDinfnan:
-	move    O1H, #0x80000000
-	move    O1L, #0
-	j       .LDret
-	.size	__fixunssfdi, .-__fixunssfdi
-#endif /* L_fixunssfdi */
-
-
-
-#ifdef L_fixunsdfdi
-
-#ifndef __big_endian__
-	#define P1L     $r0
-	#define P1H     $r1
-	#define O1L     $r3
-	#define O1H     $r4
-#else
-	#define P1H     $r0
-	#define P1L     $r1
-	#define O1H     $r3
-	#define O1L     $r4
-#endif
-	.text
-	.align	2
-	.global	__fixunsdfdi
-	.type	__fixunsdfdi, @function
-__fixunsdfdi:
-	push    $lp
-	pushm   $r6, $r6
-
-	slli    $r5, P1H, #1
-	srli    $r5, $r5, #21
-	slli    O1H, P1H, #11
-	srli    $r6, P1L, #21
-	or      O1H, O1H, $r6
-	slli    O1L, P1L, #11
-	move    $r6, #0x80000000
-	or      O1H, O1H, $r6
-	slti    $r15, $r5, #0x43e
-	beqzs8  .LDnaninf
-	subri   $r2, $r5, #0x43e
-.LL18:
-	move    $r6, #0x20
-	slt     $r15, $r2, $r6
-	bnezs8  .LL19
-	move    O1L, O1H
-	move    O1H, #0
-	addi    $r2, $r2, #0xffffffe0
-	bnez    O1L, .LL18
-.LL19:
-	beqz    $r2, .LL20
-	move    P1L, O1H
-	srl     O1L, O1L, $r2
-	srl     O1H, O1H, $r2
-	subri   $r2, $r2, #0x20
-	sll     P1L, P1L, $r2
-	or      O1L, O1L, P1L
-.LL20:
-	sltsi   $r15, P1H, #0
-	beqzs8  .LDret
-
-	subri   O1H, O1H, #0
-	beqz    O1L, .LL21
-	subri   O1L, O1L, #0
-	subi45  O1H, #1
-.LL21:
-
-.LDret:
-	move    P1L, O1L
-	move    P1H, O1H
-
-.LD999:
-	popm    $r6, $r6
-	pop     $lp
-	ret5    $lp
-
-.LDnaninf:
-	move    O1H, #0x80000000
-	move    O1L, #0
-	j       .LDret
-	.size	__fixunsdfdi, .-__fixunsdfdi
-#endif /* L_fixunsdfdi */
-
-
-
-#ifdef L_si_to_sf
-
-	.text
-	.align	2
-	.global	__floatsisf
-	.type	__floatsisf, @function
-__floatsisf:
-	push    $lp
-
-	move    $r4, #0x80000000
-	and     $r2, $r0, $r4
-	beqz    $r0, .Li39
-	sltsi   $r15, $r0, #0
-	beqzs8  .Li40
-	subri   $r0, $r0, #0
-.Li40:
-	move    $r1, #0x9e
-#ifdef __NDS32_PERF_EXT__
-	clz	$r3, $r0
-#else
-	pushm	$r0, $r2
-	pushm	$r4, $r5
-	bal	__clzsi2
-	move	$r3, $r0
-	popm	$r4, $r5
-	popm	$r0, $r2
-#endif
-	sub     $r1, $r1, $r3
-	sll     $r0, $r0, $r3
-
-	#ADD($r0, $0x80)
-	move    $r15, #0x80
-	add     $r0, $r0, $r15
-	slt     $r15, $r0, $r15
-
-	#ADDC($r1, $0x0)
-	add     $r1, $r1, $r15
-	srai    $r4, $r0, #8
-	andi    $r4, $r4, #1
-	sub     $r0, $r0, $r4
-	slli    $r0, $r0, #1
-	srli    $r0, $r0, #9
-	slli    $r4, $r1, #23
-	or      $r0, $r0, $r4
-.Li39:
-	or      $r0, $r0, $r2
-
-.LH999:
-	pop     $lp
-	ret5    $lp
-	.size	__floatsisf, .-__floatsisf
-#endif /* L_si_to_sf */
-
-
-
-#ifdef L_si_to_df
-
-#ifndef __big_endian__
-	#define O1L     $r1
-	#define O1H     $r2
-	#define O2L     $r4
-	#define O2H	$r5
-#else
-	#define O1H     $r1
-	#define O1L     $r2
-	#define O2H     $r4
-	#define O2L	$r5
-#endif
-	.text
-	.align	2
-	.global	__floatsidf
-	.type	__floatsidf, @function
-__floatsidf:
-	push    $lp
-	pushm   $r6, $r6
-
-	move    O1L, #0
-	move    O2H, O1L
-	move    $r3, O1L
-	move    O1H, $r0
-	beqz    O1H, .Li39
-	sltsi   $r15, O1H, #0
-	beqzs8  .Li40
-	move    O2H, #0x80000000
-
-	subri   O1H, O1H, #0
-	beqz    O1L, .LL71
-	subri   O1L, O1L, #0
-	subi45  O1H, #1
-.LL71:
-.Li40:
-	move    $r3, #0x41e
-#ifndef __big_endian__
-#ifdef __NDS32_PERF_EXT__
-	clz	$r4, $r2
-#else
-	pushm	$r0, $r3
-	push	$r5
-	move	$r0, $r2
-	bal	__clzsi2
-	move	$r4, $r0
-	pop	$r5
-	popm	$r0, $r3
-#endif
-#else /* __big_endian__ */
-#ifdef __NDS32_PERF_EXT__
-	clz	$r5, $r1
-#else
-	pushm	$r0, $r4
-	move	$r0, $r1
-	bal	__clzsi2
-	move	$r5, $r0
-	popm	$r0, $r4
-#endif
-#endif /* __big_endian__ */
-	sub     $r3, $r3, O2L
-	sll     O1H, O1H, O2L
-.Li39:
-	srli    O2L, O1L, #11
-	slli    $r6, O1H, #21
-	or      O2L, O2L, $r6
-	slli    $r6, O1H, #1
-	srli    $r6, $r6, #12
-	or      O2H, O2H, $r6
-	slli    $r6, $r3, #20
-	or      O2H, O2H, $r6
-	move    $r0, $r4
-	move    $r1, $r5
-
-.LH999:
-	popm    $r6, $r6
-	pop     $lp
-	ret5    $lp
-	.size __floatsidf, .-__floatsidf
-#endif /* L_si_to_df */
-
-
-
-#ifdef L_floatdisf
-
-#ifndef __big_endian__
-	#define P1L     $r0
-	#define P1H     $r1
-	#define P2L     $r2
-	#define P2H     $r3
-#else
-	#define P1H     $r0
-	#define P1L     $r1
-	#define P2H     $r2
-	#define P2L     $r3
-#endif
-	.text
-	.align	2
-	.global	__floatdisf
-	.type	__floatdisf, @function
-__floatdisf:
-	push    $lp
-	pushm   $r6, $r7
-
-	move    $r7, #0x80000000
-	and     $r5, P1H, $r7
-	move    P2H, P1H
-	move    P2L, P1L
-	or      $r7, P1H, P1L
-	beqz    $r7, .Li1
-	sltsi   $r15, P1H, #0
-	beqzs8  .Li2
-
-	subri   P2H, P2H, #0
-	beqz    P2L, .LL1
-	subri   P2L, P2L, #0
-	subi45  P2H, #1
-.LL1:
-.Li2:
-	move    $r4, #0xbe
-
-
-	#NORMd($r2, $r6, P1L)
-	bnez    P2H, .LL2
-	bnez    P2L, .LL3
-	move    $r4, #0
-	j       .LL4
-.LL3:
-	move    P2H, P2L
-	move    P2L, #0
-	move    $r6, #32
-	sub     $r4, $r4, $r6
-.LL2:
-#ifdef __NDS32_PERF_EXT__
-	clz	$r6, P2H
-#else
-	pushm	$r0, $r5
-	move	$r0, P2H
-	bal	__clzsi2
-	move	$r6, $r0
-	popm	$r0, $r5
-#endif
-	beqz    $r6, .LL4
-	sub     $r4, $r4, $r6
-	subri   P1L, $r6, #32
-	srl     P1L, P2L, P1L
-	sll     P2L, P2L, $r6
-	sll     P2H, P2H, $r6
-	or      P2H, P2H, P1L
-.LL4:
-	#NORMd End
-
-	beqz    P2L, .Li3
-	ori     P2H, P2H, #1
-.Li3:
-	#ADD(P2H, $0x80)
-	move    $r15, #0x80
-	add     P2H, P2H, $r15
-	slt     $r15, P2H, $r15
-
-	#ADDC($r4, $0x0)
-	add     $r4, $r4, $r15
-	srli    $r7, P2H, #8
-	andi    $r7, $r7, #1
-	sub     P2H, P2H, $r7
-	slli    P2H, P2H, #1
-	srli    P2H, P2H, #9
-	slli    $r7, $r4, #23
-	or      P2H, P2H, $r7
-.Li1:
-	or      $r0, P2H, $r5
-
-.LA999:
-	popm    $r6, $r7
-	pop     $lp
-	ret5    $lp
-	.size	__floatdisf, .-__floatdisf
-#endif /* L_floatdisf */
-
-
-
-#ifdef L_floatdidf
-
-#ifndef __big_endian__
-	#define P1L     $r0
-	#define P1H     $r1
-	#define P2L     $r2
-	#define P2H     $r3
-	#define O1L     $r5
-	#define O1H     $r6
-#else
-	#define P1H     $r0
-	#define P1L     $r1
-	#define P2H     $r2
-	#define P2L     $r3
-	#define O1H     $r5
-	#define O1L     $r6
-#endif
-	.text
-	.align	2
-	.global	__floatdidf
-	.type	__floatdidf, @function
-__floatdidf:
-	push    $lp
-	pushm   $r6, $r8
-
-	move    $r4, #0
-	move    $r7, $r4
-	move    P2H, P1H
-	move    P2L, P1L
-	or      $r8, P1H, P1L
-	beqz    $r8, .Li1
-	move    $r4, #0x43e
-	sltsi   $r15, P1H, #0
-	beqzs8  .Li2
-	move    $r7, #0x80000000
-
-	subri   P2H, P2H, #0
-	beqz    P2L, .LL1
-	subri   P2L, P2L, #0
-	subi45  P2H, #1
-.LL1:
-
-.Li2:
-	#NORMd($r2, O1H, O1L)
-	bnez    P2H, .LL2
-	bnez    P2L, .LL3
-	move    $r4, #0
-	j       .LL4
-.LL3:
-	move    P2H, P2L
-	move    P2L, #0
-	move    O1H, #32
-	sub     $r4, $r4, O1H
-.LL2:
-#ifdef __NDS32_PERF_EXT__
-	clz	O1H, P2H
-#else /* not __NDS32_PERF_EXT__ */
-/*
-  Replace clz with function call.
-	clz     O1H, P2H
-  EL:	clz     $r6, $r3
-  EB:	clz	$r5, $r2
-*/
-#ifndef __big_endian__
-	pushm	$r0, $r5
-	move	$r0, $r3
-	bal	__clzsi2
-	move	$r6, $r0
-	popm	$r0, $r5
-#else
-	pushm	$r0, $r4
-	move	$r0, $r2
-	bal	__clzsi2
-	move	$r5, $r0
-	popm	$r0, $r4
-#endif
-#endif /* not __NDS32_PERF_EXT__ */
-	beqz    O1H, .LL4
-	sub     $r4, $r4, O1H
-	subri   O1L, O1H, #32
-	srl     O1L, P2L, O1L
-	sll     P2L, P2L, O1H
-	sll     P2H, P2H, O1H
-	or      P2H, P2H, O1L
-.LL4:
-	#NORMd End
-
-	#ADD(P2L, $0x400)
-	move    $r15, #0x400
-	add     P2L, P2L, $r15
-	slt     $r15, P2L, $r15
-
-
-	#ADDCC(P2H, $0x0)
-	beqzs8  .LL7
-	add     P2H, P2H, $r15
-	slt     $r15, P2H, $r15
-.LL7:
-
-	#ADDC($r4, $0x0)
-	add     $r4, $r4, $r15
-	srli    $r8, P2L, #11
-	andi    $r8, $r8, #1
-	sub     P2L, P2L, $r8
-.Li1:
-	srli    O1L, P2L, #11
-	slli    $r8, P2H, #21
-	or      O1L, O1L, $r8
-	slli    O1H, P2H, #1
-	srli    O1H, O1H, #12
-	slli    $r8, $r4, #20
-	or      O1H, O1H, $r8
-	or      O1H, O1H, $r7
-	move    P1L, O1L
-	move    P1H, O1H
-
-.LA999:
-	popm    $r6, $r8
-	pop     $lp
-	ret5    $lp
-	.size	__floatdidf, .-__floatdidf
-#endif /* L_floatdidf */
-
-
-
-#ifdef L_floatunsisf
-
-	.text
-	.align	2
-	.global	__floatunsisf
-	.type	__floatunsisf, @function
-__floatunsisf:
-	push    $lp
-
-	beqz    $r0, .Li41
-	move    $r2, #0x9e
-#ifdef __NDS32_PERF_EXT__
-	clz	$r1, $r0
-#else
-	push	$r0
-	pushm	$r2, $r5
-	bal	__clzsi2
-	move	$r1, $r0
-	popm	$r2, $r5
-	pop	$r0
-#endif
-
-	sub     $r2, $r2, $r1
-	sll     $r0, $r0, $r1
-
-	#ADD($r0, $0x80)
-	move    $r15, #0x80
-	add     $r0, $r0, $r15
-	slt     $r15, $r0, $r15
-
-	#ADDC($r2, $0x0)
-	add     $r2, $r2, $r15
-	srli    $r3, $r0, #8
-	andi    $r3, $r3, #1
-	sub     $r0, $r0, $r3
-	slli    $r0, $r0, #1
-	srli    $r0, $r0, #9
-	slli    $r3, $r2, #23
-	or      $r0, $r0, $r3
-
-.Li41:
-.LI999:
-	pop     $lp
-	ret5    $lp
-	.size	__floatunsisf, .-__floatunsisf
-#endif /* L_floatunsisf */
-
-
-
-#ifdef L_floatunsidf
-
-#ifndef __big_endian__
-	#define O1L     $r1
-	#define O1H     $r2
-	#define O2L     $r4
-	#define O2H	$r5
-#else
-	#define O1H     $r1
-	#define O1L     $r2
-	#define O2H     $r4
-	#define O2L	$r5
-#endif
-	.text
-	.align	2
-	.global	__floatunsidf
-	.type	__floatunsidf, @function
-__floatunsidf:
-	push    $lp
-	pushm   $r6, $r6
-
-	move    O1L, #0
-	move    $r3, O1L
-	move    O1H, $r0
-	beqz    O1H, .Li41
-	move    $r3, #0x41e
-#ifndef __big_endian__
-#ifdef __NDS32_PERF_EXT__
-	clz	$r5, $r2
-#else
-	pushm	$r0, $r4
-	move	$r0, $r2
-	bal	__clzsi2
-	move	$r5, $r0
-	popm	$r0, $r4
-#endif
-#else /* __big_endian__ */
-#ifdef __NDS32_PERF_EXT__
-	clz	$r4, $r1
-#else
-	pushm	$r0, $r3
-	push	$r5
-	move	$r0, $r1
-	bal	__clzsi2
-	move	$r4, $r0
-	pop	$r5
-	popm	$r0, $r3
-#endif
-#endif /* __big_endian__ */
-	sub     $r3, $r3, O2H
-	sll     O1H, O1H, O2H
-.Li41:
-	srli    O2L, O1L, #11
-	slli    $r6, O1H, #21
-	or      O2L, O2L, $r6
-	slli    O2H, O1H, #1
-	srli    O2H, O2H, #12
-	slli    $r6, $r3, #20
-	or      O2H, O2H, $r6
-	move    $r0, $r4
-	move    $r1, $r5
-
-.LI999:
-	popm    $r6, $r6
-	pop     $lp
-	ret5    $lp
-	.size __floatunsidf, .-__floatunsidf
-#endif /* L_floatunsidf */
-
-
-
-#ifdef L_floatundisf
-
-#ifndef __big_endian__
-	#define P1L     $r0
-	#define P1H     $r1
-	#define P2L     $r2
-	#define P2H     $r3
-#else
-	#define P1H     $r0
-	#define P1L     $r1
-	#define P2H     $r2
-	#define P2L     $r3
-#endif
-	.text
-	.align	2
-	.global	__floatundisf
-	.type	__floatundisf, @function
-__floatundisf:
-	push    $lp
-	pushm   $r6, $r6
-
-	move    P2H, P1H
-	move    P2L, P1L
-	or      $r6, P1H, P1L
-	beqz    $r6, .Li4
-	move    $r4, #0xbe
-
-
-	#NORMd($r2, $r5, P1L)
-	bnez    P2H, .LL5
-	bnez    P2L, .LL6
-	move    $r4, #0
-	j       .LL7
-.LL6:
-	move    P2H, P2L
-	move    P2L, #0
-	move    $r5, #32
-	sub     $r4, $r4, $r5
-.LL5:
-#ifdef __NDS32_PERF_EXT__
-	clz	$r5, P2H
-#else
-	pushm	$r0, $r4
-	move	$r0, P2H
-	bal	__clzsi2
-	move	$r5, $r0
-	popm	$r0, $r4
-#endif
-	beqz    $r5, .LL7
-	sub     $r4, $r4, $r5
-	subri   P1L, $r5, #32
-	srl     P1L, P2L, P1L
-	sll     P2L, P2L, $r5
-	sll     P2H, P2H, $r5
-	or      P2H, P2H, P1L
-.LL7:
-	#NORMd End
-
-	beqz    P2L, .Li5
-	ori     P2H, P2H, #1
-.Li5:
-	#ADD(P2H, $0x80)
-	move    $r15, #0x80
-	add     P2H, P2H, $r15
-	slt     $r15, P2H, $r15
-
-	#ADDC($r4, $0x0)
-	add     $r4, $r4, $r15
-	srli    $r6, P2H, #8
-	andi    $r6, $r6, #1
-	sub     P2H, P2H, $r6
-	slli    P2H, P2H, #1
-	srli    P2H, P2H, #9
-	slli    $r6, $r4, #23
-	or      P2H, P2H, $r6
-.Li4:
-	move    $r0, P2H
-
-.LB999:
-	popm    $r6, $r6
-	pop     $lp
-	ret5    $lp
-	.size	__floatundisf, .-__floatundisf
-#endif /* L_floatundisf */
-
-
-
-#ifdef L_floatundidf
-
-#ifndef __big_endian__
-	#define P1L     $r0
-	#define P1H     $r1
-	#define P2L     $r2
-	#define P2H     $r3
-	#define O1L     $r5
-	#define O1H     $r6
-#else
-	#define P1H     $r0
-	#define P1L     $r1
-	#define P2H     $r2
-	#define P2L     $r3
-	#define O1H     $r5
-	#define O1L     $r6
-#endif
-	.text
-	.align	2
-	.global	__floatundidf
-	.type	__floatundidf, @function
-__floatundidf:
-	push    $lp
-	pushm   $r6, $r7
-
-	move    $r4, #0
-	move    P2H, P1H
-	move    P2L, P1L
-	or      $r7, P1H, P1L
-	beqz    $r7, .Li3
-	move    $r4, #0x43e
-
-
-	#NORMd($r2, O1H, O1L)
-	bnez    P2H, .LL8
-	bnez    P2L, .LL9
-	move    $r4, #0
-	j       .LL10
-.LL9:
-	move    P2H, P2L
-	move    P2L, #0
-	move    O1H, #32
-	sub     $r4, $r4, O1H
-.LL8:
-#ifdef __NDS32_PERF_EXT__
-	clz	O1H, P2H
-#else /* not __NDS32_PERF_EXT__ */
-/*
-  Replace clz with function call.
-	clz     O1H, P2H
-  EL:	clz     $r6, $r3
-  EB:	clz	$r5, $r2
-*/
-#ifndef __big_endian__
-	pushm	$r0, $r5
-	move	$r0, $r3
-	bal	__clzsi2
-	move	$r6, $r0
-	popm	$r0, $r5
-#else
-	pushm	$r0, $r4
-	move	$r0, $r2
-	bal	__clzsi2
-	move	$r5, $r0
-	popm	$r0, $r4
-#endif
-#endif /* not __NDS32_PERF_EXT__ */
-	beqz    O1H, .LL10
-	sub     $r4, $r4, O1H
-	subri   O1L, O1H, #32
-	srl     O1L, P2L, O1L
-	sll     P2L, P2L, O1H
-	sll     P2H, P2H, O1H
-	or      P2H, P2H, O1L
-.LL10:
-	#NORMd End
-
-	#ADD(P2L, $0x400)
-	move    $r15, #0x400
-	add     P2L, P2L, $r15
-	slt     $r15, P2L, $r15
-
-
-	#ADDCC(P2H, $0x0)
-	beqzs8  .LL13
-	add     P2H, P2H, $r15
-	slt     $r15, P2H, $r15
-.LL13:
-
-	#ADDC($r4, $0x0)
-	add     $r4, $r4, $r15
-	srli    $r7, P2L, #11
-	andi    $r7, $r7, #1
-	sub     P2L, P2L, $r7
-.Li3:
-	srli    O1L, P2L, #11
-	slli    $r7, P2H, #21
-	or      O1L, O1L, $r7
-	slli    O1H, P2H, #1
-	srli    O1H, O1H, #12
-	slli    $r7, $r4, #20
-	or      O1H, O1H, $r7
-	move    P1L, O1L
-	move    P1H, O1H
-
-.LB999:
-	popm    $r6, $r7
-	pop     $lp
-	ret5    $lp
-	.size	__floatundidf, .-__floatundidf
-#endif /* L_floatundidf */
-
-
-
-#ifdef L_compare_sf
-
-	.text
-	.align	2
-	.global	__cmpsf2
-	.type	__cmpsf2, @function
-__cmpsf2:
-	.global	__eqsf2
-	.type	__eqsf2, @function
-__eqsf2:
-	.global	__ltsf2
-	.type	__ltsf2, @function
-__ltsf2:
-	.global	__lesf2
-	.type	__lesf2, @function
-__lesf2:
-	.global	__nesf2
-	.type	__nesf2, @function
-__nesf2:
-	move    $r4, #1
-	j	.LA
-
-	.global	__gesf2
-	.type	__gesf2, @function
-__gesf2:
-	.global	__gtsf2
-	.type	__gtsf2, @function
-__gtsf2:
-	move	$r4, #-1
-.LA:
-	push    $lp
-
-	slli    $r2, $r0, #1
-	slli    $r3, $r1, #1
-	or      $r5, $r2, $r3
-	beqz    $r5, .LMequ
-	move    $r5, #0xff000000
-	slt     $r15, $r5, $r2
-	bnezs8  .LMnan
-	slt     $r15, $r5, $r3
-	bnezs8  .LMnan
-	srli    $r2, $r2, #1
-	sltsi   $r15, $r0, #0
-	beqzs8  .Li48
-	subri   $r2, $r2, #0
-.Li48:
-	srli    $r3, $r3, #1
-	sltsi   $r15, $r1, #0
-	beqzs8  .Li49
-	subri   $r3, $r3, #0
-.Li49:
-	slts    $r15, $r2, $r3
-	beqzs8  .Li50
-	move    $r0, #-1
-	j       .LM999
-.Li50:
-	slts    $r15, $r3, $r2
-	beqzs8  .LMequ
-	move    $r0, #1
-	j       .LM999
-
-.LMequ:
-	move    $r0, #0
-
-.LM999:
-	pop     $lp
-	ret5    $lp
-
-.LMnan:
-	move    $r0, $r4
-	j       .LM999
-	.size   __cmpsf2, .-__cmpsf2
-	.size   __eqsf2, .-__eqsf2
-	.size   __ltsf2, .-__ltsf2
-	.size   __lesf2, .-__lesf2
-	.size   __nesf2, .-__nesf2
-	.size   __gesf2, .-__gesf2
-	.size   __gtsf2, .-__gtsf2
-#endif /* L_compare_sf */
-
-
-
-#ifdef L_compare_df
-
-#ifdef __big_endian__
-	#define P1H     $r0
-	#define P1L     $r1
-	#define P2H     $r2
-	#define P2L     $r3
-#else
-	#define P1H     $r1
-	#define P1L     $r0
-	#define P2H     $r3
-	#define P2L     $r2
-#endif
-	.align	2
-	.globl	__gtdf2
-	.globl	__gedf2
-	.globl	__ltdf2
-	.globl	__ledf2
-	.globl	__eqdf2
-	.globl	__nedf2
-	.globl	__cmpdf2
-	.type	__gtdf2, @function
-	.type	__gedf2, @function
-	.type	__ltdf2, @function
-	.type	__ledf2, @function
-	.type	__eqdf2, @function
-	.type	__nedf2, @function
-	.type	__cmpdf2, @function
-__gtdf2:
-__gedf2:
-	movi	$r4, -1
-	b	.L1
-
-__ltdf2:
-__ledf2:
-__cmpdf2:
-__nedf2:
-__eqdf2:
-	movi	$r4, 1
-.L1:
-#if defined (__NDS32_ISA_V3M__)
-	push25	$r10, 0
-#else
-	smw.adm	$r6, [$sp], $r9, 0
-#endif
-
-	sethi	$r5, 0x7ff00
-	and	$r6, P1H, $r5	! r6=aExp
-	and	$r7, P2H, $r5	! r7=bExp
-	slli	$r8, P1H, 12	! r8=aSig0
-	slli	$r9, P2H, 12	! r9=bSig0
-	beq	$r6, $r5, .L11	! aExp==0x7ff
-	beq	$r7, $r5, .L12	! bExp==0x7ff
-.L2:
-	slli	$ta, P1H, 1	! ta=ahigh<<1
-	or	$ta, P1L, $ta	!
-	xor	$r5, P1H, P2H	! r5=ahigh^bhigh
-	beqz	$ta, .L3	! if(ahigh<<1)==0,go .L3
-	!-------------------------------
-	! (ahigh<<1)!=0 || (bhigh<<1)!=0
-	!-------------------------------
-.L4:
-	beqz	$r5, .L5	! ahigh==bhigh, go .L5
-	!--------------------
-	! a != b
-	!--------------------
-.L6:
-	bltz	$r5, .L7	! if(aSign!=bSign), go .L7
-	!--------------------
-	! aSign==bSign
-	!--------------------
-	slt	$ta, $r6, $r7	! ta=(aExp<bExp)
-	bne	$r6, $r7, .L8	! if(aExp!=bExp),go .L8
-	slt	$ta, $r8, $r9	! ta=(aSig0<bSig0)
-	bne	$r8, $r9, .L8	! if(aSig0!=bSig0),go .L8
-	slt	$ta, P1L, P2L	! ta=(aSig1<bSig1)
-.L8:
-	beqz	$ta, .L10	! if(|a|>|b|), go .L10
-	nor	$r0, P2H, P2H	! if(|a|<|b|),return (~yh)
-.L14:
-#if defined (__NDS32_ISA_V3M__)
-	pop25	$r10, 0
-#else
-	lmw.bim	$r6, [$sp], $r9, 0
-	ret
-#endif
-.L10:
-	ori	$r0, P2H, 1	! return (yh|1)
-	b	.L14
-	!--------------------
-	! (ahigh<<1)=0
-	!--------------------
-.L3:
-	slli	$ta, P2H, 1	! ta=bhigh<<1
-	or	$ta, P2L, $ta	!
-	bnez	$ta, .L4	! ta=(bhigh<<1)!=0,go .L4
-.L5:
-	xor	$ta, P1L, P2L	! ta=alow^blow
-	bnez	$ta, .L6	! alow!=blow,go .L6
-	movi	$r0, 0		! a==b, return 0
-	b	.L14
-	!--------------------
-	! aExp=0x7ff;
-	!--------------------
-.L11:
-	or	P1L, P1L, $r8	! x1=(aSig0|aSig1)
-	bnez	P1L, .L13	! if(a=nan), go.L13
-	xor	$ta, $r7, $r5	! ta=(bExp^0x7ff)
-	bnez	$ta, .L2	! if(bExp!=0x7ff), go .L2
-	!--------------------
-	! bExp=0x7ff;
-	!--------------------
-.L12:
-	or	$ta, P2L, $r9	! ta=(bSig0|bSig1)
-	beqz	$ta, .L2	! if(b!=nan), go .L2
-.L13:
-	move	$r0, $r4
-	b	.L14
-	!--------------------
-	! aSign!=bSign
-	!--------------------
-.L7:
-	ori	$r0, P1H, 1	! if(aSign!=bSign), return (ahigh|1)
-	b	.L14
-
-	.size	__gtdf2, .-__gtdf2
-	.size	__gedf2, .-__gedf2
-	.size	__ltdf2, .-__ltdf2
-	.size	__ledf2, .-__ledf2
-	.size	__eqdf2, .-__eqdf2
-	.size	__nedf2, .-__nedf2
-	.size	__cmpdf2, .-__cmpdf2
-#endif /* L_compare_df */
-
-
-
-#ifdef L_unord_sf
-
-	.text
-	.align	2
-	.global	__unordsf2
-	.type	__unordsf2, @function
-__unordsf2:
-	push    $lp
-
-	slli    $r2, $r0, #1
-	move    $r3, #0xff000000
-	slt     $r15, $r3, $r2
-	beqzs8  .Li52
-	move    $r0, #1
-	j       .LP999
-.Li52:
-	slli    $r2, $r1, #1
-	move    $r3, #0xff000000
-	slt     $r15, $r3, $r2
-	beqzs8  .Li53
-	move    $r0, #1
-	j       .LP999
-.Li53:
-	move    $r0, #0
-
-.LP999:
-	pop     $lp
-	ret5    $lp
-	.size	__unordsf2, .-__unordsf2
-#endif /* L_unord_sf */
-
-
-
-#ifdef L_unord_df
-
-#ifndef __big_endian__
-	#define P1L     $r0
-	#define P1H     $r1
-	#define P2L     $r2
-	#define P2H     $r3
-#else
-	#define P1H     $r0
-	#define P1L     $r1
-	#define P2H     $r2
-	#define P2L     $r3
-#endif
-	.text
-	.align	2
-	.global	__unorddf2
-	.type	__unorddf2, @function
-__unorddf2:
-	push    $lp
-
-	slli    $r4, P1H, #1
-	beqz    P1L, .Li66
-	addi    $r4, $r4, #1
-.Li66:
-	move    $r5, #0xffe00000
-	slt     $r15, $r5, $r4
-	beqzs8  .Li67
-	move    $r0, #1
-	j       .LR999
-.Li67:
-	slli    $r4, P2H, #1
-	beqz    P2L, .Li68
-	addi    $r4, $r4, #1
-.Li68:
-	move    $r5, #0xffe00000
-	slt     $r15, $r5, $r4
-	beqzs8  .Li69
-	move    $r0, #1
-	j       .LR999
-.Li69:
-	move    $r0, #0
-
-.LR999:
-	pop     $lp
-	ret5    $lp
-	.size __unorddf2, .-__unorddf2
-#endif /* L_unord_df */
-/* ------------------------------------------- */
-/* DPBIT floating point operations for libgcc  */
-/* ------------------------------------------- */
diff --git a/libgcc/config/nds32/lib2csrc-mculib/_clzdi2.c b/libgcc/config/nds32/lib2csrc-mculib/_clzdi2.c
deleted file mode 100644
index 6afd6ab..0000000
--- a/libgcc/config/nds32/lib2csrc-mculib/_clzdi2.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/* mculib libgcc routines of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-extern int __clzsi2 (int val);
-int
-__clzdi2 (long long val)
-{
-  if (val >> 32)
-    {
-      return __clzsi2 (val >> 32);
-    }
-  else
-    {
-      return __clzsi2 (val) + 32;
-    }
-}
diff --git a/libgcc/config/nds32/lib2csrc-mculib/_clzsi2.c b/libgcc/config/nds32/lib2csrc-mculib/_clzsi2.c
deleted file mode 100644
index 407caaf..0000000
--- a/libgcc/config/nds32/lib2csrc-mculib/_clzsi2.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/* mculib libgcc routines of Andes NDS32 cpu for GNU compiler
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by Andes Technology Corporation.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-int
-__clzsi2 (int val)
-{
-  int i = 32;
-  int j = 16;
-  int temp;
-
-  for (; j; j >>= 1)
-    {
-      if (temp = val >> j)
-	{
-	  if (j == 1)
-	    {
-	      return (i - 2);
-	    }
-	  else
-	    {
-	      i -= j;
-	      val = temp;
-	    }
-	}
-    }
-  return (i - val);
-}
diff --git a/libgcc/config/nds32/linux-atomic.c b/libgcc/config/nds32/linux-atomic.c
new file mode 100644
index 0000000..69f589b
--- /dev/null
+++ b/libgcc/config/nds32/linux-atomic.c
@@ -0,0 +1,282 @@
+/* Linux-specific atomic operations for NDS32 Linux.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* We implement byte, short and int versions of each atomic operation
+   using the kernel helper defined below.  There is no support for
+   64-bit operations yet.  */
+
+/* This function copy form NDS32 Linux-kernal. */
+static inline int
+__kernel_cmpxchg (int oldval, int newval, int *mem)
+{
+  int temp1, temp2, temp3, offset;
+
+  asm volatile ("msync\tall\n"
+		"movi\t%0, #0\n"
+		"1:\n"
+		"\tllw\t%1, [%4+%0]\n"
+		"\tsub\t%3, %1, %6\n"
+		"\tcmovz\t%2, %5, %3\n"
+		"\tcmovn\t%2, %1, %3\n"
+		"\tscw\t%2, [%4+%0]\n"
+		"\tbeqz\t%2, 1b\n"
+		: "=&r" (offset), "=&r" (temp3), "=&r" (temp2), "=&r" (temp1)
+		: "r" (mem), "r" (newval), "r" (oldval) : "memory");
+
+  return temp1;
+}
+
+#define HIDDEN __attribute__ ((visibility ("hidden")))
+
+#ifdef __NDS32_EL__
+#define INVERT_MASK_1 0
+#define INVERT_MASK_2 0
+#else
+#define INVERT_MASK_1 24
+#define INVERT_MASK_2 16
+#endif
+
+#define MASK_1 0xffu
+#define MASK_2 0xffffu
+
+#define FETCH_AND_OP_WORD(OP, PFX_OP, INF_OP)				\
+  int HIDDEN								\
+  __sync_fetch_and_##OP##_4 (int *ptr, int val)				\
+  {									\
+    int failure, tmp;							\
+									\
+    do {								\
+      tmp = __atomic_load_n (ptr, __ATOMIC_SEQ_CST);			\
+      failure = __kernel_cmpxchg (tmp, PFX_OP (tmp INF_OP val), ptr);	\
+    } while (failure != 0);						\
+									\
+    return tmp;								\
+  }
+
+FETCH_AND_OP_WORD (add,   , +)
+FETCH_AND_OP_WORD (sub,   , -)
+FETCH_AND_OP_WORD (or,    , |)
+FETCH_AND_OP_WORD (and,   , &)
+FETCH_AND_OP_WORD (xor,   , ^)
+FETCH_AND_OP_WORD (nand, ~, &)
+
+#define NAME_oldval(OP, WIDTH) __sync_fetch_and_##OP##_##WIDTH
+#define NAME_newval(OP, WIDTH) __sync_##OP##_and_fetch_##WIDTH
+
+/* Implement both __sync_<op>_and_fetch and __sync_fetch_and_<op> for
+   subword-sized quantities.  */
+
+#define SUBWORD_SYNC_OP(OP, PFX_OP, INF_OP, TYPE, WIDTH, RETURN)	\
+  TYPE HIDDEN								\
+  NAME##_##RETURN (OP, WIDTH) (TYPE *ptr, TYPE val)			\
+  {									\
+    int *wordptr = (int *) ((unsigned long) ptr & ~3);			\
+    unsigned int mask, shift, oldval, newval;				\
+    int failure;							\
+									\
+    shift = (((unsigned long) ptr & 3) << 3) ^ INVERT_MASK_##WIDTH;	\
+    mask = MASK_##WIDTH << shift;					\
+									\
+    do {								\
+      oldval = __atomic_load_n (wordptr, __ATOMIC_SEQ_CST);		\
+      newval = ((PFX_OP (((oldval & mask) >> shift)			\
+			 INF_OP (unsigned int) val)) << shift) & mask;	\
+      newval |= oldval & ~mask;						\
+      failure = __kernel_cmpxchg (oldval, newval, wordptr);		\
+    } while (failure != 0);						\
+									\
+    return (RETURN & mask) >> shift;					\
+  }
+
+
+SUBWORD_SYNC_OP (add,   , +, unsigned short, 2, oldval)
+SUBWORD_SYNC_OP (sub,   , -, unsigned short, 2, oldval)
+SUBWORD_SYNC_OP (or,    , |, unsigned short, 2, oldval)
+SUBWORD_SYNC_OP (and,   , &, unsigned short, 2, oldval)
+SUBWORD_SYNC_OP (xor,   , ^, unsigned short, 2, oldval)
+SUBWORD_SYNC_OP (nand, ~, &, unsigned short, 2, oldval)
+
+SUBWORD_SYNC_OP (add,   , +, unsigned char, 1, oldval)
+SUBWORD_SYNC_OP (sub,   , -, unsigned char, 1, oldval)
+SUBWORD_SYNC_OP (or,    , |, unsigned char, 1, oldval)
+SUBWORD_SYNC_OP (and,   , &, unsigned char, 1, oldval)
+SUBWORD_SYNC_OP (xor,   , ^, unsigned char, 1, oldval)
+SUBWORD_SYNC_OP (nand, ~, &, unsigned char, 1, oldval)
+
+#define OP_AND_FETCH_WORD(OP, PFX_OP, INF_OP)				\
+  int HIDDEN								\
+  __sync_##OP##_and_fetch_4 (int *ptr, int val)				\
+  {									\
+    int tmp, failure;							\
+									\
+    do {								\
+      tmp = __atomic_load_n (ptr, __ATOMIC_SEQ_CST);			\
+      failure = __kernel_cmpxchg (tmp, PFX_OP (tmp INF_OP val), ptr);	\
+    } while (failure != 0);						\
+									\
+    return PFX_OP (tmp INF_OP val);					\
+  }
+
+OP_AND_FETCH_WORD (add,   , +)
+OP_AND_FETCH_WORD (sub,   , -)
+OP_AND_FETCH_WORD (or,    , |)
+OP_AND_FETCH_WORD (and,   , &)
+OP_AND_FETCH_WORD (xor,   , ^)
+OP_AND_FETCH_WORD (nand, ~, &)
+
+SUBWORD_SYNC_OP (add,   , +, unsigned short, 2, newval)
+SUBWORD_SYNC_OP (sub,   , -, unsigned short, 2, newval)
+SUBWORD_SYNC_OP (or,    , |, unsigned short, 2, newval)
+SUBWORD_SYNC_OP (and,   , &, unsigned short, 2, newval)
+SUBWORD_SYNC_OP (xor,   , ^, unsigned short, 2, newval)
+SUBWORD_SYNC_OP (nand, ~, &, unsigned short, 2, newval)
+
+SUBWORD_SYNC_OP (add,   , +, unsigned char, 1, newval)
+SUBWORD_SYNC_OP (sub,   , -, unsigned char, 1, newval)
+SUBWORD_SYNC_OP (or,    , |, unsigned char, 1, newval)
+SUBWORD_SYNC_OP (and,   , &, unsigned char, 1, newval)
+SUBWORD_SYNC_OP (xor,   , ^, unsigned char, 1, newval)
+SUBWORD_SYNC_OP (nand, ~, &, unsigned char, 1, newval)
+
+int HIDDEN
+__sync_val_compare_and_swap_4 (int *ptr, int oldval, int newval)
+{
+  int actual_oldval, fail;
+
+  while (1)
+    {
+      actual_oldval = __atomic_load_n (ptr, __ATOMIC_SEQ_CST);
+
+      if (oldval != actual_oldval)
+	return actual_oldval;
+
+      fail = __kernel_cmpxchg (actual_oldval, newval, ptr);
+
+      if (!fail)
+	return oldval;
+    }
+}
+
+#define SUBWORD_VAL_CAS(TYPE, WIDTH)					\
+  TYPE HIDDEN								\
+  __sync_val_compare_and_swap_##WIDTH (TYPE *ptr, TYPE oldval,		\
+				       TYPE newval)			\
+  {									\
+    int *wordptr = (int *)((unsigned long) ptr & ~3), fail;		\
+    unsigned int mask, shift, actual_oldval, actual_newval;		\
+									\
+    shift = (((unsigned long) ptr & 3) << 3) ^ INVERT_MASK_##WIDTH;	\
+    mask = MASK_##WIDTH << shift;					\
+									\
+    while (1)								\
+      {									\
+	actual_oldval = __atomic_load_n (wordptr, __ATOMIC_SEQ_CST); 	\
+									\
+	if (((actual_oldval & mask) >> shift) != (unsigned int) oldval)	\
+	  return (actual_oldval & mask) >> shift;			\
+									\
+	actual_newval = (actual_oldval & ~mask)				\
+			| (((unsigned int) newval << shift) & mask);	\
+									\
+	fail = __kernel_cmpxchg (actual_oldval, actual_newval,		\
+				 wordptr);				\
+									\
+	if (!fail)							\
+	  return oldval;						\
+      }									\
+  }
+
+SUBWORD_VAL_CAS (unsigned short, 2)
+SUBWORD_VAL_CAS (unsigned char,  1)
+
+typedef unsigned char bool;
+
+bool HIDDEN
+__sync_bool_compare_and_swap_4 (int *ptr, int oldval, int newval)
+{
+  int failure = __kernel_cmpxchg (oldval, newval, ptr);
+  return (failure == 0);
+}
+
+#define SUBWORD_BOOL_CAS(TYPE, WIDTH)					\
+  bool HIDDEN								\
+  __sync_bool_compare_and_swap_##WIDTH (TYPE *ptr, TYPE oldval,		\
+					TYPE newval)			\
+  {									\
+    TYPE actual_oldval							\
+      = __sync_val_compare_and_swap_##WIDTH (ptr, oldval, newval);	\
+    return (oldval == actual_oldval);					\
+  }
+
+SUBWORD_BOOL_CAS (unsigned short, 2)
+SUBWORD_BOOL_CAS (unsigned char,  1)
+
+int HIDDEN
+__sync_lock_test_and_set_4 (int *ptr, int val)
+{
+  int failure, oldval;
+
+  do {
+    oldval = __atomic_load_n (ptr, __ATOMIC_SEQ_CST);
+    failure = __kernel_cmpxchg (oldval, val, ptr);
+  } while (failure != 0);
+
+  return oldval;
+}
+
+#define SUBWORD_TEST_AND_SET(TYPE, WIDTH)				\
+  TYPE HIDDEN								\
+  __sync_lock_test_and_set_##WIDTH (TYPE *ptr, TYPE val)		\
+  {									\
+    int failure;							\
+    unsigned int oldval, newval, shift, mask;				\
+    int *wordptr = (int *) ((unsigned long) ptr & ~3);			\
+									\
+    shift = (((unsigned long) ptr & 3) << 3) ^ INVERT_MASK_##WIDTH;	\
+    mask = MASK_##WIDTH << shift;					\
+									\
+    do {								\
+      oldval = __atomic_load_n (wordptr, __ATOMIC_SEQ_CST);		\
+      newval = (oldval & ~mask)						\
+	       | (((unsigned int) val << shift) & mask);		\
+      failure = __kernel_cmpxchg (oldval, newval, wordptr);		\
+    } while (failure != 0);						\
+									\
+    return (oldval & mask) >> shift;					\
+  }
+
+SUBWORD_TEST_AND_SET (unsigned short, 2)
+SUBWORD_TEST_AND_SET (unsigned char,  1)
+
+#define SYNC_LOCK_RELEASE(TYPE, WIDTH)					\
+  void HIDDEN								\
+  __sync_lock_release_##WIDTH (TYPE *ptr)				\
+  {									\
+    /* All writes before this point must be seen before we release	\
+       the lock itself.  */						\
+    __builtin_nds32_msync_all ();					\
+    *ptr = 0;								\
+  }
+
+SYNC_LOCK_RELEASE (int,   4)
+SYNC_LOCK_RELEASE (short, 2)
+SYNC_LOCK_RELEASE (char,  1)
diff --git a/libgcc/config/nds32/linux-unwind.h b/libgcc/config/nds32/linux-unwind.h
new file mode 100644
index 0000000..921edf9
--- /dev/null
+++ b/libgcc/config/nds32/linux-unwind.h
@@ -0,0 +1,156 @@
+/* DWARF2 EH unwinding support for NDS32 Linux signal frame.
+   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Contributed by Andes Technology Corporation.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef inhibit_libc
+
+/* Do code reading to identify a signal frame, and set the frame
+   state data appropriately.  See unwind-dw2.c for the structs.
+   The corresponding bits in the Linux kernel are in
+   arch/nds32/kernel/signal.c.  */
+
+#include <signal.h>
+#include <asm/unistd.h>
+
+/* Exactly the same layout as the kernel structures, unique names.  */
+
+/* arch/nds32/kernel/signal.c */
+struct _sigframe {
+    struct ucontext uc;
+    unsigned long retcode;
+};
+
+struct _rt_sigframe {
+  siginfo_t info;
+  struct _sigframe sig;
+};
+#define SIGRETURN 0xeb0e0a64
+#define RT_SIGRETURN 0xab150a64
+
+#define MD_FALLBACK_FRAME_STATE_FOR nds32_fallback_frame_state
+
+/* This function is supposed to be invoked by uw_frame_state_for()
+   when there is no unwind data available.
+
+   Generally, given the _Unwind_Context CONTEXT for a stack frame,
+   we need to look up its caller and decode information into FS.
+   However, if the exception handling happens within a signal handler,
+   the return address of signal handler is a special module, which
+   contains signal return syscall and has no FDE in the .eh_frame section.
+   We need to implement MD_FALLBACK_FRAME_STATE_FOR so that we can
+   unwind through signal frames.  */
+static _Unwind_Reason_Code
+nds32_fallback_frame_state (struct _Unwind_Context *context,
+			    _Unwind_FrameState *fs)
+{
+  u_int32_t *pc = (u_int32_t *) context->ra;
+  struct sigcontext *sc_;
+  _Unwind_Ptr new_cfa;
+
+#ifdef __NDS32_EB__
+#error "Signal handler is not supported for force unwind."
+#endif
+
+  if ((_Unwind_Ptr) pc & 3)
+    return _URC_END_OF_STACK;
+
+  /* Check if we are going through a signal handler.
+     See arch/nds32/kernel/signal.c implementation.
+       SWI_SYS_SIGRETURN    -> (0xeb0e0a64)
+       SWI_SYS_RT_SIGRETURN -> (0xab150a64)
+     FIXME: Currently we only handle little endian (EL) case.  */
+  if (pc[0] == SIGRETURN)
+    {
+      /* Using '_sigfame' memory address to locate kernal's sigcontext.
+	 The sigcontext structures in arch/nds32/include/asm/sigcontext.h.  */
+      struct _sigframe *rt_;
+      rt_ = context->cfa;
+      sc_ = &rt_->uc.uc_mcontext;
+    }
+  else if (pc[0] == RT_SIGRETURN)
+    {
+      /* Using '_sigfame' memory address to locate kernal's sigcontext.  */
+      struct _rt_sigframe *rt_;
+      rt_ = context->cfa;
+      sc_ = &rt_->sig.uc.uc_mcontext;
+    }
+  else
+    return _URC_END_OF_STACK;
+
+  /* Update cfa from sigcontext.  */
+  new_cfa = (_Unwind_Ptr) sc_;
+  fs->regs.cfa_how = CFA_REG_OFFSET;
+  fs->regs.cfa_reg = STACK_POINTER_REGNUM;
+  fs->regs.cfa_offset = new_cfa - (_Unwind_Ptr) context->cfa;
+
+#define NDS32_PUT_FS_REG(NUM, NAME) \
+  (fs->regs.reg[NUM].how = REG_SAVED_OFFSET, \
+   fs->regs.reg[NUM].loc.offset = (_Unwind_Ptr) &(sc_->NAME) - new_cfa)
+
+  /* Restore all registers value.  */
+  NDS32_PUT_FS_REG (0, nds32_r0);
+  NDS32_PUT_FS_REG (1, nds32_r1);
+  NDS32_PUT_FS_REG (2, nds32_r2);
+  NDS32_PUT_FS_REG (3, nds32_r3);
+  NDS32_PUT_FS_REG (4, nds32_r4);
+  NDS32_PUT_FS_REG (5, nds32_r5);
+  NDS32_PUT_FS_REG (6, nds32_r6);
+  NDS32_PUT_FS_REG (7, nds32_r7);
+  NDS32_PUT_FS_REG (8, nds32_r8);
+  NDS32_PUT_FS_REG (9, nds32_r9);
+  NDS32_PUT_FS_REG (10, nds32_r10);
+  NDS32_PUT_FS_REG (11, nds32_r11);
+  NDS32_PUT_FS_REG (12, nds32_r12);
+  NDS32_PUT_FS_REG (13, nds32_r13);
+  NDS32_PUT_FS_REG (14, nds32_r14);
+  NDS32_PUT_FS_REG (15, nds32_r15);
+  NDS32_PUT_FS_REG (16, nds32_r16);
+  NDS32_PUT_FS_REG (17, nds32_r17);
+  NDS32_PUT_FS_REG (18, nds32_r18);
+  NDS32_PUT_FS_REG (19, nds32_r19);
+  NDS32_PUT_FS_REG (20, nds32_r20);
+  NDS32_PUT_FS_REG (21, nds32_r21);
+  NDS32_PUT_FS_REG (22, nds32_r22);
+  NDS32_PUT_FS_REG (23, nds32_r23);
+  NDS32_PUT_FS_REG (24, nds32_r24);
+  NDS32_PUT_FS_REG (25, nds32_r25);
+
+  NDS32_PUT_FS_REG (28, nds32_fp);
+  NDS32_PUT_FS_REG (29, nds32_gp);
+  NDS32_PUT_FS_REG (30, nds32_lp);
+  NDS32_PUT_FS_REG (31, nds32_sp);
+
+  /* Restore PC, point to trigger signal instruction.  */
+  NDS32_PUT_FS_REG (32, nds32_ipc);
+
+#undef NDS32_PUT_FS_REG
+
+  /* The retaddr is PC, use PC to find FDE.  */
+  fs->retaddr_column = 32;
+  fs->signal_frame = 1;
+
+  return _URC_NO_REASON;
+}
+
+#endif
diff --git a/libgcc/config/nds32/sfp-machine.h b/libgcc/config/nds32/sfp-machine.h
index d822898..930a32e 100644
--- a/libgcc/config/nds32/sfp-machine.h
+++ b/libgcc/config/nds32/sfp-machine.h
@@ -76,6 +76,25 @@ typedef int __gcc_CMPtype __attribute__ ((mode (__libgcc_cmp_return__)));
     R##_c = FP_CLS_NAN;						\
   } while (0)

+#ifdef NDS32_ABI_2FP_PLUS
+#define FP_RND_NEAREST		0x0
+#define FP_RND_PINF		0x1
+#define FP_RND_MINF		0x2
+#define FP_RND_ZERO		0x3
+#define FP_RND_MASK		0x3
+
+#define _FP_DECL_EX \
+  unsigned long int _fcsr __attribute__ ((unused)) = FP_RND_NEAREST
+
+#define FP_INIT_ROUNDMODE			\
+  do {						\
+    _fcsr = __builtin_nds32_fmfcsr ();		\
+  } while (0)
+
+#define FP_ROUNDMODE (_fcsr & FP_RND_MASK)
+
+#endif
+
 /* Not checked.  */
 #define _FP_TININESS_AFTER_ROUNDING 0

diff --git a/libgcc/config/nds32/t-nds32 b/libgcc/config/nds32/t-nds32
index 20c8a3f..4e58b1b 100644
--- a/libgcc/config/nds32/t-nds32
+++ b/libgcc/config/nds32/t-nds32
@@ -26,33 +26,22 @@
 #   Make sure the linker script include these two objects
 #   for building .ctors/.dtors sections.

-# Use -DCRT_BEGIN to create beginning parts of .init and .fini content
-# Make sure you are building crtbegin1.o with -O0 optimization,
-# otherwise the static function will be optimized out
+# Use -DCRT_BEGIN to create beginning parts of .init and .fini content.
 crtbegin1.o: $(srcdir)/config/nds32/initfini.c $(GCC_PASSES) $(CONFIG_H)
 	$(GCC_FOR_TARGET) $(INCLUDES) \
 	$(CFLAGS) \
 	-DCRT_BEGIN \
 	-finhibit-size-directive -fno-inline-functions \
-	-O0 -c $(srcdir)/config/nds32/initfini.c -o crtbegin1.o
+	-fno-toplevel-reorder \
+	-Os -c $(srcdir)/config/nds32/initfini.c -o crtbegin1.o

-# Use -DCRT_END to create ending parts of .init and .fini content
-# Make sure you are building crtend1.o with -O0 optimization,
-# otherwise the static function will be optimized out
+# Use -DCRT_END to create ending parts of .init and .fini content.
 crtend1.o: $(srcdir)/config/nds32/initfini.c $(GCC_PASSES) $(CONFIG_H)
 	$(GCC_FOR_TARGET) $(INCLUDES) \
 	$(CFLAGS) \
 	-DCRT_END \
 	-finhibit-size-directive -fno-inline-functions \
-	-O0 -c $(srcdir)/config/nds32/initfini.c -o crtend1.o
-
-# Use this rule if and only if your crt0.o does not come from library
-# Also, be sure to add 'crtzero.o' in extra_parts in libgcc/config.host
-# and change STARTFILE_SPEC in nds32.h
-#
-#crtzero.o: $(srcdir)/config/nds32/crtzero.S $(GCC_PASSES) $(CONFIG_H)
-#	$(GCC_FOR_TARGET) $(INCLUDES) \
-#	-c $(srcdir)/config/nds32/crtzero.S -o crtzero.o
-
+	-fno-toplevel-reorder \
+	-Os -c $(srcdir)/config/nds32/initfini.c -o crtend1.o

 # ------------------------------------------------------------------------
diff --git a/libgcc/config/nds32/t-nds32-mculib b/libgcc/config/nds32/t-nds32-glibc
similarity index 50%
rename from libgcc/config/nds32/t-nds32-mculib
rename to libgcc/config/nds32/t-nds32-glibc
index b4f7b4c..385644b 100644
--- a/libgcc/config/nds32/t-nds32-mculib
+++ b/libgcc/config/nds32/t-nds32-glibc
@@ -1,4 +1,4 @@
-# Rules of mculib library makefile of Andes NDS32 cpu for GNU compiler
+# Rules of glibc library makefile of Andes NDS32 cpu for GNU compiler
 # Copyright (C) 2012-2016 Free Software Foundation, Inc.
 # Contributed by Andes Technology Corporation.
 #
@@ -19,59 +19,16 @@
 # <http://www.gnu.org/licenses/>.

 # Compiler flags to use when compiling 'libgcc2.c'
-HOST_LIBGCC2_CFLAGS = -Os
+HOST_LIBGCC2_CFLAGS = -O2 -fPIC -fwrapv
+LIB2ADD += $(srcdir)/config/nds32/linux-atomic.c

-
-LIB1ASMSRC   = nds32/lib1asmsrc-mculib.S
-
-LIB1ASMFUNCS =   \
-	_addsub_sf   \
-	_sf_to_si    \
-	_divsi3      \
-	_divdi3      \
-	_modsi3      \
-	_moddi3      \
-	_mulsi3      \
-	_udivsi3     \
-	_udivdi3     \
-	_udivmoddi4  \
-	_umodsi3     \
-	_umoddi3     \
-	_muldi3      \
-	_addsub_df   \
-	_mul_sf      \
-	_mul_df      \
-	_div_sf      \
-	_div_df      \
-	_negate_sf   \
-	_negate_df   \
-	_sf_to_df    \
-	_df_to_sf    \
-	_df_to_si    \
-	_fixsfdi     \
-	_fixdfdi     \
-	_fixunssfsi  \
-	_fixunsdfsi  \
-	_fixunssfdi  \
-	_fixunsdfdi  \
-	_si_to_sf    \
-	_si_to_df    \
-	_floatdisf   \
-	_floatdidf   \
-	_floatunsisf \
-	_floatunsidf \
-	_floatundisf \
-	_floatundidf \
-	_compare_sf  \
-	_compare_df  \
-	_unord_sf    \
-	_unord_df
+#LIB1ASMSRC   = nds32/lib1asmsrc-newlib.S
+#LIB1ASMFUNCS = _divsi3 _modsi3 _udivsi3 _umodsi3

 # List of functions not to build from libgcc2.c.
-LIB2FUNCS_EXCLUDE = _clzsi2 _clzdi2
+#LIB2FUNCS_EXCLUDE = _clzsi2

 # List of extra C and assembler files(*.S) to add to static libgcc2.
-LIB2ADD_ST += $(srcdir)/config/nds32/lib2csrc-mculib/_clzsi2.c
-LIB2ADD_ST += $(srcdir)/config/nds32/lib2csrc-mculib/_clzdi2.c
+#LIB2ADD_ST += $(srcdir)/config/nds32/lib2csrc-newlib/_clzsi2.c

 # ------------------------------------------------------------------------
diff --git a/libgcc/config/nds32/t-nds32-isr b/libgcc/config/nds32/t-nds32-isr
index 62b6867..6493838 100644
--- a/libgcc/config/nds32/t-nds32-isr
+++ b/libgcc/config/nds32/t-nds32-isr
@@ -23,11 +23,15 @@
 # Makfile fragment rules for libnds32_isr.a to support ISR attribute extension
 ###############################################################################

-# basic flags setting
-ISR_CFLAGS = $(CFLAGS) -c
-
-# the object files we would like to create
-LIBNDS32_ISR_16B_OBJS = \
+# Basic flags setting.
+ifneq ($(filter -mext-dsp,$(CFLAGS)),)
+ISR_CFLAGS = $(CFLAGS) -mno-force-no-ext-zol -mext-zol -c
+else
+ISR_CFLAGS = $(CFLAGS) -mno-force-no-ext-zol -c
+endif
+
+# The object files we would like to create.
+LIBNDS32_ISR_VEC_OBJS = \
 		vec_vid00.o vec_vid01.o vec_vid02.o vec_vid03.o \
 		vec_vid04.o vec_vid05.o vec_vid06.o vec_vid07.o \
 		vec_vid08.o vec_vid09.o vec_vid10.o vec_vid11.o \
@@ -46,40 +50,9 @@ LIBNDS32_ISR_16B_OBJS = \
 		vec_vid60.o vec_vid61.o vec_vid62.o vec_vid63.o \
 		vec_vid64.o vec_vid65.o vec_vid66.o vec_vid67.o \
 		vec_vid68.o vec_vid69.o vec_vid70.o vec_vid71.o \
-		vec_vid72.o \
-		excp_isr_ps_nn.o excp_isr_ps_ns.o excp_isr_ps_nr.o \
-		excp_isr_sa_nn.o excp_isr_sa_ns.o excp_isr_sa_nr.o \
-		intr_isr_ps_nn.o intr_isr_ps_ns.o intr_isr_ps_nr.o \
-		intr_isr_sa_nn.o intr_isr_sa_ns.o intr_isr_sa_nr.o \
-		reset.o
-
-LIBNDS32_ISR_4B_OBJS = \
-		vec_vid00_4b.o vec_vid01_4b.o vec_vid02_4b.o vec_vid03_4b.o \
-		vec_vid04_4b.o vec_vid05_4b.o vec_vid06_4b.o vec_vid07_4b.o \
-		vec_vid08_4b.o vec_vid09_4b.o vec_vid10_4b.o vec_vid11_4b.o \
-		vec_vid12_4b.o vec_vid13_4b.o vec_vid14_4b.o vec_vid15_4b.o \
-		vec_vid16_4b.o vec_vid17_4b.o vec_vid18_4b.o vec_vid19_4b.o \
-		vec_vid20_4b.o vec_vid21_4b.o vec_vid22_4b.o vec_vid23_4b.o \
-		vec_vid24_4b.o vec_vid25_4b.o vec_vid26_4b.o vec_vid27_4b.o \
-		vec_vid28_4b.o vec_vid29_4b.o vec_vid30_4b.o vec_vid31_4b.o \
-		vec_vid32_4b.o vec_vid33_4b.o vec_vid34_4b.o vec_vid35_4b.o \
-		vec_vid36_4b.o vec_vid37_4b.o vec_vid38_4b.o vec_vid39_4b.o \
-		vec_vid40_4b.o vec_vid41_4b.o vec_vid42_4b.o vec_vid43_4b.o \
-		vec_vid44_4b.o vec_vid45_4b.o vec_vid46_4b.o vec_vid47_4b.o \
-		vec_vid48_4b.o vec_vid49_4b.o vec_vid50_4b.o vec_vid51_4b.o \
-		vec_vid52_4b.o vec_vid53_4b.o vec_vid54_4b.o vec_vid55_4b.o \
-		vec_vid56_4b.o vec_vid57_4b.o vec_vid58_4b.o vec_vid59_4b.o \
-		vec_vid60_4b.o vec_vid61_4b.o vec_vid62_4b.o vec_vid63_4b.o \
-		vec_vid64_4b.o vec_vid65_4b.o vec_vid66_4b.o vec_vid67_4b.o \
-		vec_vid68_4b.o vec_vid69_4b.o vec_vid70_4b.o vec_vid71_4b.o \
-		vec_vid72_4b.o \
-		excp_isr_ps_nn_4b.o excp_isr_ps_ns_4b.o excp_isr_ps_nr_4b.o \
-		excp_isr_sa_nn_4b.o excp_isr_sa_ns_4b.o excp_isr_sa_nr_4b.o \
-		intr_isr_ps_nn_4b.o intr_isr_ps_ns_4b.o intr_isr_ps_nr_4b.o \
-		intr_isr_sa_nn_4b.o intr_isr_sa_ns_4b.o intr_isr_sa_nr_4b.o \
-		reset_4b.o
+		vec_vid72.o

-LIBNDS32_ISR_COMMON_OBJS = \
+LIBNDS32_ISR_JMP_OBJS = \
 		jmptbl_vid00.o jmptbl_vid01.o jmptbl_vid02.o jmptbl_vid03.o \
 		jmptbl_vid04.o jmptbl_vid05.o jmptbl_vid06.o jmptbl_vid07.o \
 		jmptbl_vid08.o jmptbl_vid09.o jmptbl_vid10.o jmptbl_vid11.o \
@@ -98,29 +71,32 @@ LIBNDS32_ISR_COMMON_OBJS = \
 		jmptbl_vid60.o jmptbl_vid61.o jmptbl_vid62.o jmptbl_vid63.o \
 		jmptbl_vid64.o jmptbl_vid65.o jmptbl_vid66.o jmptbl_vid67.o \
 		jmptbl_vid68.o jmptbl_vid69.o jmptbl_vid70.o jmptbl_vid71.o \
-		jmptbl_vid72.o \
+		jmptbl_vid72.o
+
+LIBNDS32_ISR_COMMON_OBJS = \
+		excp_isr_ps_nn.o excp_isr_ps_ns.o excp_isr_ps_nr.o \
+		excp_isr_sa_nn.o excp_isr_sa_ns.o excp_isr_sa_nr.o \
+		intr_isr_ps_nn.o intr_isr_ps_ns.o intr_isr_ps_nr.o \
+		intr_isr_sa_nn.o intr_isr_sa_ns.o intr_isr_sa_nr.o \
+		reset.o \
 		nmih.o \
 		wrh.o

-LIBNDS32_ISR_COMPLETE_OBJS = $(LIBNDS32_ISR_16B_OBJS) $(LIBNDS32_ISR_4B_OBJS) $(LIBNDS32_ISR_COMMON_OBJS)
-
+LIBNDS32_ISR_COMPLETE_OBJS = $(LIBNDS32_ISR_VEC_OBJS) $(LIBNDS32_ISR_JMP_OBJS) $(LIBNDS32_ISR_COMMON_OBJS)

-# Build common objects for ISR library
-nmih.o: $(srcdir)/config/nds32/isr-library/nmih.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) $(srcdir)/config/nds32/isr-library/nmih.S -o nmih.o

-wrh.o: $(srcdir)/config/nds32/isr-library/wrh.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) $(srcdir)/config/nds32/isr-library/wrh.S -o wrh.o

-jmptbl_vid%.o: $(srcdir)/config/nds32/isr-library/jmptbl_vid%.S
+# Build vector vid objects for ISR library.
+vec_vid%.o: $(srcdir)/config/nds32/isr-library/vec_vid%.S
 	$(GCC_FOR_TARGET) $(ISR_CFLAGS) $< -o $@


-
-# Build 16b version objects for ISR library. (no "_4b" postfix string)
-vec_vid%.o: $(srcdir)/config/nds32/isr-library/vec_vid%.S
+# Build jump table objects for ISR library.
+jmptbl_vid%.o: $(srcdir)/config/nds32/isr-library/jmptbl_vid%.S
 	$(GCC_FOR_TARGET) $(ISR_CFLAGS) $< -o $@

+
+# Build commen objects for ISR library.
 excp_isr_ps_nn.o: $(srcdir)/config/nds32/isr-library/excp_isr.S
 	$(GCC_FOR_TARGET) $(ISR_CFLAGS) $(srcdir)/config/nds32/isr-library/excp_isr.S -o excp_isr_ps_nn.o

@@ -160,48 +136,12 @@ intr_isr_sa_nr.o: $(srcdir)/config/nds32/isr-library/intr_isr.S
 reset.o: $(srcdir)/config/nds32/isr-library/reset.S
 	$(GCC_FOR_TARGET) $(ISR_CFLAGS) $(srcdir)/config/nds32/isr-library/reset.S -o reset.o

-# Build 4b version objects for ISR library.
-vec_vid%_4b.o: $(srcdir)/config/nds32/isr-library/vec_vid%_4b.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) $< -o $@
-
-excp_isr_ps_nn_4b.o: $(srcdir)/config/nds32/isr-library/excp_isr_4b.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) $(srcdir)/config/nds32/isr-library/excp_isr_4b.S -o excp_isr_ps_nn_4b.o
-
-excp_isr_ps_ns_4b.o: $(srcdir)/config/nds32/isr-library/excp_isr_4b.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) -DNDS32_NESTED $(srcdir)/config/nds32/isr-library/excp_isr_4b.S -o excp_isr_ps_ns_4b.o
-
-excp_isr_ps_nr_4b.o: $(srcdir)/config/nds32/isr-library/excp_isr_4b.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) -DNDS32_NESTED_READY $(srcdir)/config/nds32/isr-library/excp_isr_4b.S -o excp_isr_ps_nr_4b.o
-
-excp_isr_sa_nn_4b.o: $(srcdir)/config/nds32/isr-library/excp_isr_4b.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) -DNDS32_SAVE_ALL_REGS $(srcdir)/config/nds32/isr-library/excp_isr_4b.S -o excp_isr_sa_nn_4b.o
-
-excp_isr_sa_ns_4b.o: $(srcdir)/config/nds32/isr-library/excp_isr_4b.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) -DNDS32_SAVE_ALL_REGS -DNDS32_NESTED $(srcdir)/config/nds32/isr-library/excp_isr_4b.S -o excp_isr_sa_ns_4b.o
-
-excp_isr_sa_nr_4b.o: $(srcdir)/config/nds32/isr-library/excp_isr_4b.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) -DNDS32_SAVE_ALL_REGS -DNDS32_NESTED_READY $(srcdir)/config/nds32/isr-library/excp_isr_4b.S -o excp_isr_sa_nr_4b.o
-
-intr_isr_ps_nn_4b.o: $(srcdir)/config/nds32/isr-library/intr_isr_4b.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) $(srcdir)/config/nds32/isr-library/intr_isr_4b.S -o intr_isr_ps_nn_4b.o
-
-intr_isr_ps_ns_4b.o: $(srcdir)/config/nds32/isr-library/intr_isr_4b.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) -DNDS32_NESTED $(srcdir)/config/nds32/isr-library/intr_isr_4b.S -o intr_isr_ps_ns_4b.o
-
-intr_isr_ps_nr_4b.o: $(srcdir)/config/nds32/isr-library/intr_isr_4b.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) -DNDS32_NESTED_READY $(srcdir)/config/nds32/isr-library/intr_isr_4b.S -o intr_isr_ps_nr_4b.o
-
-intr_isr_sa_nn_4b.o: $(srcdir)/config/nds32/isr-library/intr_isr_4b.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) -DNDS32_SAVE_ALL_REGS $(srcdir)/config/nds32/isr-library/intr_isr_4b.S -o intr_isr_sa_nn_4b.o
-
-intr_isr_sa_ns_4b.o: $(srcdir)/config/nds32/isr-library/intr_isr_4b.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) -DNDS32_SAVE_ALL_REGS -DNDS32_NESTED $(srcdir)/config/nds32/isr-library/intr_isr_4b.S -o intr_isr_sa_ns_4b.o
+nmih.o: $(srcdir)/config/nds32/isr-library/nmih.S
+	$(GCC_FOR_TARGET) $(ISR_CFLAGS) $(srcdir)/config/nds32/isr-library/nmih.S -o nmih.o

-intr_isr_sa_nr_4b.o: $(srcdir)/config/nds32/isr-library/intr_isr_4b.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) -DNDS32_SAVE_ALL_REGS -DNDS32_NESTED_READY $(srcdir)/config/nds32/isr-library/intr_isr_4b.S -o intr_isr_sa_nr_4b.o
+wrh.o: $(srcdir)/config/nds32/isr-library/wrh.S
+	$(GCC_FOR_TARGET) $(ISR_CFLAGS) $(srcdir)/config/nds32/isr-library/wrh.S -o wrh.o

-reset_4b.o: $(srcdir)/config/nds32/isr-library/reset_4b.S
-	$(GCC_FOR_TARGET) $(ISR_CFLAGS) $(srcdir)/config/nds32/isr-library/reset_4b.S -o reset_4b.o


 # The rule to create libnds32_isr.a file
diff --git a/libgcc/config/nds32/t-nds32-newlib b/libgcc/config/nds32/t-nds32-newlib
index e4af03e..c356b60 100644
--- a/libgcc/config/nds32/t-nds32-newlib
+++ b/libgcc/config/nds32/t-nds32-newlib
@@ -19,7 +19,7 @@
 # <http://www.gnu.org/licenses/>.

 # Compiler flags to use when compiling 'libgcc2.c'
-HOST_LIBGCC2_CFLAGS = -O2
+HOST_LIBGCC2_CFLAGS = -O2 -fwrapv


 #LIB1ASMSRC   = nds32/lib1asmsrc-newlib.S