From 32ce9e7f75bd97e38bc41b2033b688fc17094d75 Mon Sep 17 00:00:00 2001
From: lihangtian <936971274@qq.com>
Date: Thu, 28 Jul 2022 17:10:26 +0800
Subject: [PATCH 1/8] [ModelZoo] Support Co_Action Network

---
 modelzoo/CAN/prepare_data.sh           |    9 +
 modelzoo/CAN/script/Dice.py            |   35 +
 modelzoo/CAN/script/calc_ckpt.py       |   14 +
 modelzoo/CAN/script/data_iterator.py   |  228 ++++
 modelzoo/CAN/script/generate_voc.py    |   91 ++
 modelzoo/CAN/script/generate_voc.py.bk |   65 ++
 modelzoo/CAN/script/local_aggretor.py  |   46 +
 modelzoo/CAN/script/model.py           |  800 +++++++++++++
 modelzoo/CAN/script/model_avazu.py     |  973 ++++++++++++++++
 modelzoo/CAN/script/process_data.py    |  101 ++
 modelzoo/CAN/script/rnn.py             | 1454 ++++++++++++++++++++++++
 modelzoo/CAN/script/shuffle.py         |   42 +
 modelzoo/CAN/script/split_by_user.py   |   20 +
 modelzoo/CAN/script/test.py            |   10 +
 modelzoo/CAN/script/train.py           |  293 +++++
 modelzoo/CAN/script/utils.py           |  404 +++++++
 16 files changed, 4585 insertions(+)
 create mode 100644 modelzoo/CAN/prepare_data.sh
 create mode 100644 modelzoo/CAN/script/Dice.py
 create mode 100644 modelzoo/CAN/script/calc_ckpt.py
 create mode 100644 modelzoo/CAN/script/data_iterator.py
 create mode 100644 modelzoo/CAN/script/generate_voc.py
 create mode 100644 modelzoo/CAN/script/generate_voc.py.bk
 create mode 100644 modelzoo/CAN/script/local_aggretor.py
 create mode 100644 modelzoo/CAN/script/model.py
 create mode 100644 modelzoo/CAN/script/model_avazu.py
 create mode 100644 modelzoo/CAN/script/process_data.py
 create mode 100644 modelzoo/CAN/script/rnn.py
 create mode 100644 modelzoo/CAN/script/shuffle.py
 create mode 100644 modelzoo/CAN/script/split_by_user.py
 create mode 100644 modelzoo/CAN/script/test.py
 create mode 100644 modelzoo/CAN/script/train.py
 create mode 100644 modelzoo/CAN/script/utils.py

diff --git a/modelzoo/CAN/prepare_data.sh b/modelzoo/CAN/prepare_data.sh
new file mode 100644
index 00000000000..110b9559129
--- /dev/null
+++ b/modelzoo/CAN/prepare_data.sh
@@ -0,0 +1,9 @@
+export PATH="~/anaconda4/bin:$PATH"
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Books.json.gz
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Books.json.gz
+gunzip reviews_Books.json.gz
+gunzip meta_Books.json.gz
+python script/process_data.py meta_Books.json reviews_Books_5.json
+python script/local_aggretor.py
+python script/split_by_user.py
+python script/generate_voc.py
diff --git a/modelzoo/CAN/script/Dice.py b/modelzoo/CAN/script/Dice.py
new file mode 100644
index 00000000000..160fb3d909e
--- /dev/null
+++ b/modelzoo/CAN/script/Dice.py
@@ -0,0 +1,35 @@
+import tensorflow as tf
+
+def dice(_x, axis=-1, epsilon=0.000000001, name=''):
+  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
+    alphas = tf.get_variable('alpha'+name, _x.get_shape()[-1],
+                         initializer=tf.constant_initializer(0.0),
+                         dtype=tf.float32)
+    input_shape = list(_x.get_shape())
+
+    reduction_axes = list(range(len(input_shape)))
+    del reduction_axes[axis]
+    broadcast_shape = [1] * len(input_shape)
+    broadcast_shape[axis] = input_shape[axis]
+
+  # case: train mode (uses stats of the current batch)
+  mean = tf.reduce_mean(_x, axis=reduction_axes)
+  brodcast_mean = tf.reshape(mean, broadcast_shape)
+  std = tf.reduce_mean(tf.square(_x - brodcast_mean) + epsilon, axis=reduction_axes)
+  std = tf.sqrt(std)
+  brodcast_std = tf.reshape(std, broadcast_shape)
+  x_normed = (_x - brodcast_mean) / (brodcast_std + epsilon)
+  # x_normed = tf.layers.batch_normalization(_x, center=False, scale=False)
+  x_p = tf.sigmoid(x_normed)
+
+
+  return alphas * (1.0 - x_p) * _x + x_p * _x
+
+def parametric_relu(_x):
+  alphas = tf.get_variable('alpha', _x.get_shape()[-1],
+                       initializer=tf.constant_initializer(0.0),
+                       dtype=tf.float32)
+  pos = tf.nn.relu(_x)
+  neg = alphas * (_x - abs(_x)) * 0.5
+
+  return pos + neg
diff --git a/modelzoo/CAN/script/calc_ckpt.py b/modelzoo/CAN/script/calc_ckpt.py
new file mode 100644
index 00000000000..fa5d4bda035
--- /dev/null
+++ b/modelzoo/CAN/script/calc_ckpt.py
@@ -0,0 +1,14 @@
+
+ckpt = tf.train.get_checkpoint_state("./ckpt_path/").model_checkpoint_path
+saver = tf.train.import_meta_graph(ckpt+'.meta')
+variables = tf.trainable_variables()
+total_parameters = 0
+for variable in variables:
+    shape = variable.get_shape()
+    variable_parameters = 1
+    for dim in shape:
+        # print(dim)
+        variable_parameters *= dim.value
+    # print(variable_parameters)
+    total_parameters += variable_parameters
+print(total_parameters)
diff --git a/modelzoo/CAN/script/data_iterator.py b/modelzoo/CAN/script/data_iterator.py
new file mode 100644
index 00000000000..b5eef5f9e57
--- /dev/null
+++ b/modelzoo/CAN/script/data_iterator.py
@@ -0,0 +1,228 @@
+import numpy
+import json
+#import cPickle as pkl
+import _pickle as cPickle
+import random
+
+import gzip
+
+import shuffle
+
+def unicode_to_utf8(d):
+    return dict((key.encode("UTF-8"), value) for (key,value) in d.items())
+def dict_unicode_to_utf8(d):
+    print('d={}'.format(d))
+    return dict(((key[0].encode("UTF-8"), key[1].encode("UTF-8")), value) for (key,value) in d.items())
+
+def load_dict(filename):
+    try:
+        with open(filename, 'rb') as f:
+            return unicode_to_utf8(json.load(f))
+    except:
+        try:
+            with open(filename, 'rb') as f:
+                return unicode_to_utf8(cPickle.load(f))
+        except:
+            with open(filename, 'rb') as f:
+                return dict_unicode_to_utf8(cPickle.load(f))
+
+
+def fopen(filename, mode='r'):
+    if filename.endswith('.gz'):
+        return gzip.open(filename, mode)
+    return open(filename, mode)
+
+
+class DataIterator:
+
+    def __init__(self, source,
+                 uid_voc,
+                 mid_voc,
+                 cat_voc,
+                 batch_size=128,
+                 maxlen=100,
+                 skip_empty=False,
+                 shuffle_each_epoch=False,
+                 sort_by_length=True,
+                 max_batch_size=20,
+                 minlen=None,
+                 label_type=1):
+        if shuffle_each_epoch:
+            self.source_orig = source
+            self.source = shuffle.main(self.source_orig, temporary=True)
+        else:
+            self.source = fopen(source, 'r')
+        self.source_dicts = []
+        #for source_dict in [uid_voc, mid_voc, cat_voc, cat_voc, cat_voc]:# 'item_carte_voc.pkl', 'cate_carte_voc.pkl']:
+        for source_dict in [uid_voc, mid_voc, cat_voc, '/home/test/modelzoo/CAN/data/item_carte_voc.pkl', '/home/test/modelzoo/CAN/data/cate_carte_voc.pkl']:
+            self.source_dicts.append(load_dict(source_dict))
+
+        f_meta = open("/home/test/modelzoo/CAN/data/item-info", "r")
+        meta_map = {}
+        for line in f_meta:
+            arr = line.strip().split("\t")
+            if arr[0] not in meta_map:
+                meta_map[arr[0]] = arr[1]
+        self.meta_id_map ={}
+        for key in meta_map:
+            val = meta_map[key]
+            if key in self.source_dicts[1]:
+                mid_idx = self.source_dicts[1][key]
+            else:
+                mid_idx = 0
+            if val in self.source_dicts[2]:
+                cat_idx = self.source_dicts[2][val]
+            else:
+                cat_idx = 0
+            self.meta_id_map[mid_idx] = cat_idx
+
+        f_review = open("/home/test/modelzoo/CAN/data/reviews-info", "r")
+        self.mid_list_for_random = []
+        for line in f_review:
+            arr = line.strip().split("\t")
+            tmp_idx = 0
+            if arr[1] in self.source_dicts[1]:
+                tmp_idx = self.source_dicts[1][arr[1]]
+            self.mid_list_for_random.append(tmp_idx)
+
+        self.batch_size = batch_size
+        self.maxlen = maxlen
+        self.minlen = minlen
+        self.skip_empty = skip_empty
+
+        self.n_uid = len(self.source_dicts[0])
+        self.n_mid = len(self.source_dicts[1])
+        self.n_cat = len(self.source_dicts[2])
+        self.n_carte = [len(self.source_dicts[3]), len(self.source_dicts[4])]
+        print("n_uid=%d, n_mid=%d, n_cat=%d" % (self.n_uid, self.n_mid, self.n_cat))
+
+        self.shuffle = shuffle_each_epoch
+        self.sort_by_length = sort_by_length
+
+        self.source_buffer = []
+        self.k = batch_size * max_batch_size
+
+        self.end_of_data = False
+        self.label_type = label_type
+
+    def get_n(self):
+        return self.n_uid, self.n_mid, self.n_cat, self.n_carte
+
+    def __iter__(self):
+        return self
+
+    def reset(self):
+        if self.shuffle:
+            self.source= shuffle.main(self.source_orig, temporary=True)
+        else:
+            self.source.seek(0)
+
+    def __next__(self):
+        if self.end_of_data:
+            self.end_of_data = False
+            self.reset()
+            raise StopIteration
+
+        source = []
+        target = []
+
+        if len(self.source_buffer) == 0:
+            for k_ in range(self.k):
+                ss = self.source.readline()
+                if ss == "":
+                    break
+                self.source_buffer.append(ss.strip("\n").split("\t"))
+
+            # sort by  history behavior length
+            if self.sort_by_length:
+                his_length = numpy.array([len(s[4].split("")) for s in self.source_buffer])
+                tidx = his_length.argsort()
+
+                _sbuf = [self.source_buffer[i] for i in tidx]
+                self.source_buffer = _sbuf
+            else:
+                self.source_buffer.reverse()
+
+        if len(self.source_buffer) == 0:
+            self.end_of_data = False
+            self.reset()
+            raise StopIteration
+
+        try:
+
+            # actual work here
+            while True:
+
+                # read from source file and map to word index
+                try:
+                    ss = self.source_buffer.pop()
+                except IndexError:
+                    break
+
+                uid = self.source_dicts[0][ss[1]] if ss[1] in self.source_dicts[0] else 0
+                mid = self.source_dicts[1][ss[2]] if ss[2] in self.source_dicts[1] else 0
+                cat = self.source_dicts[2][ss[3]] if ss[3] in self.source_dicts[2] else 0
+
+                tmp = []
+                item_carte = []
+                for fea in ss[4].split(""):
+                    m = self.source_dicts[1][fea] if fea in self.source_dicts[1] else 0
+                    tmp.append(m)
+                    i_c = self.source_dicts[3][(ss[2], fea)] if (ss[2], fea) in self.source_dicts[3] else 0
+                    item_carte.append(i_c)
+                mid_list = tmp
+
+                tmp1 = []
+                cate_carte = []
+                for fea in ss[5].split(""):
+                    c = self.source_dicts[2][fea] if fea in self.source_dicts[2] else 0
+                    tmp1.append(c)
+                    c_c = self.source_dicts[4][(ss[3], fea)] if (ss[3], fea) in self.source_dicts[4] else 0
+                    cate_carte.append(c_c)
+                cat_list = tmp1
+
+                # read from source file and map to word index
+
+                if self.minlen != None:
+                    if len(mid_list) <= self.minlen:
+                        continue
+                if self.skip_empty and (not mid_list):
+                    continue
+
+                noclk_mid_list = []
+                noclk_cat_list = []
+                for pos_mid in mid_list:
+                    noclk_tmp_mid = []
+                    noclk_tmp_cat = []
+                    noclk_index = 0
+                    while True:
+                        noclk_mid_indx = random.randint(0, len(self.mid_list_for_random)-1)
+                        noclk_mid = self.mid_list_for_random[noclk_mid_indx]
+                        if noclk_mid == pos_mid:
+                            continue
+                        noclk_tmp_mid.append(noclk_mid)
+                        noclk_tmp_cat.append(self.meta_id_map[noclk_mid])
+                        noclk_index += 1
+                        if noclk_index >= 5:
+                            break
+                    noclk_mid_list.append(noclk_tmp_mid)
+                    noclk_cat_list.append(noclk_tmp_cat)
+                carte_list = [item_carte, cate_carte]
+                source.append([uid, mid, cat, mid_list, cat_list, noclk_mid_list, noclk_cat_list, carte_list])
+                if self.label_type == 1:
+                    target.append([float(ss[0])])
+                else:
+                    target.append([float(ss[0]), 1-float(ss[0])])
+
+                if len(source) >= self.batch_size or len(target) >= self.batch_size:
+                    break
+        except IOError:
+            self.end_of_data = True
+
+        # all sentence pairs in maxibatch filtered out because of length
+        if len(source) == 0 or len(target) == 0:
+            source, target = self.next()
+
+        return source, target
+
+
diff --git a/modelzoo/CAN/script/generate_voc.py b/modelzoo/CAN/script/generate_voc.py
new file mode 100644
index 00000000000..03b6a662d97
--- /dev/null
+++ b/modelzoo/CAN/script/generate_voc.py
@@ -0,0 +1,91 @@
+import  pickle  as pk
+
+f_train = open("/home/test/modelzoo/DIEN/data/local_train_splitByUser", "r")
+uid_dict = {}
+mid_dict = {}
+cat_dict = {}
+item_carte_dict = {}
+cate_carte_dict = {}
+
+iddd = 0
+for line in f_train:
+    arr = line.strip("\n").split("\t")
+    clk = arr[0]
+    uid = arr[1]
+    mid = arr[2]
+    cat = arr[3]
+    mid_list = arr[4]
+    cat_list = arr[5]
+    if uid not in uid_dict:
+        uid_dict[uid] = 0
+    uid_dict[uid] += 1
+    if mid not in mid_dict:
+        mid_dict[mid] = 0
+    mid_dict[mid] += 1
+    if cat not in cat_dict:
+        cat_dict[cat] = 0
+    cat_dict[cat] += 1
+    if len(mid_list) == 0:
+        continue
+    for m in mid_list.split(""):
+        if m not in mid_dict:
+            mid_dict[m] = 0
+        mid_dict[m] += 1
+        if (mid, m) not in item_carte_dict:
+            item_carte_dict[(mid, m)] = 0
+        item_carte_dict[(mid, m)] += 1
+    #print iddd
+    iddd+=1
+    for c in cat_list.split(""):
+        if c not in cat_dict:
+            cat_dict[c] = 0
+        cat_dict[c] += 1
+        if (cat, c) not in cate_carte_dict:
+            cate_carte_dict[(cat, c)] = 0
+        cate_carte_dict[(cat, c)] += 1
+
+sorted_uid_dict = sorted(uid_dict.items(), key=lambda x:x[1], reverse=True)
+sorted_mid_dict = sorted(mid_dict.items(), key=lambda x:x[1], reverse=True)
+sorted_cat_dict = sorted(cat_dict.items(), key=lambda x:x[1], reverse=True)
+sorted_item_carte_dict = sorted(item_carte_dict.items(), key=lambda x:x[1], reverse=True)
+sorted_cate_carte_dict = sorted(cate_carte_dict.items(), key=lambda x:x[1], reverse=True)
+
+uid_voc = {}
+index = 0
+for key, value in sorted_uid_dict:
+    uid_voc[key] = index
+    index += 1
+
+mid_voc = {}
+mid_voc["default_mid"] = 0
+index = 1
+for key, value in sorted_mid_dict:
+    mid_voc[key] = index
+    index += 1
+
+cat_voc = {}
+cat_voc["default_cat"] = 0
+index = 1
+for key, value in sorted_cat_dict:
+    cat_voc[key] = index
+    index += 1
+
+item_carte_voc = {}
+item_carte_voc["default_item_carte"] = 0
+index = 1
+for key, value in sorted_item_carte_dict:
+    item_carte_voc[key] = index
+    index += 1
+
+cate_carte_voc = {}
+cate_carte_voc["default_cate_carte"] = 0
+index = 1
+for key, value in sorted_cate_carte_dict:
+    cate_carte_voc[key] = index
+    index += 1
+
+pk.dump(uid_voc, open("uid_voc.pkl", "wb"))
+pk.dump(mid_voc, open("mid_voc.pkl", "wb"))
+pk.dump(cat_voc, open("cat_voc.pkl", "wb"))
+pk.dump(item_carte_voc, open("item_carte_voc.pkl", "wb"))
+pk.dump(cate_carte_voc, open("cate_carte_voc.pkl", "wb"))
diff --git a/modelzoo/CAN/script/generate_voc.py.bk b/modelzoo/CAN/script/generate_voc.py.bk
new file mode 100644
index 00000000000..411708148aa
--- /dev/null
+++ b/modelzoo/CAN/script/generate_voc.py.bk
@@ -0,0 +1,65 @@
+import cPickle
+
+f_train = open("local_train_splitByUser", "r")
+uid_dict = {}
+mid_dict = {}
+cat_dict = {}
+
+iddd = 0
+for line in f_train:
+    arr = line.strip("\n").split("\t")
+    clk = arr[0]
+    uid = arr[1]
+    mid = arr[2]
+    cat = arr[3]
+    mid_list = arr[4]
+    cat_list = arr[5]
+    if uid not in uid_dict:
+        uid_dict[uid] = 0
+    uid_dict[uid] += 1
+    if mid not in mid_dict:
+        mid_dict[mid] = 0
+    mid_dict[mid] += 1
+    if cat not in cat_dict:
+        cat_dict[cat] = 0
+    cat_dict[cat] += 1
+    if len(mid_list) == 0:
+        continue
+    for m in mid_list.split(""):
+        if m not in mid_dict:
+            mid_dict[m] = 0
+        mid_dict[m] += 1
+    #print iddd
+    iddd+=1
+    for c in cat_list.split(""):
+        if c not in cat_dict:
+            cat_dict[c] = 0
+        cat_dict[c] += 1
+
+sorted_uid_dict = sorted(uid_dict.iteritems(), key=lambda x:x[1], reverse=True)
+sorted_mid_dict = sorted(mid_dict.iteritems(), key=lambda x:x[1], reverse=True)
+sorted_cat_dict = sorted(cat_dict.iteritems(), key=lambda x:x[1], reverse=True)
+
+uid_voc = {}
+index = 0
+for key, value in sorted_uid_dict:
+    uid_voc[key] = index
+    index += 1
+
+mid_voc = {}
+mid_voc["default_mid"] = 0
+index = 1
+for key, value in sorted_mid_dict:
+    mid_voc[key] = index
+    index += 1
+
+cat_voc = {}
+cat_voc["default_cat"] = 0
+index = 1
+for key, value in sorted_cat_dict:
+    cat_voc[key] = index
+    index += 1
+
+cPickle.dump(uid_voc, open("uid_voc.pkl", "w"))
+cPickle.dump(mid_voc, open("mid_voc.pkl", "w"))
+cPickle.dump(cat_voc, open("cat_voc.pkl", "w"))
diff --git a/modelzoo/CAN/script/local_aggretor.py b/modelzoo/CAN/script/local_aggretor.py
new file mode 100644
index 00000000000..e7e23190a1d
--- /dev/null
+++ b/modelzoo/CAN/script/local_aggretor.py
@@ -0,0 +1,46 @@
+import sys
+import hashlib
+import random
+
+fin = open("/home/test/modelzoo/DIEN/data/jointed-new-split-info", "r")
+ftrain = open("/home/test/modelzoo/DIEN/data/local_train", "w")
+ftest = open("/home/test/modelzoo/DIEN/data/local_test", "w")
+
+last_user = "0"
+common_fea = ""
+line_idx = 0
+for line in fin:
+    items = line.strip().split("\t")
+    ds = items[0]
+    clk = int(items[1])
+    user = items[2]
+    movie_id = items[3]
+    dt = items[5]
+    cat1 = items[6]
+
+    if ds=="20180118":
+        fo = ftrain
+    else:
+        fo = ftest
+    if user != last_user:
+        movie_id_list = []
+        cate1_list = []
+        #print >> fo, items[1] + "\t" + user + "\t" + movie_id + "\t" + cat1 +"\t" + "" + "\t" + "" 
+    else:
+        history_clk_num = len(movie_id_list)
+        cat_str = ""
+        mid_str = ""
+        for c1 in cate1_list:
+            cat_str += c1 + ""
+        for mid in movie_id_list:
+            mid_str += mid + ""
+        if len(cat_str) > 0: cat_str = cat_str[:-1]
+        if len(mid_str) > 0: mid_str = mid_str[:-1]
+        if history_clk_num >= 1:    # 8 is the average length of user behavior
+            print(items[1] + "\t" + user + "\t" + movie_id + "\t" + cat1 +"\t" + mid_str + "\t" + cat_str,
+                  file=fo)
+    last_user = user
+    if clk:
+        movie_id_list.append(movie_id)
+        cate1_list.append(cat1)                
+    line_idx += 1
diff --git a/modelzoo/CAN/script/model.py b/modelzoo/CAN/script/model.py
new file mode 100644
index 00000000000..133ded83f09
--- /dev/null
+++ b/modelzoo/CAN/script/model.py
@@ -0,0 +1,800 @@
+#import tensorflow as tf
+import tensorflow.compat.v1 as tf
+from tensorflow.python.ops.rnn_cell import GRUCell
+from tensorflow.python.ops.rnn_cell import LSTMCell
+from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
+#from tensorflow.python.ops.rnn import dynamic_rnn
+from rnn import dynamic_rnn
+from utils import *
+from Dice import dice
+
+#### CAN config #####
+weight_emb_w = [[16, 8], [8,4]] 
+weight_emb_b = [0, 0]
+print(weight_emb_w, weight_emb_b)
+orders = 3
+order_indep = False # True
+WEIGHT_EMB_DIM = (sum([w[0]*w[1] for w in weight_emb_w]) + sum(weight_emb_b)) #* orders
+INDEP_NUM = 1
+if order_indep:
+    INDEP_NUM *= orders
+
+print("orders: ",orders)
+CALC_MODE = "can"
+device = '/gpu:2'
+#### CAN config #####
+
+def gen_coaction(ad, his_items, dim, mode="can", mask=None,keep_fake_carte_seq=False):
+    weight, bias = [], []
+    idx = 0
+    weight_orders = []
+    bias_orders = []
+    for i in range(orders):
+        for w, b in zip(weight_emb_w, weight_emb_b):
+            weight.append(tf.reshape(ad[:, idx:idx+w[0]*w[1]], [-1, w[0], w[1]]))
+            idx += w[0] * w[1]
+            if b == 0:
+                bias.append(None)
+            else:
+                bias.append(tf.reshape(ad[:, idx:idx+b], [-1, 1, b]))
+                idx += b
+        weight_orders.append(weight)
+        bias_orders.append(bias)
+        if not order_indep:
+            break
+ 
+    if mode == "can":
+        out_seq = []
+        hh = []
+        for i in range(orders):
+            hh.append(his_items**(i+1))
+        #hh = [sum(hh)]
+        for i, h in enumerate(hh):
+            if order_indep:
+                weight, bias = weight_orders[i], bias_orders[i]
+            else:
+                weight, bias = weight_orders[0], bias_orders[0]
+            for j, (w, b) in enumerate(zip(weight, bias)):
+                h  = tf.matmul(h, w)
+                if b is not None:
+                    h = h + b
+                if j != len(weight)-1:
+                    h = tf.nn.tanh(h)
+                out_seq.append(h)
+        out_seq = tf.concat(out_seq, 2)
+        if mask is not None:
+            mask = tf.expand_dims(mask, axis=-1) 
+            out_seq = out_seq * mask
+    out = tf.reduce_sum(out_seq, 1)
+    if keep_fake_carte_seq and mode=="emb":
+        return out, out_seq
+    return out, None
+
+class Model(object):
+    def __init__(self, n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling = False, use_softmax=True, use_coaction=False, use_cartes=False):
+        with tf.name_scope('Inputs'):
+            self.mid_his_batch_ph = tf.placeholder(tf.int32, [None, None], name='mid_his_batch_ph')
+            self.cate_his_batch_ph = tf.placeholder(tf.int32, [None, None], name='cate_his_batch_ph')
+            self.uid_batch_ph = tf.placeholder(tf.int32, [None, ], name='uid_batch_ph')
+            self.mid_batch_ph = tf.placeholder(tf.int32, [None, ], name='mid_batch_ph')
+            self.cate_batch_ph = tf.placeholder(tf.int32, [None, ], name='cate_batch_ph')
+            self.mask = tf.placeholder(tf.float32, [None, None], name='mask')
+            self.seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph')
+            self.target_ph = tf.placeholder(tf.float32, [None, None], name='target_ph')
+            self.carte_batch_ph = tf.placeholder(tf.int32, [None, None, None], name='carte_ph')
+            self.lr = tf.placeholder(tf.float64, [])
+            self.use_negsampling =use_negsampling
+            self.use_softmax = False #use_softmax
+            self.use_coaction = use_coaction
+            self.use_cartes = use_cartes
+            print("args:")
+            print("negsampling: ", self.use_negsampling)
+            print("softmax: ", self.use_softmax)
+            print("co-action: ", self.use_coaction)
+            print("carte: ", self.use_cartes)
+            if use_negsampling:
+                self.noclk_mid_batch_ph = tf.placeholder(tf.int32, [None, None, None], name='noclk_mid_batch_ph') #generate 3 item IDs from negative sampling.
+                self.noclk_cate_batch_ph = tf.placeholder(tf.int32, [None, None, None], name='noclk_cate_batch_ph')
+
+        # Embedding layer
+        with tf.name_scope('Embedding_layer'):
+            self.uid_embeddings_var = tf.get_variable("uid_embedding_var", [n_uid, EMBEDDING_DIM])
+            tf.summary.histogram('uid_embeddings_var', self.uid_embeddings_var)
+            self.uid_batch_embedded = tf.nn.embedding_lookup(self.uid_embeddings_var, self.uid_batch_ph)
+
+            self.mid_embeddings_var = tf.get_variable("mid_embedding_var", [n_mid, EMBEDDING_DIM])
+            tf.summary.histogram('mid_embeddings_var', self.mid_embeddings_var)
+            self.mid_batch_embedded = tf.nn.embedding_lookup(self.mid_embeddings_var, self.mid_batch_ph)
+            self.mid_his_batch_embedded = tf.nn.embedding_lookup(self.mid_embeddings_var, self.mid_his_batch_ph)
+            if self.use_negsampling:
+                self.noclk_mid_his_batch_embedded = tf.nn.embedding_lookup(self.mid_embeddings_var, self.noclk_mid_batch_ph)
+
+            self.cate_embeddings_var = tf.get_variable("cate_embedding_var", [n_cate, EMBEDDING_DIM])
+            tf.summary.histogram('cate_embeddings_var', self.cate_embeddings_var)
+            self.cate_batch_embedded = tf.nn.embedding_lookup(self.cate_embeddings_var, self.cate_batch_ph)
+            self.cate_his_batch_embedded = tf.nn.embedding_lookup(self.cate_embeddings_var, self.cate_his_batch_ph)
+            if self.use_negsampling:
+                self.noclk_cate_his_batch_embedded = tf.nn.embedding_lookup(self.cate_embeddings_var, self.noclk_cate_batch_ph)
+
+            if self.use_cartes:
+                self.carte_embedding_vars = []
+                self.carte_batch_embedded = []
+                with tf.device(device):
+                    for i, num in enumerate(n_carte):
+                        print("carte num:", num)
+                        self.carte_embedding_vars.append(tf.get_variable("carte_embedding_var_{}".format(i), [num, EMBEDDING_DIM], trainable=True))
+                        self.carte_batch_embedded.append(tf.nn.embedding_lookup(self.carte_embedding_vars[i], self.carte_batch_ph[:,i,:]))
+
+            ###  co-action ###
+            if self.use_coaction:
+                ph_dict = {
+                    "item": [self.mid_batch_ph, self.mid_his_batch_ph, self.mid_his_batch_embedded],
+                    "cate": [self.cate_batch_ph, self.cate_his_batch_ph, self.cate_his_batch_embedded]
+                }
+                self.mlp_batch_embedded = []
+                with tf.device(device):
+                    self.item_mlp_embeddings_var = tf.get_variable("item_mlp_embedding_var", [n_mid, INDEP_NUM * WEIGHT_EMB_DIM], trainable=True)
+                    self.cate_mlp_embeddings_var = tf.get_variable("cate_mlp_embedding_var", [n_cate, INDEP_NUM * WEIGHT_EMB_DIM], trainable=True)
+
+                    self.mlp_batch_embedded.append(tf.nn.embedding_lookup(self.item_mlp_embeddings_var, ph_dict['item'][0]))
+                    self.mlp_batch_embedded.append(tf.nn.embedding_lookup(self.cate_mlp_embeddings_var, ph_dict['cate'][0]))
+
+                    self.input_batch_embedded = []
+                    self.item_input_embeddings_var = tf.get_variable("item_input_embedding_var", [n_mid, weight_emb_w[0][0] * INDEP_NUM], trainable=True)
+                    self.cate_input_embeddings_var = tf.get_variable("cate_input_embedding_var", [n_cate, weight_emb_w[0][0] * INDEP_NUM], trainable=True)
+                    self.input_batch_embedded.append(tf.nn.embedding_lookup(self.item_input_embeddings_var, ph_dict['item'][1]))
+                    self.input_batch_embedded.append(tf.nn.embedding_lookup(self.cate_input_embeddings_var, ph_dict['cate'][1]))
+
+        self.item_eb = tf.concat([self.mid_batch_embedded, self.cate_batch_embedded], 1)
+        self.item_his_eb = tf.concat([self.mid_his_batch_embedded, self.cate_his_batch_embedded], 2)
+        self.item_his_eb_sum = tf.reduce_sum(self.item_his_eb, 1)
+        if self.use_negsampling:
+            self.noclk_item_his_eb = tf.concat(
+                [self.noclk_mid_his_batch_embedded[:, :, 0, :], self.noclk_cate_his_batch_embedded[:, :, 0, :]], -1)# 0 means only using the first negative item ID. 3 item IDs are inputed in the line 24.
+            self.noclk_item_his_eb = tf.reshape(self.noclk_item_his_eb,
+                                                [-1, tf.shape(self.noclk_mid_his_batch_embedded)[1], 2*EMBEDDING_DIM])# cat embedding 18 concate item embedding 18.
+
+            self.noclk_his_eb = tf.concat([self.noclk_mid_his_batch_embedded, self.noclk_cate_his_batch_embedded], -1)
+            self.noclk_his_eb_sum_1 = tf.reduce_sum(self.noclk_his_eb, 2)
+            self.noclk_his_eb_sum = tf.reduce_sum(self.noclk_his_eb_sum_1, 1)
+
+        self.cross = []
+        if self.use_cartes:
+            if self.mask is not None:
+                mask = tf.expand_dims(self.mask, axis=-1)
+            for i,emb in enumerate(self.carte_batch_embedded):
+                emb = emb * mask
+                carte_eb_sum = tf.reduce_sum(emb, 1) 
+                self.cross.append(carte_eb_sum)
+
+        if self.use_coaction:
+            input_batch = self.input_batch_embedded
+            tmp_sum, tmp_seq = [], []
+            if INDEP_NUM == 2:
+                for i, mlp_batch in enumerate(self.mlp_batch_embedded):
+                    for j, input_batch in enumerate(self.input_batch_embedded):
+                        coaction_sum, coaction_seq = gen_coaction(mlp_batch[:, WEIGHT_EMB_DIM * j:  WEIGHT_EMB_DIM * (j+1)], input_batch[:, :, weight_emb_w[0][0] * i: weight_emb_w[0][0] * (i+1)],  EMBEDDING_DIM, mode=CALC_MODE,mask=self.mask) 
+                        tmp_sum.append(coaction_sum)
+                        tmp_seq.append(coaction_seq)
+            else:
+                for i, (mlp_batch, input_batch) in enumerate(zip(self.mlp_batch_embedded, self.input_batch_embedded)):
+                    coaction_sum, coaction_seq = gen_coaction(mlp_batch[:, : INDEP_NUM * WEIGHT_EMB_DIM], input_batch[:, :, : weight_emb_w[0][0]],  EMBEDDING_DIM, mode=CALC_MODE, mask=self.mask) 
+                    tmp_sum.append(coaction_sum)
+                    tmp_seq.append(coaction_seq)
+            
+            self.coaction_sum = tf.concat(tmp_sum, axis=1)
+            self.cross.append(self.coaction_sum)
+
+    def build_fcn_net(self, inp, use_dice = False):
+        bn1 = tf.layers.batch_normalization(inputs=inp, name='bn1')
+        dnn1 = tf.layers.dense(bn1, 200, activation=None, name='f1')
+        if use_dice:
+            dnn1 = dice(dnn1, name='dice_1')
+        else:
+            dnn1 = prelu(dnn1, 'prelu1')
+
+        dnn2 = tf.layers.dense(dnn1, 80, activation=None, name='f2')
+        if use_dice:
+            dnn2 = dice(dnn2, name='dice_2')
+        else:
+            dnn2 = prelu(dnn2, 'prelu2')
+        dnn3 = tf.layers.dense(dnn2, 2 if self.use_softmax else 1, activation=None, name='f3')
+        return dnn3
+
+    def build_loss(self, inp, L2=False):
+
+        with tf.name_scope('Metrics'):
+            # Cross-entropy loss and optimizer initialization
+            if self.use_softmax:
+                self.y_hat = tf.nn.softmax(inp) + 0.00000001
+                ctr_loss = - tf.reduce_mean(tf.log(self.y_hat) * self.target_ph)
+            else:
+                self.y_hat = tf.nn.sigmoid(inp)
+                ctr_loss = - tf.reduce_mean(tf.concat([tf.log(self.y_hat + 0.00000001) * self.target_ph, tf.log(1 - self.y_hat + 0.00000001) * (1-self.target_ph)], axis=1))
+            self.loss = ctr_loss
+            if self.use_negsampling:
+                self.loss += self.aux_loss
+            if L2:
+                self.loss += self.l2_loss
+
+            tf.summary.scalar('loss', self.loss)
+            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(self.loss)
+
+            # Accuracy metric
+            if self.use_softmax:
+                self.accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(self.y_hat), self.target_ph), tf.float32))
+            else:
+                self.accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(self.y_hat), self.target_ph), tf.float32))
+            tf.summary.scalar('accuracy', self.accuracy)
+
+
+    def auxiliary_loss(self, h_states, click_seq, noclick_seq, mask, stag = None):
+        mask = tf.cast(mask, tf.float32)
+        click_input_ = tf.concat([h_states, click_seq], -1)
+        noclick_input_ = tf.concat([h_states, noclick_seq], -1)
+        click_prop_ = self.auxiliary_net(click_input_, stag = stag)[:, :, 0]
+        noclick_prop_ = self.auxiliary_net(noclick_input_, stag = stag)[:, :, 0]
+        click_loss_ = - tf.reshape(tf.log(click_prop_), [-1, tf.shape(click_seq)[1]]) * mask
+        noclick_loss_ = - tf.reshape(tf.log(1.0 - noclick_prop_), [-1, tf.shape(noclick_seq)[1]]) * mask
+        loss_ = tf.reduce_mean(click_loss_ + noclick_loss_)
+        return loss_
+
+    def auxiliary_net(self, in_, stag='auxiliary_net'):
+        bn1 = tf.layers.batch_normalization(inputs=in_, name='bn1' + stag, reuse=tf.AUTO_REUSE)
+        dnn1 = tf.layers.dense(bn1, 100, activation=None, name='f1' + stag, reuse=tf.AUTO_REUSE)
+        dnn1 = tf.nn.sigmoid(dnn1)
+        dnn2 = tf.layers.dense(dnn1, 50, activation=None, name='f2' + stag, reuse=tf.AUTO_REUSE)
+        dnn2 = tf.nn.sigmoid(dnn2)
+        dnn3 = tf.layers.dense(dnn2, 2 if self.use_softmax else 1, activation=None, name='f3' + stag, reuse=tf.AUTO_REUSE)
+        if self.use_softmax:
+            y_hat = tf.nn.softmax(dnn3) + 0.00000001
+        else:
+            y_hat = tf.nn.sigmoid(dnn3) + 0.00000001
+        return y_hat
+
+
+    def train(self, sess, inps):
+        if self.use_negsampling:
+            loss, accuracy, aux_loss, _ = sess.run([self.loss, self.accuracy, self.aux_loss, self.optimizer], feed_dict={
+                self.uid_batch_ph: inps[0],
+                self.mid_batch_ph: inps[1],
+                self.cate_batch_ph: inps[2],
+                self.mid_his_batch_ph: inps[3],
+                self.cate_his_batch_ph: inps[4],
+                self.mask: inps[5],
+                self.target_ph: inps[6],
+                self.seq_len_ph: inps[7],
+                self.lr: inps[8],
+                self.noclk_mid_batch_ph: inps[9],
+                self.noclk_cate_batch_ph: inps[10],
+                self.carte_batch_ph: inps[11]
+            })
+            return loss, accuracy, aux_loss
+        else:
+            loss, accuracy, _ = sess.run([self.loss, self.accuracy, self.optimizer], feed_dict={
+                self.uid_batch_ph: inps[0],
+                self.mid_batch_ph: inps[1],
+                self.cate_batch_ph: inps[2],
+                self.mid_his_batch_ph: inps[3],
+                self.cate_his_batch_ph: inps[4],
+                self.mask: inps[5],
+                self.target_ph: inps[6],
+                self.seq_len_ph: inps[7],
+                self.lr: inps[8],
+                self.carte_batch_ph: inps[11]
+            })
+            return loss, accuracy, 0
+
+    def calculate(self, sess, inps):
+        if self.use_negsampling:
+            probs, loss, accuracy, aux_loss = sess.run([self.y_hat, self.loss, self.accuracy, self.aux_loss], feed_dict={
+                self.uid_batch_ph: inps[0],
+                self.mid_batch_ph: inps[1],
+                self.cate_batch_ph: inps[2],
+                self.mid_his_batch_ph: inps[3],
+                self.cate_his_batch_ph: inps[4],
+                self.mask: inps[5],
+                self.target_ph: inps[6],
+                self.seq_len_ph: inps[7],
+                self.noclk_mid_batch_ph: inps[8],
+                self.noclk_cate_batch_ph: inps[9],
+                self.carte_batch_ph: inps[10]
+            })
+            return probs, loss, accuracy, aux_loss
+        else:
+            probs, loss, accuracy = sess.run([self.y_hat, self.loss, self.accuracy], feed_dict={
+                self.uid_batch_ph: inps[0],
+                self.mid_batch_ph: inps[1],
+                self.cate_batch_ph: inps[2],
+                self.mid_his_batch_ph: inps[3],
+                self.cate_his_batch_ph: inps[4],
+                self.mask: inps[5],
+                self.target_ph: inps[6],
+                self.seq_len_ph: inps[7],
+                self.carte_batch_ph: inps[10]
+            })
+            return probs, loss, accuracy, 0
+
+    def save(self, sess, path):
+        saver = tf.train.Saver()
+        saver.save(sess, save_path=path)
+
+    def restore(self, sess, path):
+        saver = tf.train.Saver()
+        saver.restore(sess, save_path=path)
+        print('model restored from %s' % path)
+
+class Model_NCF(Model):
+    def __init__(self, n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=False, use_softmax=True):
+        super(Model_NCF, self).__init__(n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE,
+                                           ATTENTION_SIZE,
+                                           use_negsampling, use_softmax)
+        with tf.name_scope('ncf_embedding'):
+            self.ncf_item_embedding_var = tf.get_variable("ncf_item_embedding_var", [n_mid, EMBEDDING_DIM], trainable=True)
+            self.ncf_cate_embedding_var = tf.get_variable("ncf_cate_embedding_var", [n_cate, EMBEDDING_DIM], trainable=True)
+
+            ncf_item_emb = tf.nn.embedding_lookup(self.ncf_item_embedding_var, self.mid_batch_ph)
+            ncf_item_his_emb = tf.nn.embedding_lookup(self.ncf_item_embedding_var, self.mid_his_batch_ph)
+            ncf_cate_emb = tf.nn.embedding_lookup(self.ncf_cate_embedding_var, self.cate_batch_ph)
+            ncf_cate_his_emb = tf.nn.embedding_lookup(self.ncf_cate_embedding_var, self.cate_his_batch_ph)            
+
+        ncf_item_his_sum = tf.reduce_mean(ncf_item_his_emb, axis=1)
+        ncf_cate_his_sum = tf.reduce_mean(ncf_cate_his_emb, axis=1)
+        mf = tf.concat([ncf_item_emb * ncf_item_his_sum, ncf_cate_emb * ncf_cate_his_sum], axis=1)
+
+        inp = tf.concat([self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum], 1)
+        logit = self.build_fcn_net(inp, mf, use_dice=False)
+        self.build_loss(logit)
+
+    def build_fcn_net(self, inp, mf, use_dice = False):
+        bn1 = tf.layers.batch_normalization(inputs=inp, name='bn1')
+        dnn1 = tf.layers.dense(bn1, 200, activation=None, name='f1')
+        if use_dice:
+            dnn1 = dice(dnn1, name='dice_1')
+        else:
+            dnn1 = prelu(dnn1, scope='prelu_1')
+
+        dnn2 = tf.layers.dense(dnn1, 80, activation=None, name='f2')
+        if use_dice:
+            dnn2 = dice(dnn2, name='dice_2')
+        else:
+            dnn2 = prelu(dnn2, scope='prelu_2')
+
+        dnn2 = tf.concat([dnn2, mf], axis=1)
+        dnn3 = tf.layers.dense(dnn2, 2 if self.use_softmax else 1, activation=None, name='f3')
+        return dnn3
+
+def ProductLayer(feas, DIM, share=True):
+    row, col = [], []
+    num = len(feas)
+    pair = num * (num-1) / 2
+    for i in range(num - 1):
+        for j in range(i+1, num):
+            row.append(i)
+            col.append(j)
+    if share:
+        p = tf.stack([feas[i] for i in row], axis=1)
+        q = tf.stack([feas[i] for i in col], axis=1)
+    else:
+        tmp = []
+        count = {}
+        for i in row:
+            if i not in count:
+                count[i] = 0
+            else:
+                count[i] += 1
+            k = count[i]
+            tmp.append(feas[i][:, k*DIM:(k+1)*DIM])
+        p = tf.stack(tmp, axis=1)
+        tmp = []
+        for i in col:
+            if i not in count:
+                count[i] = 0
+            else:
+                count[i] += 1
+            k = count[i]
+            tmp.append(feas[i][:, k*DIM:(k+1)*DIM])
+        q = tf.stack(tmp, axis=1)
+        
+    ipnn = p * q
+    ipnn = tf.reduce_sum(ipnn, axis=2, keep_dims=False)
+    p = tf.expand_dims(p, axis=1)
+    w = tf.get_variable("pnn_var", [DIM, pair, DIM], trainable=True)
+    opnn = tf.reduce_sum((tf.multiply((tf.transpose(tf.reduce_sum(tf.multiply(p, w), axis=-1), [0, 2, 1])), q)), axis=-1)
+    pnn = tf.concat([ipnn, opnn], axis=1) 
+    return pnn
+
+class Model_PNN(Model):
+    def __init__(self, n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=False, use_softmax=True):
+        super(Model_PNN, self).__init__(n_uid, n_mid, n_cate, n_carte,EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling, use_softmax=use_softmax)
+        
+        fea_list = [self.mid_batch_embedded, self.cate_batch_embedded, tf.reduce_mean(self.mid_his_batch_embedded, axis=1), tf.reduce_mean(self.cate_his_batch_embedded, axis=1)]
+        pnn = ProductLayer(fea_list, EMBEDDING_DIM)
+        inp = tf.concat([self.uid_batch_embedded[:, :18], self.item_eb[:, :36], self.item_his_eb_sum[:, :36], pnn], 1)
+        logit = self.build_fcn_net(inp, use_dice=False)
+        self.build_loss(logit)
+
+def FMLayer(feas, output_dim=1):
+    feas = tf.stack(feas, axis=1)
+    square_of_sum = tf.reduce_sum(feas, axis=1, keep_dims=True) ** 2
+    sum_of_square = tf.reduce_sum(feas ** 2, axis=1, keep_dims=True)
+    fm_term = 0.5 * tf.reduce_sum(square_of_sum - sum_of_square, axis=2, keep_dims=False)
+    if output_dim==2:
+        fm_term = tf.concat([fm_term, tf.zeros_like(fm_term)], axis=1)
+    return fm_term
+
+class Model_FM(Model):
+    def __init__(self, n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=False, use_softmax=False):
+        super(Model_FM, self).__init__(n_uid, n_mid, n_cate, n_carte,EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling, use_softmax=use_softmax)
+        
+        w_item_var = tf.get_variable("w_item_var", [n_mid, 1], trainable=True)
+        w_cate_var = tf.get_variable("w_cate_var", [n_mid, 1], trainable=True)
+        wx = []
+        wx.append(tf.nn.embedding_lookup(w_item_var, self.mid_batch_ph))
+        wx.append(tf.nn.embedding_lookup(w_cate_var, self.cate_batch_ph))
+        wx.append(tf.reduce_sum(tf.nn.embedding_lookup(w_item_var, self.mid_his_batch_ph), axis=1))
+        wx.append(tf.reduce_sum(tf.nn.embedding_lookup(w_cate_var, self.cate_his_batch_ph), axis=1))        
+        b = tf.get_variable("b_var", [1], initializer=tf.zeros_initializer(), trainable=True)
+
+        wx = tf.concat(wx, axis=1)
+        lr_term = tf.reduce_sum(wx, axis=1) + b
+
+        fea_list = [self.mid_batch_embedded, self.cate_batch_embedded, tf.reduce_sum(self.mid_his_batch_embedded, axis=1), tf.reduce_sum(self.cate_his_batch_embedded, axis=1)]
+        logit = tf.reduce_sum(wx, axis=1) + b + FMLayer(fea_list, 1) 
+
+        #self.l2_loss = 2e-5 * tf.add_n([tf.nn.l2_loss(v) for v in [wx, self.item_eb, self.item_his_eb_sum]])
+        self.build_loss(logit, L2=False)
+
+class Model_FFM(Model):
+    def __init__(self, n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=False, use_softmax=False):
+        super(Model_FFM, self).__init__(n_uid, n_mid, n_cate, n_carte,EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling, use_softmax=use_softmax)
+        
+        w_item_var = tf.get_variable("w_item_var", [n_mid, 1], trainable=True)
+        w_cate_var = tf.get_variable("w_cate_var", [n_mid, 1], trainable=True)
+        wx = []
+        wx.append(tf.nn.embedding_lookup(w_item_var, self.mid_batch_ph))
+        wx.append(tf.nn.embedding_lookup(w_cate_var, self.cate_batch_ph))
+        wx.append(tf.reduce_sum(tf.nn.embedding_lookup(w_item_var, self.mid_his_batch_ph), axis=1))
+        wx.append(tf.reduce_sum(tf.nn.embedding_lookup(w_cate_var, self.cate_his_batch_ph), axis=1))        
+        b = tf.get_variable("b_var", [1], initializer=tf.zeros_initializer(), trainable=True)
+
+        wx = tf.concat(wx, axis=1)
+        lr_term = tf.reduce_sum(wx, axis=1, keep_dims=True) + b
+
+        with tf.name_scope('FFM_embedding'):
+
+            FFM_item_embedding_var = tf.get_variable("FFM_item_embedding_var", [n_mid, 3, EMBEDDING_DIM], trainable=True)
+            FFM_cate_embedding_var = tf.get_variable("FFM_cate_embedding_var", [n_cate, 3, EMBEDDING_DIM], trainable=True)
+            item_emb = tf.nn.embedding_lookup(FFM_item_embedding_var, self.mid_batch_ph)
+            item_his_emb = tf.nn.embedding_lookup(FFM_item_embedding_var, self.mid_his_batch_ph)
+            item_his_sum = tf.reduce_sum(item_his_emb, axis=1)
+
+            cate_emb = tf.nn.embedding_lookup(FFM_cate_embedding_var, self.cate_batch_ph)
+            cate_his_emb = tf.nn.embedding_lookup(FFM_cate_embedding_var, self.cate_his_batch_ph)            
+            cate_his_sum = tf.reduce_sum(cate_his_emb, axis=1)
+        
+        fea_list = [item_emb, item_his_sum, cate_emb, cate_his_sum]
+        feas = tf.stack(fea_list, axis=1)
+        num = len(fea_list)
+        rows, cols = [], []
+        for i in range(num-1):
+            for j in range(i+1, num):
+                rows.append([i, j-1])
+                cols.append([j, i])
+        p = tf.transpose(tf.gather_nd(tf.transpose(feas, [1,2,0,3]), rows), [1,0,2])
+        q = tf.transpose(tf.gather_nd(tf.transpose(feas, [1,2,0,3]), cols), [1,0,2])
+        ffm_term = tf.reduce_sum(p * q, axis=2)
+        ffm_term = tf.reduce_sum(ffm_term, axis=1, keep_dims=True)
+        logit = lr_term + ffm_term
+        self.build_loss(logit)
+
+
+class Model_DeepFFM(Model):
+    def __init__(self, n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=False, use_softmax=False):
+        super(Model_DeepFFM, self).__init__(n_uid, n_mid, n_cate, n_carte,EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling, use_softmax=use_softmax)
+        
+        w_item_var = tf.get_variable("w_item_var", [n_mid, 1], trainable=True)
+        w_cate_var = tf.get_variable("w_cate_var", [n_mid, 1], trainable=True)
+        wx = []
+        wx.append(tf.nn.embedding_lookup(w_item_var, self.mid_batch_ph))
+        wx.append(tf.nn.embedding_lookup(w_cate_var, self.cate_batch_ph))
+        wx.append(tf.reduce_sum(tf.nn.embedding_lookup(w_item_var, self.mid_his_batch_ph), axis=1))
+        wx.append(tf.reduce_sum(tf.nn.embedding_lookup(w_cate_var, self.cate_his_batch_ph), axis=1))        
+        b = tf.get_variable("b_var", [1], initializer=tf.zeros_initializer(), trainable=True)
+
+        wx = tf.concat(wx, axis=1)
+        lr_term = tf.reduce_sum(wx, axis=1, keep_dims=True) + b
+
+        with tf.name_scope('FFM_embedding'):
+
+            FFM_item_embedding_var = tf.get_variable("FFM_item_embedding_var", [n_mid, 3, EMBEDDING_DIM], trainable=True)
+            FFM_cate_embedding_var = tf.get_variable("FFM_cate_embedding_var", [n_cate, 3, EMBEDDING_DIM], trainable=True)
+            item_emb = tf.nn.embedding_lookup(FFM_item_embedding_var, self.mid_batch_ph)
+            item_his_emb = tf.nn.embedding_lookup(FFM_item_embedding_var, self.mid_his_batch_ph)
+            item_his_sum = tf.reduce_sum(item_his_emb, axis=1)
+
+            cate_emb = tf.nn.embedding_lookup(FFM_cate_embedding_var, self.cate_batch_ph)
+            cate_his_emb = tf.nn.embedding_lookup(FFM_cate_embedding_var, self.cate_his_batch_ph)            
+            cate_his_sum = tf.reduce_sum(cate_his_emb, axis=1)
+        
+        fea_list = [item_emb, item_his_sum, cate_emb, cate_his_sum]
+        feas = tf.stack(fea_list, axis=1)
+        num = len(fea_list)
+        rows, cols = [], []
+        for i in range(num-1):
+            for j in range(i+1, num):
+                rows.append([i, j-1])
+                cols.append([j, i])
+        p = tf.transpose(tf.gather_nd(tf.transpose(feas, [1,2,0,3]), rows), [1,0,2])
+        q = tf.transpose(tf.gather_nd(tf.transpose(feas, [1,2,0,3]), cols), [1,0,2])
+        ffm_term = tf.reduce_sum(p * q, axis=2)
+        ffm_term = tf.reduce_sum(ffm_term, axis=1, keep_dims=True)
+    
+        inp = tf.concat([self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum], 1)
+        dnn_term = self.build_fcn_net(inp, use_dice=False)
+
+        logit = dnn_term + lr_term + ffm_term
+        self.build_loss(logit)
+
+class Model_DeepFM(Model):
+    def __init__(self, n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=False, use_softmax=False):
+        super(Model_DeepFM, self).__init__(n_uid, n_mid, n_cate, n_carte,EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling, use_softmax=use_softmax)
+        w_item_var = tf.get_variable("w_item_var", [n_mid, 1], trainable=True)
+        w_cate_var = tf.get_variable("w_cate_var", [n_cate, 1], trainable=True)
+        wx = []
+        wx.append(tf.nn.embedding_lookup(w_item_var, self.mid_batch_ph))
+        wx.append(tf.nn.embedding_lookup(w_cate_var, self.cate_batch_ph))
+        wx.append(tf.reduce_sum(tf.nn.embedding_lookup(w_item_var, self.mid_his_batch_ph), axis=1))
+        wx.append(tf.reduce_sum(tf.nn.embedding_lookup(w_cate_var, self.cate_his_batch_ph), axis=1))        
+        b = tf.get_variable("b_var", [1], initializer=tf.zeros_initializer(), trainable=True)
+
+        wx = tf.concat(wx, axis=1)
+        lr_term = tf.reduce_sum(wx, axis=1, keep_dims=True) + b
+
+        inp = tf.concat([self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum], 1)
+        logit = self.build_fcn_net(inp, use_dice=False)
+
+        fea_list = [self.mid_batch_embedded, self.cate_batch_embedded, tf.reduce_sum(self.mid_his_batch_embedded, axis=1), tf.reduce_sum(self.cate_his_batch_embedded, axis=1)]
+        fm_term = FMLayer(fea_list)
+        logit = tf.layers.dense(tf.concat([logit, fm_term, lr_term], axis=1), 1, activation=None, name='fm_fc')
+        #self.l2_loss = 0.01 * tf.add_n([tf.nn.l2_loss(v) for v in [wx, self.item_eb, self.item_his_eb_sum]])
+        self.build_loss(logit, L2=False)
+
+def ExtremeFMLayer(feas, dim, output_dim=1):
+    num = len(feas)
+    feas = tf.stack(feas, axis=1) # batch, field_num, emb_dim
+    hidden_nn_layers = []
+    field_nums = [num]
+    final_len = 0
+    hidden_nn_layers.append(feas)
+    final_result = []
+    cross_layers = [256, 256, 256]
+
+    split_tensor0 = tf.split(hidden_nn_layers[0], dim * [1], 2)
+
+    with tf.variable_scope("xfm", initializer=tf.contrib.layers.xavier_initializer(uniform=True)) as scope:
+        for idx, layer_size in enumerate(cross_layers):
+            split_tensor = tf.split(hidden_nn_layers[-1], dim * [1], 2)
+            dot_result_m = tf.matmul(split_tensor0, split_tensor, transpose_b=True)
+            dot_result_o = tf.reshape(dot_result_m, shape=[dim, -1, field_nums[0] * field_nums[-1]])
+            dot_result = tf.transpose(dot_result_o, perm=[1, 0, 2])
+
+            filters = tf.get_variable(name="f_" + str(idx),
+                                      shape=[1, field_nums[-1] * field_nums[0], layer_size],
+                                      dtype=tf.float32)
+
+            curr_out = tf.nn.conv1d(dot_result, filters=filters, stride=1, padding='VALID')
+            curr_out = tf.transpose(curr_out, perm=[0, 2, 1])
+
+            if idx != len(cross_layers) - 1:
+                next_hidden, direct_connect = tf.split(curr_out, 2 * [int(layer_size / 2)], 1)
+                final_len += int(layer_size / 2)
+            else:
+                direct_connect = curr_out
+                next_hidden = 0
+                final_len += layer_size
+            field_nums.append(int(layer_size / 2))
+
+            final_result.append(direct_connect)
+            hidden_nn_layers.append(next_hidden)
+
+
+        result = tf.concat(final_result, axis=1)
+        result = tf.reduce_sum(result, -1)
+
+        w_nn_output = tf.get_variable(name='w_nn_output',
+                                      shape=[final_len, 1],
+                                      dtype=tf.float32)
+        b_nn_output = tf.get_variable(name='b_nn_output',
+                                      shape=[1],
+                                      dtype=tf.float32,
+                                      initializer=tf.zeros_initializer())
+        xfm_term = tf.matmul(result, w_nn_output) + b_nn_output
+
+        if output_dim==2:
+            xfm_term = tf.concat([xfm_term, tf.zeros_like(xfm_term)], axis=1)
+        return xfm_term
+
+class Model_xDeepFM(Model):
+    def __init__(self, n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=False, use_softmax=False):
+        super(Model_xDeepFM, self).__init__(n_uid, n_mid, n_cate, n_carte,EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling, use_softmax=use_softmax)
+        
+        w_item_var = tf.get_variable("w_item_var", [n_mid, 1], trainable=True)
+        w_cate_var = tf.get_variable("w_cate_var", [n_cate, 1], trainable=True)
+        wx = []
+        wx.append(tf.nn.embedding_lookup(w_item_var, self.mid_batch_ph))
+        wx.append(tf.nn.embedding_lookup(w_cate_var, self.cate_batch_ph))
+        wx.append(tf.reduce_sum(tf.nn.embedding_lookup(w_item_var, self.mid_his_batch_ph), axis=1))
+        wx.append(tf.reduce_sum(tf.nn.embedding_lookup(w_cate_var, self.cate_his_batch_ph), axis=1))        
+        b = tf.get_variable("b_var", [1], initializer=tf.zeros_initializer(), trainable=True)
+
+        wx = tf.concat(wx, axis=1)
+        lr_term = tf.reduce_sum(wx, axis=1, keep_dims=True) + b
+   
+        inp = tf.concat([self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum], 1)
+        mlp_term =  self.build_fcn_net(inp, use_dice=False)
+
+        fea_list = [self.mid_batch_embedded, self.cate_batch_embedded, tf.reduce_sum(self.mid_his_batch_embedded, axis=1), tf.reduce_sum(self.cate_his_batch_embedded, axis=1)]
+        fm_term = ExtremeFMLayer(fea_list, EMBEDDING_DIM)
+        self.build_loss(mlp_term + fm_term)
+
+class Model_PIN(Model):
+    def __init__(self,n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, SEQ_LEN=256):
+        super(Model_PIN, self).__init__(n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, 
+                                           BATCH_SIZE, SEQ_LEN, Flag="PIN")
+        
+        inp = tf.concat([self.item_eb, self.item_his_eb_sum], 1)
+        logit = self.build_fcn_net(inp, use_dice=False)
+
+        feas = [self.mid_batch_embedded, self.cate_batch_embedded, tf.reduce_sum(self.mid_his_batch_embedded * tf.reshape(self.mask,(BATCH_SIZE, SEQ_LEN, 1)), axis=1), tf.reduce_sum(self.cate_his_batch_embedded * tf.reshape(self.mask,(BATCH_SIZE, SEQ_LEN, 1)), axis=1)]
+
+        self.feas = feas
+        row, col = [], []
+        num = len(feas)
+        for i in range(num - 1):
+            for j in range(i+1, num):
+                row.append(i)
+                col.append(j)
+        pairs = len(rows)
+        p = tf.concat([feas[i] for i in row], axis=1)
+        q = tf.concat([feas[i] for i in col], axis=1)
+        pq = p * q
+        inp = tf.concat([p,q,pq], axis=2) #batch, pair, 3*dim
+        logit = self.pin(inp)
+        self.build_loss(logit)
+
+    def pin(self, inp):
+        batch, pair, dim = inp.shape.as_list()
+        with tf.variable_scope('product_network'):
+            inp = tf.transpose(inp, [1,0,2])
+            x = tf.layers.dense(inp, 20, activation=None, name='fc1')
+            x = tf.layers.batch_normalization(x, name='bn1')
+            x = tf.nn.relu(x)
+            x = tf.layers.dense(x, 1, activation=None, name='fc2')
+            x = tf.layers.batch_normalization(x, name='bn2')
+            x = tf.transpose(x, [1,0,2])
+            sub_out = tf.reshape(x, [-1, pair * dim])
+
+        with tf.variable_scope('network'):
+            new_inp = tf.concat(self.feas+[sub_out], axis=1)
+            x = tf.layers.dense(sub_out, 400, activation=tf.nn.relu, name='fc1')
+            x = tf.layers.dense(x, 400, activation=tf.nn.relu, name='fc2')
+            x = tf.layers.dense(x, 400, activation=tf.nn.relu, name='fc3')
+            x = tf.layers.dense(x, 1, activation=None, name='fc4')
+        return x
+
+class Model_ONN(Model):
+    def __init__(self, n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=False, use_softmax=False):
+        super(Model_ONN, self).__init__(n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling, use_softmax=use_softmax)
+
+        dim = 5
+        self.item_embedding_var = tf.get_variable("item_embedding_var_onn", [n_mid, dim * 3], trainable=True)
+        self.item_emb = tf.nn.embedding_lookup(self.item_embedding_var, self.mid_batch_ph)
+        self.item_his_emb = tf.nn.embedding_lookup(self.item_embedding_var, self.mid_his_batch_ph)
+        self.item_his_emb_sum = tf.reduce_mean(self.item_his_emb, axis=1)
+
+        self.cate_embedding_var = tf.get_variable("cate_embedding_var_onn", [n_cate, dim * 3], trainable=True)
+        self.cate_emb = tf.nn.embedding_lookup(self.cate_embedding_var, self.cate_batch_ph)
+        self.cate_his_emb = tf.nn.embedding_lookup(self.cate_embedding_var, self.cate_his_batch_ph)            
+        self.cate_his_emb_sum = tf.reduce_mean(self.cate_his_emb, axis=1)
+
+        fea_list = [self.item_emb, self.cate_emb, self.item_his_emb_sum, self.cate_his_emb_sum]
+        onn = ProductLayer(fea_list, dim, False)
+        
+        inp = tf.concat([self.uid_batch_embedded, self.mid_batch_embedded, self.cate_batch_embedded, tf.reduce_mean(self.mid_his_batch_embedded, axis=1), tf.reduce_mean(self.cate_his_batch_embedded, axis=1), onn], 1)
+        logit = self.build_fcn_net(inp, use_dice=False)
+        self.build_loss(logit)
+
+class Model_WideDeep(Model):
+    def __init__(self, n_uid, n_mid, n_cate, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=False):
+        super(Model_WideDeep, self).__init__(n_uid, n_mid, n_cate, EMBEDDING_DIM, HIDDEN_SIZE,
+                                        ATTENTION_SIZE,
+                                        use_negsampling)
+
+        inp = tf.concat([self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum], 1)
+        # Fully connected layer
+        bn1 = tf.layers.batch_normalization(inputs=inp, name='bn1')
+        dnn1 = tf.layers.dense(bn1, 200, activation=None, name='f1')
+        dnn1 = prelu(dnn1, 'p1')
+        dnn2 = tf.layers.dense(dnn1, 80, activation=None, name='f2')
+        dnn2 = prelu(dnn2, 'p2')
+        dnn3 = tf.layers.dense(dnn2, 2, activation=None, name='f3')
+        d_layer_wide = tf.concat([tf.concat([self.item_eb,self.item_his_eb_sum], axis=-1),
+                                self.item_eb * self.item_his_eb_sum], axis=-1)
+        d_layer_wide = tf.layers.dense(d_layer_wide, 2, activation=None, name='f_fm')
+        self.y_hat = tf.nn.softmax(dnn3 + d_layer_wide)
+
+        with tf.name_scope('Metrics'):
+            # Cross-entropy loss and optimizer initialization
+            self.loss = - tf.reduce_mean(tf.log(self.y_hat) * self.target_ph)
+            tf.summary.scalar('loss', self.loss)
+            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(self.loss)
+
+            # Accuracy metric
+            self.accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(self.y_hat), self.target_ph), tf.float32))
+            tf.summary.scalar('accuracy', self.accuracy)
+        self.merged = tf.summary.merge_all()
+
+class Model_DNN(Model):
+    def __init__(self, n_uid, n_mid, n_cate, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=False, use_softmax=True, use_coaction=False, use_cartes=False):
+        #EMBEDDING_DIM = 4
+        super(Model_DNN, self).__init__(n_uid, n_mid, n_cate, EMBEDDING_DIM, HIDDEN_SIZE,
+                                                          ATTENTION_SIZE,
+                                                          use_negsampling, use_softmax=use_softmax, use_coaction=use_coaction, use_cartes=use_cartes)
+
+        inp = tf.concat([self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum]+self.cross, 1)
+        logit = self.build_fcn_net(inp, use_dice=False)
+        self.build_loss(logit)
+
+
+class Model_DIN(Model):
+    def __init__(self, n_uid, n_mid, n_cate, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=False, use_softmax=True):
+        super(Model_DIN, self).__init__(n_uid, n_mid, n_cate, EMBEDDING_DIM, HIDDEN_SIZE,
+                                           ATTENTION_SIZE,
+                                           use_negsampling, use_softmax=use_softmax)
+
+        # Attention layer
+        with tf.name_scope('Attention_layer'):
+            attention_output = din_attention(self.item_eb, self.item_his_eb, ATTENTION_SIZE, self.mask)
+            att_fea = tf.reduce_sum(attention_output, 1)
+            tf.summary.histogram('att_fea', att_fea)
+        inp = tf.concat([self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum, self.item_eb * self.item_his_eb_sum, att_fea], -1)
+        # Fully connected layer
+        logit = self.build_fcn_net(inp, use_dice=True)
+        self.build_loss(logit)
+
+
+class Model_DIEN(Model):
+    def __init__(self, n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=True, use_coaction=False):
+        super(Model_DIEN, self).__init__(n_uid, n_mid, n_cate, n_carte,
+                                                          EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE,
+                                                          use_negsampling, use_coaction=use_coaction)
+
+        # RNN layer(-s)
+        with tf.name_scope('rnn_1'):
+            rnn_outputs, _ = dynamic_rnn(GRUCell(HIDDEN_SIZE), inputs=self.item_his_eb,
+                                         sequence_length=self.seq_len_ph, dtype=tf.float32,
+                                         scope="gru1")
+            tf.summary.histogram('GRU_outputs', rnn_outputs)
+
+        aux_loss_1 = self.auxiliary_loss(rnn_outputs[:, :-1, :], self.item_his_eb[:, 1:, :],
+                                         self.noclk_item_his_eb[:, 1:, :],
+                                         self.mask[:, 1:], stag="gru")
+        self.aux_loss = aux_loss_1
+
+        # Attention layer
+        with tf.name_scope('Attention_layer_1'):
+            att_outputs, alphas = din_fcn_attention(self.item_eb, rnn_outputs, ATTENTION_SIZE, self.mask,
+                                                    softmax_stag=1, stag='1_1', mode='LIST', return_alphas=True)
+            tf.summary.histogram('alpha_outputs', alphas)
+
+        with tf.name_scope('rnn_2'):
+            rnn_outputs2, final_state2 = dynamic_rnn(VecAttGRUCell(HIDDEN_SIZE), inputs=rnn_outputs,
+                                                     att_scores = tf.expand_dims(alphas, -1),
+                                                     sequence_length=self.seq_len_ph, dtype=tf.float32,
+                                                     scope="gru2")
+            tf.summary.histogram('GRU2_Final_State', final_state2)
+
+        inp = tf.concat([self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum, self.item_eb * self.item_his_eb_sum, final_state2]+self.cross, 1)
+        prop = self.build_fcn_net(inp, use_dice=True)
+        self.build_loss(prop)
diff --git a/modelzoo/CAN/script/model_avazu.py b/modelzoo/CAN/script/model_avazu.py
new file mode 100644
index 00000000000..cfaaedf815e
--- /dev/null
+++ b/modelzoo/CAN/script/model_avazu.py
@@ -0,0 +1,973 @@
+#coding:utf-8
+import tensorflow as tf
+from utils import *
+from tensorflow.python.ops.rnn_cell import GRUCell
+import mimn as mimn
+import rum as rum
+from rnn import dynamic_rnn 
+# import mann_simple_cell as mann_cell
+import random
+
+### Exp config ###
+
+feature_num = [
+    264,7,7,4842,7912,26,9136,580,36,
+    7338655,8303,5,4,2885,8,9,474,4,69,172,62
+]
+# id starts with 1
+id_offset = [0] + [sum(feature_num[:i])  for i in range(1, len(feature_num))]
+
+emb_as_weight = True #False #True
+use_new_seq_emb = True #False # True
+#edge_type = "item"
+edge_type = "3-9"
+use_cartes = ["item-his_item"]
+use_cartes = ["cate-his_cate"]
+use_cartes = [
+    "3-9", "3-10", "4-9", "4-10", "6-9", "6-10", "7-9", "7-10",
+    "16-9", "16-10", "19-9", "19-10", "13-16-19", "13-16-19-9", "13-16-19-10",
+    "16-3", "16-6", "19-3", "19-6", "13-16-19-3", "13-16-19-6"
+]
+use_cartes = []
+
+WEIGHT_EMB_NUM = 1
+orders = 5
+CALC_MODE = "poly_x_x4"
+weight_emb_w, weight_emb_b = [], []
+alpha = 1
+if CALC_MODE in ["seq_sum", "seq", "emb"]:
+    weight_emb_w = [[4, 3], [3,4]]
+    #weight_emb_w = [[16, 3], [3,4]]
+    #weight_emb_w = [[16, 3], [3,4], [4,5],[5,5]]
+    weight_emb_b = [3, 0]
+    #weight_emb_b = [3, 4, 5, 0]
+    WEIGHT_EMB_DIM = sum([w[0]*w[1] for w in weight_emb_w]) + sum(weight_emb_b)
+elif CALC_MODE.startswith("poly"):
+    WEIGHT_EMB_DIM = 16 
+    if "vec" in CALC_MODE:
+        WEIGHT_EMB_DIM = int(WEIGHT_EMB_DIM ** 0.5)
+    elif "wx_ind" in CALC_MODE:
+        WEIGHT_EMB_DIM *= 2
+    elif "x_ind" in CALC_MODE:
+        WEIGHT_EMB_DIM *= orders
+    elif "x4" in CALC_MODE:
+        alpha = 4
+        WEIGHT_EMB_DIM *= alpha**2
+
+keep_fake_carte_seq = False # True
+carte_with_gru = True #False
+
+carte_num_dict = {
+    "3-6": 8315+1,
+    "6-9": 1849306+1,
+    "4-7": 4547+1,
+    "3-9": 2102068+1,
+    "3-10": 161045+1,
+    "4-9": 2073680+1,
+    "4-10": 146645+1,
+    "6-9": 1851115+1,
+    "6-10": 93771+1,
+    "7-9": 1765776+1,
+    "7-10": 23738+1,
+    "16-9": 2135855+1,
+    "16-10": 128321+1,
+    "19-9": 1637771+1,
+    "19-10": 57099+1,
+    "13-16-19": 16905+1,
+    "13-16-19-9": 2579867+1,
+    "13-16-19-10": 447410+1,
+    "16-3": 33287+1,
+    "16-6": 25011+1,
+    "19-3": 24748+1,
+    "19-6": 22125+1,
+    "13-16-19-3": 142791+1,
+    "13-16-19-6": 86211+1,
+}
+if use_cartes:
+    n_cid = sum([carte_num_dict[c] for c in use_cartes]) - (len(use_cartes) - 1)
+#n_cid = 59201 #6689210 #8586832 #6689210 #6630010
+
+def eb_as_weight(ad, his_items, dim, mode="seq"):
+    ad = tf.reshape(ad, [-1, WEIGHT_EMB_DIM])
+    weight, bias = [], []
+    idx = 0
+    for w, b in zip(weight_emb_w, weight_emb_b):
+        weight.append(tf.reshape(ad[:, idx:idx+w[0]*w[1]], [-1, w[0], w[1]]))
+        idx += w[0] * w[1]
+        if b == 0:
+            bias.append(None)
+        else:
+            bias.append(tf.reshape(ad[:, idx:idx+b], [-1, 1, b]))
+            idx += b
+ 
+    if mode == "seq_sum":
+        his_items_sum = tf.reduce_sum(his_items, 1)
+        his_items_sum = tf.reshape(his_items_sum, [-1, 1, dim])
+        out_seq = tf.nn.selu(tf.matmul(his_items_sum, w_1) + b)
+        out_seq = tf.matmul(out_seq, w_2)
+        out = tf.reduce_sum(out_seq, 1)
+    elif mode == "seq":
+        his_items_ = tf.unstack(his_items, axis=1)
+        out_seq = []
+        for item in his_items_:
+            item = tf.reshape(item, [-1, 1, dim])
+            #out.append(tf.nn.selu(tf.matmul(item, w) + b))
+            h = item
+            for w, b in zip(weight, bias):
+                h = tf.matmul(h, w)
+                if b is not None:
+                    h = tf.nn.selu(h + b)
+            out_seq.append(h)
+            #h = tf.nn.selu(tf.matmul(item, w_1) + b)
+            #out_seq.append(tf.matmul(h, w_2))
+        out_seq = tf.concat(out_seq, 1)
+        out = tf.reduce_sum(out_seq, 1)
+    elif mode == "emb":
+        inp = his_items
+        h = tf.reshape(inp, [-1, 1, dim])
+        for w, b in zip(weight, bias):
+            h = tf.matmul(h, w)
+            if b is not None:
+                h = tf.nn.selu(h + b)
+        out = h
+        out = tf.reduce_sum(out, 1)
+    elif mode == "poly":
+        h = tf.reshape(his_items, [-1, 1, dim])
+        w = tf.reshape(ad, [-1, dim, dim])
+        ww = [w**(i+1) for i in range(orders)]
+        for i in range(orders):
+            h = tf.matmul(h, ww[i])
+            #if i < 2:
+            h = tf.nn.tanh(h)
+        out = h
+        out = tf.reduce_sum(out, 1)
+    elif mode == "poly_w":
+        h = tf.reshape(his_items, [-1, 1, dim])
+        w = tf.reshape(ad, [-1, dim, dim])
+        ww = [w**(i+1) for i in range(orders)]
+        out = []
+        for i in range(orders):
+            out.append(tf.nn.tanh(tf.matmul(h, ww[i])))
+        out = tf.reduce_sum(tf.concat(out, axis=1), 1)
+    elif mode == "poly_x":
+        h = tf.reshape(his_items, [-1, 1, dim])
+        w = tf.reshape(ad, [-1, dim, dim])
+        hh = [h**(i+1) for i in range(orders)]
+        out = []
+        for i in range(orders):
+            #out.append(tf.nn.tanh(tf.matmul(hh[i], w)))
+            out.append(tf.matmul(hh[i], w))
+        out = tf.reduce_sum(tf.concat(out, axis=1), 1)
+    elif mode == "poly_x_x4":
+        h = tf.reshape(his_items, [-1, 1, dim * alpha])
+        w = tf.reshape(ad, [-1, dim*alpha, dim*alpha])
+        hh = [h**(i+1) for i in range(orders)]
+        out = []
+        for i in range(orders):
+            out.append(tf.nn.tanh(tf.matmul(hh[i], w)))
+            #out.append(tf.matmul(hh[i], w))
+        out = tf.reduce_sum(tf.concat(out, axis=1), 1)
+    elif mode == "poly_x_ind":
+        h = tf.reshape(his_items, [-1, 1, dim])
+        ww = tf.split(ad, num_or_size_splits=orders, axis=1)
+        ww = [tf.reshape(w, [-1, dim, dim]) for w in ww]
+        hh = [h**(i+1) for i in range(orders)]
+        out = []
+        for i in range(orders):
+            out.append(tf.nn.tanh(tf.matmul(hh[i], ww[i])))
+            #out.append(tf.matmul(hh[i], ww[i]))
+        out = tf.reduce_sum(tf.concat(out, axis=1), 1)
+    elif mode == "poly_wx":
+        h = tf.reshape(his_items, [-1, 1, dim])
+        w = tf.reshape(ad, [-1, dim, dim])
+        ww = [w**(i+1) for i in range(orders)]
+        hh = [h**(i+1) for i in range(orders)]
+        out = []
+        for i in range(orders):
+            out.append(tf.nn.tanh(tf.matmul(hh[i], w)))
+            out.append(tf.nn.tanh(tf.matmul(h, ww[i])))
+        out = tf.reduce_sum(tf.concat(out, axis=1), 1)
+    elif mode == "poly_wx_ind":
+        h = tf.reshape(his_items, [-1, 1, dim])
+        ww = tf.split(ad, num_or_size_splits=2, axis=1)
+        ww = [tf.reshape(w, [-1, dim, dim]) for w in ww]
+        ww1 = [ww[1]**(i+1) for i in range(orders)]
+        hh = [h**(i+1) for i in range(orders)]
+        out = []
+        for i in range(orders):
+            out.append(tf.nn.tanh(tf.matmul(hh[i], ww[0])))
+            out.append(tf.nn.tanh(tf.matmul(h, ww1[i])))
+        out = tf.reduce_sum(tf.concat(out, axis=1), 1)
+    elif mode == "poly_x_vec":
+        h = tf.reshape(his_items, [-1, 1, dim])
+        w = tf.reshape(ad, [-1, 1, dim])
+        hh = [h**(i+1) for i in range(orders)]
+        out = []
+        for i in range(orders):
+            out.append(tf.nn.tanh(hh[i] * w))
+            #out.append(hh[i] * w)
+        out = tf.reduce_sum(tf.concat(out, axis=1), 1)
+        out = tf.reduce_sum(tf.concat(out, axis=1), 1)
+    elif mode == "poly_pure":
+        h = tf.reshape(his_items, [-1, 1, dim])
+        w = tf.reshape(ad, [-1, dim, dim])
+        ww = [w**(i+1) for i in range(orders)]
+        hh = [h**(i+1) for i in range(orders)]
+        out = []
+        for i in range(orders):
+            for j in range(orders):
+                out.append(tf.nn.tanh(tf.matmul(hh[i], ww[j])))
+        out = tf.reduce_sum(tf.concat(out, axis=1), 1)
+            
+    #out = tf.nn.selu(out)
+    if keep_fake_carte_seq and mode=="seq":
+        return out, out_seq
+    return out, None
+
+def FM(feas):
+    feas = tf.stack(feas, aixs=1)
+    square_of_sum = tf.reduce_sum(feas, axis=1) ** 2
+    sum_of_square = tf.reduce_sum(feas ** 2, axis=1)
+    return 0.5 * (square_of_sum - sum_of_square)
+
+class Model(object):
+    def __init__(self, n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, SEQ_LEN, use_negsample=False, Flag="DNN"):
+        self.model_flag = Flag
+        self.reg = False
+        self.use_negsample= use_negsample
+        with tf.name_scope('Inputs'):
+            self.user_batch_ph = tf.placeholder(tf.int32, [None, None], name='user_batch_ph')
+            self.ad_batch_ph = tf.placeholder(tf.int32, [None, None], name='ad_batch_ph')
+            self.scene_batch_ph = tf.placeholder(tf.int32, [None, None], name='scene_batch_ph')
+            self.time_batch_ph = tf.placeholder(tf.int32, [None, ], name='time_batch_ph')
+            self.clk_seq_batch_ph = tf.placeholder(tf.int32, [None, None, None], name='clk_seq_batch_ph')
+            self.carte_batch_ph = tf.placeholder(tf.int32, [None, None], name='carte_batch_ph')
+            #self.noclk_seq_batch_ph = tf.placeholder(tf.int32, [None, None], name='noclk_seq_batch_ph')
+            '''
+            self.item_carte_batch_ph = tf.placeholder(tf.int32, [None, None], name='item_carte_batch_ph')
+            self.cate_carte_batch_ph = tf.placeholder(tf.int32, [None, None], name='cate_carte_batch_ph')
+            self.item_cate_carte_batch_ph = tf.placeholder(tf.int32, [None, None], name='item_cate_carte_batch_ph')
+            self.cate_item_carte_batch_ph = tf.placeholder(tf.int32, [None, None], name='cate_item_carte_batch_ph')
+            '''
+            self.clk_mask = tf.placeholder(tf.float32, [None, None], name='clk_mask_batch_ph')
+            self.target_ph = tf.placeholder(tf.float32, [None, 2], name='target_ph')
+            self.lr = tf.placeholder(tf.float64, [])
+
+        # Embedding layer
+        with tf.name_scope('Embedding_layer'):
+
+            ad_ph = tf.split(self.ad_batch_ph, num_or_size_splits=10, axis=1)
+            scene_ph = tf.split(self.scene_batch_ph, num_or_size_splits=6, axis=1)
+            user_ph = tf.split(self.user_batch_ph, num_or_size_splits=4, axis=1)
+            feature_ph = [self.time_batch_ph] + ad_ph[:2] + scene_ph + user_ph + ad_ph[2:]
+
+            self.embedding_vars = []
+            features = []
+            for i, num in enumerate(feature_num):
+                self.embedding_vars.append(tf.get_variable("embedding_var_fea{}".format(i), [num, EMBEDDING_DIM], trainable=True))
+                features.append(tf.nn.embedding_lookup(self.embedding_vars[i], feature_ph[i] - id_offset[i]))
+
+            self.user_batch_embedded = tf.concat(features[9:13], axis=1)
+            self.ad_batch_embedded = tf.concat(features[1:3]+features[13:], axis=1)
+            self.scene_batch_embedded = tf.concat(features[3:9], axis=1)
+            self.time_batch_embedded = features[0]
+            self.clk_seq_batch_embedded = tf.nn.embedding_lookup(self.embedding_vars[0], self.clk_seq_batch_ph)
+
+            if use_cartes:
+                self.carte_embeddings_var = [] 
+                self.carte_batch_embedded = []
+                for i, c in enumerate(use_cartes):
+                    self.carte_embeddings_var.append(tf.get_variable("carte_embedding_var_{}".format(c), [carte_num_dict[c], EMBEDDING_DIM], trainable=True))
+                    self.carte_batch_embedded.append(tf.nn.embedding_lookup(self.carte_embeddings_var[i], self.carte_batch_ph[:, i]))
+
+            ###  fake carte ###
+            if emb_as_weight:
+                '''
+                TODO: support multi-group cartesian feature, e.g., 13-16-19
+                '''
+                idx_w, idx_x = map(int, edge_type.split('-'))
+ 
+                self.weight_embeddings_var = tf.get_variable("weight_embedding_var", [feature_num[idx_w] + 1, WEIGHT_EMB_NUM * WEIGHT_EMB_DIM], trainable=True)
+                self.weight_batch_embedded = tf.nn.embedding_lookup(self.weight_embeddings_var, feature_ph[idx_w])
+                if use_new_seq_emb:
+                    self.seq_embeddings_var = tf.get_variable("seq_embedding_var", [feature_num[idx_x], EMBEDDING_DIM * alpha], trainable=True)
+                    self.seq_his_batch_embedded = tf.nn.embedding_lookup(self.seq_embeddings_var, feature_ph[idx_x])
+
+        with tf.name_scope('init_operation'):    
+            for i, num in enumerate(feature_num):
+                embedding_placeholder = tf.placeholder(tf.float32,[num, EMBEDDING_DIM], name="emb_ph_{}".format(i))
+                self.embedding_vars[i].assign(embedding_placeholder)
+
+            if use_cartes:
+                self.carte_embedding_placeholder = []
+                self.carte_embedding_init = []
+                for i, c in enumerate(use_cartes):
+                    self.carte_embedding_placeholder.append(tf.placeholder(tf.float32,[carte_num_dict[c], EMBEDDING_DIM], name="cid_emb_ph"))
+                    self.carte_embedding_init.append(self.carte_embeddings_var[i].assign(self.carte_embedding_placeholder[i]))
+
+        if self.use_negsample:
+            self.noclk_seq_batch_ph = tf.placeholder(tf.int32, [None, None, None], name='noclk_seq_batch_ph')
+            self.noclk_seq_batch_embedded = tf.nn.embedding_lookup(self.embeddings_var, self.noclk_seq_batch_ph)
+            self.noclk_mask = tf.placeholder(tf.float32, [None, None], name='noclk_mask_batch_ph')
+            #self.mid_neg_batch_ph = tf.placeholder(tf.int32, [None, None], name='neg_his_batch_ph')
+            #self.cate_neg_batch_ph = tf.placeholder(tf.int32, [None, None], name='neg_cate_his_batch_ph')
+ 
+            #self.neg_item_his_eb = tf.nn.embedding_lookup(self.mid_embeddings_var, self.mid_neg_batch_ph)
+            #self.neg_cate_his_eb = tf.nn.embedding_lookup(self.mid_embeddings_var, self.cate_neg_batch_ph)
+            #self.neg_his_eb = tf.concat([self.neg_item_his_eb,self.neg_cate_his_eb], axis=2) * tf.reshape(self.mask,(BATCH_SIZE, SEQ_LEN, 1))   
+            self.noclk_seq_eb = tf.concat(tf.unstack(tf.reshape(self.noclk_seq_batch_embedded,(BATCH_SIZE, 10, SEQ_LEN, EMBEDDING_DIM)), axis=1), axis=-1)  * tf.reshape(self.noclk_mask,(BATCH_SIZE, SEQ_LEN, 1))   
+            
+        self.user_eb = tf.reshape(self.user_batch_embedded, [-1, EMBEDDING_DIM * 4]) # [batch, 4, dim] -> [batch, 4*dim]
+        self.ad_eb = tf.reshape(self.ad_batch_embedded, [-1, EMBEDDING_DIM * 10]) 
+        self.scene_eb = tf.reshape(self.scene_batch_embedded, [-1, EMBEDDING_DIM * 6]) 
+        self.time_eb = self.time_batch_embedded
+
+        self.clk_seq_eb = tf.concat(tf.unstack(tf.reshape(self.clk_seq_batch_embedded,(BATCH_SIZE, 10, SEQ_LEN, EMBEDDING_DIM)), axis=1), axis=-1) * tf.reshape(self.clk_mask, (BATCH_SIZE, SEQ_LEN, 1))
+        self.clk_seq_eb_sum = tf.reduce_sum(self.clk_seq_eb, 1)
+
+
+        self.carte_embs = []
+        if use_cartes:
+            self.carte_embs += self.carte_batch_embedded
+
+        if emb_as_weight:
+            if use_new_seq_emb:
+                seq_his_batch = self.seq_his_batch_embedded
+            else:
+                seq_his_batch = features[int(edge_type.split('-')[1])]
+            tmp_sum, tmp_seq = [], []
+            if CALC_MODE.startswith("seq"):
+                shape = (BATCH_SIZE, SEQ_LEN, EMBEDDING_DIM)
+            else:
+                shape = (BATCH_SIZE, EMBEDDING_DIM * alpha)
+            for i in range(WEIGHT_EMB_NUM):
+                fake_carte_sum, fake_carte_seq = eb_as_weight(self.weight_batch_embedded[:, i * WEIGHT_EMB_DIM: (i+1) * WEIGHT_EMB_DIM], tf.reshape(seq_his_batch, shape), EMBEDDING_DIM, mode=CALC_MODE) 
+                tmp_sum.append(fake_carte_sum)
+                tmp_seq.append(fake_carte_seq)
+            self.fake_carte_sum = tf.concat(tmp_sum, axis=1)
+            if keep_fake_carte_seq:
+                self.fake_carte_seq = tmp_seq
+                
+
+    def build_fcn_net(self, inp, use_dice = False):
+        bn1 = tf.layers.batch_normalization(inputs=inp, name='bn1')
+        dnn1 = tf.layers.dense(bn1, 200, activation=None, name='f1')
+        if use_dice:
+            dnn1 = dice(dnn1, name='dice_1')
+        else:
+            dnn1 = prelu(dnn1, scope='prelu_1')
+
+        dnn2 = tf.layers.dense(dnn1, 80, activation=None, name='f2')
+        if use_dice:
+            dnn2 = dice(dnn2, name='dice_2')
+        else:
+            dnn2 = prelu(dnn2, scope='prelu_2')
+
+        dnn3 = tf.layers.dense(dnn2, 2, activation=None, name='f3')
+        self.y_hat = tf.nn.softmax(dnn3) + 0.00000001
+
+        with tf.name_scope('Metrics'):
+            # Cross-entropy loss and optimizer initialization
+            ctr_loss = - tf.reduce_mean(tf.log(self.y_hat) * self.target_ph)
+            self.loss = ctr_loss
+            if self.use_negsample:
+                self.loss += self.aux_loss
+            if self.reg:
+                self.loss += self.reg_loss
+
+            tf.summary.scalar('loss', self.loss)
+            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(self.loss)
+            # Accuracy metric
+            self.accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(self.y_hat), self.target_ph), tf.float32))
+            tf.summary.scalar('accuracy', self.accuracy)
+
+        self.merged = tf.summary.merge_all()
+
+    def auxiliary_loss(self, h_states, click_seq, noclick_seq, clk_mask=None, noclk_mask = None, stag = None):
+        #mask = tf.cast(mask, tf.float32)
+        if noclk_mask is None:
+            noclk_mask = clk_mask
+        click_input_ = tf.concat([h_states, click_seq], -1)
+        noclick_input_ = tf.concat([h_states, noclick_seq], -1)
+        click_prop_ = self.auxiliary_net(click_input_, stag = stag)[:, :, 0]
+        noclick_prop_ = self.auxiliary_net(noclick_input_, stag = stag)[:, :, 0]
+
+        click_loss_ = - tf.reshape(tf.log(click_prop_), [-1, tf.shape(click_seq)[1]]) * clk_mask
+        noclick_loss_ = - tf.reshape(tf.log(1.0 - noclick_prop_), [-1, tf.shape(noclick_seq)[1]]) * noclk_mask
+
+        loss_ = tf.reduce_mean(click_loss_ + noclick_loss_)
+        return loss_
+
+    def auxiliary_net(self, in_, stag='auxiliary_net'):
+        bn1 = tf.layers.batch_normalization(inputs=in_, name='bn1' + stag, reuse=tf.AUTO_REUSE)
+        dnn1 = tf.layers.dense(bn1, 100, activation=None, name='f1' + stag, reuse=tf.AUTO_REUSE)
+        dnn1 = tf.nn.sigmoid(dnn1)
+        dnn2 = tf.layers.dense(dnn1, 50, activation=None, name='f2' + stag, reuse=tf.AUTO_REUSE)
+        dnn2 = tf.nn.sigmoid(dnn2)
+        dnn3 = tf.layers.dense(dnn2, 2, activation=None, name='f3' + stag, reuse=tf.AUTO_REUSE)
+        y_hat = tf.nn.softmax(dnn3) + 0.000001
+        return y_hat
+
+    def init_uid_weight(self, sess, uid_weight):
+        sess.run(self.uid_embedding_init,feed_dict={self.uid_embedding_placeholder: uid_weight})
+    
+    def init_mid_weight(self, sess, mid_weight):
+        sess.run([self.mid_embedding_init],feed_dict={self.mid_embedding_placeholder: mid_weight})
+
+    def save_mid_embedding_weight(self, sess):
+        embedding = sess.run(self.mid_embeddings_var)
+        return embedding
+
+    def save_uid_embedding_weight(self, sess):
+        embedding = sess.run(self.uid_bp_memory)
+        return embedding                                 
+    
+    def train(self, sess, inps):
+        input_dict = {
+            self.user_batch_ph: inps[0],
+            self.ad_batch_ph: inps[1],
+            self.scene_batch_ph: inps[2],
+            self.time_batch_ph: inps[3],
+            self.clk_seq_batch_ph: inps[4],
+            self.clk_mask: inps[6],
+            self.target_ph: inps[-2],
+            self.lr: inps[-1],
+        }
+        if use_cartes:
+            input_dict[self.carte_batch_ph] = inps[-3]
+        if "item-his_item" in use_cartes:
+            input_dict[self.item_carte_batch_ph] = inps[10]
+        if "cate-his_cate" in use_cartes:
+            input_dict[self.cate_carte_batch_ph] = inps[11]
+        if "item-his_cate" in use_cartes:
+            input_dict[self.item_cate_carte_batch_ph] = inps[12]
+        if "cate-his_item" in use_cartes:
+            input_dict[self.cate_item_carte_batch_ph] = inps[13]
+
+        if self.use_negsample:
+            input_dict[self.noclk_seq_batch_ph] = inps[5]
+            input_dict[self.noclk_mask] = inps[7]
+            loss, aux_loss, accuracy, _ = sess.run([self.loss, self.aux_loss, self.accuracy, self.optimizer], feed_dict=input_dict)
+        else:
+            loss, accuracy, _ = sess.run([self.loss, self.accuracy, self.optimizer], feed_dict=input_dict)
+            aux_loss = 0
+        return loss, accuracy, aux_loss            
+
+    def calculate(self, sess, inps):
+        input_dict = {
+            self.user_batch_ph: inps[0],
+            self.ad_batch_ph: inps[1],
+            self.scene_batch_ph: inps[2],
+            self.time_batch_ph: inps[3],
+            self.clk_seq_batch_ph: inps[4],
+            self.clk_mask: inps[6],
+            self.target_ph: inps[-1],
+        }
+        if use_cartes:
+            input_dict[self.carte_batch_ph] = inps[-2]
+            
+        if "item-his_item" in use_cartes:
+            input_dict[self.item_carte_batch_ph] = inps[9]
+        if "cate-his_cate" in use_cartes:
+            input_dict[self.cate_carte_batch_ph] = inps[10]
+        if "item-his_cate" in use_cartes:
+            input_dict[self.item_cate_carte_batch_ph] = inps[11]
+        if "cate-his_item" in use_cartes:
+            input_dict[self.cate_item_carte_batch_ph] = inps[12]
+
+        if self.use_negsample:
+            input_dict[self.noclk_seq_batch_ph] = inps[5]
+            input_dict[self.noclk_mask] = inps[7]
+            probs, loss, accuracy, aux_loss = sess.run([self.y_hat, self.loss, self.accuracy, self.aux_loss], feed_dict=input_dict)
+        else:
+            probs, loss, accuracy = sess.run([self.y_hat, self.loss, self.accuracy], feed_dict=input_dict)
+            aux_loss = 0
+        return probs, loss, accuracy, aux_loss
+
+    def save(self, sess, path):
+        saver = tf.train.Saver()
+        saver.save(sess, save_path=path)
+
+    def restore(self, sess, path):
+        saver = tf.train.Saver()
+        saver.restore(sess, save_path=path)
+        print('model restored from %s' % path)
+
+class Model_DNN(Model):
+    def __init__(self,n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, SEQ_LEN=256):
+        super(Model_DNN, self).__init__(n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, 
+                                           BATCH_SIZE, SEQ_LEN, Flag="DNN")
+        
+        #inp = tf.concat([self.item_eb, self.item_his_eb_sum], 1)
+        if emb_as_weight:
+            self.carte_embs.append(self.fake_carte_sum)
+        inp = tf.concat([self.user_eb, self.ad_eb, self.scene_eb, self.time_eb] + self.carte_embs, 1)
+        self.build_fcn_net(inp, use_dice=False)
+ 
+
+class Model_FFM(Model):
+    def __init__(self,n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, SEQ_LEN=256):
+        super(Model_DNN, self).__init__(n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, 
+                                           BATCH_SIZE, SEQ_LEN, Flag="DNN")
+        
+        inp = tf.concat([self.item_eb, self.item_his_eb_sum], 1)
+        self.build_fcn_net(inp, use_dice=False)
+        
+       
+
+class Model_PNN(Model):
+    def __init__(self,n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, SEQ_LEN=256):
+        super(Model_PNN, self).__init__(n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, 
+                                           BATCH_SIZE, SEQ_LEN, Flag="PNN")
+        
+        inp = tf.concat([self.item_eb, self.item_his_eb_sum, self.item_eb * self.item_his_eb_sum], 1)
+        self.build_fcn_net(inp, use_dice=False)
+
+
+class Model_GRU4REC(Model):
+    def __init__(self,n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, SEQ_LEN=256):
+        super(Model_GRU4REC, self).__init__(n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, 
+                                           BATCH_SIZE, SEQ_LEN, Flag="GRU4REC")
+        with tf.name_scope('rnn_1'):
+            self.sequence_length = tf.Variable([SEQ_LEN] * BATCH_SIZE)
+            rnn_outputs, final_state1 = dynamic_rnn(GRUCell(2*EMBEDDING_DIM), inputs=self.item_his_eb,
+                                         sequence_length=self.sequence_length, dtype=tf.float32,
+                                         scope="gru1")
+            tf.summary.histogram('GRU_outputs', rnn_outputs)
+                    
+        inp = tf.concat([self.item_eb, self.item_his_eb_sum, final_state1], 1)
+        self.build_fcn_net(inp, use_dice=False)
+        
+
+class Model_DIN(Model):
+    def __init__(self,n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, SEQ_LEN=256):
+        super(Model_DIN, self).__init__(n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, 
+                                           BATCH_SIZE, SEQ_LEN, Flag="DIN")
+        with tf.name_scope('Attention_layer'):
+            attention_output = din_attention(self.item_eb, self.item_his_eb, HIDDEN_SIZE, self.mask)
+            att_fea = tf.reduce_sum(attention_output, 1)
+            tf.summary.histogram('att_fea', att_fea)
+        inp = tf.concat([self.item_eb, self.item_his_eb_sum, att_fea], -1)
+        self.build_fcn_net(inp, use_dice=False)
+
+
+class Model_ARNN(Model):
+    def __init__(self,n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, SEQ_LEN=256):
+        super(Model_ARNN, self).__init__(n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, 
+                                           BATCH_SIZE, SEQ_LEN, Flag="ARNN")
+        with tf.name_scope('rnn_1'):
+            self.sequence_length = tf.Variable([SEQ_LEN] * BATCH_SIZE)
+            rnn_outputs, final_state1 = dynamic_rnn(GRUCell(2*EMBEDDING_DIM), inputs=self.item_his_eb,
+                                         sequence_length=self.sequence_length, dtype=tf.float32,
+                                         scope="gru1")
+            tf.summary.histogram('GRU_outputs', rnn_outputs)
+        # Attention layer
+        with tf.name_scope('Attention_layer_1'):
+            att_gru = din_attention(self.item_eb, rnn_outputs, HIDDEN_SIZE, self.mask)
+            att_gru = tf.reduce_sum(att_gru, 1)
+
+        inp = tf.concat([self.item_eb, self.item_his_eb_sum, final_state1, att_gru], -1)
+        self.build_fcn_net(inp, use_dice=False)        
+
+class Model_RUM(Model):
+    def __init__(self, n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, MEMORY_SIZE, SEQ_LEN=400, mask_flag=True):
+        super(Model_RUM, self).__init__(n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, 
+                                           BATCH_SIZE, SEQ_LEN, Flag="RUM")
+
+        def clear_mask_state(state, begin_state, mask, t):
+            state["controller_state"] = (1-tf.reshape(mask[:,t], (BATCH_SIZE, 1))) * begin_state["controller_state"] + tf.reshape(mask[:,t], (BATCH_SIZE, 1)) * state["controller_state"]
+            state["M"] = (1-tf.reshape(mask[:,t], (BATCH_SIZE, 1, 1))) * begin_state["M"] + tf.reshape(mask[:,t], (BATCH_SIZE, 1, 1)) * state["M"]
+            return state
+      
+        cell = rum.RUMCell(controller_units=HIDDEN_SIZE, memory_size=MEMORY_SIZE, memory_vector_dim=2*EMBEDDING_DIM,read_head_num=1, write_head_num=1,
+            reuse=False, output_dim=HIDDEN_SIZE, clip_value=20, batch_size=BATCH_SIZE)
+        
+        state = cell.zero_state(BATCH_SIZE, tf.float32)
+        begin_state = state
+        for t in range(SEQ_LEN):
+            output, state = cell(self.item_his_eb[:, t, :], state)
+            if mask_flag:
+                state = clear_mask_state(state, begin_state, self.mask, t)
+        
+        final_state = output
+        before_memory = state['M']
+        rum_att_hist = din_attention(self.item_eb, before_memory, HIDDEN_SIZE, None)
+
+        inp = tf.concat([self.item_eb, self.item_his_eb_sum, final_state, tf.squeeze(rum_att_hist)], 1)
+
+        self.build_fcn_net(inp, use_dice=False) 
+
+class Model_DIEN(Model):
+    def __init__(self, n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, SEQ_LEN=400, use_negsample=False, use_mi_cons=False):
+        super(Model_DIEN, self).__init__(n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, 
+                                           BATCH_SIZE, SEQ_LEN, use_negsample, Flag="DIEN")
+
+        with tf.name_scope('rnn_1'):
+            self.sequence_length = tf.Variable([SEQ_LEN] * BATCH_SIZE)
+            rnn_outputs, _ = dynamic_rnn(GRUCell(10*EMBEDDING_DIM), inputs=self.clk_seq_eb,
+                                         sequence_length=self.sequence_length, dtype=tf.float32,
+                                         scope="gru1")
+            tf.summary.histogram('GRU_outputs', rnn_outputs)        
+        
+        if use_negsample:
+            if use_mi_cons:
+                #aux_loss_1 = self.info_NCE(rnn_outputs[:, :-1, :], self.item_his_eb[:, 1:, :], self.mask[:, 1:])
+                #aux_loss_1 = self.info_NCE_aux(rnn_outputs[:, :-1, :], self.item_his_eb[:, 1:, :], self.neg_his_eb[:, 1:, :], self.mask[:, 1:])
+                aux_loss_1 = self.mi_loss(rnn_outputs[:, :-1, :], self.clk_seq_eb[:, 1:, :],
+                                             self.noclk_seq_eb[:, 1:, :], self.mask[:, 1:], stag = "mi_0")
+            else:
+                aux_loss_1 = self.auxiliary_loss(rnn_outputs[:, :-1, :], self.clk_seq_eb[:, 1:, :],
+                                             self.noclk_seq_eb[:, 1:, :], self.clk_mask[:, 1:], self.noclk_mask[:, 1:], stag = "bigru_0")
+            self.aux_loss = aux_loss_1
+
+        # Attention layer
+        with tf.name_scope('Attention_layer_1'):
+            att_outputs, alphas = din_attention(self.ad_eb, rnn_outputs, HIDDEN_SIZE, mask=self.clk_mask, mode="LIST", return_alphas=True)
+            tf.summary.histogram('alpha_outputs', alphas)
+
+        with tf.name_scope('rnn_2'):
+            rnn_outputs2, final_state2 = dynamic_rnn(VecAttGRUCell(HIDDEN_SIZE), inputs=rnn_outputs,
+                                                     att_scores = tf.expand_dims(alphas, -1),
+                                                     sequence_length=self.sequence_length, dtype=tf.float32,
+                                                     scope="gru2")
+            tf.summary.histogram('GRU2_Final_State', final_state2)
+
+        #inp = tf.concat([self.item_eb, final_state2, self.item_his_eb_sum, self.item_eb*self.item_his_eb_sum], 1)
+        #inp = tf.concat([self.item_eb, final_state2, self.item_his_eb_sum, self.item_eb*self.item_his_eb_sum, self.item_carte_eb_sum], 1)
+        #inp = tf.concat([self.item_eb, final_state2, self.item_his_eb_sum, self.item_eb*self.item_his_eb_sum, self.cate_carte_eb_sum], 1)
+        #inp = tf.concat([self.item_eb, final_state2, self.item_his_eb_sum, self.item_eb*self.item_his_eb_sum, self.item_cate_carte_eb_sum], 1)
+        #inp = tf.concat([self.item_eb, final_state2, self.item_his_eb_sum, self.item_eb*self.item_his_eb_sum, self.cate_carte_eb_sum], 1)
+        #inp = tf.concat([self.item_eb, final_state2, self.item_his_eb_sum, self.item_eb*self.item_his_eb_sum,  self.item_carte_eb_sum, self.cate_carte_eb_sum], 1)
+
+        
+        #if attention
+        
+        if emb_as_weight:
+            if keep_fake_carte_seq:
+                if carte_with_gru:
+                    with tf.name_scope('rnn_3'):
+                        self.fake_carte_seq, _ = dynamic_rnn(GRUCell(EMBEDDING_DIM), inputs=self.fake_carte_seq,
+                                                 sequence_length=self.sequence_length, dtype=tf.float32,
+                                                 scope="gru3")
+ 
+                with tf.name_scope('Attention_layer_2'):
+                    carte_att_outputs, _ = din_attention(self.mid_batch_embedded, self.fake_carte_seq, HIDDEN_SIZE, mask=self.mask, stag="carte", mode="SUM", return_alphas=True)
+                self.carte_embs.append(tf.reduce_sum(carte_att_outputs, 1))
+                #self.carte_embs.append(self.fake_carte_sum)
+            else:
+                self.carte_embs.append(self.fake_carte_sum)
+        inp = tf.concat([self.user_eb, self.ad_eb, self.scene_eb, self.time_eb, final_state2, self.clk_seq_eb_sum, self.ad_eb*self.clk_seq_eb_sum] + self.carte_embs, 1)
+        self.build_fcn_net(inp, use_dice=False)
+
+    def neg_sample(self, neg_his_emb, K=10, mode="random"):
+        shape = tf.shape(neg_his_emb)
+        batch, seq, dim = shape[0], shape[1], shape[2]
+        
+        if mode == "random":
+            neg = tf.expand_dims(neg_his_emb, 1) #[batch, 1, seq, dim]
+            neg = tf.tile(neg, [1,seq, 1,1]) #[batch, seq, seq, dim]
+            # index = tf.random_uniform((batch, seq, K), minval=0, maxval=seq, dtype=tf.int32)
+            # neg = tf.batch_gather(neg, index) #[batch, seq, K, dim]
+            neg = neg[:, :, :K, :]
+            return neg
+        elif mode == "aux":
+            neg = tf.expand_dims(neg_his_emb, 1)
+            return neg
+            
+    def mi_loss_(self, h_states, click_seq, noclick_seq, mask = None, stag = None):
+        #mask = tf.cast(mask, tf.float32)
+        '''
+        h = self.mlp(h_states, stag = stag)
+        pos = self.mlp(click_seq, stag = stag)
+        neg = self.mlp(noclick_seq, stag = stag)
+
+        scores_pos = tf.matmul(h, pos)
+        scores_neg = tf.matmul(h, neg)
+        joint = tf.linalg.diag_part(score_pos)
+        '''
+        pos = tf.concat([h_states, click_seq], axis=2)
+        f_pos = self.mlp(pos) # [batch, seq, 1]
+
+        K = 99
+        neg = self.neg_sample(noclick_seq, K)
+        h_states_tiled = tf.tile(tf.expand_dims(h_states, 2), [1,1,K,1]) # [batch, seq, K, dim]
+        total = tf.concat([h_states_tiled, neg], axis=3)
+        f_neg = self.mlp(total) #[batch, seq, K, 1]
+        f_neg = tf.reduce_sum(f_neg, axis=2)
+        f_total = f_pos + f_neg
+
+        loss_ = tf.reshape(tf.log(f_pos / f_total), [-1, tf.shape(click_seq)[1]]) * mask
+        loss_ = - tf.reduce_mean(loss_) 
+
+        return loss_
+    
+    def mi_loss(self, h_states, click_seq, noclick_seq,  mask, stag='NCE'):
+        exp = 'random_1'
+        if exp == 'random_1':
+            shape = tf.shape(h_states)
+            batch, len_seq, dim = shape[0], shape[1], shape[2]
+            Wk_ct = []
+            x = tf.layers.dense(click_seq, 256, activation=None, name='pos_enc')
+            x = tf.unstack(x, axis=1)
+            neg = tf.layers.dense(noclick_seq, 256, activation=None, name='neg_enc')
+            neg = tf.unstack(neg, axis=1)
+            c_t = tf.unstack(h_states, axis=1)
+            with tf.name_scope(stag):
+                for i in range(len(c_t)):
+                    Wk_ct.append(tf.layers.dense(c_t[i], 256, activation=None, name='W{}'.format(i)))
+            #nce = 0        
+            nce = []
+            for i in range(len(c_t)):
+                s_p = tf.reduce_sum(x[i] * Wk_ct[i], axis=1, keep_dims=True) # shape=[batch,1]
+                s_n = tf.reduce_sum(neg[i] * Wk_ct[i], axis=1, keep_dims=True)
+                score = tf.concat([s_p, s_n], axis=1)
+                score = tf.nn.log_softmax(tf.exp(score), dim=1)
+                score = tf.reshape(score[:, 0], [-1])
+                nce.append(score)
+            nce = tf.stack(nce, axis=1) * mask
+            nce = tf.reduce_sum(nce)
+            nce /= -1.0 * tf.cast(batch*len_seq, tf.float32)
+            return nce
+        elif exp == 'random_all':
+            shape = tf.shape(h_states)
+            batch, len_seq, dim = shape[0], shape[1], shape[2]
+            Wk_ct = []
+            x = tf.layers.dense(click_seq, 256, activation=None, name='pos_enc')
+            x = tf.unstack(x, axis=1)
+            neg = tf.layers.dense(noclick_seq, 256, activation=None, name='neg_enc')
+            neg = tf.unstack(neg, axis=1)
+            c_t = tf.unstack(h_states, axis=1)
+            with tf.name_scope(stag):
+                for i in range(len(c_t)):
+                    Wk_ct.append(tf.layers.dense(c_t[i], 256, activation=None, name='W{}'.format(i)))
+            nce = []
+            for i in range(len(c_t)):
+                s_p = tf.reduce_sum(x[i] * Wk_ct[i], axis=1, keep_dims=True) # shape=[batch,1]
+                s_n = []
+                for j in range(len(neg)):
+                    s_n.append(tf.reduce_sum(neg[j] * Wk_ct[i], axis=1, keep_dims=True))
+                score = tf.concat([s_p] + s_n, axis=1)
+                score = tf.nn.log_softmax(tf.exp(score), dim=1)
+                score = tf.reshape(score[:, 0], [-1])
+                nce.append(score)
+            nce = tf.stack(nce, axis=1) * mask
+            nce = tf.reduce_sum(nce)
+            nce /= -1.0 * tf.cast(batch*len_seq, tf.float32)
+            return nce
+
+        elif exp == 'batch_1':
+            shape = tf.shape(click_seq)
+            batch, len_seq, dim = shape[0], shape[1], shape[2]
+            x = tf.layers.dense(click_seq, 256, activation=None, name='pos_enc')
+            x = tf.unstack(x, axis=1)
+            c_t = tf.unstack(h_states, axis=1)
+            # different W for every step
+            rand_idx = 12
+            Wk_ct = []
+            with tf.name_scope(stag):
+                for i in range(len(c_t)):
+                    Wk_ct.append(tf.layers.dense(c_t[i], 256, activation=None, name='W{}'.format(i)))
+            nce = []
+            for i in range(len(c_t)):
+                x_i = tf.tile(x[i], [2,1])
+                s_p = tf.reduce_sum(x_i[0:128, :] * Wk_ct[i], axis=1, keep_dims=True) # shape=[batch,1]
+                s_n = tf.reduce_sum(x_i[rand_idx:rand_idx+128] * Wk_ct[i], axis=1, keep_dims=True) # shape=[batch,1]
+                score = tf.concat([s_p, s_n], axis=1)
+                score = tf.nn.log_softmax(tf.exp(score), dim=1) # softmax over batch
+                score = tf.reshape(score[:, 0], [-1])
+                nce.append(score)
+            nce =tf.stack(nce, axis=1) * mask
+            nce = tf.reduce_sum(nce)
+            nce /= -1.0*tf.cast(batch*len_seq, tf.float32)
+            return nce
+
+        elif exp == 'batch_all':
+            shape = tf.shape(click_seq)
+            batch, len_seq, dim = shape[0], shape[1], shape[2]
+            x = tf.layers.dense(click_seq, 256, activation=None, name='pos_enc')
+            x = tf.unstack(x, axis=1)
+            c_t = tf.unstack(h_states, axis=1)
+            # different W for every step
+            Wk_ct = []
+            with tf.name_scope(stag):
+                for i in range(len(c_t)):
+                    Wk_ct.append(tf.layers.dense(c_t[i], 256, activation=None, name='W{}'.format(i)))
+            nce = []
+            for i in range(len(c_t)):
+                score = tf.exp(tf.matmul(x[i], tf.transpose(Wk_ct[i])))
+                score = tf.nn.log_softmax(score, dim=0) # softmax over batch
+                nce.append(tf.linalg.diag_part(score))
+                #nce += tf.reduce_sum(tf.linalg.diag_part(score))
+            nce = tf.stack(nce, axis=1)  * mask
+            nce = tf.reduce_sum(nce)
+            nce /= -1.0*tf.cast(batch*len_seq, tf.float32)
+            return nce
+
+
+    def mlp(self, in_, stag='mlp'):
+        bn1 = tf.layers.batch_normalization(inputs=in_, name='bn1' + stag, reuse=tf.AUTO_REUSE)
+        dnn1 = tf.layers.dense(bn1, 1024, activation=None, name='f1' + stag, reuse=tf.AUTO_REUSE)
+        dnn2 = tf.layers.dense(dnn1, 512, activation=None, name='f2' + stag, reuse=tf.AUTO_REUSE)
+        dnn3 = tf.layers.dense(dnn2, 256, activation=None, name='f3' + stag, reuse=tf.AUTO_REUSE)
+        return dnn3
+        '''
+        dnn4 = tf.layers.dense(dnn3, 1, activation=None, name='f4' + stag, reuse=tf.AUTO_REUSE)
+        dnn4 = tf.nn.sigmoid(dnn4)
+        return dnn4
+        y_hat = tf.nn.softmax(dnn3) + 0.000001
+        return y_hat
+        '''
+
+    def auxiliary_loss(self, h_states, click_seq, noclick_seq, clk_mask=None, noclk_mask=None, stag=None):
+        if noclk_mask is None:
+            noclk_mask = clk_mask
+        # postive 
+        click_input = tf.concat([h_states, click_seq], -1)
+        click_prop = self.auxiliary_net(click_input, stag = stag)[:, :, 0]
+        click_loss = - tf.reshape(tf.log(click_prop), [-1, tf.shape(click_seq)[1]]) * clk_mask
+        
+        # negative
+        exp = 'random_1'
+        if exp =='random_1':
+            return super(Model_DIEN, self).auxiliary_loss(h_states, click_seq, noclick_seq, clk_mask, noclk_mask, stag)
+        elif exp == 'random_all':
+            batch = 99
+            noclick_seq_ = tf.tile(noclick_seq, [1,2,1]) # shape = [batch, 2 * seq, dim] for sliding window
+            noclick_input = []
+            for i in range(99):
+                noclick_input.append(tf.concat([h_states, noclick_seq_[:, i:i+batch, :]], axis=-1))
+            noclick_input = tf.concat(noclick_input, axis=0)
+            mask = tf.tile(mask, [batch, 1])
+        elif exp == 'batch_1':
+            batch = 128
+            h_states = tf.unstack(h_states, axis=1)
+            click_seq = tf.unstack(click_seq, axis=1)
+            noclick_input = []
+            rand_idx = 12
+            for i in range(len(click_seq)):
+                h = h_states[i] # seq i of the batch, shape = [batch, dim]
+                c = click_seq[i]
+                c = tf.tile(c, [2, 1]) # sliding window
+                noclick_input.append(tf.concat([h, c[rand_idx:rand_idx+batch,:]], axis=1))
+            noclick_input = tf.stack(noclick_input, axis=1)
+        elif exp == 'batch_all':
+            batch = 128
+            h_states = tf.unstack(h_states, axis=1)
+            click_seq = tf.unstack(click_seq, axis=1)
+            noclick_input = []
+            for i in range(len(click_seq)):
+                h = h_states[i] # seq i of the batch, shape = [batch, dim]
+                c = click_seq[i]
+                c = tf.tile(c, [2, 1]) # sliding window
+                neg = []
+                for i in range(1, batch):
+                    neg.append(tf.concat([h, c[i:i+batch,:]], axis=1))
+                noclick_input.append(tf.concat(neg, axis=0))
+            noclick_input = tf.stack(noclick_input, axis=1)
+            mask = tf.tile(mask, [batch-1, 1])
+
+        noclick_prop = self.auxiliary_net(noclick_input, stag = stag)[:, :, 0]
+        noclick_loss = - tf.reshape(tf.log(1.0 - noclick_prop), [-1, tf.shape(noclick_seq)[1]])  * mask
+        loss_ = tf.reduce_mean(click_loss) + tf.reduce_mean(noclick_loss)
+        return loss_
+
+    def aux_batch(self, h_states, click_seq, noclick_seq, mask = None, stag = None):
+        #mask = tf.cast(mask, tf.float32)
+        # batch = tf.shape(h_states)[0]
+        batch = 128
+        click_input_ = tf.concat([h_states, click_seq], -1)
+        h_states_ = tf.unstack(h_states, axis=1)
+        click_seq_ = tf.unstack(click_seq, axis=1)
+        neg_input_total = []
+        for i in range(len(click_seq_)):
+            h = h_states_[i] # seq i of the batch [batch, dim]
+            c = click_seq_[i]
+            c = tf.tile(c, [2, 1]) # sliding window
+            neg = []
+            for i in range(1, batch):
+                neg.append(tf.concat([h, c[i:i+batch,:]], axis=1))
+            neg_input_total.append(tf.concat(neg, axis=0))
+        noclick_input_ = tf.stack(neg_input_total, axis=1)
+        #noclick_input_ = tf.concat([h_states, noclick_seq], -1)
+        click_prop_ = self.auxiliary_net(click_input_, stag = stag)[:, :, 0]
+        noclick_prop_ = self.auxiliary_net(noclick_input_, stag = stag)[:, :, 0]
+
+        click_loss_ = - tf.reshape(tf.log(click_prop_), [-1, tf.shape(click_seq)[1]]) * mask
+        mask = tf.tile(mask, [batch-1, 1])
+        noclick_loss_ = - tf.reshape(tf.log(1.0 - noclick_prop_), [-1, tf.shape(noclick_seq)[1]]) * mask
+
+        #loss_ = tf.reduce_mean(click_loss_ + noclick_loss_)
+        loss_ = tf.reduce_mean(click_loss_) + tf.reduce_mean(noclick_loss_)
+        return loss_
+
+
+       
+        
+class Model_MIMN(Model):
+    def __init__(self, n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, MEMORY_SIZE, SEQ_LEN=400, Mem_Induction=0, Util_Reg=0, use_negsample=False, mask_flag=False):
+        super(Model_MIMN, self).__init__(n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, 
+                                           BATCH_SIZE, SEQ_LEN, use_negsample, Flag="MIMN")
+        self.reg = Util_Reg
+
+        def clear_mask_state(state, begin_state, begin_channel_rnn_state, mask, cell, t):
+            state["controller_state"] = (1-tf.reshape(mask[:,t], (BATCH_SIZE, 1))) * begin_state["controller_state"] + tf.reshape(mask[:,t], (BATCH_SIZE, 1)) * state["controller_state"]
+            state["M"] = (1-tf.reshape(mask[:,t], (BATCH_SIZE, 1, 1))) * begin_state["M"] + tf.reshape(mask[:,t], (BATCH_SIZE, 1, 1)) * state["M"]
+            state["key_M"] = (1-tf.reshape(mask[:,t], (BATCH_SIZE, 1, 1))) * begin_state["key_M"] + tf.reshape(mask[:,t], (BATCH_SIZE, 1, 1)) * state["key_M"]
+            state["sum_aggre"] = (1-tf.reshape(mask[:,t], (BATCH_SIZE, 1, 1))) * begin_state["sum_aggre"] + tf.reshape(mask[:,t], (BATCH_SIZE, 1, 1)) * state["sum_aggre"]
+            if Mem_Induction > 0:
+                temp_channel_rnn_state = []
+                for i in range(MEMORY_SIZE):
+                    temp_channel_rnn_state.append(cell.channel_rnn_state[i] * tf.expand_dims(mask[:,t], axis=1) + begin_channel_rnn_state[i]*(1- tf.expand_dims(mask[:,t], axis=1)))
+                cell.channel_rnn_state = temp_channel_rnn_state
+                temp_channel_rnn_output = []
+                for i in range(MEMORY_SIZE):
+                    temp_output = cell.channel_rnn_output[i] * tf.expand_dims(mask[:,t], axis=1) + begin_channel_rnn_output[i]*(1- tf.expand_dims(self.mask[:,t], axis=1))
+                    temp_channel_rnn_output.append(temp_output)
+                cell.channel_rnn_output = temp_channel_rnn_output
+
+            return state
+      
+        cell = mimn.MIMNCell(controller_units=HIDDEN_SIZE, memory_size=MEMORY_SIZE, memory_vector_dim=2*EMBEDDING_DIM,read_head_num=1, write_head_num=1,
+            reuse=False, output_dim=HIDDEN_SIZE, clip_value=20, batch_size=BATCH_SIZE, mem_induction=Mem_Induction, util_reg=Util_Reg)
+        
+        state = cell.zero_state(BATCH_SIZE, tf.float32)
+        if Mem_Induction > 0:
+            begin_channel_rnn_output = cell.channel_rnn_output
+        else:
+            begin_channel_rnn_output = 0.0
+        
+        begin_state = state
+        self.state_list = [state]
+        self.mimn_o = []
+        for t in range(SEQ_LEN):
+            output, state, temp_output_list = cell(self.item_his_eb[:, t, :], state)
+            if mask_flag:
+                state = clear_mask_state(state, begin_state, begin_channel_rnn_output, self.mask, cell, t)
+            self.mimn_o.append(output)
+            self.state_list.append(state)
+                
+        self.mimn_o = tf.stack(self.mimn_o, axis=1)
+        self.state_list.append(state)
+        mean_memory = tf.reduce_mean(state['sum_aggre'], axis=-2)
+
+        before_aggre = state['w_aggre']
+        read_out, _, _ = cell(self.item_eb, state)
+        
+        if use_negsample:
+            aux_loss_1 = self.auxiliary_loss(self.mimn_o[:, :-1, :], self.item_his_eb[:, 1:, :],
+                                             self.neg_his_eb[:, 1:, :], self.mask[:, 1:], stag = "bigru_0")
+            self.aux_loss = aux_loss_1  
+
+        if self.reg:
+            self.reg_loss = cell.capacity_loss(before_aggre)
+        else:
+            self.reg_loss = tf.zeros(1)
+
+        if Mem_Induction == 1:
+            channel_memory_tensor = tf.concat(temp_output_list, 1)            
+            multi_channel_hist = din_attention(self.item_eb, channel_memory_tensor, HIDDEN_SIZE, None, stag='pal')
+            inp = tf.concat([self.item_eb, self.item_his_eb_sum, read_out, tf.squeeze(multi_channel_hist), mean_memory*self.item_eb], 1)
+        else:
+            inp = tf.concat([self.item_eb, self.item_his_eb_sum, read_out, mean_memory*self.item_eb], 1)
+
+        self.build_fcn_net(inp, use_dice=False) 
diff --git a/modelzoo/CAN/script/process_data.py b/modelzoo/CAN/script/process_data.py
new file mode 100644
index 00000000000..18bf7ebdd7d
--- /dev/null
+++ b/modelzoo/CAN/script/process_data.py
@@ -0,0 +1,101 @@
+import sys
+import random
+import time
+
+def process_meta(file):
+    fi = open(file, "r")
+    fo = open("item-info", "w")
+    for line in fi:
+        obj = eval(line)
+        cat = obj["categories"][0][-1]
+        print(obj["asin"] + "\t" + cat,file=fo)
+
+def process_reviews(file):
+    fi = open(file, "r")
+    user_map = {}
+    fo = open("reviews-info", "w")
+    for line in fi:
+        obj = eval(line)
+        userID = obj["reviewerID"]
+        itemID = obj["asin"]
+        rating = obj["overall"]
+        time = obj["unixReviewTime"]
+        print(userID + "\t" + itemID + "\t" + str(rating) + "\t" + str(time),file=fo)
+
+def manual_join():
+    f_rev = open("reviews-info", "r")
+    user_map = {}
+    item_list = []
+    for line in f_rev:
+        line = line.strip()
+        items = line.split("\t")
+        #loctime = time.localtime(float(items[-1]))
+        #items[-1] = time.strftime('%Y-%m-%d', loctime)
+        if items[0] not in user_map:
+            user_map[items[0]]= []
+        user_map[items[0]].append(("\t".join(items), float(items[-1])))
+        item_list.append(items[1])
+    f_meta = open("item-info", "r")
+    meta_map = {}
+    for line in f_meta:
+        arr = line.strip().split("\t")
+        if arr[0] not in meta_map:
+            meta_map[arr[0]] = arr[1]
+            arr = line.strip().split("\t")
+    fo = open("jointed-new", "w")
+    for key in user_map:
+        sorted_user_bh = sorted(user_map[key], key=lambda x:x[1])
+        for line, t in sorted_user_bh:
+            items = line.split("\t")
+            asin = items[1]
+            j = 0
+            while True:
+                asin_neg_index = random.randint(0, len(item_list) - 1)
+                asin_neg = item_list[asin_neg_index]
+                if asin_neg == asin:
+                    continue 
+                items[1] = asin_neg
+                print("0" + "\t" + "\t".join(items) + "\t" + meta_map[asin_neg],file=fo)
+                j += 1
+                if j == 1:             #negative sampling frequency
+                    break
+            if asin in meta_map:
+                print("1" + "\t" + line + "\t" + meta_map[asin],file=fo)
+            else:
+                print("1" + "\t" + line + "\t" + "default_cat",file=fo)
+
+
+def split_test():
+    fi = open("jointed-new", "r")
+    fo = open("jointed-new-split-info", "w")
+    user_count = {}
+    for line in fi:
+        line = line.strip()
+        user = line.split("\t")[1]
+        if user not in user_count:
+            user_count[user] = 0
+        user_count[user] += 1
+    fi.seek(0)
+    i = 0
+    last_user = "A26ZDKC53OP6JD"
+    for line in fi:
+        line = line.strip()
+        user = line.split("\t")[1]
+        if user == last_user:
+            if i < user_count[user] - 2:  # 1 + negative samples
+                print("20180118" + "\t" + line,file=fo)
+            else:
+                print("20190119" + "\t" + line,file=fo)
+        else:
+            last_user = user
+            i = 0
+            if i < user_count[user] - 2:
+                print("20180118" + "\t" + line,file=fo)
+            else:
+                print("20190119" + "\t" + line,file=fo)
+        i += 1
+
+process_meta(sys.argv[1])
+process_reviews(sys.argv[2])
+manual_join()
+split_test()
diff --git a/modelzoo/CAN/script/rnn.py b/modelzoo/CAN/script/rnn.py
new file mode 100644
index 00000000000..da2351b2dbb
--- /dev/null
+++ b/modelzoo/CAN/script/rnn.py
@@ -0,0 +1,1454 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""RNN helpers for TensorFlow models.
+
+
+@@bidirectional_dynamic_rnn
+@@dynamic_rnn
+@@raw_rnn
+@@static_rnn
+@@static_state_saving_rnn
+@@static_bidirectional_rnn
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.util import nest
+
+
+# pylint: disable=protected-access
+_concat = rnn_cell_impl._concat
+#_like_rnncell = rnn_cell_impl._like_rnncell
+_like_rnncell = rnn_cell_impl.assert_like_rnncell
+# pylint: enable=protected-access
+
+
+def _transpose_batch_time(x):
+  """Transpose the batch and time dimensions of a Tensor.
+
+  Retains as much of the static shape information as possible.
+
+  Args:
+    x: A tensor of rank 2 or higher.
+
+  Returns:
+    x transposed along the first two dimensions.
+
+  Raises:
+    ValueError: if `x` is rank 1 or lower.
+  """
+  x_static_shape = x.get_shape()
+  if x_static_shape.ndims is not None and x_static_shape.ndims < 2:
+    raise ValueError(
+        "Expected input tensor %s to have rank at least 2, but saw shape: %s" %
+        (x, x_static_shape))
+  x_rank = array_ops.rank(x)
+  x_t = array_ops.transpose(
+      x, array_ops.concat(
+          ([1, 0], math_ops.range(2, x_rank)), axis=0))
+  x_t.set_shape(
+      tensor_shape.TensorShape([
+          x_static_shape[1].value, x_static_shape[0].value
+      ]).concatenate(x_static_shape[2:]))
+  return x_t
+
+
+def _best_effort_input_batch_size(flat_input):
+  """Get static input batch size if available, with fallback to the dynamic one.
+
+  Args:
+    flat_input: An iterable of time major input Tensors of shape [max_time,
+      batch_size, ...]. All inputs should have compatible batch sizes.
+
+  Returns:
+    The batch size in Python integer if available, or a scalar Tensor otherwise.
+
+  Raises:
+    ValueError: if there is any input with an invalid shape.
+  """
+  for input_ in flat_input:
+    shape = input_.shape
+    if shape.ndims is None:
+      continue
+    if shape.ndims < 2:
+      raise ValueError(
+          "Expected input tensor %s to have rank at least 2" % input_)
+    batch_size = shape[1].value
+    if batch_size is not None:
+      return batch_size
+  # Fallback to the dynamic batch size of the first input.
+  return array_ops.shape(flat_input[0])[1]
+
+
+def _infer_state_dtype(explicit_dtype, state):
+  """Infer the dtype of an RNN state.
+
+  Args:
+    explicit_dtype: explicitly declared dtype or None.
+    state: RNN's hidden state. Must be a Tensor or a nested iterable containing
+      Tensors.
+
+  Returns:
+    dtype: inferred dtype of hidden state.
+
+  Raises:
+    ValueError: if `state` has heterogeneous dtypes or is empty.
+  """
+  if explicit_dtype is not None:
+    return explicit_dtype
+  elif nest.is_sequence(state):
+    inferred_dtypes = [element.dtype for element in nest.flatten(state)]
+    if not inferred_dtypes:
+      raise ValueError("Unable to infer dtype from empty state.")
+    all_same = all([x == inferred_dtypes[0] for x in inferred_dtypes])
+    if not all_same:
+      raise ValueError(
+          "State has tensors of different inferred_dtypes. Unable to infer a "
+          "single representative dtype.")
+    return inferred_dtypes[0]
+  else:
+    return state.dtype
+
+
+# pylint: disable=unused-argument
+def _rnn_step(
+    time, sequence_length, min_sequence_length, max_sequence_length,
+    zero_output, state, call_cell, state_size, skip_conditionals=False):
+  """Calculate one step of a dynamic RNN minibatch.
+
+  Returns an (output, state) pair conditioned on the sequence_lengths.
+  When skip_conditionals=False, the pseudocode is something like:
+
+  if t >= max_sequence_length:
+    return (zero_output, state)
+  if t < min_sequence_length:
+    return call_cell()
+
+  # Selectively output zeros or output, old state or new state depending
+  # on if we've finished calculating each row.
+  new_output, new_state = call_cell()
+  final_output = np.vstack([
+    zero_output if time >= sequence_lengths[r] else new_output_r
+    for r, new_output_r in enumerate(new_output)
+  ])
+  final_state = np.vstack([
+    state[r] if time >= sequence_lengths[r] else new_state_r
+    for r, new_state_r in enumerate(new_state)
+  ])
+  return (final_output, final_state)
+
+  Args:
+    time: Python int, the current time step
+    sequence_length: int32 `Tensor` vector of size [batch_size]
+    min_sequence_length: int32 `Tensor` scalar, min of sequence_length
+    max_sequence_length: int32 `Tensor` scalar, max of sequence_length
+    zero_output: `Tensor` vector of shape [output_size]
+    state: Either a single `Tensor` matrix of shape `[batch_size, state_size]`,
+      or a list/tuple of such tensors.
+    call_cell: lambda returning tuple of (new_output, new_state) where
+      new_output is a `Tensor` matrix of shape `[batch_size, output_size]`.
+      new_state is a `Tensor` matrix of shape `[batch_size, state_size]`.
+    state_size: The `cell.state_size` associated with the state.
+    skip_conditionals: Python bool, whether to skip using the conditional
+      calculations.  This is useful for `dynamic_rnn`, where the input tensor
+      matches `max_sequence_length`, and using conditionals just slows
+      everything down.
+
+  Returns:
+    A tuple of (`final_output`, `final_state`) as given by the pseudocode above:
+      final_output is a `Tensor` matrix of shape [batch_size, output_size]
+      final_state is either a single `Tensor` matrix, or a tuple of such
+        matrices (matching length and shapes of input `state`).
+
+  Raises:
+    ValueError: If the cell returns a state tuple whose length does not match
+      that returned by `state_size`.
+  """
+
+  # Convert state to a list for ease of use
+  flat_state = nest.flatten(state)
+  flat_zero_output = nest.flatten(zero_output)
+
+  def _copy_one_through(output, new_output):
+    # If the state contains a scalar value we simply pass it through.
+    if output.shape.ndims == 0:
+      return new_output
+    copy_cond = (time >= sequence_length)
+    with ops.colocate_with(new_output):
+      return array_ops.where(copy_cond, output, new_output)
+
+  def _copy_some_through(flat_new_output, flat_new_state):
+    # Use broadcasting select to determine which values should get
+    # the previous state & zero output, and which values should get
+    # a calculated state & output.
+    flat_new_output = [
+        _copy_one_through(zero_output, new_output)
+        for zero_output, new_output in zip(flat_zero_output, flat_new_output)]
+    flat_new_state = [
+        _copy_one_through(state, new_state)
+        for state, new_state in zip(flat_state, flat_new_state)]
+    return flat_new_output + flat_new_state
+
+  def _maybe_copy_some_through():
+    """Run RNN step.  Pass through either no or some past state."""
+    new_output, new_state = call_cell()
+
+    nest.assert_same_structure(state, new_state)
+
+    flat_new_state = nest.flatten(new_state)
+    flat_new_output = nest.flatten(new_output)
+    return control_flow_ops.cond(
+        # if t < min_seq_len: calculate and return everything
+        time < min_sequence_length, lambda: flat_new_output + flat_new_state,
+        # else copy some of it through
+        lambda: _copy_some_through(flat_new_output, flat_new_state))
+
+  # TODO(ebrevdo): skipping these conditionals may cause a slowdown,
+  # but benefits from removing cond() and its gradient.  We should
+  # profile with and without this switch here.
+  if skip_conditionals:
+    # Instead of using conditionals, perform the selective copy at all time
+    # steps.  This is faster when max_seq_len is equal to the number of unrolls
+    # (which is typical for dynamic_rnn).
+    new_output, new_state = call_cell()
+    nest.assert_same_structure(state, new_state)
+    new_state = nest.flatten(new_state)
+    new_output = nest.flatten(new_output)
+    final_output_and_state = _copy_some_through(new_output, new_state)
+  else:
+    empty_update = lambda: flat_zero_output + flat_state
+    final_output_and_state = control_flow_ops.cond(
+        # if t >= max_seq_len: copy all state through, output zeros
+        time >= max_sequence_length, empty_update,
+        # otherwise calculation is required: copy some or all of it through
+        _maybe_copy_some_through)
+
+  if len(final_output_and_state) != len(flat_zero_output) + len(flat_state):
+    raise ValueError("Internal error: state and output were not concatenated "
+                     "correctly.")
+  final_output = final_output_and_state[:len(flat_zero_output)]
+  final_state = final_output_and_state[len(flat_zero_output):]
+
+  for output, flat_output in zip(final_output, flat_zero_output):
+    output.set_shape(flat_output.get_shape())
+  for substate, flat_substate in zip(final_state, flat_state):
+    substate.set_shape(flat_substate.get_shape())
+
+  final_output = nest.pack_sequence_as(
+      structure=zero_output, flat_sequence=final_output)
+  final_state = nest.pack_sequence_as(
+      structure=state, flat_sequence=final_state)
+
+  return final_output, final_state
+
+
+def _reverse_seq(input_seq, lengths):
+  """Reverse a list of Tensors up to specified lengths.
+
+  Args:
+    input_seq: Sequence of seq_len tensors of dimension (batch_size, n_features)
+               or nested tuples of tensors.
+    lengths:   A `Tensor` of dimension batch_size, containing lengths for each
+               sequence in the batch. If "None" is specified, simply reverses
+               the list.
+
+  Returns:
+    time-reversed sequence
+  """
+  if lengths is None:
+    return list(reversed(input_seq))
+
+  flat_input_seq = tuple(nest.flatten(input_) for input_ in input_seq)
+
+  flat_results = [[] for _ in range(len(input_seq))]
+  for sequence in zip(*flat_input_seq):
+    input_shape = tensor_shape.unknown_shape(
+        ndims=sequence[0].get_shape().ndims)
+    for input_ in sequence:
+      input_shape.merge_with(input_.get_shape())
+      input_.set_shape(input_shape)
+
+    # Join into (time, batch_size, depth)
+    s_joined = array_ops.stack(sequence)
+
+    # Reverse along dimension 0
+    s_reversed = array_ops.reverse_sequence(s_joined, lengths, 0, 1)
+    # Split again into list
+    result = array_ops.unstack(s_reversed)
+    for r, flat_result in zip(result, flat_results):
+      r.set_shape(input_shape)
+      flat_result.append(r)
+
+  results = [nest.pack_sequence_as(structure=input_, flat_sequence=flat_result)
+             for input_, flat_result in zip(input_seq, flat_results)]
+  return results
+
+
+def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
+                              initial_state_fw=None, initial_state_bw=None,
+                              dtype=None, parallel_iterations=None,
+                              swap_memory=False, time_major=False, scope=None):
+  """Creates a dynamic version of bidirectional recurrent neural network.
+
+  Takes input and builds independent forward and backward RNNs. The input_size
+  of forward and backward cell must match. The initial state for both directions
+  is zero by default (but can be set optionally) and no intermediate states are
+  ever returned -- the network is fully unrolled for the given (passed in)
+  length(s) of the sequence(s) or completely unrolled if length(s) is not
+  given.
+
+  Args:
+    cell_fw: An instance of RNNCell, to be used for forward direction.
+    cell_bw: An instance of RNNCell, to be used for backward direction.
+    inputs: The RNN inputs.
+      If time_major == False (default), this must be a tensor of shape:
+        `[batch_size, max_time, ...]`, or a nested tuple of such elements.
+      If time_major == True, this must be a tensor of shape:
+        `[max_time, batch_size, ...]`, or a nested tuple of such elements.
+    sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
+      containing the actual lengths for each of the sequences in the batch.
+      If not provided, all batch entries are assumed to be full sequences; and
+      time reversal is applied from time `0` to `max_time` for each sequence.
+    initial_state_fw: (optional) An initial state for the forward RNN.
+      This must be a tensor of appropriate type and shape
+      `[batch_size, cell_fw.state_size]`.
+      If `cell_fw.state_size` is a tuple, this should be a tuple of
+      tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
+    initial_state_bw: (optional) Same as for `initial_state_fw`, but using
+      the corresponding properties of `cell_bw`.
+    dtype: (optional) The data type for the initial states and expected output.
+      Required if initial_states are not provided or RNN states have a
+      heterogeneous dtype.
+    parallel_iterations: (Default: 32).  The number of iterations to run in
+      parallel.  Those operations which do not have any temporal dependency
+      and can be run in parallel, will be.  This parameter trades off
+      time for space.  Values >> 1 use more memory but take less time,
+      while smaller values use less memory but computations take longer.
+    swap_memory: Transparently swap the tensors produced in forward inference
+      but needed for back prop from GPU to CPU.  This allows training RNNs
+      which would typically not fit on a single GPU, with very minimal (or no)
+      performance penalty.
+    time_major: The shape format of the `inputs` and `outputs` Tensors.
+      If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+      If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+      Using `time_major = True` is a bit more efficient because it avoids
+      transposes at the beginning and end of the RNN calculation.  However,
+      most TensorFlow data is batch-major, so by default this function
+      accepts input and emits output in batch-major form.
+    scope: VariableScope for the created subgraph; defaults to
+      "bidirectional_rnn"
+
+  Returns:
+    A tuple (outputs, output_states) where:
+      outputs: A tuple (output_fw, output_bw) containing the forward and
+        the backward rnn output `Tensor`.
+        If time_major == False (default),
+          output_fw will be a `Tensor` shaped:
+          `[batch_size, max_time, cell_fw.output_size]`
+          and output_bw will be a `Tensor` shaped:
+          `[batch_size, max_time, cell_bw.output_size]`.
+        If time_major == True,
+          output_fw will be a `Tensor` shaped:
+          `[max_time, batch_size, cell_fw.output_size]`
+          and output_bw will be a `Tensor` shaped:
+          `[max_time, batch_size, cell_bw.output_size]`.
+        It returns a tuple instead of a single concatenated `Tensor`, unlike
+        in the `bidirectional_rnn`. If the concatenated one is preferred,
+        the forward and backward outputs can be concatenated as
+        `tf.concat(outputs, 2)`.
+      output_states: A tuple (output_state_fw, output_state_bw) containing
+        the forward and the backward final states of bidirectional rnn.
+
+  Raises:
+    TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
+  """
+
+  if not _like_rnncell(cell_fw):
+    raise TypeError("cell_fw must be an instance of RNNCell")
+  if not _like_rnncell(cell_bw):
+    raise TypeError("cell_bw must be an instance of RNNCell")
+
+  with vs.variable_scope(scope or "bidirectional_rnn"):
+    # Forward direction
+    with vs.variable_scope("fw") as fw_scope:
+      output_fw, output_state_fw = dynamic_rnn(
+          cell=cell_fw, inputs=inputs, sequence_length=sequence_length,
+          initial_state=initial_state_fw, dtype=dtype,
+          parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+          time_major=time_major, scope=fw_scope)
+
+    # Backward direction
+    if not time_major:
+      time_dim = 1
+      batch_dim = 0
+    else:
+      time_dim = 0
+      batch_dim = 1
+
+    def _reverse(input_, seq_lengths, seq_dim, batch_dim):
+      if seq_lengths is not None:
+        return array_ops.reverse_sequence(
+            input=input_, seq_lengths=seq_lengths,
+            seq_dim=seq_dim, batch_dim=batch_dim)
+      else:
+        return array_ops.reverse(input_, axis=[seq_dim])
+
+    with vs.variable_scope("bw") as bw_scope:
+      inputs_reverse = _reverse(
+          inputs, seq_lengths=sequence_length,
+          seq_dim=time_dim, batch_dim=batch_dim)
+      tmp, output_state_bw = dynamic_rnn(
+          cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
+          initial_state=initial_state_bw, dtype=dtype,
+          parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+          time_major=time_major, scope=bw_scope)
+
+  output_bw = _reverse(
+      tmp, seq_lengths=sequence_length,
+      seq_dim=time_dim, batch_dim=batch_dim)
+
+  outputs = (output_fw, output_bw)
+  output_states = (output_state_fw, output_state_bw)
+
+  return (outputs, output_states)
+
+
+def dynamic_rnn(cell, inputs, att_scores=None, sequence_length=None, initial_state=None,
+                dtype=None, parallel_iterations=None, swap_memory=False,
+                time_major=False, scope=None):
+  """Creates a recurrent neural network specified by RNNCell `cell`.
+
+  Performs fully dynamic unrolling of `inputs`.
+
+  Example:
+
+  ```python
+  # create a BasicRNNCell
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+
+  # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size]
+
+  # defining initial state
+  initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32)
+
+  # 'state' is a tensor of shape [batch_size, cell_state_size]
+  outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data,
+                                     initial_state=initial_state,
+                                     dtype=tf.float32)
+  ```
+
+  ```python
+  # create 2 LSTMCells
+  rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [128, 256]]
+
+  # create a RNN cell composed sequentially of a number of RNNCells
+  multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
+
+  # 'outputs' is a tensor of shape [batch_size, max_time, 256]
+  # 'state' is a N-tuple where N is the number of LSTMCells containing a
+  # tf.contrib.rnn.LSTMStateTuple for each cell
+  outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
+                                     inputs=data,
+                                     dtype=tf.float32)
+  ```
+
+
+  Args:
+    cell: An instance of RNNCell.
+    inputs: The RNN inputs.
+      If `time_major == False` (default), this must be a `Tensor` of shape:
+        `[batch_size, max_time, ...]`, or a nested tuple of such
+        elements.
+      If `time_major == True`, this must be a `Tensor` of shape:
+        `[max_time, batch_size, ...]`, or a nested tuple of such
+        elements.
+      This may also be a (possibly nested) tuple of Tensors satisfying
+      this property.  The first two dimensions must match across all the inputs,
+      but otherwise the ranks and other shape components may differ.
+      In this case, input to `cell` at each time-step will replicate the
+      structure of these tuples, except for the time dimension (from which the
+      time is taken).
+      The input to `cell` at each time step will be a `Tensor` or (possibly
+      nested) tuple of Tensors each with dimensions `[batch_size, ...]`.
+    sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
+      Used to copy-through state and zero-out outputs when past a batch
+      element's sequence length.  So it's more for correctness than performance.
+    initial_state: (optional) An initial state for the RNN.
+      If `cell.state_size` is an integer, this must be
+      a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
+      If `cell.state_size` is a tuple, this should be a tuple of
+      tensors having shapes `[batch_size, s] for s in cell.state_size`.
+    dtype: (optional) The data type for the initial state and expected output.
+      Required if initial_state is not provided or RNN state has a heterogeneous
+      dtype.
+    parallel_iterations: (Default: 32).  The number of iterations to run in
+      parallel.  Those operations which do not have any temporal dependency
+      and can be run in parallel, will be.  This parameter trades off
+      time for space.  Values >> 1 use more memory but take less time,
+      while smaller values use less memory but computations take longer.
+    swap_memory: Transparently swap the tensors produced in forward inference
+      but needed for back prop from GPU to CPU.  This allows training RNNs
+      which would typically not fit on a single GPU, with very minimal (or no)
+      performance penalty.
+    time_major: The shape format of the `inputs` and `outputs` Tensors.
+      If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+      If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+      Using `time_major = True` is a bit more efficient because it avoids
+      transposes at the beginning and end of the RNN calculation.  However,
+      most TensorFlow data is batch-major, so by default this function
+      accepts input and emits output in batch-major form.
+    scope: VariableScope for the created subgraph; defaults to "rnn".
+
+  Returns:
+    A pair (outputs, state) where:
+
+    outputs: The RNN output `Tensor`.
+
+      If time_major == False (default), this will be a `Tensor` shaped:
+        `[batch_size, max_time, cell.output_size]`.
+
+      If time_major == True, this will be a `Tensor` shaped:
+        `[max_time, batch_size, cell.output_size]`.
+
+      Note, if `cell.output_size` is a (possibly nested) tuple of integers
+      or `TensorShape` objects, then `outputs` will be a tuple having the
+      same structure as `cell.output_size`, containing Tensors having shapes
+      corresponding to the shape data in `cell.output_size`.
+
+    state: The final state.  If `cell.state_size` is an int, this
+      will be shaped `[batch_size, cell.state_size]`.  If it is a
+      `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
+      If it is a (possibly nested) tuple of ints or `TensorShape`, this will
+      be a tuple having the corresponding shapes. If cells are `LSTMCells`
+      `state` will be a tuple containing a `LSTMStateTuple` for each cell.
+
+  Raises:
+    TypeError: If `cell` is not an instance of RNNCell.
+    ValueError: If inputs is None or an empty list.
+  """
+  if not _like_rnncell(cell):
+    raise TypeError("cell must be an instance of RNNCell")
+
+  # By default, time_major==False and inputs are batch-major: shaped
+  #   [batch, time, depth]
+  # For internal calculations, we transpose to [time, batch, depth]
+  flat_input = nest.flatten(inputs)
+
+  if not time_major:
+    # (B,T,D) => (T,B,D)
+    flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
+    flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
+
+  parallel_iterations = parallel_iterations or 32
+  if sequence_length is not None:
+    sequence_length = math_ops.to_int32(sequence_length)
+    if sequence_length.get_shape().ndims not in (None, 1):
+      raise ValueError(
+          "sequence_length must be a vector of length batch_size, "
+          "but saw shape: %s" % sequence_length.get_shape())
+    sequence_length = array_ops.identity(  # Just to find it in the graph.
+        sequence_length, name="sequence_length")
+
+  # Create a new scope in which the caching device is either
+  # determined by the parent scope, or is set to place the cached
+  # Variable using the same placement as for the rest of the RNN.
+  with vs.variable_scope(scope or "rnn") as varscope:
+    if varscope.caching_device is None:
+      varscope.set_caching_device(lambda op: op.device)
+    batch_size = _best_effort_input_batch_size(flat_input)
+
+    if initial_state is not None:
+      state = initial_state
+    else:
+      if not dtype:
+        raise ValueError("If there is no initial_state, you must give a dtype.")
+      state = cell.zero_state(batch_size, dtype)
+
+    def _assert_has_shape(x, shape):
+      x_shape = array_ops.shape(x)
+      packed_shape = array_ops.stack(shape)
+      return control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)),
+          ["Expected shape for Tensor %s is " % x.name,
+           packed_shape, " but saw shape: ", x_shape])
+
+    if sequence_length is not None:
+      # Perform some shape validation
+      with ops.control_dependencies(
+          [_assert_has_shape(sequence_length, [batch_size])]):
+        sequence_length = array_ops.identity(
+            sequence_length, name="CheckSeqLen")
+
+    inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input)
+
+    (outputs, final_state) = _dynamic_rnn_loop(
+        cell,
+        inputs,
+        state,
+        parallel_iterations=parallel_iterations,
+        swap_memory=swap_memory,
+        att_scores = att_scores,
+        sequence_length=sequence_length,
+        dtype=dtype)
+
+    # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth].
+    # If we are performing batch-major calculations, transpose output back
+    # to shape [batch, time, depth]
+    if not time_major:
+      # (T,B,D) => (B,T,D)
+      outputs = nest.map_structure(_transpose_batch_time, outputs)
+
+    return (outputs, final_state)
+
+
+def _dynamic_rnn_loop(cell,
+                      inputs,
+                      initial_state,
+                      parallel_iterations,
+                      swap_memory,
+                      att_scores = None,
+                      sequence_length=None,
+                      dtype=None):
+  """Internal implementation of Dynamic RNN.
+
+  Args:
+    cell: An instance of RNNCell.
+    inputs: A `Tensor` of shape [time, batch_size, input_size], or a nested
+      tuple of such elements.
+    initial_state: A `Tensor` of shape `[batch_size, state_size]`, or if
+      `cell.state_size` is a tuple, then this should be a tuple of
+      tensors having shapes `[batch_size, s] for s in cell.state_size`.
+    parallel_iterations: Positive Python int.
+    swap_memory: A Python boolean
+    sequence_length: (optional) An `int32` `Tensor` of shape [batch_size].
+    dtype: (optional) Expected dtype of output. If not specified, inferred from
+      initial_state.
+
+  Returns:
+    Tuple `(final_outputs, final_state)`.
+    final_outputs:
+      A `Tensor` of shape `[time, batch_size, cell.output_size]`.  If
+      `cell.output_size` is a (possibly nested) tuple of ints or `TensorShape`
+      objects, then this returns a (possibly nsted) tuple of Tensors matching
+      the corresponding shapes.
+    final_state:
+      A `Tensor`, or possibly nested tuple of Tensors, matching in length
+      and shapes to `initial_state`.
+
+  Raises:
+    ValueError: If the input depth cannot be inferred via shape inference
+      from the inputs.
+  """
+  state = initial_state
+  assert isinstance(parallel_iterations, int), "parallel_iterations must be int"
+
+  state_size = cell.state_size
+
+  flat_input = nest.flatten(inputs)
+  flat_output_size = nest.flatten(cell.output_size)
+
+  # Construct an initial output
+  input_shape = array_ops.shape(flat_input[0])
+  time_steps = input_shape[0]
+  batch_size = _best_effort_input_batch_size(flat_input)
+
+  inputs_got_shape = tuple(input_.get_shape().with_rank_at_least(3)
+                           for input_ in flat_input)
+
+  const_time_steps, const_batch_size = inputs_got_shape[0].as_list()[:2]
+
+  for shape in inputs_got_shape:
+    if not shape[2:].is_fully_defined():
+      raise ValueError(
+          "Input size (depth of inputs) must be accessible via shape inference,"
+          " but saw value None.")
+    got_time_steps = shape[0].value
+    got_batch_size = shape[1].value
+    if const_time_steps != got_time_steps:
+      raise ValueError(
+          "Time steps is not the same for all the elements in the input in a "
+          "batch.")
+    if const_batch_size != got_batch_size:
+      raise ValueError(
+          "Batch_size is not the same for all the elements in the input.")
+
+  # Prepare dynamic conditional copying of state & output
+  def _create_zero_arrays(size):
+    size = _concat(batch_size, size)
+    return array_ops.zeros(
+        array_ops.stack(size), _infer_state_dtype(dtype, state))
+
+  flat_zero_output = tuple(_create_zero_arrays(output)
+                           for output in flat_output_size)
+  zero_output = nest.pack_sequence_as(structure=cell.output_size,
+                                      flat_sequence=flat_zero_output)
+
+  if sequence_length is not None:
+    min_sequence_length = math_ops.reduce_min(sequence_length)
+    max_sequence_length = math_ops.reduce_max(sequence_length)
+
+  time = array_ops.constant(0, dtype=dtypes.int32, name="time")
+
+  with ops.name_scope("dynamic_rnn") as scope:
+    base_name = scope
+
+  def _create_ta(name, dtype):
+    return tensor_array_ops.TensorArray(dtype=dtype,
+                                        size=time_steps,
+                                        tensor_array_name=base_name + name)
+
+  output_ta = tuple(_create_ta("output_%d" % i,
+                               _infer_state_dtype(dtype, state))
+                    for i in range(len(flat_output_size)))
+  input_ta = tuple(_create_ta("input_%d" % i, flat_input[i].dtype)
+                   for i in range(len(flat_input)))
+
+  input_ta = tuple(ta.unstack(input_)
+                   for ta, input_ in zip(input_ta, flat_input))
+
+  def _time_step(time, output_ta_t, state, att_scores=None):
+    """Take a time step of the dynamic RNN.
+
+    Args:
+      time: int32 scalar Tensor.
+      output_ta_t: List of `TensorArray`s that represent the output.
+      state: nested tuple of vector tensors that represent the state.
+
+    Returns:
+      The tuple (time + 1, output_ta_t with updated flow, new_state).
+    """
+
+    input_t = tuple(ta.read(time) for ta in input_ta)
+    # Restore some shape information
+    for input_, shape in zip(input_t, inputs_got_shape):
+      input_.set_shape(shape[1:])
+
+    input_t = nest.pack_sequence_as(structure=inputs, flat_sequence=input_t)
+    if att_scores is not None:
+        att_score = att_scores[:, time, :]
+        call_cell = lambda: cell(input_t, state, att_score)
+    else:
+        call_cell = lambda: cell(input_t, state)
+
+    if sequence_length is not None:
+      (output, new_state) = _rnn_step(
+          time=time,
+          sequence_length=sequence_length,
+          min_sequence_length=min_sequence_length,
+          max_sequence_length=max_sequence_length,
+          zero_output=zero_output,
+          state=state,
+          call_cell=call_cell,
+          state_size=state_size,
+          skip_conditionals=True)
+    else:
+      (output, new_state) = call_cell()
+
+    # Pack state if using state tuples
+    output = nest.flatten(output)
+
+    output_ta_t = tuple(
+        ta.write(time, out) for ta, out in zip(output_ta_t, output))
+    if att_scores is not None:
+        return (time + 1, output_ta_t, new_state, att_scores)
+    else:
+        return (time + 1, output_ta_t, new_state)
+
+  if att_scores is not None:  
+      _, output_final_ta, final_state, _ = control_flow_ops.while_loop(
+          cond=lambda time, *_: time < time_steps,
+          body=_time_step,
+          loop_vars=(time, output_ta, state, att_scores),
+          parallel_iterations=parallel_iterations,
+          swap_memory=swap_memory)
+  else:
+      _, output_final_ta, final_state = control_flow_ops.while_loop(
+          cond=lambda time, *_: time < time_steps,
+          body=_time_step,
+          loop_vars=(time, output_ta, state),
+          parallel_iterations=parallel_iterations,
+          swap_memory=swap_memory)
+
+  # Unpack final output if not using output tuples.
+  final_outputs = tuple(ta.stack() for ta in output_final_ta)
+
+  # Restore some shape information
+  for output, output_size in zip(final_outputs, flat_output_size):
+    shape = _concat(
+        [const_time_steps, const_batch_size], output_size, static=True)
+    output.set_shape(shape)
+
+  final_outputs = nest.pack_sequence_as(
+      structure=cell.output_size, flat_sequence=final_outputs)
+
+  return (final_outputs, final_state)
+
+
+def raw_rnn(cell, loop_fn,
+            parallel_iterations=None, swap_memory=False, scope=None):
+  """Creates an `RNN` specified by RNNCell `cell` and loop function `loop_fn`.
+
+  **NOTE: This method is still in testing, and the API may change.**
+
+  This function is a more primitive version of `dynamic_rnn` that provides
+  more direct access to the inputs each iteration.  It also provides more
+  control over when to start and finish reading the sequence, and
+  what to emit for the output.
+
+  For example, it can be used to implement the dynamic decoder of a seq2seq
+  model.
+
+  Instead of working with `Tensor` objects, most operations work with
+  `TensorArray` objects directly.
+
+  The operation of `raw_rnn`, in pseudo-code, is basically the following:
+
+  ```python
+  time = tf.constant(0, dtype=tf.int32)
+  (finished, next_input, initial_state, _, loop_state) = loop_fn(
+      time=time, cell_output=None, cell_state=None, loop_state=None)
+  emit_ta = TensorArray(dynamic_size=True, dtype=initial_state.dtype)
+  state = initial_state
+  while not all(finished):
+    (output, cell_state) = cell(next_input, state)
+    (next_finished, next_input, next_state, emit, loop_state) = loop_fn(
+        time=time + 1, cell_output=output, cell_state=cell_state,
+        loop_state=loop_state)
+    # Emit zeros and copy forward state for minibatch entries that are finished.
+    state = tf.where(finished, state, next_state)
+    emit = tf.where(finished, tf.zeros_like(emit), emit)
+    emit_ta = emit_ta.write(time, emit)
+    # If any new minibatch entries are marked as finished, mark these.
+    finished = tf.logical_or(finished, next_finished)
+    time += 1
+  return (emit_ta, state, loop_state)
+  ```
+
+  with the additional properties that output and state may be (possibly nested)
+  tuples, as determined by `cell.output_size` and `cell.state_size`, and
+  as a result the final `state` and `emit_ta` may themselves be tuples.
+
+  A simple implementation of `dynamic_rnn` via `raw_rnn` looks like this:
+
+  ```python
+  inputs = tf.placeholder(shape=(max_time, batch_size, input_depth),
+                          dtype=tf.float32)
+  sequence_length = tf.placeholder(shape=(batch_size,), dtype=tf.int32)
+  inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time)
+  inputs_ta = inputs_ta.unstack(inputs)
+
+  cell = tf.contrib.rnn.LSTMCell(num_units)
+
+  def loop_fn(time, cell_output, cell_state, loop_state):
+    emit_output = cell_output  # == None for time == 0
+    if cell_output is None:  # time == 0
+      next_cell_state = cell.zero_state(batch_size, tf.float32)
+    else:
+      next_cell_state = cell_state
+    elements_finished = (time >= sequence_length)
+    finished = tf.reduce_all(elements_finished)
+    next_input = tf.cond(
+        finished,
+        lambda: tf.zeros([batch_size, input_depth], dtype=tf.float32),
+        lambda: inputs_ta.read(time))
+    next_loop_state = None
+    return (elements_finished, next_input, next_cell_state,
+            emit_output, next_loop_state)
+
+  outputs_ta, final_state, _ = raw_rnn(cell, loop_fn)
+  outputs = outputs_ta.stack()
+  ```
+
+  Args:
+    cell: An instance of RNNCell.
+    loop_fn: A callable that takes inputs
+      `(time, cell_output, cell_state, loop_state)`
+      and returns the tuple
+      `(finished, next_input, next_cell_state, emit_output, next_loop_state)`.
+      Here `time` is an int32 scalar `Tensor`, `cell_output` is a
+      `Tensor` or (possibly nested) tuple of tensors as determined by
+      `cell.output_size`, and `cell_state` is a `Tensor`
+      or (possibly nested) tuple of tensors, as determined by the `loop_fn`
+      on its first call (and should match `cell.state_size`).
+      The outputs are: `finished`, a boolean `Tensor` of
+      shape `[batch_size]`, `next_input`: the next input to feed to `cell`,
+      `next_cell_state`: the next state to feed to `cell`,
+      and `emit_output`: the output to store for this iteration.
+
+      Note that `emit_output` should be a `Tensor` or (possibly nested)
+      tuple of tensors with shapes and structure matching `cell.output_size`
+      and `cell_output` above.  The parameter `cell_state` and output
+      `next_cell_state` may be either a single or (possibly nested) tuple
+      of tensors.  The parameter `loop_state` and
+      output `next_loop_state` may be either a single or (possibly nested) tuple
+      of `Tensor` and `TensorArray` objects.  This last parameter
+      may be ignored by `loop_fn` and the return value may be `None`.  If it
+      is not `None`, then the `loop_state` will be propagated through the RNN
+      loop, for use purely by `loop_fn` to keep track of its own state.
+      The `next_loop_state` parameter returned may be `None`.
+
+      The first call to `loop_fn` will be `time = 0`, `cell_output = None`,
+      `cell_state = None`, and `loop_state = None`.  For this call:
+      The `next_cell_state` value should be the value with which to initialize
+      the cell's state.  It may be a final state from a previous RNN or it
+      may be the output of `cell.zero_state()`.  It should be a
+      (possibly nested) tuple structure of tensors.
+      If `cell.state_size` is an integer, this must be
+      a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
+      If `cell.state_size` is a `TensorShape`, this must be a `Tensor` of
+      appropriate type and shape `[batch_size] + cell.state_size`.
+      If `cell.state_size` is a (possibly nested) tuple of ints or
+      `TensorShape`, this will be a tuple having the corresponding shapes.
+      The `emit_output` value may be either `None` or a (possibly nested)
+      tuple structure of tensors, e.g.,
+      `(tf.zeros(shape_0, dtype=dtype_0), tf.zeros(shape_1, dtype=dtype_1))`.
+      If this first `emit_output` return value is `None`,
+      then the `emit_ta` result of `raw_rnn` will have the same structure and
+      dtypes as `cell.output_size`.  Otherwise `emit_ta` will have the same
+      structure, shapes (prepended with a `batch_size` dimension), and dtypes
+      as `emit_output`.  The actual values returned for `emit_output` at this
+      initializing call are ignored.  Note, this emit structure must be
+      consistent across all time steps.
+
+    parallel_iterations: (Default: 32).  The number of iterations to run in
+      parallel.  Those operations which do not have any temporal dependency
+      and can be run in parallel, will be.  This parameter trades off
+      time for space.  Values >> 1 use more memory but take less time,
+      while smaller values use less memory but computations take longer.
+    swap_memory: Transparently swap the tensors produced in forward inference
+      but needed for back prop from GPU to CPU.  This allows training RNNs
+      which would typically not fit on a single GPU, with very minimal (or no)
+      performance penalty.
+    scope: VariableScope for the created subgraph; defaults to "rnn".
+
+  Returns:
+    A tuple `(emit_ta, final_state, final_loop_state)` where:
+
+    `emit_ta`: The RNN output `TensorArray`.
+       If `loop_fn` returns a (possibly nested) set of Tensors for
+       `emit_output` during initialization, (inputs `time = 0`,
+       `cell_output = None`, and `loop_state = None`), then `emit_ta` will
+       have the same structure, dtypes, and shapes as `emit_output` instead.
+       If `loop_fn` returns `emit_output = None` during this call,
+       the structure of `cell.output_size` is used:
+       If `cell.output_size` is a (possibly nested) tuple of integers
+       or `TensorShape` objects, then `emit_ta` will be a tuple having the
+       same structure as `cell.output_size`, containing TensorArrays whose
+       elements' shapes correspond to the shape data in `cell.output_size`.
+
+    `final_state`: The final cell state.  If `cell.state_size` is an int, this
+      will be shaped `[batch_size, cell.state_size]`.  If it is a
+      `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
+      If it is a (possibly nested) tuple of ints or `TensorShape`, this will
+      be a tuple having the corresponding shapes.
+
+    `final_loop_state`: The final loop state as returned by `loop_fn`.
+
+  Raises:
+    TypeError: If `cell` is not an instance of RNNCell, or `loop_fn` is not
+      a `callable`.
+  """
+
+  if not _like_rnncell(cell):
+    raise TypeError("cell must be an instance of RNNCell")
+  if not callable(loop_fn):
+    raise TypeError("loop_fn must be a callable")
+
+  parallel_iterations = parallel_iterations or 32
+
+  # Create a new scope in which the caching device is either
+  # determined by the parent scope, or is set to place the cached
+  # Variable using the same placement as for the rest of the RNN.
+  with vs.variable_scope(scope or "rnn") as varscope:
+    if varscope.caching_device is None:
+      varscope.set_caching_device(lambda op: op.device)
+
+    time = constant_op.constant(0, dtype=dtypes.int32)
+    (elements_finished, next_input, initial_state, emit_structure,
+     init_loop_state) = loop_fn(
+         time, None, None, None)  # time, cell_output, cell_state, loop_state
+    flat_input = nest.flatten(next_input)
+
+    # Need a surrogate loop state for the while_loop if none is available.
+    loop_state = (init_loop_state if init_loop_state is not None
+                  else constant_op.constant(0, dtype=dtypes.int32))
+
+    input_shape = [input_.get_shape() for input_ in flat_input]
+    static_batch_size = input_shape[0][0]
+
+    for input_shape_i in input_shape:
+      # Static verification that batch sizes all match
+      static_batch_size.merge_with(input_shape_i[0])
+
+    batch_size = static_batch_size.value
+    if batch_size is None:
+      batch_size = array_ops.shape(flat_input[0])[0]
+
+    nest.assert_same_structure(initial_state, cell.state_size)
+    state = initial_state
+    flat_state = nest.flatten(state)
+    flat_state = [ops.convert_to_tensor(s) for s in flat_state]
+    state = nest.pack_sequence_as(structure=state,
+                                  flat_sequence=flat_state)
+
+    if emit_structure is not None:
+      flat_emit_structure = nest.flatten(emit_structure)
+      flat_emit_size = [emit.shape if emit.shape.is_fully_defined() else
+                        array_ops.shape(emit) for emit in flat_emit_structure]
+      flat_emit_dtypes = [emit.dtype for emit in flat_emit_structure]
+    else:
+      emit_structure = cell.output_size
+      flat_emit_size = nest.flatten(emit_structure)
+      flat_emit_dtypes = [flat_state[0].dtype] * len(flat_emit_size)
+
+    flat_emit_ta = [
+        tensor_array_ops.TensorArray(
+            dtype=dtype_i, dynamic_size=True, size=0, name="rnn_output_%d" % i)
+        for i, dtype_i in enumerate(flat_emit_dtypes)]
+    emit_ta = nest.pack_sequence_as(structure=emit_structure,
+                                    flat_sequence=flat_emit_ta)
+    flat_zero_emit = [
+        array_ops.zeros(_concat(batch_size, size_i), dtype_i)
+        for size_i, dtype_i in zip(flat_emit_size, flat_emit_dtypes)]
+    zero_emit = nest.pack_sequence_as(structure=emit_structure,
+                                      flat_sequence=flat_zero_emit)
+
+    def condition(unused_time, elements_finished, *_):
+      return math_ops.logical_not(math_ops.reduce_all(elements_finished))
+
+    def body(time, elements_finished, current_input,
+             emit_ta, state, loop_state):
+      """Internal while loop body for raw_rnn.
+
+      Args:
+        time: time scalar.
+        elements_finished: batch-size vector.
+        current_input: possibly nested tuple of input tensors.
+        emit_ta: possibly nested tuple of output TensorArrays.
+        state: possibly nested tuple of state tensors.
+        loop_state: possibly nested tuple of loop state tensors.
+
+      Returns:
+        Tuple having the same size as Args but with updated values.
+      """
+      (next_output, cell_state) = cell(current_input, state)
+
+      nest.assert_same_structure(state, cell_state)
+      nest.assert_same_structure(cell.output_size, next_output)
+
+      next_time = time + 1
+      (next_finished, next_input, next_state, emit_output,
+       next_loop_state) = loop_fn(
+           next_time, next_output, cell_state, loop_state)
+
+      nest.assert_same_structure(state, next_state)
+      nest.assert_same_structure(current_input, next_input)
+      nest.assert_same_structure(emit_ta, emit_output)
+
+      # If loop_fn returns None for next_loop_state, just reuse the
+      # previous one.
+      loop_state = loop_state if next_loop_state is None else next_loop_state
+
+      def _copy_some_through(current, candidate):
+        """Copy some tensors through via array_ops.where."""
+        def copy_fn(cur_i, cand_i):
+          with ops.colocate_with(cand_i):
+            return array_ops.where(elements_finished, cur_i, cand_i)
+        return nest.map_structure(copy_fn, current, candidate)
+
+      emit_output = _copy_some_through(zero_emit, emit_output)
+      next_state = _copy_some_through(state, next_state)
+
+      emit_ta = nest.map_structure(
+          lambda ta, emit: ta.write(time, emit), emit_ta, emit_output)
+
+      elements_finished = math_ops.logical_or(elements_finished, next_finished)
+
+      return (next_time, elements_finished, next_input,
+              emit_ta, next_state, loop_state)
+
+    returned = control_flow_ops.while_loop(
+        condition, body, loop_vars=[
+            time, elements_finished, next_input,
+            emit_ta, state, loop_state],
+        parallel_iterations=parallel_iterations,
+        swap_memory=swap_memory)
+
+    (emit_ta, final_state, final_loop_state) = returned[-3:]
+
+    if init_loop_state is None:
+      final_loop_state = None
+
+    return (emit_ta, final_state, final_loop_state)
+
+
+def static_rnn(cell,
+               inputs,
+               initial_state=None,
+               dtype=None,
+               sequence_length=None,
+               scope=None):
+  """Creates a recurrent neural network specified by RNNCell `cell`.
+
+  The simplest form of RNN network generated is:
+
+  ```python
+    state = cell.zero_state(...)
+    outputs = []
+    for input_ in inputs:
+      output, state = cell(input_, state)
+      outputs.append(output)
+    return (outputs, state)
+  ```
+  However, a few other options are available:
+
+  An initial state can be provided.
+  If the sequence_length vector is provided, dynamic calculation is performed.
+  This method of calculation does not compute the RNN steps past the maximum
+  sequence length of the minibatch (thus saving computational time),
+  and properly propagates the state at an example's sequence length
+  to the final state output.
+
+  The dynamic calculation performed is, at time `t` for batch row `b`,
+
+  ```python
+    (output, state)(b, t) =
+      (t >= sequence_length(b))
+        ? (zeros(cell.output_size), states(b, sequence_length(b) - 1))
+        : cell(input(b, t), state(b, t - 1))
+  ```
+
+  Args:
+    cell: An instance of RNNCell.
+    inputs: A length T list of inputs, each a `Tensor` of shape
+      `[batch_size, input_size]`, or a nested tuple of such elements.
+    initial_state: (optional) An initial state for the RNN.
+      If `cell.state_size` is an integer, this must be
+      a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
+      If `cell.state_size` is a tuple, this should be a tuple of
+      tensors having shapes `[batch_size, s] for s in cell.state_size`.
+    dtype: (optional) The data type for the initial state and expected output.
+      Required if initial_state is not provided or RNN state has a heterogeneous
+      dtype.
+    sequence_length: Specifies the length of each sequence in inputs.
+      An int32 or int64 vector (tensor) size `[batch_size]`, values in `[0, T)`.
+    scope: VariableScope for the created subgraph; defaults to "rnn".
+
+  Returns:
+    A pair (outputs, state) where:
+
+    - outputs is a length T list of outputs (one for each input), or a nested
+      tuple of such elements.
+    - state is the final state
+
+  Raises:
+    TypeError: If `cell` is not an instance of RNNCell.
+    ValueError: If `inputs` is `None` or an empty list, or if the input depth
+      (column size) cannot be inferred from inputs via shape inference.
+  """
+
+  if not _like_rnncell(cell):
+    raise TypeError("cell must be an instance of RNNCell")
+  if not nest.is_sequence(inputs):
+    raise TypeError("inputs must be a sequence")
+  if not inputs:
+    raise ValueError("inputs must not be empty")
+
+  outputs = []
+  # Create a new scope in which the caching device is either
+  # determined by the parent scope, or is set to place the cached
+  # Variable using the same placement as for the rest of the RNN.
+  with vs.variable_scope(scope or "rnn") as varscope:
+    if varscope.caching_device is None:
+      varscope.set_caching_device(lambda op: op.device)
+
+    # Obtain the first sequence of the input
+    first_input = inputs
+    while nest.is_sequence(first_input):
+      first_input = first_input[0]
+
+    # Temporarily avoid EmbeddingWrapper and seq2seq badness
+    # TODO(lukaszkaiser): remove EmbeddingWrapper
+    if first_input.get_shape().ndims != 1:
+
+      input_shape = first_input.get_shape().with_rank_at_least(2)
+      fixed_batch_size = input_shape[0]
+
+      flat_inputs = nest.flatten(inputs)
+      for flat_input in flat_inputs:
+        input_shape = flat_input.get_shape().with_rank_at_least(2)
+        batch_size, input_size = input_shape[0], input_shape[1:]
+        fixed_batch_size.merge_with(batch_size)
+        for i, size in enumerate(input_size):
+          if size.value is None:
+            raise ValueError(
+                "Input size (dimension %d of inputs) must be accessible via "
+                "shape inference, but saw value None." % i)
+    else:
+      fixed_batch_size = first_input.get_shape().with_rank_at_least(1)[0]
+
+    if fixed_batch_size.value:
+      batch_size = fixed_batch_size.value
+    else:
+      batch_size = array_ops.shape(first_input)[0]
+    if initial_state is not None:
+      state = initial_state
+    else:
+      if not dtype:
+        raise ValueError("If no initial_state is provided, "
+                         "dtype must be specified")
+      state = cell.zero_state(batch_size, dtype)
+
+    if sequence_length is not None:  # Prepare variables
+      sequence_length = ops.convert_to_tensor(
+          sequence_length, name="sequence_length")
+      if sequence_length.get_shape().ndims not in (None, 1):
+        raise ValueError(
+            "sequence_length must be a vector of length batch_size")
+
+      def _create_zero_output(output_size):
+        # convert int to TensorShape if necessary
+        size = _concat(batch_size, output_size)
+        output = array_ops.zeros(
+            array_ops.stack(size), _infer_state_dtype(dtype, state))
+        shape = _concat(fixed_batch_size.value, output_size, static=True)
+        output.set_shape(tensor_shape.TensorShape(shape))
+        return output
+
+      output_size = cell.output_size
+      flat_output_size = nest.flatten(output_size)
+      flat_zero_output = tuple(
+          _create_zero_output(size) for size in flat_output_size)
+      zero_output = nest.pack_sequence_as(
+          structure=output_size, flat_sequence=flat_zero_output)
+
+      sequence_length = math_ops.to_int32(sequence_length)
+      min_sequence_length = math_ops.reduce_min(sequence_length)
+      max_sequence_length = math_ops.reduce_max(sequence_length)
+
+    for time, input_ in enumerate(inputs):
+      if time > 0:
+        varscope.reuse_variables()
+      # pylint: disable=cell-var-from-loop
+      call_cell = lambda: cell(input_, state)
+      # pylint: enable=cell-var-from-loop
+      if sequence_length is not None:
+        (output, state) = _rnn_step(
+            time=time,
+            sequence_length=sequence_length,
+            min_sequence_length=min_sequence_length,
+            max_sequence_length=max_sequence_length,
+            zero_output=zero_output,
+            state=state,
+            call_cell=call_cell,
+            state_size=cell.state_size)
+      else:
+        (output, state) = call_cell()
+
+      outputs.append(output)
+
+    return (outputs, state)
+
+
+def static_state_saving_rnn(cell,
+                            inputs,
+                            state_saver,
+                            state_name,
+                            sequence_length=None,
+                            scope=None):
+  """RNN that accepts a state saver for time-truncated RNN calculation.
+
+  Args:
+    cell: An instance of `RNNCell`.
+    inputs: A length T list of inputs, each a `Tensor` of shape
+      `[batch_size, input_size]`.
+    state_saver: A state saver object with methods `state` and `save_state`.
+    state_name: Python string or tuple of strings.  The name to use with the
+      state_saver. If the cell returns tuples of states (i.e.,
+      `cell.state_size` is a tuple) then `state_name` should be a tuple of
+      strings having the same length as `cell.state_size`.  Otherwise it should
+      be a single string.
+    sequence_length: (optional) An int32/int64 vector size [batch_size].
+      See the documentation for rnn() for more details about sequence_length.
+    scope: VariableScope for the created subgraph; defaults to "rnn".
+
+  Returns:
+    A pair (outputs, state) where:
+      outputs is a length T list of outputs (one for each input)
+      states is the final state
+
+  Raises:
+    TypeError: If `cell` is not an instance of RNNCell.
+    ValueError: If `inputs` is `None` or an empty list, or if the arity and
+     type of `state_name` does not match that of `cell.state_size`.
+  """
+  state_size = cell.state_size
+  state_is_tuple = nest.is_sequence(state_size)
+  state_name_tuple = nest.is_sequence(state_name)
+
+  if state_is_tuple != state_name_tuple:
+    raise ValueError("state_name should be the same type as cell.state_size.  "
+                     "state_name: %s, cell.state_size: %s" % (str(state_name),
+                                                              str(state_size)))
+
+  if state_is_tuple:
+    state_name_flat = nest.flatten(state_name)
+    state_size_flat = nest.flatten(state_size)
+
+    if len(state_name_flat) != len(state_size_flat):
+      raise ValueError("#elems(state_name) != #elems(state_size): %d vs. %d" %
+                       (len(state_name_flat), len(state_size_flat)))
+
+    initial_state = nest.pack_sequence_as(
+        structure=state_size,
+        flat_sequence=[state_saver.state(s) for s in state_name_flat])
+  else:
+    initial_state = state_saver.state(state_name)
+
+  (outputs, state) = static_rnn(
+      cell,
+      inputs,
+      initial_state=initial_state,
+      sequence_length=sequence_length,
+      scope=scope)
+
+  if state_is_tuple:
+    flat_state = nest.flatten(state)
+    state_name = nest.flatten(state_name)
+    save_state = [
+        state_saver.save_state(name, substate)
+        for name, substate in zip(state_name, flat_state)
+    ]
+  else:
+    save_state = [state_saver.save_state(state_name, state)]
+
+  with ops.control_dependencies(save_state):
+    last_output = outputs[-1]
+    flat_last_output = nest.flatten(last_output)
+    flat_last_output = [
+        array_ops.identity(output) for output in flat_last_output
+    ]
+    outputs[-1] = nest.pack_sequence_as(
+        structure=last_output, flat_sequence=flat_last_output)
+
+  return (outputs, state)
+
+
+def static_bidirectional_rnn(cell_fw,
+                             cell_bw,
+                             inputs,
+                             initial_state_fw=None,
+                             initial_state_bw=None,
+                             dtype=None,
+                             sequence_length=None,
+                             scope=None):
+  """Creates a bidirectional recurrent neural network.
+
+  Similar to the unidirectional case above (rnn) but takes input and builds
+  independent forward and backward RNNs with the final forward and backward
+  outputs depth-concatenated, such that the output will have the format
+  [time][batch][cell_fw.output_size + cell_bw.output_size]. The input_size of
+  forward and backward cell must match. The initial state for both directions
+  is zero by default (but can be set optionally) and no intermediate states are
+  ever returned -- the network is fully unrolled for the given (passed in)
+  length(s) of the sequence(s) or completely unrolled if length(s) is not given.
+
+  Args:
+    cell_fw: An instance of RNNCell, to be used for forward direction.
+    cell_bw: An instance of RNNCell, to be used for backward direction.
+    inputs: A length T list of inputs, each a tensor of shape
+      [batch_size, input_size], or a nested tuple of such elements.
+    initial_state_fw: (optional) An initial state for the forward RNN.
+      This must be a tensor of appropriate type and shape
+      `[batch_size, cell_fw.state_size]`.
+      If `cell_fw.state_size` is a tuple, this should be a tuple of
+      tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
+    initial_state_bw: (optional) Same as for `initial_state_fw`, but using
+      the corresponding properties of `cell_bw`.
+    dtype: (optional) The data type for the initial state.  Required if
+      either of the initial states are not provided.
+    sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
+      containing the actual lengths for each of the sequences.
+    scope: VariableScope for the created subgraph; defaults to
+      "bidirectional_rnn"
+
+  Returns:
+    A tuple (outputs, output_state_fw, output_state_bw) where:
+      outputs is a length `T` list of outputs (one for each input), which
+        are depth-concatenated forward and backward outputs.
+      output_state_fw is the final state of the forward rnn.
+      output_state_bw is the final state of the backward rnn.
+
+  Raises:
+    TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
+    ValueError: If inputs is None or an empty list.
+  """
+
+  if not _like_rnncell(cell_fw):
+    raise TypeError("cell_fw must be an instance of RNNCell")
+  if not _like_rnncell(cell_bw):
+    raise TypeError("cell_bw must be an instance of RNNCell")
+  if not nest.is_sequence(inputs):
+    raise TypeError("inputs must be a sequence")
+  if not inputs:
+    raise ValueError("inputs must not be empty")
+
+  with vs.variable_scope(scope or "bidirectional_rnn"):
+    # Forward direction
+    with vs.variable_scope("fw") as fw_scope:
+      output_fw, output_state_fw = static_rnn(
+          cell_fw,
+          inputs,
+          initial_state_fw,
+          dtype,
+          sequence_length,
+          scope=fw_scope)
+
+    # Backward direction
+    with vs.variable_scope("bw") as bw_scope:
+      reversed_inputs = _reverse_seq(inputs, sequence_length)
+      tmp, output_state_bw = static_rnn(
+          cell_bw,
+          reversed_inputs,
+          initial_state_bw,
+          dtype,
+          sequence_length,
+          scope=bw_scope)
+
+  output_bw = _reverse_seq(tmp, sequence_length)
+  # Concat each of the forward/backward outputs
+  flat_output_fw = nest.flatten(output_fw)
+  flat_output_bw = nest.flatten(output_bw)
+
+  flat_outputs = tuple(
+      array_ops.concat([fw, bw], 1)
+      for fw, bw in zip(flat_output_fw, flat_output_bw))
+
+  outputs = nest.pack_sequence_as(
+      structure=output_fw, flat_sequence=flat_outputs)
+
+  return (outputs, output_state_fw, output_state_bw)
diff --git a/modelzoo/CAN/script/shuffle.py b/modelzoo/CAN/script/shuffle.py
new file mode 100644
index 00000000000..3bc0b86d750
--- /dev/null
+++ b/modelzoo/CAN/script/shuffle.py
@@ -0,0 +1,42 @@
+import os
+import sys
+import random
+
+import tempfile
+from subprocess import call
+
+
+def main(file, temporary=False):
+    tf_os, tpath = tempfile.mkstemp(dir='~/DIN-V2-CODE')
+    tf = open(tpath, 'w')
+
+    fd = open(file, "r")
+    for l in fd:
+        print >> tf, l.strip("\n")
+    tf.close()
+
+    lines = open(tpath, 'r').readlines()
+    random.shuffle(lines)
+    if temporary:
+        path, filename = os.path.split(os.path.realpath(file))
+        fd = tempfile.TemporaryFile(prefix=filename + '.shuf', dir=path)
+    else:
+        fd = open(file + '.shuf', 'w')
+
+    for l in lines:
+        s = l.strip("\n")
+        print >> fd, s
+
+    if temporary:
+        fd.seek(0)
+    else:
+        fd.close()
+
+    os.remove(tpath)
+
+    return fd
+
+
+if __name__ == '__main__':
+    main(sys.argv[1])
+
diff --git a/modelzoo/CAN/script/split_by_user.py b/modelzoo/CAN/script/split_by_user.py
new file mode 100644
index 00000000000..9f570d97819
--- /dev/null
+++ b/modelzoo/CAN/script/split_by_user.py
@@ -0,0 +1,20 @@
+import random
+
+fi = open("/home/test/modelzoo/DIEN/data/local_test", "r")
+ftrain = open("/home/test/modelzoo/DIEN/data/local_train_splitByUser", "w")
+ftest = open("/home/test/modelzoo/DIEN/data/local_test_splitByUser", "w")
+
+while True:
+    rand_int = random.randint(1, 10)
+    noclk_line = fi.readline().strip()
+    clk_line = fi.readline().strip()
+    if noclk_line == "" or clk_line == "":
+        break
+    if rand_int == 2:
+        print(noclk_line,file=ftest)
+        print(clk_line,file=ftest)
+    else:
+        print(noclk_line,file=ftrain)
+        print(clk_line,file=ftrain)
+        
+
diff --git a/modelzoo/CAN/script/test.py b/modelzoo/CAN/script/test.py
new file mode 100644
index 00000000000..64b9a7f3337
--- /dev/null
+++ b/modelzoo/CAN/script/test.py
@@ -0,0 +1,10 @@
+import os
+import pandas as pd
+
+file =  '/home/test/modelzoo/DIEN/data/local_train_splitByUser'
+# if os.path.exists(file+'_neg') is True:
+#     print('YES')
+# else:
+#     print('NOT')
+data = pd.read_csv(file)
+print(data.head())
\ No newline at end of file
diff --git a/modelzoo/CAN/script/train.py b/modelzoo/CAN/script/train.py
new file mode 100644
index 00000000000..bc1c8a8d97d
--- /dev/null
+++ b/modelzoo/CAN/script/train.py
@@ -0,0 +1,293 @@
+import numpy
+from data_iterator import DataIterator
+import tensorflow as tf
+from model import *
+import time
+import random
+import sys
+from utils import *
+from tqdm import tqdm
+
+EMBEDDING_DIM = 18
+HIDDEN_SIZE = 18 * 2
+ATTENTION_SIZE = 18 * 2
+best_auc = 0.0
+
+def prepare_data(input, target, maxlen = None, return_neg = False):
+    # x: a list of sentences
+    lengths_x = [len(s[4]) for s in input]
+    seqs_mid = [inp[3] for inp in input]
+    seqs_cat = [inp[4] for inp in input]
+    noclk_seqs_mid = [inp[5] for inp in input]
+    noclk_seqs_cat = [inp[6] for inp in input]
+    seqs_item_carte = [inp[7][0] for inp in input]
+    seqs_cate_carte = [inp[7][1] for inp in input]
+
+    if maxlen is not None:
+        new_seqs_mid = []
+        new_seqs_cat = []
+        new_noclk_seqs_mid = []
+        new_noclk_seqs_cat = []
+        new_lengths_x = []
+        new_seqs_item_carte = []
+        new_seqs_cate_carte = []
+        for l_x, inp in zip(lengths_x, input):
+            if l_x > maxlen:
+                new_seqs_mid.append(inp[3][l_x - maxlen:])
+                new_seqs_cat.append(inp[4][l_x - maxlen:])
+                new_noclk_seqs_mid.append(inp[5][l_x - maxlen:])
+                new_noclk_seqs_cat.append(inp[6][l_x - maxlen:])
+                new_seqs_item_carte.append(inp[7][0][l_x - maxlen:])
+                new_seqs_cate_carte.append(inp[7][1][l_x - maxlen:])
+                new_lengths_x.append(maxlen)
+            else:
+                new_seqs_mid.append(inp[3])
+                new_seqs_cat.append(inp[4])
+                new_noclk_seqs_mid.append(inp[5])
+                new_noclk_seqs_cat.append(inp[6])
+                new_seqs_item_carte.append(inp[7][0])
+                new_seqs_cate_carte.append(inp[7][1])
+                new_lengths_x.append(l_x)
+        lengths_x = new_lengths_x
+        seqs_mid = new_seqs_mid
+        seqs_cat = new_seqs_cat
+        noclk_seqs_mid = new_noclk_seqs_mid
+        noclk_seqs_cat = new_noclk_seqs_cat
+        seqs_item_carte = new_seqs_item_carte
+        seqs_cate_carte = new_seqs_cate_carte
+
+        if len(lengths_x) < 1:
+            return None, None, None, None
+
+    n_samples = len(seqs_mid)
+    maxlen_x = numpy.max(lengths_x)
+    neg_samples = len(noclk_seqs_mid[0][0])
+
+    mid_his = numpy.zeros((n_samples, maxlen_x)).astype('int64')
+    cat_his = numpy.zeros((n_samples, maxlen_x)).astype('int64')
+    noclk_mid_his = numpy.zeros((n_samples, maxlen_x, neg_samples)).astype('int64')
+    noclk_cat_his = numpy.zeros((n_samples, maxlen_x, neg_samples)).astype('int64')
+    item_carte = numpy.zeros((n_samples, maxlen_x)).astype('int64')
+    cate_carte = numpy.zeros((n_samples, maxlen_x)).astype('int64')
+    mid_mask = numpy.zeros((n_samples, maxlen_x)).astype('float32')
+    for idx, [s_x, s_y, no_sx, no_sy, i_c, c_c] in enumerate(zip(seqs_mid, seqs_cat, noclk_seqs_mid, noclk_seqs_cat, seqs_item_carte, seqs_cate_carte)):
+        mid_mask[idx, :lengths_x[idx]] = 1.
+        mid_his[idx, :lengths_x[idx]] = s_x
+        cat_his[idx, :lengths_x[idx]] = s_y
+        noclk_mid_his[idx, :lengths_x[idx], :] = no_sx
+        noclk_cat_his[idx, :lengths_x[idx], :] = no_sy
+        item_carte[idx, :lengths_x[idx]] = i_c
+        cate_carte[idx, :lengths_x[idx]] = c_c
+
+    uids = numpy.array([inp[0] for inp in input])
+    mids = numpy.array([inp[1] for inp in input])
+    cats = numpy.array([inp[2] for inp in input])
+
+    carte = numpy.stack([item_carte, cate_carte], axis=1)
+
+    if return_neg:
+        return uids, mids, cats, mid_his, cat_his, mid_mask, numpy.array(target), numpy.array(lengths_x), noclk_mid_his, noclk_cat_his, carte
+
+    else:
+        return uids, mids, cats, mid_his, cat_his, mid_mask, numpy.array(target), numpy.array(lengths_x), carte
+
+def eval(sess, test_data, model, model_path):
+
+    loss_sum = 0.
+    accuracy_sum = 0.
+    aux_loss_sum = 0.
+    nums = 0
+    stored_arr = []
+    for src, tgt in test_data:
+        nums += 1
+        uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats, carte = prepare_data(src, tgt, return_neg=True)
+        prob, loss, acc, aux_loss = model.calculate(sess, [uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats, carte])
+        loss_sum += loss
+        aux_loss_sum = aux_loss
+        accuracy_sum += acc
+        prob_1 = prob[:, 0].tolist()
+        target_1 = target[:, 0].tolist()
+        for p ,t in zip(prob_1, target_1):
+            stored_arr.append([p, t])
+    test_auc = calc_auc(stored_arr)
+    accuracy_sum = accuracy_sum / nums
+    loss_sum = loss_sum / nums
+    aux_loss_sum / nums
+    global best_auc
+    if best_auc < test_auc:
+        best_auc = test_auc
+        #model.save(sess, model_path)
+    return test_auc, loss_sum, accuracy_sum, aux_loss_sum
+
+def train(
+        train_file = "/home/test/modelzoo/DIEN/data/local_train_splitByUser",
+        test_file = "/home/test/modelzoo/DIEN/data/local_test_splitByUser",
+        uid_voc = "/home/test/modelzoo/CAN/data/uid_voc.pkl",
+        mid_voc = "/home/test/modelzoo/CAN/data/mid_voc.pkl",
+        cat_voc = "/home/test/modelzoo/CAN/data/cat_voc.pkl",
+        batch_size = 128,
+        maxlen = 100,
+        test_iter = 8400,
+        save_iter = 8400,
+        model_type = 'DNN',
+        seed = 2,
+):
+    model_path = "dnn_save_path/ckpt_noshuff" + model_type + str(seed)
+    best_model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed)
+    gpu_options = tf.GPUOptions(allow_growth=True)
+    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
+        label_type = 1
+        train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen, shuffle_each_epoch=False, label_type=label_type)
+        test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen, label_type=label_type)
+        n_uid, n_mid, n_cat, n_carte = train_data.get_n()
+        if model_type == 'DNN':
+            model = Model_DNN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE,use_softmax=False)
+        elif model_type == 'Cartesion':
+            model = Model_DNN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE,use_softmax=False, use_cartes=True)
+        elif model_type == 'CAN+Cartesion':
+            model = Model_DNN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_coaction=True, use_cartes=True)
+        elif model_type == 'CAN':
+            model = Model_DNN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_coaction=True)
+        elif model_type == 'PNN':
+            model = Model_PNN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_softmax=False)
+        elif model_type == 'ONN':
+            model = Model_ONN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_softmax=False)
+        elif model_type == 'Wide':
+            model = Model_WideDeep(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
+        elif model_type == 'NCF':
+            model = Model_NCF(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
+        elif model_type == 'FM':
+            model = Model_FM(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_softmax=False)
+        elif model_type == 'FFM':
+            model = Model_FFM(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_softmax=False)
+        elif model_type == 'DeepFM':
+            model = Model_DeepFM(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_softmax=False)
+        elif model_type == 'DeepFFM':
+            model = Model_DeepFFM(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_softmax=False)
+        elif model_type == 'xDeepFM':
+            model = Model_xDeepFM(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_softmax=False)
+        elif model_type == 'ONN':
+            model = Model_ONN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
+        elif model_type == 'DIN':
+            model = Model_DIN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
+        elif model_type == 'DIEN':
+            model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
+        elif model_type == 'CAN+DIEN':
+            model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_coaction=True)
+        else:
+            print ("Invalid model_type : %s"% model_type)
+            return
+        print("Model: ", model_type)
+        sess.run(tf.global_variables_initializer())
+        sess.run(tf.local_variables_initializer())
+        sys.stdout.flush()
+
+        count()
+        start_time = time.time()
+        iter = 0
+        lr = 0.001
+
+        for itr in range(1) :
+            loss_sum = 0.0
+            accuracy_sum = 0.
+            aux_loss_sum = 0.
+            print('train_data:',train_data)
+            for src, tgt in train_data:
+                uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats, carte = prepare_data(src, tgt, maxlen, return_neg=True)
+                loss, acc, aux_loss = model.train(sess, [uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, lr, noclk_mids, noclk_cats, carte])
+                loss_sum += loss
+                accuracy_sum += acc
+                aux_loss_sum += aux_loss
+                iter += 1
+                sys.stdout.flush()
+                #if (iter % 100) == 0:
+                print('iter: %d ----> train_loss: %.4f ---- train_accuracy: %.4f ---- train_aux_loss: %.4f' %  (iter, loss_sum / 100, accuracy_sum / 100, aux_loss_sum / 100))
+                loss_sum = 0.0
+                accuracy_sum = 0.0
+                aux_loss_sum = 0.0
+                #if (iter % test_iter) == 0:
+                auc_, loss_, acc_, aux_ = eval(sess, test_data, model, best_model_path)
+                print('iter: %d --- test_auc: %.4f ---- test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % (iter, auc_, loss_, acc_, aux_))
+                loss_sum = 0.0
+                accuracy_sum = 0.0
+                aux_loss_sum = 0.0
+                if (iter % save_iter) == 0:
+                    print('save model iter: %d' %(iter))
+                    model.save(sess, model_path+"--"+str(iter))
+                print('time:%f',(time.time()-start_time))
+            lr *= 0.5
+
+def count_flops(graph):
+    flops = tf.profiler.profile(graph, options=tf.profiler.ProfileOptionBuilder.float_operation())
+    print('FLOPs: {}'.format(flops.total_float_ops))
+
+def count():
+    total_parameters = 0
+    for variable in tf.trainable_variables():
+        # shape is an array of tf.Dimension
+        shape = variable.get_shape()
+        variable_parameters = 1
+        for dim in shape:
+            variable_parameters *= dim.value
+        total_parameters += variable_parameters
+    print("Prameter: ", total_parameters)
+
+def test(
+        train_file = "/home/test/modelzoo/DIEN/data/local_train_splitByUser",
+        test_file = "/home/test/modelzoo/DIEN/data/local_test_splitByUser",
+        uid_voc = "/home/test/modelzoo/CAN/data/uid_voc.pkl",
+        mid_voc = "/home/test/modelzoo/CAN/data/mid_voc.pkl",
+        cat_voc = "/home/test/modelzoo/CAN/data/cat_voc.pkl",
+        batch_size = 128,
+        maxlen = 100,
+        model_type = 'DNN',
+	seed = 2
+):
+
+    model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed)
+    gpu_options = tf.GPUOptions(allow_growth=True)
+    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
+        train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
+        test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
+        n_uid, n_mid, n_cat = train_data.get_n()
+        if model_type == 'DNN':
+            model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
+        elif model_type == 'PNN':
+            model = Model_PNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
+        elif model_type == 'Wide':
+	        model = Model_WideDeep(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
+        elif model_type == 'DIN':
+            model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
+        elif model_type == 'DIN-V2-gru-att-gru':
+            model = Model_DIN_V2_Gru_att_Gru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
+        elif model_type == 'DIN-V2-gru-gru-att':
+            model = Model_DIN_V2_Gru_Gru_att(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
+        elif model_type == 'DIN-V2-gru-qa-attGru':
+            model = Model_DIN_V2_Gru_QA_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
+        elif model_type == 'DIN-V2-gru-vec-attGru':
+            model = Model_DIN_V2_Gru_Vec_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
+        elif model_type == 'DIEN':
+            model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
+        else:
+            print ("Invalid model_type : %s", model_type)
+            return
+        model.restore(sess, model_path)
+        print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, model_path))
+
+if __name__ == '__main__':
+    if len(sys.argv) == 4:
+        SEED = int(sys.argv[3])
+    else:
+        SEED = 3
+    tf.set_random_seed(SEED)
+    numpy.random.seed(SEED)
+    random.seed(SEED)
+
+    if sys.argv[1] == 'train':
+        train(model_type=sys.argv[2], seed=SEED)
+    elif sys.argv[1] == 'test':
+        test(model_type=sys.argv[2], seed=SEED)
+    else:
+        print('do nothing...')
+
+
diff --git a/modelzoo/CAN/script/utils.py b/modelzoo/CAN/script/utils.py
new file mode 100644
index 00000000000..4590754b054
--- /dev/null
+++ b/modelzoo/CAN/script/utils.py
@@ -0,0 +1,404 @@
+import tensorflow as tf
+
+from tensorflow.python.ops.rnn_cell import *
+from tensorflow.contrib.rnn.python.ops.core_rnn_cell import  _linear
+#from tensorflow import keras
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope as vs
+#from keras import backend as K
+
+class QAAttGRUCell(RNNCell):
+  """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
+  Args:
+    num_units: int, The number of units in the GRU cell.
+    activation: Nonlinearity to use.  Default: `tanh`.
+    reuse: (optional) Python boolean describing whether to reuse variables
+     in an existing scope.  If not `True`, and the existing scope already has
+     the given variables, an error is raised.
+    kernel_initializer: (optional) The initializer to use for the weight and
+    projection matrices.
+    bias_initializer: (optional) The initializer to use for the bias.
+  """
+
+  def __init__(self,
+               num_units,
+               activation=None,
+               reuse=None,
+               kernel_initializer=None,
+               bias_initializer=None):
+    super(QAAttGRUCell, self).__init__(_reuse=reuse)
+    self._num_units = num_units
+    self._activation = activation or math_ops.tanh
+    self._kernel_initializer = kernel_initializer
+    self._bias_initializer = bias_initializer
+    self._gate_linear = None
+    self._candidate_linear = None
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def __call__(self, inputs, state, att_score):
+      return self.call(inputs, state, att_score)
+
+  def call(self, inputs, state, att_score=None):
+    """Gated recurrent unit (GRU) with nunits cells."""
+    if self._gate_linear is None:
+      bias_ones = self._bias_initializer
+      if self._bias_initializer is None:
+        bias_ones = init_ops.constant_initializer(1.0, dtype=inputs.dtype)
+      with vs.variable_scope("gates"):  # Reset gate and update gate.
+        self._gate_linear = _Linear(
+            [inputs, state],
+            2 * self._num_units,
+            True,
+            bias_initializer=bias_ones,
+            kernel_initializer=self._kernel_initializer)
+
+    value = math_ops.sigmoid(self._gate_linear([inputs, state]))
+    r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
+
+    r_state = r * state
+    if self._candidate_linear is None:
+      with vs.variable_scope("candidate"):
+        self._candidate_linear = _Linear(
+            [inputs, r_state],
+            self._num_units,
+            True,
+            bias_initializer=self._bias_initializer,
+            kernel_initializer=self._kernel_initializer)
+    c = self._activation(self._candidate_linear([inputs, r_state]))
+    new_h = (1. - att_score) * state + att_score * c
+    return new_h, new_h
+
+class VecAttGRUCell(RNNCell):
+  """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
+  Args:
+    num_units: int, The number of units in the GRU cell.
+    activation: Nonlinearity to use.  Default: `tanh`.
+    reuse: (optional) Python boolean describing whether to reuse variables
+     in an existing scope.  If not `True`, and the existing scope already has
+     the given variables, an error is raised.
+    kernel_initializer: (optional) The initializer to use for the weight and
+    projection matrices.
+    bias_initializer: (optional) The initializer to use for the bias.
+  """
+
+  def __init__(self,
+               num_units,
+               activation=None,
+               reuse=None,
+               kernel_initializer=None,
+               bias_initializer=None):
+    super(VecAttGRUCell, self).__init__(_reuse=reuse)
+    self._num_units = num_units
+    self._activation = activation or math_ops.tanh
+    self._kernel_initializer = kernel_initializer
+    self._bias_initializer = bias_initializer
+    self._gate_linear = None
+    self._candidate_linear = None
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+  def __call__(self, inputs, state, att_score):
+      return self.call(inputs, state, att_score)
+  def call(self, inputs, state, att_score=None):
+    """Gated recurrent unit (GRU) with nunits cells."""
+    if self._gate_linear is None:
+      bias_ones = self._bias_initializer
+      if self._bias_initializer is None:
+        bias_ones = init_ops.constant_initializer(1.0, dtype=inputs.dtype)
+      with vs.variable_scope("gates"):  # Reset gate and update gate.
+        self._gate_linear = _Linear(
+            [inputs, state],
+            2 * self._num_units,
+            True,
+            bias_initializer=bias_ones,
+            kernel_initializer=self._kernel_initializer)
+
+    value = math_ops.sigmoid(self._gate_linear([inputs, state]))
+    r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
+
+    r_state = r * state
+    if self._candidate_linear is None:
+      with vs.variable_scope("candidate"):
+        self._candidate_linear = _Linear(
+            [inputs, r_state],
+            self._num_units,
+            True,
+            bias_initializer=self._bias_initializer,
+            kernel_initializer=self._kernel_initializer)
+    c = self._activation(self._candidate_linear([inputs, r_state]))
+    u = (1.0 - att_score) * u
+    new_h = u * state + (1 - u) * c
+    return new_h, new_h
+
+def prelu(_x, scope=''):
+    """parametric ReLU activation"""
+    with tf.variable_scope(name_or_scope=scope, default_name="prelu"):
+        _alpha = tf.get_variable("prelu_"+scope, shape=_x.get_shape()[-1],
+                                 dtype=_x.dtype, initializer=tf.constant_initializer(0.1))
+        return tf.maximum(0.0, _x) + _alpha * tf.minimum(0.0, _x)
+
+def calc_auc(raw_arr):
+    """Summary
+
+    Args:
+        raw_arr (TYPE): Description
+
+    Returns:
+        TYPE: Description
+    """
+
+    arr = sorted(raw_arr, key=lambda d:d[0], reverse=True)
+    pos, neg = 0., 0.
+    for record in arr:
+        if record[1] == 1.:
+            pos += 1
+        else:
+            neg += 1
+
+    fp, tp = 0., 0.
+    xy_arr = []
+    for record in arr:
+        if record[1] == 1.:
+            tp += 1
+        else:
+            fp += 1
+        xy_arr.append([fp/neg, tp/pos])
+
+    auc = 0.
+    prev_x = 0.
+    prev_y = 0.
+    for x, y in xy_arr:
+        if x != prev_x:
+            auc += ((x - prev_x) * (y + prev_y) / 2.)
+            prev_x = x
+            prev_y = y
+
+    return auc
+
+def attention(query, facts, attention_size, mask, stag='null', mode='LIST', softmax_stag=1, time_major=False, return_alphas=False):
+    if isinstance(facts, tuple):
+        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
+        facts = tf.concat(facts, 2)
+
+    if time_major:
+        # (T,B,D) => (B,T,D)
+        facts = tf.array_ops.transpose(facts, [1, 0, 2])
+
+    mask = tf.equal(mask, tf.ones_like(mask))
+    hidden_size = facts.get_shape().as_list()[-1]  # D value - hidden size of the RNN layer
+    input_size = query.get_shape().as_list()[-1]
+
+    # Trainable parameters
+    w1 = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
+    w2 = tf.Variable(tf.random_normal([input_size, attention_size], stddev=0.1))
+    b = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
+    v = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
+
+    with tf.name_scope('v'):
+        # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
+        #  the shape of `tmp` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
+        tmp1 = tf.tensordot(facts, w1, axes=1)
+        tmp2 = tf.tensordot(query, w2, axes=1)
+        tmp2 = tf.reshape(tmp2, [-1, 1, tf.shape(tmp2)[-1]])
+        tmp = tf.tanh((tmp1 + tmp2) + b)
+
+    # For each of the timestamps its vector of size A from `tmp` is reduced with `v` vector
+    v_dot_tmp = tf.tensordot(tmp, v, axes=1, name='v_dot_tmp')  # (B,T) shape
+    key_masks = mask # [B, 1, T]
+    # key_masks = tf.expand_dims(mask, 1) # [B, 1, T]
+    paddings = tf.ones_like(v_dot_tmp) * (-2 ** 32 + 1)
+    v_dot_tmp = tf.where(key_masks, v_dot_tmp, paddings)  # [B, 1, T]
+    alphas = tf.nn.softmax(v_dot_tmp, name='alphas')         # (B,T) shape
+
+    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
+    #output = tf.reduce_sum(facts * tf.expand_dims(alphas, -1), 1)
+    output = facts * tf.expand_dims(alphas, -1)
+    output = tf.reshape(output, tf.shape(facts))
+    # output = output / (facts.get_shape().as_list()[-1] ** 0.5)
+    if not return_alphas:
+        return output
+    else:
+        return output, alphas
+
+def din_attention(query, facts, attention_size, mask, stag='null', mode='SUM', softmax_stag=1, time_major=False, return_alphas=False):
+    if isinstance(facts, tuple):
+        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
+        facts = tf.concat(facts, 2)
+        print ("querry_size mismatch")
+        query = tf.concat(values = [
+        query,
+        query,
+        ], axis=1)
+
+    if time_major:
+        # (T,B,D) => (B,T,D)
+        facts = tf.array_ops.transpose(facts, [1, 0, 2])
+    mask = tf.equal(mask, tf.ones_like(mask))
+    facts_size = facts.get_shape().as_list()[-1]  # D value - hidden size of the RNN layer
+    querry_size = query.get_shape().as_list()[-1]
+    queries = tf.tile(query, [1, tf.shape(facts)[1]])
+    queries = tf.reshape(queries, tf.shape(facts))
+    din_all = tf.concat([queries, facts, queries-facts, queries*facts], axis=-1)
+    d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att' + stag)
+    d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att' + stag)
+    d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att' + stag)
+    d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(facts)[1]])
+    scores = d_layer_3_all
+    # Mask
+    # key_masks = tf.sequence_mask(facts_length, tf.shape(facts)[1])   # [B, T]
+    key_masks = tf.expand_dims(mask, 1) # [B, 1, T]
+    paddings = tf.ones_like(scores) * (-2 ** 32 + 1)
+    scores = tf.where(key_masks, scores, paddings)  # [B, 1, T]
+
+    # Scale
+    # scores = scores / (facts.get_shape().as_list()[-1] ** 0.5)
+
+    # Activation
+    if softmax_stag:
+        scores = tf.nn.softmax(scores)  # [B, 1, T]
+
+    # Weighted sum
+    if mode == 'SUM':
+        output = tf.matmul(scores, facts)  # [B, 1, H]
+        # output = tf.reshape(output, [-1, tf.shape(facts)[-1]])
+    else:
+        scores = tf.reshape(scores, [-1, tf.shape(facts)[1]])
+        output = facts * tf.expand_dims(scores, -1)
+        output = tf.reshape(output, tf.shape(facts))
+    return output
+
+def din_fcn_attention(query, facts, attention_size, mask, stag='null', mode='SUM', softmax_stag=1, time_major=False, return_alphas=False, forCnn=False):
+    if isinstance(facts, tuple):
+        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
+        facts = tf.concat(facts, 2)
+    if len(facts.get_shape().as_list()) == 2:
+        facts = tf.expand_dims(facts, 1)
+
+    if time_major:
+        # (T,B,D) => (B,T,D)
+        facts = tf.array_ops.transpose(facts, [1, 0, 2])
+    # Trainable parameters
+    mask = tf.equal(mask, tf.ones_like(mask))
+    facts_size = facts.get_shape().as_list()[-1]  # D value - hidden size of the RNN layer
+    querry_size = query.get_shape().as_list()[-1]
+    query = tf.layers.dense(query, facts_size, activation=None, name='f1' + stag)
+    query = prelu(query)
+    queries = tf.tile(query, [1, tf.shape(facts)[1]])
+    queries = tf.reshape(queries, tf.shape(facts))
+    din_all = tf.concat([queries, facts, queries-facts, queries*facts], axis=-1)
+    d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att' + stag)
+    d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att' + stag)
+    d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att' + stag)
+    d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(facts)[1]])
+    scores = d_layer_3_all
+    # Mask
+    # key_masks = tf.sequence_mask(facts_length, tf.shape(facts)[1])   # [B, T]
+    key_masks = tf.expand_dims(mask, 1) # [B, 1, T]
+    paddings = tf.ones_like(scores) * (-2 ** 32 + 1)
+    if not forCnn:
+        scores = tf.where(key_masks, scores, paddings)  # [B, 1, T]
+
+    # Scale
+    # scores = scores / (facts.get_shape().as_list()[-1] ** 0.5)
+
+    # Activation
+    if softmax_stag:
+        scores = tf.nn.softmax(scores)  # [B, 1, T]
+
+    # Weighted sum
+    if mode == 'SUM':
+        output = tf.matmul(scores, facts)  # [B, 1, H]
+        # output = tf.reshape(output, [-1, tf.shape(facts)[-1]])
+    else:
+        scores = tf.reshape(scores, [-1, tf.shape(facts)[1]])
+        output = facts * tf.expand_dims(scores, -1)
+        output = tf.reshape(output, tf.shape(facts))
+    if return_alphas:
+        return output, scores
+    return output
+
+def self_attention(facts, ATTENTION_SIZE, mask, stag='null'):
+    if len(facts.get_shape().as_list()) == 2:
+        facts = tf.expand_dims(facts, 1)
+
+    def cond(batch, output, i):
+        return tf.less(i, tf.shape(batch)[1])
+
+    def body(batch, output, i):
+        self_attention_tmp = din_fcn_attention(batch[:, i, :], batch[:, 0:i+1, :],
+                                               ATTENTION_SIZE, mask[:, 0:i+1], softmax_stag=1, stag=stag,
+                                               mode='LIST')
+        self_attention_tmp = tf.reduce_sum(self_attention_tmp, 1)
+        output = output.write(i, self_attention_tmp)
+        return batch, output, i + 1
+
+    output_ta = tf.TensorArray(dtype=tf.float32,
+                               size=0,
+                               dynamic_size=True,
+                               element_shape=(facts[:, 0, :].get_shape()))
+    _, output_op, _ = tf.while_loop(cond, body, [facts, output_ta, 0])
+    self_attention = output_op.stack()
+    self_attention = tf.transpose(self_attention, perm = [1, 0, 2])
+    return self_attention
+
+def self_all_attention(facts, ATTENTION_SIZE, mask, stag='null'):
+    if len(facts.get_shape().as_list()) == 2:
+        facts = tf.expand_dims(facts, 1)
+
+    def cond(batch, output, i):
+        return tf.less(i, tf.shape(batch)[1])
+
+    def body(batch, output, i):
+        self_attention_tmp = din_fcn_attention(batch[:, i, :], batch,
+                                               ATTENTION_SIZE, mask, softmax_stag=1, stag=stag,
+                                               mode='LIST')
+        self_attention_tmp = tf.reduce_sum(self_attention_tmp, 1)
+        output = output.write(i, self_attention_tmp)
+        return batch, output, i + 1
+
+    output_ta = tf.TensorArray(dtype=tf.float32,
+                               size=0,
+                               dynamic_size=True,
+                               element_shape=(facts[:, 0, :].get_shape()))
+    _, output_op, _ = tf.while_loop(cond, body, [facts, output_ta, 0])
+    self_attention = output_op.stack()
+    self_attention = tf.transpose(self_attention, perm = [1, 0, 2])
+    return self_attention
+
+def din_fcn_shine(query, facts, attention_size, mask, stag='null', mode='SUM', softmax_stag=1, time_major=False, return_alphas=False):
+    if isinstance(facts, tuple):
+        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
+        facts = tf.concat(facts, 2)
+
+    if time_major:
+        # (T,B,D) => (B,T,D)
+        facts = tf.array_ops.transpose(facts, [1, 0, 2])
+    # Trainable parameters
+    mask = tf.equal(mask, tf.ones_like(mask))
+    facts_size = facts.get_shape().as_list()[-1]  # D value - hidden size of the RNN layer
+    querry_size = query.get_shape().as_list()[-1]
+    query = tf.layers.dense(query, facts_size, activation=None, name='f1_trans_shine' + stag)
+    query = prelu(query)
+    queries = tf.tile(query, [1, tf.shape(facts)[1]])
+    queries = tf.reshape(queries, tf.shape(facts))
+    din_all = tf.concat([queries, facts, queries-facts, queries*facts], axis=-1)
+    d_layer_1_all = tf.layers.dense(din_all, facts_size, activation=tf.nn.sigmoid, name='f1_shine_att' + stag)
+    d_layer_2_all = tf.layers.dense(d_layer_1_all, facts_size, activation=tf.nn.sigmoid, name='f2_shine_att' + stag)
+    d_layer_2_all = tf.reshape(d_layer_2_all, tf.shape(facts))
+    output = d_layer_2_all
+    return output
+

From d6d5be591e3f7ad2da16c767d5c12d3268e4c8d8 Mon Sep 17 00:00:00 2001
From: lihangtian <936971274@qq.com>
Date: Wed, 3 Aug 2022 11:38:14 +0800
Subject: [PATCH 2/8] [ModelZoo] Support Co_Action Network

---
 modelzoo/CAN/README.md                        | 25 +++++++++
 modelzoo/CAN/data/README.md                   | 25 +++++++++
 modelzoo/CAN/{ => data}/prepare_data.sh       |  2 +-
 modelzoo/CAN/{ => data}/script/Dice.py        |  0
 modelzoo/CAN/{ => data}/script/calc_ckpt.py   |  4 +-
 .../CAN/{ => data}/script/data_iterator.py    | 12 ++---
 .../CAN/{ => data}/script/generate_voc.py     |  7 ++-
 .../CAN/{ => data}/script/generate_voc.py.bk  |  0
 .../CAN/{ => data}/script/local_aggretor.py   |  7 ++-
 modelzoo/CAN/{ => data}/script/model.py       | 12 ++---
 modelzoo/CAN/{ => data}/script/model_avazu.py |  0
 .../CAN/{ => data}/script/process_data.py     |  0
 modelzoo/CAN/{ => data}/script/rnn.py         |  0
 modelzoo/CAN/{ => data}/script/shuffle.py     |  0
 .../CAN/{ => data}/script/split_by_user.py    |  6 +--
 modelzoo/CAN/{ => data}/script/utils.py       |  2 -
 modelzoo/CAN/script/test.py                   | 10 ----
 modelzoo/CAN/{script => }/train.py            | 53 +++++++++----------
 18 files changed, 96 insertions(+), 69 deletions(-)
 create mode 100644 modelzoo/CAN/README.md
 create mode 100644 modelzoo/CAN/data/README.md
 rename modelzoo/CAN/{ => data}/prepare_data.sh (84%)
 rename modelzoo/CAN/{ => data}/script/Dice.py (100%)
 rename modelzoo/CAN/{ => data}/script/calc_ckpt.py (82%)
 rename modelzoo/CAN/{ => data}/script/data_iterator.py (92%)
 rename modelzoo/CAN/{ => data}/script/generate_voc.py (95%)
 rename modelzoo/CAN/{ => data}/script/generate_voc.py.bk (100%)
 rename modelzoo/CAN/{ => data}/script/local_aggretor.py (78%)
 rename modelzoo/CAN/{ => data}/script/model.py (99%)
 rename modelzoo/CAN/{ => data}/script/model_avazu.py (100%)
 rename modelzoo/CAN/{ => data}/script/process_data.py (100%)
 rename modelzoo/CAN/{ => data}/script/rnn.py (100%)
 rename modelzoo/CAN/{ => data}/script/shuffle.py (100%)
 rename modelzoo/CAN/{ => data}/script/split_by_user.py (64%)
 rename modelzoo/CAN/{ => data}/script/utils.py (99%)
 delete mode 100644 modelzoo/CAN/script/test.py
 rename modelzoo/CAN/{script => }/train.py (88%)

diff --git a/modelzoo/CAN/README.md b/modelzoo/CAN/README.md
new file mode 100644
index 00000000000..02d8b396649
--- /dev/null
+++ b/modelzoo/CAN/README.md
@@ -0,0 +1,25 @@
+# Co-Action Network
+
+Implementation of paper "CAN: Revisiting Feature Co-Action for Click Through Rate Prediction".
+
+paper: [arxiv (to be released)]()
+
+## Installation
+dependences：
+
+tensorflow：1.4.1
+
+python: 2.7
+
+higher version of tensorflow and python3 will be supported soon!
+
+## Getting Started
+training:
+
+CUDA_VISIBLE_DEVICES=0  python  train.py train {model}
+
+model: CAN,Cartesion,PNN, etc. (check the train.py)
+
+## Citation
+## Contact
+## License
diff --git a/modelzoo/CAN/data/README.md b/modelzoo/CAN/data/README.md
new file mode 100644
index 00000000000..02d8b396649
--- /dev/null
+++ b/modelzoo/CAN/data/README.md
@@ -0,0 +1,25 @@
+# Co-Action Network
+
+Implementation of paper "CAN: Revisiting Feature Co-Action for Click Through Rate Prediction".
+
+paper: [arxiv (to be released)]()
+
+## Installation
+dependences：
+
+tensorflow：1.4.1
+
+python: 2.7
+
+higher version of tensorflow and python3 will be supported soon!
+
+## Getting Started
+training:
+
+CUDA_VISIBLE_DEVICES=0  python  train.py train {model}
+
+model: CAN,Cartesion,PNN, etc. (check the train.py)
+
+## Citation
+## Contact
+## License
diff --git a/modelzoo/CAN/prepare_data.sh b/modelzoo/CAN/data/prepare_data.sh
similarity index 84%
rename from modelzoo/CAN/prepare_data.sh
rename to modelzoo/CAN/data/prepare_data.sh
index 110b9559129..54c9733dd15 100644
--- a/modelzoo/CAN/prepare_data.sh
+++ b/modelzoo/CAN/data/prepare_data.sh
@@ -3,7 +3,7 @@ wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Boo
 wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Books.json.gz
 gunzip reviews_Books.json.gz
 gunzip meta_Books.json.gz
-python script/process_data.py meta_Books.json reviews_Books_5.json
+python script/process_data.py meta_Books.json reviews_Books.json
 python script/local_aggretor.py
 python script/split_by_user.py
 python script/generate_voc.py
diff --git a/modelzoo/CAN/script/Dice.py b/modelzoo/CAN/data/script/Dice.py
similarity index 100%
rename from modelzoo/CAN/script/Dice.py
rename to modelzoo/CAN/data/script/Dice.py
diff --git a/modelzoo/CAN/script/calc_ckpt.py b/modelzoo/CAN/data/script/calc_ckpt.py
similarity index 82%
rename from modelzoo/CAN/script/calc_ckpt.py
rename to modelzoo/CAN/data/script/calc_ckpt.py
index fa5d4bda035..97d30de8a73 100644
--- a/modelzoo/CAN/script/calc_ckpt.py
+++ b/modelzoo/CAN/data/script/calc_ckpt.py
@@ -7,8 +7,6 @@
     shape = variable.get_shape()
     variable_parameters = 1
     for dim in shape:
-        # print(dim)
         variable_parameters *= dim.value
-    # print(variable_parameters)
     total_parameters += variable_parameters
-print(total_parameters)
+
diff --git a/modelzoo/CAN/script/data_iterator.py b/modelzoo/CAN/data/script/data_iterator.py
similarity index 92%
rename from modelzoo/CAN/script/data_iterator.py
rename to modelzoo/CAN/data/script/data_iterator.py
index b5eef5f9e57..b49e5f8c2c9 100644
--- a/modelzoo/CAN/script/data_iterator.py
+++ b/modelzoo/CAN/data/script/data_iterator.py
@@ -1,17 +1,15 @@
 import numpy
 import json
-#import cPickle as pkl
 import _pickle as cPickle
 import random
 
 import gzip
 
-import shuffle
+import data.script.shuffle
 
 def unicode_to_utf8(d):
     return dict((key.encode("UTF-8"), value) for (key,value) in d.items())
 def dict_unicode_to_utf8(d):
-    print('d={}'.format(d))
     return dict(((key[0].encode("UTF-8"), key[1].encode("UTF-8")), value) for (key,value) in d.items())
 
 def load_dict(filename):
@@ -53,11 +51,10 @@ def __init__(self, source,
         else:
             self.source = fopen(source, 'r')
         self.source_dicts = []
-        #for source_dict in [uid_voc, mid_voc, cat_voc, cat_voc, cat_voc]:# 'item_carte_voc.pkl', 'cate_carte_voc.pkl']:
-        for source_dict in [uid_voc, mid_voc, cat_voc, '/home/test/modelzoo/CAN/data/item_carte_voc.pkl', '/home/test/modelzoo/CAN/data/cate_carte_voc.pkl']:
+        for source_dict in [uid_voc, mid_voc, cat_voc, '../CAN/data/item_carte_voc.pkl', '../CAN/data/cate_carte_voc.pkl']:
             self.source_dicts.append(load_dict(source_dict))
 
-        f_meta = open("/home/test/modelzoo/CAN/data/item-info", "r")
+        f_meta = open("../CAN/data/item-info", "r")
         meta_map = {}
         for line in f_meta:
             arr = line.strip().split("\t")
@@ -76,7 +73,7 @@ def __init__(self, source,
                 cat_idx = 0
             self.meta_id_map[mid_idx] = cat_idx
 
-        f_review = open("/home/test/modelzoo/CAN/data/reviews-info", "r")
+        f_review = open("../CAN/data/reviews-info", "r")
         self.mid_list_for_random = []
         for line in f_review:
             arr = line.strip().split("\t")
@@ -94,7 +91,6 @@ def __init__(self, source,
         self.n_mid = len(self.source_dicts[1])
         self.n_cat = len(self.source_dicts[2])
         self.n_carte = [len(self.source_dicts[3]), len(self.source_dicts[4])]
-        print("n_uid=%d, n_mid=%d, n_cat=%d" % (self.n_uid, self.n_mid, self.n_cat))
 
         self.shuffle = shuffle_each_epoch
         self.sort_by_length = sort_by_length
diff --git a/modelzoo/CAN/script/generate_voc.py b/modelzoo/CAN/data/script/generate_voc.py
similarity index 95%
rename from modelzoo/CAN/script/generate_voc.py
rename to modelzoo/CAN/data/script/generate_voc.py
index 03b6a662d97..b6816d36563 100644
--- a/modelzoo/CAN/script/generate_voc.py
+++ b/modelzoo/CAN/data/script/generate_voc.py
@@ -1,13 +1,13 @@
 import  pickle  as pk
 
-f_train = open("/home/test/modelzoo/DIEN/data/local_train_splitByUser", "r")
+f_train = open("../../DIEN/data/local_train_splitByUser", "r")
 uid_dict = {}
 mid_dict = {}
 cat_dict = {}
 item_carte_dict = {}
 cate_carte_dict = {}
 
-iddd = 0
+
 for line in f_train:
     arr = line.strip("\n").split("\t")
     clk = arr[0]
@@ -34,8 +34,7 @@
         if (mid, m) not in item_carte_dict:
             item_carte_dict[(mid, m)] = 0
         item_carte_dict[(mid, m)] += 1
-    #print iddd
-    iddd+=1
+
     for c in cat_list.split(""):
         if c not in cat_dict:
             cat_dict[c] = 0
diff --git a/modelzoo/CAN/script/generate_voc.py.bk b/modelzoo/CAN/data/script/generate_voc.py.bk
similarity index 100%
rename from modelzoo/CAN/script/generate_voc.py.bk
rename to modelzoo/CAN/data/script/generate_voc.py.bk
diff --git a/modelzoo/CAN/script/local_aggretor.py b/modelzoo/CAN/data/script/local_aggretor.py
similarity index 78%
rename from modelzoo/CAN/script/local_aggretor.py
rename to modelzoo/CAN/data/script/local_aggretor.py
index e7e23190a1d..e652ff3d543 100644
--- a/modelzoo/CAN/script/local_aggretor.py
+++ b/modelzoo/CAN/data/script/local_aggretor.py
@@ -2,9 +2,9 @@
 import hashlib
 import random
 
-fin = open("/home/test/modelzoo/DIEN/data/jointed-new-split-info", "r")
-ftrain = open("/home/test/modelzoo/DIEN/data/local_train", "w")
-ftest = open("/home/test/modelzoo/DIEN/data/local_test", "w")
+fin = open("../../DIEN/data/jointed-new-split-info", "r")
+ftrain = open("../../DIEN/data/local_train", "w")
+ftest = open("../../DIEN/data/local_test", "w")
 
 last_user = "0"
 common_fea = ""
@@ -25,7 +25,6 @@
     if user != last_user:
         movie_id_list = []
         cate1_list = []
-        #print >> fo, items[1] + "\t" + user + "\t" + movie_id + "\t" + cat1 +"\t" + "" + "\t" + "" 
     else:
         history_clk_num = len(movie_id_list)
         cat_str = ""
diff --git a/modelzoo/CAN/script/model.py b/modelzoo/CAN/data/script/model.py
similarity index 99%
rename from modelzoo/CAN/script/model.py
rename to modelzoo/CAN/data/script/model.py
index 133ded83f09..aba37138e0e 100644
--- a/modelzoo/CAN/script/model.py
+++ b/modelzoo/CAN/data/script/model.py
@@ -1,12 +1,10 @@
-#import tensorflow as tf
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.python.ops.rnn_cell import GRUCell
 from tensorflow.python.ops.rnn_cell import LSTMCell
 from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
-#from tensorflow.python.ops.rnn import dynamic_rnn
-from rnn import dynamic_rnn
-from utils import *
-from Dice import dice
+from data.script.rnn import dynamic_rnn
+from data.script.utils import *
+from data.script.Dice import dice
 
 #### CAN config #####
 weight_emb_w = [[16, 8], [8,4]] 
@@ -21,7 +19,7 @@
 
 print("orders: ",orders)
 CALC_MODE = "can"
-device = '/gpu:2'
+device = '/gpu:0'
 #### CAN config #####
 
 def gen_coaction(ad, his_items, dim, mode="can", mask=None,keep_fake_carte_seq=False):
diff --git a/modelzoo/CAN/script/model_avazu.py b/modelzoo/CAN/data/script/model_avazu.py
similarity index 100%
rename from modelzoo/CAN/script/model_avazu.py
rename to modelzoo/CAN/data/script/model_avazu.py
diff --git a/modelzoo/CAN/script/process_data.py b/modelzoo/CAN/data/script/process_data.py
similarity index 100%
rename from modelzoo/CAN/script/process_data.py
rename to modelzoo/CAN/data/script/process_data.py
diff --git a/modelzoo/CAN/script/rnn.py b/modelzoo/CAN/data/script/rnn.py
similarity index 100%
rename from modelzoo/CAN/script/rnn.py
rename to modelzoo/CAN/data/script/rnn.py
diff --git a/modelzoo/CAN/script/shuffle.py b/modelzoo/CAN/data/script/shuffle.py
similarity index 100%
rename from modelzoo/CAN/script/shuffle.py
rename to modelzoo/CAN/data/script/shuffle.py
diff --git a/modelzoo/CAN/script/split_by_user.py b/modelzoo/CAN/data/script/split_by_user.py
similarity index 64%
rename from modelzoo/CAN/script/split_by_user.py
rename to modelzoo/CAN/data/script/split_by_user.py
index 9f570d97819..c2a7600fad5 100644
--- a/modelzoo/CAN/script/split_by_user.py
+++ b/modelzoo/CAN/data/script/split_by_user.py
@@ -1,8 +1,8 @@
 import random
 
-fi = open("/home/test/modelzoo/DIEN/data/local_test", "r")
-ftrain = open("/home/test/modelzoo/DIEN/data/local_train_splitByUser", "w")
-ftest = open("/home/test/modelzoo/DIEN/data/local_test_splitByUser", "w")
+fi = open("../../DIEN/data/local_test", "r")
+ftrain = open("../../DIEN/data/local_train_splitByUser", "w")
+ftest = open("../../DIEN/data/local_test_splitByUser", "w")
 
 while True:
     rand_int = random.randint(1, 10)
diff --git a/modelzoo/CAN/script/utils.py b/modelzoo/CAN/data/script/utils.py
similarity index 99%
rename from modelzoo/CAN/script/utils.py
rename to modelzoo/CAN/data/script/utils.py
index 4590754b054..641402b140c 100644
--- a/modelzoo/CAN/script/utils.py
+++ b/modelzoo/CAN/data/script/utils.py
@@ -2,12 +2,10 @@
 
 from tensorflow.python.ops.rnn_cell import *
 from tensorflow.contrib.rnn.python.ops.core_rnn_cell import  _linear
-#from tensorflow import keras
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope as vs
-#from keras import backend as K
 
 class QAAttGRUCell(RNNCell):
   """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
diff --git a/modelzoo/CAN/script/test.py b/modelzoo/CAN/script/test.py
deleted file mode 100644
index 64b9a7f3337..00000000000
--- a/modelzoo/CAN/script/test.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import os
-import pandas as pd
-
-file =  '/home/test/modelzoo/DIEN/data/local_train_splitByUser'
-# if os.path.exists(file+'_neg') is True:
-#     print('YES')
-# else:
-#     print('NOT')
-data = pd.read_csv(file)
-print(data.head())
\ No newline at end of file
diff --git a/modelzoo/CAN/script/train.py b/modelzoo/CAN/train.py
similarity index 88%
rename from modelzoo/CAN/script/train.py
rename to modelzoo/CAN/train.py
index bc1c8a8d97d..7ef1a6cda18 100644
--- a/modelzoo/CAN/script/train.py
+++ b/modelzoo/CAN/train.py
@@ -1,11 +1,11 @@
 import numpy
-from data_iterator import DataIterator
+from data.script.data_iterator import DataIterator
 import tensorflow as tf
-from model import *
+from data.script.model import *
 import time
 import random
 import sys
-from utils import *
+from data.script.utils import *
 from tqdm import tqdm
 
 EMBEDDING_DIM = 18
@@ -120,11 +120,11 @@ def eval(sess, test_data, model, model_path):
     return test_auc, loss_sum, accuracy_sum, aux_loss_sum
 
 def train(
-        train_file = "/home/test/modelzoo/DIEN/data/local_train_splitByUser",
-        test_file = "/home/test/modelzoo/DIEN/data/local_test_splitByUser",
-        uid_voc = "/home/test/modelzoo/CAN/data/uid_voc.pkl",
-        mid_voc = "/home/test/modelzoo/CAN/data/mid_voc.pkl",
-        cat_voc = "/home/test/modelzoo/CAN/data/cat_voc.pkl",
+        train_file = "../DIEN/data/local_train_splitByUser",
+        test_file = "../DIEN/data/local_test_splitByUser",
+        uid_voc = "../CAN/data/uid_voc.pkl",
+        mid_voc = "../CAN/data/mid_voc.pkl",
+        cat_voc = "../CAN/data/cat_voc.pkl",
         batch_size = 128,
         maxlen = 100,
         test_iter = 8400,
@@ -183,7 +183,7 @@ def train(
         sys.stdout.flush()
 
         count()
-        start_time = time.time()
+
         iter = 0
         lr = 0.001
 
@@ -191,7 +191,6 @@ def train(
             loss_sum = 0.0
             accuracy_sum = 0.
             aux_loss_sum = 0.
-            print('train_data:',train_data)
             for src, tgt in train_data:
                 uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats, carte = prepare_data(src, tgt, maxlen, return_neg=True)
                 loss, acc, aux_loss = model.train(sess, [uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, lr, noclk_mids, noclk_cats, carte])
@@ -200,21 +199,21 @@ def train(
                 aux_loss_sum += aux_loss
                 iter += 1
                 sys.stdout.flush()
-                #if (iter % 100) == 0:
-                print('iter: %d ----> train_loss: %.4f ---- train_accuracy: %.4f ---- train_aux_loss: %.4f' %  (iter, loss_sum / 100, accuracy_sum / 100, aux_loss_sum / 100))
-                loss_sum = 0.0
-                accuracy_sum = 0.0
-                aux_loss_sum = 0.0
-                #if (iter % test_iter) == 0:
-                auc_, loss_, acc_, aux_ = eval(sess, test_data, model, best_model_path)
-                print('iter: %d --- test_auc: %.4f ---- test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % (iter, auc_, loss_, acc_, aux_))
-                loss_sum = 0.0
-                accuracy_sum = 0.0
-                aux_loss_sum = 0.0
+                if (iter % 100) == 0:
+                    print('iter: %d ----> train_loss: %.4f ---- train_accuracy: %.4f ---- train_aux_loss: %.4f' %  (iter, loss_sum / 100, accuracy_sum / 100, aux_loss_sum / 100))
+                    loss_sum = 0.0
+                    accuracy_sum = 0.0
+                    aux_loss_sum = 0.0
+                if (iter % test_iter) == 0:
+                    auc_, loss_, acc_, aux_ = eval(sess, test_data, model, best_model_path)
+                    print('iter: %d --- test_auc: %.4f ---- test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % (iter, auc_, loss_, acc_, aux_))
+                    loss_sum = 0.0
+                    accuracy_sum = 0.0
+                    aux_loss_sum = 0.0
                 if (iter % save_iter) == 0:
                     print('save model iter: %d' %(iter))
                     model.save(sess, model_path+"--"+str(iter))
-                print('time:%f',(time.time()-start_time))
+
             lr *= 0.5
 
 def count_flops(graph):
@@ -233,11 +232,11 @@ def count():
     print("Prameter: ", total_parameters)
 
 def test(
-        train_file = "/home/test/modelzoo/DIEN/data/local_train_splitByUser",
-        test_file = "/home/test/modelzoo/DIEN/data/local_test_splitByUser",
-        uid_voc = "/home/test/modelzoo/CAN/data/uid_voc.pkl",
-        mid_voc = "/home/test/modelzoo/CAN/data/mid_voc.pkl",
-        cat_voc = "/home/test/modelzoo/CAN/data/cat_voc.pkl",
+        train_file = "../DIEN/data/local_train_splitByUser",
+        test_file = "../DIEN/data/local_test_splitByUser",
+        uid_voc = "../CAN/data/uid_voc.pkl",
+        mid_voc = "../CAN/data/mid_voc.pkl",
+        cat_voc = "../CAN/data/cat_voc.pkl",
         batch_size = 128,
         maxlen = 100,
         model_type = 'DNN',

From b438c64ffcb2ca56d2736dc4dec5bb49b465b837 Mon Sep 17 00:00:00 2001
From: lihangtian <936971274@qq.com>
Date: Fri, 19 Aug 2022 21:27:43 +0800
Subject: [PATCH 3/8] [ModelZoo] Support Co_Action Net

---
 modelzoo/CAN/README.md                        | 83 ++++++++++++++++---
 modelzoo/CAN/data/README.md                   | 27 ++----
 modelzoo/CAN/data/script/calc_ckpt.py         | 12 ---
 modelzoo/CAN/data/script/data_iterator.py     |  6 +-
 modelzoo/CAN/{data => }/script/Dice.py        |  0
 modelzoo/CAN/{data => }/script/model.py       |  6 +-
 modelzoo/CAN/{data => }/script/model_avazu.py |  0
 modelzoo/CAN/{data => }/script/rnn.py         |  0
 modelzoo/CAN/{data => }/script/utils.py       |  0
 modelzoo/CAN/train.py                         |  4 +-
 10 files changed, 88 insertions(+), 50 deletions(-)
 delete mode 100644 modelzoo/CAN/data/script/calc_ckpt.py
 rename modelzoo/CAN/{data => }/script/Dice.py (100%)
 rename modelzoo/CAN/{data => }/script/model.py (99%)
 rename modelzoo/CAN/{data => }/script/model_avazu.py (100%)
 rename modelzoo/CAN/{data => }/script/rnn.py (100%)
 rename modelzoo/CAN/{data => }/script/utils.py (100%)

diff --git a/modelzoo/CAN/README.md b/modelzoo/CAN/README.md
index 02d8b396649..c26f3f8eace 100644
--- a/modelzoo/CAN/README.md
+++ b/modelzoo/CAN/README.md
@@ -1,25 +1,86 @@
 # Co-Action Network
 
+The following is a brief directory structure and description for this example:
+
+
+
+```
+├── data                          # Data set directory
+│   ├── prepare_data.sh          # Shell script to download and process dataset
+│   └── README.md               # Documentation describing how to prepare dataset
+│   └── script                      # Directory contains scripts to process dataset
+│       ├── data_iterator.py           
+│       ├── generate_voc.py         
+│       ├── local_aggretor.py               
+│       ├── shuffle.py           
+│       └── split_by_user.py
+├── script                       #  Directory contains scripts to CAN model
+│	├── Dice.py
+│	├── model.py
+│	├── model_avazu.py
+│	├── rnn.py
+│	└── utils.py
+├── README.md                     # Documentation
+└── train.py                      # Training script
+```
+
+
+
+## Content
+
+[TOC]
+
+
+
+## Model Structure
+
 Implementation of paper "CAN: Revisiting Feature Co-Action for Click Through Rate Prediction".
 
 paper: [arxiv (to be released)]()
 
-## Installation
-dependences：
 
-tensorflow：1.4.1
 
-python: 2.7
+## Usage
+
+### Stand-alone Training
+
+1. Please prepare the data set and DeepRec env.
 
-higher version of tensorflow and python3 will be supported soon!
+   1. Manually
 
-## Getting Started
-training:
+      - Follow [dataset preparation](https://github.com/alibaba/DeepRec/tree/main/modelzoo/DIEN#prepare) to prepare data set.
+      - Download code by `git clone https://github.com/alibaba/DeepRec`
+      - Follow [How to Build](https://github.com/alibaba/DeepRec#how-to-build) to build DeepRec whl package and install by `pip install $DEEPREC_WHL`.
 
-CUDA_VISIBLE_DEVICES=0  python  train.py train {model}
+   2. Docker(Recommended)
+
+      ```
+      docker pull alideeprec/deeprec-release-modelzoo:latest
+      docker run -it alideeprec/deeprec-release-modelzoo:latest /bin/bash
+      
+      # In docker container
+      cd /root/modelzoo/CAN
+      ```
+
+​	2.train.
+
+```
+CUDA_VISIBLE_DEVICES=0 python script/train.py train {model}
 
 model: CAN,Cartesion,PNN, etc. (check the train.py)
+```
+
+​	
+
+
+
+## Dataset
+
+Amazon, Taobao and Avazu dataset is used as benchmark dataset.
+
+### Prepare
+
+For details of Data download, see `./data`
+
+
 
-## Citation
-## Contact
-## License
diff --git a/modelzoo/CAN/data/README.md b/modelzoo/CAN/data/README.md
index 02d8b396649..1f4b135adae 100644
--- a/modelzoo/CAN/data/README.md
+++ b/modelzoo/CAN/data/README.md
@@ -1,25 +1,14 @@
-# Co-Action Network
+# Dataset
 
-Implementation of paper "CAN: Revisiting Feature Co-Action for Click Through Rate Prediction".
+## Prepare dataset
 
-paper: [arxiv (to be released)]()
+Prepare data of DIEN first;
 
-## Installation
-dependences：
+Run `prepare_data.sh` to download and process data:
 
-tensorflow：1.4.1
+```
+sh prepare_data.sh
+```
 
-python: 2.7
+Put data into this folder.
 
-higher version of tensorflow and python3 will be supported soon!
-
-## Getting Started
-training:
-
-CUDA_VISIBLE_DEVICES=0  python  train.py train {model}
-
-model: CAN,Cartesion,PNN, etc. (check the train.py)
-
-## Citation
-## Contact
-## License
diff --git a/modelzoo/CAN/data/script/calc_ckpt.py b/modelzoo/CAN/data/script/calc_ckpt.py
deleted file mode 100644
index 97d30de8a73..00000000000
--- a/modelzoo/CAN/data/script/calc_ckpt.py
+++ /dev/null
@@ -1,12 +0,0 @@
-
-ckpt = tf.train.get_checkpoint_state("./ckpt_path/").model_checkpoint_path
-saver = tf.train.import_meta_graph(ckpt+'.meta')
-variables = tf.trainable_variables()
-total_parameters = 0
-for variable in variables:
-    shape = variable.get_shape()
-    variable_parameters = 1
-    for dim in shape:
-        variable_parameters *= dim.value
-    total_parameters += variable_parameters
-
diff --git a/modelzoo/CAN/data/script/data_iterator.py b/modelzoo/CAN/data/script/data_iterator.py
index b49e5f8c2c9..75c53c46919 100644
--- a/modelzoo/CAN/data/script/data_iterator.py
+++ b/modelzoo/CAN/data/script/data_iterator.py
@@ -51,10 +51,10 @@ def __init__(self, source,
         else:
             self.source = fopen(source, 'r')
         self.source_dicts = []
-        for source_dict in [uid_voc, mid_voc, cat_voc, '../CAN/data/item_carte_voc.pkl', '../CAN/data/cate_carte_voc.pkl']:
+        for source_dict in [uid_voc, mid_voc, cat_voc, './data/item_carte_voc.pkl', './data/cate_carte_voc.pkl']:
             self.source_dicts.append(load_dict(source_dict))
 
-        f_meta = open("../CAN/data/item-info", "r")
+        f_meta = open("./data/item-info", "r")
         meta_map = {}
         for line in f_meta:
             arr = line.strip().split("\t")
@@ -73,7 +73,7 @@ def __init__(self, source,
                 cat_idx = 0
             self.meta_id_map[mid_idx] = cat_idx
 
-        f_review = open("../CAN/data/reviews-info", "r")
+        f_review = open("./data/reviews-info", "r")
         self.mid_list_for_random = []
         for line in f_review:
             arr = line.strip().split("\t")
diff --git a/modelzoo/CAN/data/script/Dice.py b/modelzoo/CAN/script/Dice.py
similarity index 100%
rename from modelzoo/CAN/data/script/Dice.py
rename to modelzoo/CAN/script/Dice.py
diff --git a/modelzoo/CAN/data/script/model.py b/modelzoo/CAN/script/model.py
similarity index 99%
rename from modelzoo/CAN/data/script/model.py
rename to modelzoo/CAN/script/model.py
index aba37138e0e..e968c382f55 100644
--- a/modelzoo/CAN/data/script/model.py
+++ b/modelzoo/CAN/script/model.py
@@ -2,9 +2,9 @@
 from tensorflow.python.ops.rnn_cell import GRUCell
 from tensorflow.python.ops.rnn_cell import LSTMCell
 from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
-from data.script.rnn import dynamic_rnn
-from data.script.utils import *
-from data.script.Dice import dice
+from script.rnn import dynamic_rnn
+from script.utils import *
+from script.Dice import dice
 
 #### CAN config #####
 weight_emb_w = [[16, 8], [8,4]] 
diff --git a/modelzoo/CAN/data/script/model_avazu.py b/modelzoo/CAN/script/model_avazu.py
similarity index 100%
rename from modelzoo/CAN/data/script/model_avazu.py
rename to modelzoo/CAN/script/model_avazu.py
diff --git a/modelzoo/CAN/data/script/rnn.py b/modelzoo/CAN/script/rnn.py
similarity index 100%
rename from modelzoo/CAN/data/script/rnn.py
rename to modelzoo/CAN/script/rnn.py
diff --git a/modelzoo/CAN/data/script/utils.py b/modelzoo/CAN/script/utils.py
similarity index 100%
rename from modelzoo/CAN/data/script/utils.py
rename to modelzoo/CAN/script/utils.py
diff --git a/modelzoo/CAN/train.py b/modelzoo/CAN/train.py
index 7ef1a6cda18..dd54677aecc 100644
--- a/modelzoo/CAN/train.py
+++ b/modelzoo/CAN/train.py
@@ -1,11 +1,11 @@
 import numpy
 from data.script.data_iterator import DataIterator
 import tensorflow as tf
-from data.script.model import *
+from script.model import *
 import time
 import random
 import sys
-from data.script.utils import *
+from script.utils import *
 from tqdm import tqdm
 
 EMBEDDING_DIM = 18

From c5df688db708d3a890309b6d5655fbb90aa1fd90 Mon Sep 17 00:00:00 2001
From: lihangtian <936971274@qq.com>
Date: Fri, 19 Aug 2022 21:35:51 +0800
Subject: [PATCH 4/8] [ModelZoo] Support FNN

---
 modelzoo/FNN/README.md                        |   87 +
 modelzoo/FNN/data/README.md                   |    4 +
 modelzoo/FNN/result/README.md                 |    2 +
 modelzoo/FNN/script/__init__.py               |    0
 modelzoo/FNN/script/contrib/__init__.py       |    0
 modelzoo/FNN/script/contrib/rnn.py            | 1153 +++++++++++++
 modelzoo/FNN/script/contrib/rnn_v2.py         | 1452 ++++++++++++++++
 modelzoo/FNN/script/contrib/utils.py          |  378 +++++
 modelzoo/FNN/script/estimator/__init__.py     |    1 +
 .../FNN/script/estimator/feature_column.py    |   52 +
 modelzoo/FNN/script/estimator/inputs.py       |   52 +
 modelzoo/FNN/script/estimator/utils.py        |  217 +++
 modelzoo/FNN/script/feature_column.py         |  220 +++
 modelzoo/FNN/script/inputs.py                 |  155 ++
 modelzoo/FNN/script/layers/__init__.py        |   52 +
 modelzoo/FNN/script/layers/activation.py      |   85 +
 modelzoo/FNN/script/layers/core.py            |  267 +++
 modelzoo/FNN/script/layers/interaction.py     | 1492 +++++++++++++++++
 modelzoo/FNN/script/layers/normalization.py   |   51 +
 modelzoo/FNN/script/layers/sequence.py        |  901 ++++++++++
 modelzoo/FNN/script/layers/utils.py           |  302 ++++
 modelzoo/FNN/script/models/__init__.py        |    4 +
 modelzoo/FNN/script/models/fnn.py             |   53 +
 modelzoo/FNN/script/utils.py                  |   46 +
 modelzoo/FNN/train.py                         |  139 ++
 25 files changed, 7165 insertions(+)
 create mode 100644 modelzoo/FNN/README.md
 create mode 100644 modelzoo/FNN/data/README.md
 create mode 100644 modelzoo/FNN/result/README.md
 create mode 100644 modelzoo/FNN/script/__init__.py
 create mode 100644 modelzoo/FNN/script/contrib/__init__.py
 create mode 100644 modelzoo/FNN/script/contrib/rnn.py
 create mode 100644 modelzoo/FNN/script/contrib/rnn_v2.py
 create mode 100644 modelzoo/FNN/script/contrib/utils.py
 create mode 100644 modelzoo/FNN/script/estimator/__init__.py
 create mode 100644 modelzoo/FNN/script/estimator/feature_column.py
 create mode 100644 modelzoo/FNN/script/estimator/inputs.py
 create mode 100644 modelzoo/FNN/script/estimator/utils.py
 create mode 100644 modelzoo/FNN/script/feature_column.py
 create mode 100644 modelzoo/FNN/script/inputs.py
 create mode 100644 modelzoo/FNN/script/layers/__init__.py
 create mode 100644 modelzoo/FNN/script/layers/activation.py
 create mode 100644 modelzoo/FNN/script/layers/core.py
 create mode 100644 modelzoo/FNN/script/layers/interaction.py
 create mode 100644 modelzoo/FNN/script/layers/normalization.py
 create mode 100644 modelzoo/FNN/script/layers/sequence.py
 create mode 100644 modelzoo/FNN/script/layers/utils.py
 create mode 100644 modelzoo/FNN/script/models/__init__.py
 create mode 100644 modelzoo/FNN/script/models/fnn.py
 create mode 100644 modelzoo/FNN/script/utils.py
 create mode 100644 modelzoo/FNN/train.py

diff --git a/modelzoo/FNN/README.md b/modelzoo/FNN/README.md
new file mode 100644
index 00000000000..a2f9e721921
--- /dev/null
+++ b/modelzoo/FNN/README.md
@@ -0,0 +1,87 @@
+# FNN
+
+The following is a brief directory structure and description for this example:
+
+
+
+```
+├── data                        # Data set directory
+│   └── README.md              # Documentation describing how to prepare dataset
+├── script                       # model set directory
+│	├── contrib                  #Directory contains rnn
+│	├── estimator                #Directory contains estimator to data
+│	├── layers                   #Directory contains layers of model 
+│	├── models                   #Directory contains FNN model
+│	├── feature_column.py        # Feature marker
+│	├── inputs.py                #Construction of Input Layer
+│	└──utils
+├── train.py                    # Training script
+└── README.md                      # Documentation
+```
+
+
+
+## Content
+
+[TOC]
+
+
+
+## Model Structure
+
+Implementation of paper "Deep Learning over Multi-field Categorical Data– A Case Study on User Response  Prediction".
+
+
+
+## Usage
+
+### Stand-alone Training
+
+1. Please prepare the data set and DeepRec env.
+
+   1. Manually
+
+      - Follow [dataset preparation](https://github.com/alibaba/DeepRec/tree/main/modelzoo/DIEN#prepare) to prepare data set.
+      - Download code by `git clone https://github.com/alibaba/DeepRec`
+      - Follow [How to Build](https://github.com/alibaba/DeepRec#how-to-build) to build DeepRec whl package and install by `pip install $DEEPREC_WHL`.
+
+   2. Docker(Recommended)
+
+      ```
+      docker pull alideeprec/deeprec-release-modelzoo:latest
+      docker run -it alideeprec/deeprec-release-modelzoo:latest /bin/bash
+      
+      # In docker container
+      cd /root/modelzoo/CAN
+      ```
+
+​	2.train.
+
+```
+  python train.py
+```
+
+​	
+
+
+
+## Dataset
+
+ iPinYou dataset is used as benchmark dataset.
+
+### Prepare
+
+For details of Data download, see [Data Preparation](https://github.com/Atomu2014/make-ipinyou-data)
+
+### Campaigs
+
+We use campaign 1458 as example here.
+
+```
+make-ipinyou-data/1458$ ls
+featindex.txt  test.log.txt  test.txt  train.log.txt  train.txt
+```
+
+- `train.log.txt` and `test.log.txt` are the formalised string data for each row (record) in train and test. The first column is whether the user click the ad or not.
+- `featindex.txt`maps the features to their indexes. For example, `8:1.1.174.* 76` means that the 8th column in `train.log.txt` with the string `1.1.174.*` maps to feature index `76`.
+- `train.txt` and `test.txt` are the mapped vector data for `train.log.txt` and `test.log.txt`. The format is y:click, and x:features. Such data is in the standard form as introduced in [iPinYou Benchmarking](http://arxiv.org/abs/1407.7073).
diff --git a/modelzoo/FNN/data/README.md b/modelzoo/FNN/data/README.md
new file mode 100644
index 00000000000..15a0bc61c8d
--- /dev/null
+++ b/modelzoo/FNN/data/README.md
@@ -0,0 +1,4 @@
+make-ipinyou-data
+=================
+
+For details of Data download, see [Data Preparation](https://github.com/Atomu2014/make-ipinyou-data)
diff --git a/modelzoo/FNN/result/README.md b/modelzoo/FNN/result/README.md
new file mode 100644
index 00000000000..6f962fb1716
--- /dev/null
+++ b/modelzoo/FNN/result/README.md
@@ -0,0 +1,2 @@
+# Result
+Evaluation Metrics file are default saved in this folder.
diff --git a/modelzoo/FNN/script/__init__.py b/modelzoo/FNN/script/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/modelzoo/FNN/script/contrib/__init__.py b/modelzoo/FNN/script/contrib/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/modelzoo/FNN/script/contrib/rnn.py b/modelzoo/FNN/script/contrib/rnn.py
new file mode 100644
index 00000000000..b3554993063
--- /dev/null
+++ b/modelzoo/FNN/script/contrib/rnn.py
@@ -0,0 +1,1153 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+#
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+
+# you may not use this file except in compliance with the License.
+
+# You may obtain a copy of the License at
+
+#
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+#
+
+# Unless required by applicable law or agreed to in writing, software
+
+# distributed under the License is distributed on an "AS IS" BASIS,
+
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+# See the License for the specific language governing permissions and
+
+# limitations under the License.
+
+# ==============================================================================
+
+
+"""RNN helpers for TensorFlow models.
+@@bidirectional_dynamic_rnn
+@@dynamic_rnn
+@@raw_rnn
+@@static_rnn
+@@static_state_saving_rnn
+@@static_bidirectional_rnn
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.util import nest
+import tensorflow as tf
+
+
+def _like_rnncell_(cell):
+    """Checks that a given object is an RNNCell by using duck typing."""
+
+    conditions = [hasattr(cell, "output_size"), hasattr(cell, "state_size"),
+
+                  hasattr(cell, "zero_state"), callable(cell)]
+
+    return all(conditions)
+
+
+# pylint: disable=protected-access
+
+_concat = rnn_cell_impl._concat
+try:
+    _like_rnncell = rnn_cell_impl._like_rnncell
+except Exception as e:
+    _like_rnncell = _like_rnncell_
+
+
+# pylint: enable=protected-access
+
+
+def _transpose_batch_time(x):
+    """Transpose the batch and time dimensions of a Tensor.
+    Retains as much of the static shape information as possible.
+    Args:
+      x: A tensor of rank 2 or higher.
+    Returns:
+      x transposed along the first two dimensions.
+    Raises:
+      ValueError: if `x` is rank 1 or lower.
+    """
+
+    x_static_shape = x.get_shape()
+
+    if x_static_shape.ndims is not None and x_static_shape.ndims < 2:
+        raise ValueError(
+
+            "Expected input tensor %s to have rank at least 2, but saw shape: %s" %
+
+            (x, x_static_shape))
+
+    x_rank = array_ops.rank(x)
+
+    x_t = array_ops.transpose(
+
+        x, array_ops.concat(
+
+            ([1, 0], math_ops.range(2, x_rank)), axis=0))
+
+    x_t.set_shape(
+
+        tensor_shape.TensorShape([
+
+            x_static_shape[1].value, x_static_shape[0].value
+
+        ]).concatenate(x_static_shape[2:]))
+
+    return x_t
+
+
+def _best_effort_input_batch_size(flat_input):
+    """Get static input batch size if available, with fallback to the dynamic one.
+    Args:
+      flat_input: An iterable of time major input Tensors of shape [max_time,
+        batch_size, ...]. All inputs should have compatible batch sizes.
+    Returns:
+      The batch size in Python integer if available, or a scalar Tensor otherwise.
+    Raises:
+      ValueError: if there is any input with an invalid shape.
+    """
+
+    for input_ in flat_input:
+
+        shape = input_.shape
+
+        if shape.ndims is None:
+            continue
+
+        if shape.ndims < 2:
+            raise ValueError(
+
+                "Expected input tensor %s to have rank at least 2" % input_)
+
+        batch_size = shape[1].value
+
+        if batch_size is not None:
+            return batch_size
+
+    # Fallback to the dynamic batch size of the first input.
+
+    return array_ops.shape(flat_input[0])[1]
+
+
+def _infer_state_dtype(explicit_dtype, state):
+    """Infer the dtype of an RNN state.
+    Args:
+      explicit_dtype: explicitly declared dtype or None.
+      state: RNN's hidden state. Must be a Tensor or a nested iterable containing
+        Tensors.
+    Returns:
+      dtype: inferred dtype of hidden state.
+    Raises:
+      ValueError: if `state` has heterogeneous dtypes or is empty.
+    """
+
+    if explicit_dtype is not None:
+
+        return explicit_dtype
+
+    elif nest.is_sequence(state):
+
+        inferred_dtypes = [element.dtype for element in nest.flatten(state)]
+
+        if not inferred_dtypes:
+            raise ValueError("Unable to infer dtype from empty state.")
+
+        all_same = all([x == inferred_dtypes[0] for x in inferred_dtypes])
+
+        if not all_same:
+            raise ValueError(
+
+                "State has tensors of different inferred_dtypes. Unable to infer a "
+
+                "single representative dtype.")
+
+        return inferred_dtypes[0]
+
+    else:
+
+        return state.dtype
+
+
+# pylint: disable=unused-argument
+
+def _rnn_step(
+
+        time, sequence_length, min_sequence_length, max_sequence_length,
+
+        zero_output, state, call_cell, state_size, skip_conditionals=False):
+    """Calculate one step of a dynamic RNN minibatch.
+    Returns an (output, state) pair conditioned on the sequence_lengths.
+    When skip_conditionals=False, the pseudocode is something like:
+    if t >= max_sequence_length:
+      return (zero_output, state)
+    if t < min_sequence_length:
+      return call_cell()
+    # Selectively output zeros or output, old state or new state depending
+    # on if we've finished calculating each row.
+    new_output, new_state = call_cell()
+    final_output = np.vstack([
+      zero_output if time >= sequence_lengths[r] else new_output_r
+      for r, new_output_r in enumerate(new_output)
+    ])
+    final_state = np.vstack([
+      state[r] if time >= sequence_lengths[r] else new_state_r
+      for r, new_state_r in enumerate(new_state)
+    ])
+    return (final_output, final_state)
+    Args:
+      time: Python int, the current time step
+      sequence_length: int32 `Tensor` vector of size [batch_size]
+      min_sequence_length: int32 `Tensor` scalar, min of sequence_length
+      max_sequence_length: int32 `Tensor` scalar, max of sequence_length
+      zero_output: `Tensor` vector of shape [output_size]
+      state: Either a single `Tensor` matrix of shape `[batch_size, state_size]`,
+        or a list/tuple of such tensors.
+      call_cell: lambda returning tuple of (new_output, new_state) where
+        new_output is a `Tensor` matrix of shape `[batch_size, output_size]`.
+        new_state is a `Tensor` matrix of shape `[batch_size, state_size]`.
+      state_size: The `cell.state_size` associated with the state.
+      skip_conditionals: Python bool, whether to skip using the conditional
+        calculations.  This is useful for `dynamic_rnn`, where the input tensor
+        matches `max_sequence_length`, and using conditionals just slows
+        everything down.
+    Returns:
+      A tuple of (`final_output`, `final_state`) as given by the pseudocode above:
+        final_output is a `Tensor` matrix of shape [batch_size, output_size]
+        final_state is either a single `Tensor` matrix, or a tuple of such
+          matrices (matching length and shapes of input `state`).
+    Raises:
+      ValueError: If the cell returns a state tuple whose length does not match
+        that returned by `state_size`.
+    """
+
+    # Convert state to a list for ease of use
+
+    flat_state = nest.flatten(state)
+
+    flat_zero_output = nest.flatten(zero_output)
+
+    def _copy_one_through(output, new_output):
+
+        # If the state contains a scalar value we simply pass it through.
+
+        if output.shape.ndims == 0:
+            return new_output
+
+        copy_cond = (time >= sequence_length)
+
+        with ops.colocate_with(new_output):
+            return array_ops.where(copy_cond, output, new_output)
+
+    def _copy_some_through(flat_new_output, flat_new_state):
+
+        # Use broadcasting select to determine which values should get
+
+        # the previous state & zero output, and which values should get
+
+        # a calculated state & output.
+
+        flat_new_output = [
+
+            _copy_one_through(zero_output, new_output)
+
+            for zero_output, new_output in zip(flat_zero_output, flat_new_output)]
+
+        flat_new_state = [
+
+            _copy_one_through(state, new_state)
+
+            for state, new_state in zip(flat_state, flat_new_state)]
+
+        return flat_new_output + flat_new_state
+
+    def _maybe_copy_some_through():
+
+        """Run RNN step.  Pass through either no or some past state."""
+
+        new_output, new_state = call_cell()
+
+        nest.assert_same_structure(state, new_state)
+
+        flat_new_state = nest.flatten(new_state)
+
+        flat_new_output = nest.flatten(new_output)
+
+        return control_flow_ops.cond(
+
+            # if t < min_seq_len: calculate and return everything
+
+            time < min_sequence_length, lambda: flat_new_output + flat_new_state,
+
+            # else copy some of it through
+
+            lambda: _copy_some_through(flat_new_output, flat_new_state))
+
+    # TODO(ebrevdo): skipping these conditionals may cause a slowdown,
+
+    # but benefits from removing cond() and its gradient.  We should
+
+    # profile with and without this switch here.
+
+    if skip_conditionals:
+
+        # Instead of using conditionals, perform the selective copy at all time
+
+        # steps.  This is faster when max_seq_len is equal to the number of unrolls
+
+        # (which is typical for dynamic_rnn).
+
+        new_output, new_state = call_cell()
+
+        nest.assert_same_structure(state, new_state)
+
+        new_state = nest.flatten(new_state)
+
+        new_output = nest.flatten(new_output)
+
+        final_output_and_state = _copy_some_through(new_output, new_state)
+
+    else:
+
+        empty_update = lambda: flat_zero_output + flat_state
+
+        final_output_and_state = control_flow_ops.cond(
+
+            # if t >= max_seq_len: copy all state through, output zeros
+
+            time >= max_sequence_length, empty_update,
+
+            # otherwise calculation is required: copy some or all of it through
+
+            _maybe_copy_some_through)
+
+    if len(final_output_and_state) != len(flat_zero_output) + len(flat_state):
+        raise ValueError("Internal error: state and output were not concatenated "
+
+                         "correctly.")
+
+    final_output = final_output_and_state[:len(flat_zero_output)]
+
+    final_state = final_output_and_state[len(flat_zero_output):]
+
+    for output, flat_output in zip(final_output, flat_zero_output):
+        output.set_shape(flat_output.get_shape())
+
+    for substate, flat_substate in zip(final_state, flat_state):
+        substate.set_shape(flat_substate.get_shape())
+
+    final_output = nest.pack_sequence_as(
+
+        structure=zero_output, flat_sequence=final_output)
+
+    final_state = nest.pack_sequence_as(
+
+        structure=state, flat_sequence=final_state)
+
+    return final_output, final_state
+
+
+def _reverse_seq(input_seq, lengths):
+    """Reverse a list of Tensors up to specified lengths.
+    Args:
+      input_seq: Sequence of seq_len tensors of dimension (batch_size, n_features)
+                 or nested tuples of tensors.
+      lengths:   A `Tensor` of dimension batch_size, containing lengths for each
+                 sequence in the batch. If "None" is specified, simply reverses
+                 the list.
+    Returns:
+      time-reversed sequence
+    """
+
+    if lengths is None:
+        return list(reversed(input_seq))
+
+    flat_input_seq = tuple(nest.flatten(input_) for input_ in input_seq)
+
+    flat_results = [[] for _ in range(len(input_seq))]
+
+    for sequence in zip(*flat_input_seq):
+
+        input_shape = tensor_shape.unknown_shape(
+
+            ndims=sequence[0].get_shape().ndims)
+
+        for input_ in sequence:
+            input_shape.merge_with(input_.get_shape())
+
+            input_.set_shape(input_shape)
+
+        # Join into (time, batch_size, depth)
+
+        s_joined = array_ops.stack(sequence)
+
+        # Reverse along dimension 0
+
+        s_reversed = array_ops.reverse_sequence(s_joined, lengths, 0, 1)
+
+        # Split again into list
+
+        result = array_ops.unstack(s_reversed)
+
+        for r, flat_result in zip(result, flat_results):
+            r.set_shape(input_shape)
+
+            flat_result.append(r)
+
+    results = [nest.pack_sequence_as(structure=input_, flat_sequence=flat_result)
+
+               for input_, flat_result in zip(input_seq, flat_results)]
+
+    return results
+
+
+#
+# def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
+#
+#                               initial_state_fw=None, initial_state_bw=None,
+#
+#                               dtype=None, parallel_iterations=None,
+#
+#                               swap_memory=False, time_major=False, scope=None):
+#
+#   """Creates a dynamic version of bidirectional recurrent neural network.
+#
+#
+#
+#   Takes input and builds independent forward and backward RNNs. The input_size
+#
+#   of forward and backward cell must match. The initial state for both directions
+#
+#   is zero by default (but can be set optionally) and no intermediate states are
+#
+#   ever returned -- the network is fully unrolled for the given (passed in)
+#
+#   length(s) of the sequence(s) or completely unrolled if length(s) is not
+#
+#   given.
+#
+#
+#
+#   Args:
+#
+#     cell_fw: An instance of RNNCell, to be used for forward direction.
+#
+#     cell_bw: An instance of RNNCell, to be used for backward direction.
+#
+#     inputs: The RNN inputs.
+#
+#       If time_major == False (default), this must be a tensor of shape:
+#
+#         `[batch_size, max_time, ...]`, or a nested tuple of such elements.
+#
+#       If time_major == True, this must be a tensor of shape:
+#
+#         `[max_time, batch_size, ...]`, or a nested tuple of such elements.
+#
+#     sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
+#
+#       containing the actual lengths for each of the sequences in the batch.
+#
+#       If not provided, all batch entries are assumed to be full sequences; and
+#
+#       time reversal is applied from time `0` to `max_time` for each sequence.
+#
+#     initial_state_fw: (optional) An initial state for the forward RNN.
+#
+#       This must be a tensor of appropriate type and shape
+#
+#       `[batch_size, cell_fw.state_size]`.
+#
+#       If `cell_fw.state_size` is a tuple, this should be a tuple of
+#
+#       tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
+#
+#     initial_state_bw: (optional) Same as for `initial_state_fw`, but using
+#
+#       the corresponding properties of `cell_bw`.
+#
+#     dtype: (optional) The data type for the initial states and expected output.
+#
+#       Required if initial_states are not provided or RNN states have a
+#
+#       heterogeneous dtype.
+#
+#     parallel_iterations: (Default: 32).  The number of iterations to run in
+#
+#       parallel.  Those operations which do not have any temporal dependency
+#
+#       and can be run in parallel, will be.  This parameter trades off
+#
+#       time for space.  Values >> 1 use more memory but take less time,
+#
+#       while smaller values use less memory but computations take longer.
+#
+#     swap_memory: Transparently swap the tensors produced in forward inference
+#
+#       but needed for back prop from GPU to CPU.  This allows training RNNs
+#
+#       which would typically not fit on a single GPU, with very minimal (or no)
+#
+#       performance penalty.
+#
+#     time_major: The shape format of the `inputs` and `outputs` Tensors.
+#
+#       If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+#
+#       If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+#
+#       Using `time_major = True` is a bit more efficient because it avoids
+#
+#       transposes at the beginning and end of the RNN calculation.  However,
+#
+#       most TensorFlow data is batch-major, so by default this function
+#
+#       accepts input and emits output in batch-major form.
+#
+#     scope: VariableScope for the created subgraph; defaults to
+#
+#       "bidirectional_rnn"
+#
+#
+#
+#   Returns:
+#
+#     A tuple (outputs, output_states) where:
+#
+#       outputs: A tuple (output_fw, output_bw) containing the forward and
+#
+#         the backward rnn output `Tensor`.
+#
+#         If time_major == False (default),
+#
+#           output_fw will be a `Tensor` shaped:
+#
+#           `[batch_size, max_time, cell_fw.output_size]`
+#
+#           and output_bw will be a `Tensor` shaped:
+#
+#           `[batch_size, max_time, cell_bw.output_size]`.
+#
+#         If time_major == True,
+#
+#           output_fw will be a `Tensor` shaped:
+#
+#           `[max_time, batch_size, cell_fw.output_size]`
+#
+#           and output_bw will be a `Tensor` shaped:
+#
+#           `[max_time, batch_size, cell_bw.output_size]`.
+#
+#         It returns a tuple instead of a single concatenated `Tensor`, unlike
+#
+#         in the `bidirectional_rnn`. If the concatenated one is preferred,
+#
+#         the forward and backward outputs can be concatenated as
+#
+#         `tf.concat(outputs, 2)`.
+#
+#       output_states: A tuple (output_state_fw, output_state_bw) containing
+#
+#         the forward and the backward final states of bidirectional rnn.
+#
+#
+#
+#   Raises:
+#
+#     TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
+#
+#   """
+#
+#
+#
+#   if not _like_rnncell(cell_fw):
+#
+#     raise TypeError("cell_fw must be an instance of RNNCell")
+#
+#   if not _like_rnncell(cell_bw):
+#
+#     raise TypeError("cell_bw must be an instance of RNNCell")
+#
+#
+#
+#   with vs.variable_scope(scope or "bidirectional_rnn"):
+#
+#     # Forward direction
+#
+#     with vs.variable_scope("fw") as fw_scope:
+#
+#       output_fw, output_state_fw = dynamic_rnn(
+#
+#           cell=cell_fw, inputs=inputs, sequence_length=sequence_length,
+#
+#           initial_state=initial_state_fw, dtype=dtype,
+#
+#           parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+#
+#           time_major=time_major, scope=fw_scope)
+#
+#
+#
+#     # Backward direction
+#
+#     if not time_major:
+#
+#       time_dim = 1
+#
+#       batch_dim = 0
+#
+#     else:
+#
+#       time_dim = 0
+#
+#       batch_dim = 1
+#
+#
+#
+#     def _reverse(input_, seq_lengths, seq_dim, batch_dim):
+#
+#       if seq_lengths is not None:
+#
+#         return array_ops.reverse_sequence(
+#
+#             input=input_, seq_lengths=seq_lengths,
+#
+#             seq_dim=seq_dim, batch_dim=batch_dim)
+#
+#       else:
+#
+#         return array_ops.reverse(input_, axis=[seq_dim])
+#
+#
+#
+#     with vs.variable_scope("bw") as bw_scope:
+#
+#       inputs_reverse = _reverse(
+#
+#           inputs, seq_lengths=sequence_length,
+#
+#           seq_dim=time_dim, batch_dim=batch_dim)
+#
+#       tmp, output_state_bw = dynamic_rnn(
+#
+#           cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
+#
+#           initial_state=initial_state_bw, dtype=dtype,
+#
+#           parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+#
+#           time_major=time_major, scope=bw_scope)
+#
+#
+#
+#   output_bw = _reverse(
+#
+#       tmp, seq_lengths=sequence_length,
+#
+#       seq_dim=time_dim, batch_dim=batch_dim)
+#
+#
+#
+#   outputs = (output_fw, output_bw)
+#
+#   output_states = (output_state_fw, output_state_bw)
+#
+#
+#
+#   return (outputs, output_states)
+#
+
+
+def dynamic_rnn(cell, inputs, att_scores=None, sequence_length=None, initial_state=None,
+
+                dtype=None, parallel_iterations=None, swap_memory=False,
+
+                time_major=False, scope=None):
+    """Creates a recurrent neural network specified by RNNCell `cell`.
+    Performs fully dynamic unrolling of `inputs`.
+    Example:
+    ```python
+    # create a BasicRNNCell
+    rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+    # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size]
+    # defining initial state
+    initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32)
+    # 'state' is a tensor of shape [batch_size, cell_state_size]
+    outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data,
+                                       initial_state=initial_state,
+                                       dtype=tf.float32)
+    ```
+    ```python
+    # create 2 LSTMCells
+    rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [128, 256]]
+    # create a RNN cell composed sequentially of a number of RNNCells
+    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
+    # 'outputs' is a tensor of shape [batch_size, max_time, 256]
+    # 'state' is a N-tuple where N is the number of LSTMCells containing a
+    # tf.contrib.rnn.LSTMStateTuple for each cell
+    outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
+                                       inputs=data,
+                                       dtype=tf.float32)
+    ```
+    Args:
+      cell: An instance of RNNCell.
+      inputs: The RNN inputs.
+        If `time_major == False` (default), this must be a `Tensor` of shape:
+          `[batch_size, max_time, ...]`, or a nested tuple of such
+          elements.
+        If `time_major == True`, this must be a `Tensor` of shape:
+          `[max_time, batch_size, ...]`, or a nested tuple of such
+          elements.
+        This may also be a (possibly nested) tuple of Tensors satisfying
+        this property.  The first two dimensions must match across all the inputs,
+        but otherwise the ranks and other shape components may differ.
+        In this case, input to `cell` at each time-step will replicate the
+        structure of these tuples, except for the time dimension (from which the
+        time is taken).
+        The input to `cell` at each time step will be a `Tensor` or (possibly
+        nested) tuple of Tensors each with dimensions `[batch_size, ...]`.
+      sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
+        Used to copy-through state and zero-out outputs when past a batch
+        element's sequence length.  So it's more for correctness than performance.
+      initial_state: (optional) An initial state for the RNN.
+        If `cell.state_size` is an integer, this must be
+        a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
+        If `cell.state_size` is a tuple, this should be a tuple of
+        tensors having shapes `[batch_size, s] for s in cell.state_size`.
+      dtype: (optional) The data type for the initial state and expected output.
+        Required if initial_state is not provided or RNN state has a heterogeneous
+        dtype.
+      parallel_iterations: (Default: 32).  The number of iterations to run in
+        parallel.  Those operations which do not have any temporal dependency
+        and can be run in parallel, will be.  This parameter trades off
+        time for space.  Values >> 1 use more memory but take less time,
+        while smaller values use less memory but computations take longer.
+      swap_memory: Transparently swap the tensors produced in forward inference
+        but needed for back prop from GPU to CPU.  This allows training RNNs
+        which would typically not fit on a single GPU, with very minimal (or no)
+        performance penalty.
+      time_major: The shape format of the `inputs` and `outputs` Tensors.
+        If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+        If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+        Using `time_major = True` is a bit more efficient because it avoids
+        transposes at the beginning and end of the RNN calculation.  However,
+        most TensorFlow data is batch-major, so by default this function
+        accepts input and emits output in batch-major form.
+      scope: VariableScope for the created subgraph; defaults to "rnn".
+    Returns:
+      A pair (outputs, state) where:
+      outputs: The RNN output `Tensor`.
+        If time_major == False (default), this will be a `Tensor` shaped:
+          `[batch_size, max_time, cell.output_size]`.
+        If time_major == True, this will be a `Tensor` shaped:
+          `[max_time, batch_size, cell.output_size]`.
+        Note, if `cell.output_size` is a (possibly nested) tuple of integers
+        or `TensorShape` objects, then `outputs` will be a tuple having the
+        same structure as `cell.output_size`, containing Tensors having shapes
+        corresponding to the shape data in `cell.output_size`.
+      state: The final state.  If `cell.state_size` is an int, this
+        will be shaped `[batch_size, cell.state_size]`.  If it is a
+        `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
+        If it is a (possibly nested) tuple of ints or `TensorShape`, this will
+        be a tuple having the corresponding shapes. If cells are `LSTMCells`
+        `state` will be a tuple containing a `LSTMStateTuple` for each cell.
+    Raises:
+      TypeError: If `cell` is not an instance of RNNCell.
+      ValueError: If inputs is None or an empty list.
+    """
+
+    if not _like_rnncell(cell):
+        raise TypeError("cell must be an instance of RNNCell")
+
+    # By default, time_major==False and inputs are batch-major: shaped
+
+    #   [batch, time, depth]
+
+    # For internal calculations, we transpose to [time, batch, depth]
+
+    flat_input = nest.flatten(inputs)
+
+    if not time_major:
+        # (B,T,D) => (T,B,D)
+
+        flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
+
+        flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
+
+    parallel_iterations = parallel_iterations or 32
+
+    if sequence_length is not None:
+
+        sequence_length = math_ops.to_int32(sequence_length)
+
+        if sequence_length.get_shape().ndims not in (None, 1):
+            raise ValueError(
+
+                "sequence_length must be a vector of length batch_size, "
+
+                "but saw shape: %s" % sequence_length.get_shape())
+
+        sequence_length = array_ops.identity(  # Just to find it in the graph.
+
+            sequence_length, name="sequence_length")
+
+    # Create a new scope in which the caching device is either
+
+    # determined by the parent scope, or is set to place the cached
+
+    # Variable using the same placement as for the rest of the RNN.
+
+    with vs.variable_scope(scope or "rnn",reuse=tf.AUTO_REUSE) as varscope:#TODO:user defined reuse
+
+        if varscope.caching_device is None:
+            varscope.set_caching_device(lambda op: op.device)
+
+        batch_size = _best_effort_input_batch_size(flat_input)
+
+        if initial_state is not None:
+
+            state = initial_state
+
+        else:
+
+            if not dtype:
+                raise ValueError("If there is no initial_state, you must give a dtype.")
+
+            state = cell.zero_state(batch_size, dtype)
+
+        def _assert_has_shape(x, shape):
+
+            x_shape = array_ops.shape(x)
+
+            packed_shape = array_ops.stack(shape)
+
+            return control_flow_ops.Assert(
+
+                math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)),
+
+                ["Expected shape for Tensor %s is " % x.name,
+
+                 packed_shape, " but saw shape: ", x_shape])
+
+        if sequence_length is not None:
+            # Perform some shape validation
+
+            with ops.control_dependencies(
+
+                    [_assert_has_shape(sequence_length, [batch_size])]):
+                sequence_length = array_ops.identity(
+
+                    sequence_length, name="CheckSeqLen")
+
+        inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input)
+
+        (outputs, final_state) = _dynamic_rnn_loop(
+
+            cell,
+
+            inputs,
+
+            state,
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory,
+
+            att_scores=att_scores,
+
+            sequence_length=sequence_length,
+
+            dtype=dtype)
+
+        # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth].
+
+        # If we are performing batch-major calculations, transpose output back
+
+        # to shape [batch, time, depth]
+
+        if not time_major:
+            # (T,B,D) => (B,T,D)
+
+            outputs = nest.map_structure(_transpose_batch_time, outputs)
+
+        return (outputs, final_state)
+
+
+def _dynamic_rnn_loop(cell,
+
+                      inputs,
+
+                      initial_state,
+
+                      parallel_iterations,
+
+                      swap_memory,
+
+                      att_scores=None,
+
+                      sequence_length=None,
+
+                      dtype=None):
+    """Internal implementation of Dynamic RNN.
+    Args:
+      cell: An instance of RNNCell.
+      inputs: A `Tensor` of shape [time, batch_size, input_size], or a nested
+        tuple of such elements.
+      initial_state: A `Tensor` of shape `[batch_size, state_size]`, or if
+        `cell.state_size` is a tuple, then this should be a tuple of
+        tensors having shapes `[batch_size, s] for s in cell.state_size`.
+      parallel_iterations: Positive Python int.
+      swap_memory: A Python boolean
+      sequence_length: (optional) An `int32` `Tensor` of shape [batch_size].
+      dtype: (optional) Expected dtype of output. If not specified, inferred from
+        initial_state.
+    Returns:
+      Tuple `(final_outputs, final_state)`.
+      final_outputs:
+        A `Tensor` of shape `[time, batch_size, cell.output_size]`.  If
+        `cell.output_size` is a (possibly nested) tuple of ints or `TensorShape`
+        objects, then this returns a (possibly nsted) tuple of Tensors matching
+        the corresponding shapes.
+      final_state:
+        A `Tensor`, or possibly nested tuple of Tensors, matching in length
+        and shapes to `initial_state`.
+    Raises:
+      ValueError: If the input depth cannot be inferred via shape inference
+        from the inputs.
+    """
+
+    state = initial_state
+
+    assert isinstance(parallel_iterations, int), "parallel_iterations must be int"
+
+    state_size = cell.state_size
+
+    flat_input = nest.flatten(inputs)
+
+    flat_output_size = nest.flatten(cell.output_size)
+
+    # Construct an initial output
+
+    input_shape = array_ops.shape(flat_input[0])
+
+    time_steps = input_shape[0]
+
+    batch_size = _best_effort_input_batch_size(flat_input)
+
+    inputs_got_shape = tuple(input_.get_shape().with_rank_at_least(3)
+
+                             for input_ in flat_input)
+
+    const_time_steps, const_batch_size = inputs_got_shape[0].as_list()[:2]
+
+    for shape in inputs_got_shape:
+
+        if not shape[2:].is_fully_defined():
+            raise ValueError(
+
+                "Input size (depth of inputs) must be accessible via shape inference,"
+
+                " but saw value None.")
+
+        got_time_steps = shape[0].value
+
+        got_batch_size = shape[1].value
+
+        if const_time_steps != got_time_steps:
+            raise ValueError(
+
+                "Time steps is not the same for all the elements in the input in a "
+
+                "batch.")
+
+        if const_batch_size != got_batch_size:
+            raise ValueError(
+
+                "Batch_size is not the same for all the elements in the input.")
+
+    # Prepare dynamic conditional copying of state & output
+
+    def _create_zero_arrays(size):
+
+        size = _concat(batch_size, size)
+
+        return array_ops.zeros(
+
+            array_ops.stack(size), _infer_state_dtype(dtype, state))
+
+    flat_zero_output = tuple(_create_zero_arrays(output)
+
+                             for output in flat_output_size)
+
+    zero_output = nest.pack_sequence_as(structure=cell.output_size,
+
+                                        flat_sequence=flat_zero_output)
+
+    if sequence_length is not None:
+        min_sequence_length = math_ops.reduce_min(sequence_length)
+
+        max_sequence_length = math_ops.reduce_max(sequence_length)
+
+    time = array_ops.constant(0, dtype=dtypes.int32, name="time")
+
+    with ops.name_scope("dynamic_rnn") as scope:
+
+        base_name = scope
+
+    def _create_ta(name, dtype):
+
+        return tensor_array_ops.TensorArray(dtype=dtype,
+
+                                            size=time_steps,
+
+                                            tensor_array_name=base_name + name)
+
+    output_ta = tuple(_create_ta("output_%d" % i,
+
+                                 _infer_state_dtype(dtype, state))
+
+                      for i in range(len(flat_output_size)))
+
+    input_ta = tuple(_create_ta("input_%d" % i, flat_input[i].dtype)
+
+                     for i in range(len(flat_input)))
+
+    input_ta = tuple(ta.unstack(input_)
+
+                     for ta, input_ in zip(input_ta, flat_input))
+
+    def _time_step(time, output_ta_t, state, att_scores=None):
+
+        """Take a time step of the dynamic RNN.
+        Args:
+          time: int32 scalar Tensor.
+          output_ta_t: List of `TensorArray`s that represent the output.
+          state: nested tuple of vector tensors that represent the state.
+        Returns:
+          The tuple (time + 1, output_ta_t with updated flow, new_state).
+        """
+
+        input_t = tuple(ta.read(time) for ta in input_ta)
+
+        # Restore some shape information
+
+        for input_, shape in zip(input_t, inputs_got_shape):
+            input_.set_shape(shape[1:])
+
+        input_t = nest.pack_sequence_as(structure=inputs, flat_sequence=input_t)
+
+        if att_scores is not None:
+
+            att_score = att_scores[:, time, :]
+
+            call_cell = lambda: cell(input_t, state, att_score)
+
+        else:
+
+            call_cell = lambda: cell(input_t, state)
+
+        if sequence_length is not None:
+
+            (output, new_state) = _rnn_step(
+
+                time=time,
+
+                sequence_length=sequence_length,
+
+                min_sequence_length=min_sequence_length,
+
+                max_sequence_length=max_sequence_length,
+
+                zero_output=zero_output,
+
+                state=state,
+
+                call_cell=call_cell,
+
+                state_size=state_size,
+
+                skip_conditionals=True)
+
+        else:
+
+            (output, new_state) = call_cell()
+
+        # Pack state if using state tuples
+
+        output = nest.flatten(output)
+
+        output_ta_t = tuple(
+
+            ta.write(time, out) for ta, out in zip(output_ta_t, output))
+
+        if att_scores is not None:
+
+            return (time + 1, output_ta_t, new_state, att_scores)
+
+        else:
+
+            return (time + 1, output_ta_t, new_state)
+
+    if att_scores is not None:
+
+        _, output_final_ta, final_state, _ = control_flow_ops.while_loop(
+
+            cond=lambda time, *_: time < time_steps,
+
+            body=_time_step,
+
+            loop_vars=(time, output_ta, state, att_scores),
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory)
+
+    else:
+
+        _, output_final_ta, final_state = control_flow_ops.while_loop(
+
+            cond=lambda time, *_: time < time_steps,
+
+            body=_time_step,
+
+            loop_vars=(time, output_ta, state),
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory)
+
+    # Unpack final output if not using output tuples.
+
+    final_outputs = tuple(ta.stack() for ta in output_final_ta)
+
+    # Restore some shape information
+
+    for output, output_size in zip(final_outputs, flat_output_size):
+        shape = _concat(
+
+            [const_time_steps, const_batch_size], output_size, static=True)
+
+        output.set_shape(shape)
+
+    final_outputs = nest.pack_sequence_as(
+
+        structure=cell.output_size, flat_sequence=final_outputs)
+
+    return (final_outputs, final_state)
\ No newline at end of file
diff --git a/modelzoo/FNN/script/contrib/rnn_v2.py b/modelzoo/FNN/script/contrib/rnn_v2.py
new file mode 100644
index 00000000000..a2bd625cd8b
--- /dev/null
+++ b/modelzoo/FNN/script/contrib/rnn_v2.py
@@ -0,0 +1,1452 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+#
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+
+# you may not use this file except in compliance with the License.
+
+# You may obtain a copy of the License at
+
+#
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+#
+
+# Unless required by applicable law or agreed to in writing, software
+
+# distributed under the License is distributed on an "AS IS" BASIS,
+
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+# See the License for the specific language governing permissions and
+
+# limitations under the License.
+
+# ==============================================================================
+
+
+"""RNN helpers for TensorFlow models.
+
+
+
+
+
+@@bidirectional_dynamic_rnn
+
+@@dynamic_rnn
+
+@@raw_rnn
+
+@@static_rnn
+
+@@static_state_saving_rnn
+
+@@static_bidirectional_rnn
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.util import nest
+import tensorflow as tf
+
+
+def _like_rnncell_(cell):
+    """Checks that a given object is an RNNCell by using duck typing."""
+
+    conditions = [hasattr(cell, "output_size"), hasattr(cell, "state_size"),
+
+                  hasattr(cell, "zero_state"), callable(cell)]
+
+    return all(conditions)
+
+
+# pylint: disable=protected-access
+
+_concat = rnn_cell_impl._concat
+try:
+    _like_rnncell = rnn_cell_impl._like_rnncell
+except:
+    _like_rnncell = _like_rnncell_
+
+
+# pylint: enable=protected-access
+
+
+def _transpose_batch_time(x):
+    """Transpose the batch and time dimensions of a Tensor.
+
+
+
+    Retains as much of the static shape information as possible.
+
+
+
+    Args:
+
+      x: A tensor of rank 2 or higher.
+
+
+
+    Returns:
+
+      x transposed along the first two dimensions.
+
+
+
+    Raises:
+
+      ValueError: if `x` is rank 1 or lower.
+
+    """
+
+    x_static_shape = x.get_shape()
+
+    if x_static_shape.ndims is not None and x_static_shape.ndims < 2:
+        raise ValueError(
+
+            "Expected input tensor %s to have rank at least 2, but saw shape: %s" %
+
+            (x, x_static_shape))
+
+    x_rank = array_ops.rank(x)
+
+    x_t = array_ops.transpose(
+
+        x, array_ops.concat(
+
+            ([1, 0], math_ops.range(2, x_rank)), axis=0))
+
+    x_t.set_shape(
+
+        tensor_shape.TensorShape([
+
+            x_static_shape[1], x_static_shape[0]
+
+        ]).concatenate(x_static_shape[2:]))
+
+    return x_t
+
+
+def _best_effort_input_batch_size(flat_input):
+    """Get static input batch size if available, with fallback to the dynamic one.
+
+
+
+    Args:
+
+      flat_input: An iterable of time major input Tensors of shape [max_time,
+
+        batch_size, ...]. All inputs should have compatible batch sizes.
+
+
+
+    Returns:
+
+      The batch size in Python integer if available, or a scalar Tensor otherwise.
+
+
+
+    Raises:
+
+      ValueError: if there is any input with an invalid shape.
+
+    """
+
+    for input_ in flat_input:
+
+        shape = input_.shape
+
+        if shape.ndims is None:
+            continue
+
+        if shape.ndims < 2:
+            raise ValueError(
+
+                "Expected input tensor %s to have rank at least 2" % input_)
+
+        batch_size = shape[1]
+
+        if batch_size is not None:
+            return batch_size
+
+    # Fallback to the dynamic batch size of the first input.
+
+    return array_ops.shape(flat_input[0])[1]
+
+
+def _infer_state_dtype(explicit_dtype, state):
+    """Infer the dtype of an RNN state.
+
+
+
+    Args:
+
+      explicit_dtype: explicitly declared dtype or None.
+
+      state: RNN's hidden state. Must be a Tensor or a nested iterable containing
+
+        Tensors.
+
+
+
+    Returns:
+
+      dtype: inferred dtype of hidden state.
+
+
+
+    Raises:
+
+      ValueError: if `state` has heterogeneous dtypes or is empty.
+
+    """
+
+    if explicit_dtype is not None:
+
+        return explicit_dtype
+
+    elif nest.is_sequence(state):
+
+        inferred_dtypes = [element.dtype for element in nest.flatten(state)]
+
+        if not inferred_dtypes:
+            raise ValueError("Unable to infer dtype from empty state.")
+
+        all_same = all([x == inferred_dtypes[0] for x in inferred_dtypes])
+
+        if not all_same:
+            raise ValueError(
+
+                "State has tensors of different inferred_dtypes. Unable to infer a "
+
+                "single representative dtype.")
+
+        return inferred_dtypes[0]
+
+    else:
+
+        return state.dtype
+
+
+# pylint: disable=unused-argument
+
+def _rnn_step(
+
+        time, sequence_length, min_sequence_length, max_sequence_length,
+
+        zero_output, state, call_cell, state_size, skip_conditionals=False):
+    """Calculate one step of a dynamic RNN minibatch.
+
+
+
+    Returns an (output, state) pair conditioned on the sequence_lengths.
+
+    When skip_conditionals=False, the pseudocode is something like:
+
+
+
+    if t >= max_sequence_length:
+
+      return (zero_output, state)
+
+    if t < min_sequence_length:
+
+      return call_cell()
+
+
+
+    # Selectively output zeros or output, old state or new state depending
+
+    # on if we've finished calculating each row.
+
+    new_output, new_state = call_cell()
+
+    final_output = np.vstack([
+
+      zero_output if time >= sequence_lengths[r] else new_output_r
+
+      for r, new_output_r in enumerate(new_output)
+
+    ])
+
+    final_state = np.vstack([
+
+      state[r] if time >= sequence_lengths[r] else new_state_r
+
+      for r, new_state_r in enumerate(new_state)
+
+    ])
+
+    return (final_output, final_state)
+
+
+
+    Args:
+
+      time: Python int, the current time step
+
+      sequence_length: int32 `Tensor` vector of size [batch_size]
+
+      min_sequence_length: int32 `Tensor` scalar, min of sequence_length
+
+      max_sequence_length: int32 `Tensor` scalar, max of sequence_length
+
+      zero_output: `Tensor` vector of shape [output_size]
+
+      state: Either a single `Tensor` matrix of shape `[batch_size, state_size]`,
+
+        or a list/tuple of such tensors.
+
+      call_cell: lambda returning tuple of (new_output, new_state) where
+
+        new_output is a `Tensor` matrix of shape `[batch_size, output_size]`.
+
+        new_state is a `Tensor` matrix of shape `[batch_size, state_size]`.
+
+      state_size: The `cell.state_size` associated with the state.
+
+      skip_conditionals: Python bool, whether to skip using the conditional
+
+        calculations.  This is useful for `dynamic_rnn`, where the input tensor
+
+        matches `max_sequence_length`, and using conditionals just slows
+
+        everything down.
+
+
+
+    Returns:
+
+      A tuple of (`final_output`, `final_state`) as given by the pseudocode above:
+
+        final_output is a `Tensor` matrix of shape [batch_size, output_size]
+
+        final_state is either a single `Tensor` matrix, or a tuple of such
+
+          matrices (matching length and shapes of input `state`).
+
+
+
+    Raises:
+
+      ValueError: If the cell returns a state tuple whose length does not match
+
+        that returned by `state_size`.
+
+    """
+
+    # Convert state to a list for ease of use
+
+    flat_state = nest.flatten(state)
+
+    flat_zero_output = nest.flatten(zero_output)
+
+    def _copy_one_through(output, new_output):
+
+        # If the state contains a scalar value we simply pass it through.
+
+        if output.shape.ndims == 0:
+            return new_output
+
+        copy_cond = (time >= sequence_length)
+
+        with ops.colocate_with(new_output):
+            return array_ops.where(copy_cond, output, new_output)
+
+    def _copy_some_through(flat_new_output, flat_new_state):
+
+        # Use broadcasting select to determine which values should get
+
+        # the previous state & zero output, and which values should get
+
+        # a calculated state & output.
+
+        flat_new_output = [
+
+            _copy_one_through(zero_output, new_output)
+
+            for zero_output, new_output in zip(flat_zero_output, flat_new_output)]
+
+        flat_new_state = [
+
+            _copy_one_through(state, new_state)
+
+            for state, new_state in zip(flat_state, flat_new_state)]
+
+        return flat_new_output + flat_new_state
+
+    def _maybe_copy_some_through():
+
+        """Run RNN step.  Pass through either no or some past state."""
+
+        new_output, new_state = call_cell()
+
+        nest.assert_same_structure(state, new_state)
+
+        flat_new_state = nest.flatten(new_state)
+
+        flat_new_output = nest.flatten(new_output)
+
+        return control_flow_ops.cond(
+
+            # if t < min_seq_len: calculate and return everything
+
+            time < min_sequence_length, lambda: flat_new_output + flat_new_state,
+
+            # else copy some of it through
+
+            lambda: _copy_some_through(flat_new_output, flat_new_state))
+
+    # TODO(ebrevdo): skipping these conditionals may cause a slowdown,
+
+    # but benefits from removing cond() and its gradient.  We should
+
+    # profile with and without this switch here.
+
+    if skip_conditionals:
+
+        # Instead of using conditionals, perform the selective copy at all time
+
+        # steps.  This is faster when max_seq_len is equal to the number of unrolls
+
+        # (which is typical for dynamic_rnn).
+
+        new_output, new_state = call_cell()
+
+        nest.assert_same_structure(state, new_state)
+
+        new_state = nest.flatten(new_state)
+
+        new_output = nest.flatten(new_output)
+
+        final_output_and_state = _copy_some_through(new_output, new_state)
+
+    else:
+
+        empty_update = lambda: flat_zero_output + flat_state
+
+        final_output_and_state = control_flow_ops.cond(
+
+            # if t >= max_seq_len: copy all state through, output zeros
+
+            time >= max_sequence_length, empty_update,
+
+            # otherwise calculation is required: copy some or all of it through
+
+            _maybe_copy_some_through)
+
+    if len(final_output_and_state) != len(flat_zero_output) + len(flat_state):
+        raise ValueError("Internal error: state and output were not concatenated "
+
+                         "correctly.")
+
+    final_output = final_output_and_state[:len(flat_zero_output)]
+
+    final_state = final_output_and_state[len(flat_zero_output):]
+
+    for output, flat_output in zip(final_output, flat_zero_output):
+        output.set_shape(flat_output.get_shape())
+
+    for substate, flat_substate in zip(final_state, flat_state):
+        substate.set_shape(flat_substate.get_shape())
+
+    final_output = nest.pack_sequence_as(
+
+        structure=zero_output, flat_sequence=final_output)
+
+    final_state = nest.pack_sequence_as(
+
+        structure=state, flat_sequence=final_state)
+
+    return final_output, final_state
+
+
+def _reverse_seq(input_seq, lengths):
+    """Reverse a list of Tensors up to specified lengths.
+
+
+
+    Args:
+
+      input_seq: Sequence of seq_len tensors of dimension (batch_size, n_features)
+
+                 or nested tuples of tensors.
+
+      lengths:   A `Tensor` of dimension batch_size, containing lengths for each
+
+                 sequence in the batch. If "None" is specified, simply reverses
+
+                 the list.
+
+
+
+    Returns:
+
+      time-reversed sequence
+
+    """
+
+    if lengths is None:
+        return list(reversed(input_seq))
+
+    flat_input_seq = tuple(nest.flatten(input_) for input_ in input_seq)
+
+    flat_results = [[] for _ in range(len(input_seq))]
+
+    for sequence in zip(*flat_input_seq):
+
+        input_shape = tensor_shape.unknown_shape(
+
+            ndims=sequence[0].get_shape().ndims)
+
+        for input_ in sequence:
+            input_shape.merge_with(input_.get_shape())
+
+            input_.set_shape(input_shape)
+
+        # Join into (time, batch_size, depth)
+
+        s_joined = array_ops.stack(sequence)
+
+        # Reverse along dimension 0
+
+        s_reversed = array_ops.reverse_sequence(s_joined, lengths, 0, 1)
+
+        # Split again into list
+
+        result = array_ops.unstack(s_reversed)
+
+        for r, flat_result in zip(result, flat_results):
+            r.set_shape(input_shape)
+
+            flat_result.append(r)
+
+    results = [nest.pack_sequence_as(structure=input_, flat_sequence=flat_result)
+
+               for input_, flat_result in zip(input_seq, flat_results)]
+
+    return results
+
+
+#
+# def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
+#
+#                               initial_state_fw=None, initial_state_bw=None,
+#
+#                               dtype=None, parallel_iterations=None,
+#
+#                               swap_memory=False, time_major=False, scope=None):
+#
+#   """Creates a dynamic version of bidirectional recurrent neural network.
+#
+#
+#
+#   Takes input and builds independent forward and backward RNNs. The input_size
+#
+#   of forward and backward cell must match. The initial state for both directions
+#
+#   is zero by default (but can be set optionally) and no intermediate states are
+#
+#   ever returned -- the network is fully unrolled for the given (passed in)
+#
+#   length(s) of the sequence(s) or completely unrolled if length(s) is not
+#
+#   given.
+#
+#
+#
+#   Args:
+#
+#     cell_fw: An instance of RNNCell, to be used for forward direction.
+#
+#     cell_bw: An instance of RNNCell, to be used for backward direction.
+#
+#     inputs: The RNN inputs.
+#
+#       If time_major == False (default), this must be a tensor of shape:
+#
+#         `[batch_size, max_time, ...]`, or a nested tuple of such elements.
+#
+#       If time_major == True, this must be a tensor of shape:
+#
+#         `[max_time, batch_size, ...]`, or a nested tuple of such elements.
+#
+#     sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
+#
+#       containing the actual lengths for each of the sequences in the batch.
+#
+#       If not provided, all batch entries are assumed to be full sequences; and
+#
+#       time reversal is applied from time `0` to `max_time` for each sequence.
+#
+#     initial_state_fw: (optional) An initial state for the forward RNN.
+#
+#       This must be a tensor of appropriate type and shape
+#
+#       `[batch_size, cell_fw.state_size]`.
+#
+#       If `cell_fw.state_size` is a tuple, this should be a tuple of
+#
+#       tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
+#
+#     initial_state_bw: (optional) Same as for `initial_state_fw`, but using
+#
+#       the corresponding properties of `cell_bw`.
+#
+#     dtype: (optional) The data type for the initial states and expected output.
+#
+#       Required if initial_states are not provided or RNN states have a
+#
+#       heterogeneous dtype.
+#
+#     parallel_iterations: (Default: 32).  The number of iterations to run in
+#
+#       parallel.  Those operations which do not have any temporal dependency
+#
+#       and can be run in parallel, will be.  This parameter trades off
+#
+#       time for space.  Values >> 1 use more memory but take less time,
+#
+#       while smaller values use less memory but computations take longer.
+#
+#     swap_memory: Transparently swap the tensors produced in forward inference
+#
+#       but needed for back prop from GPU to CPU.  This allows training RNNs
+#
+#       which would typically not fit on a single GPU, with very minimal (or no)
+#
+#       performance penalty.
+#
+#     time_major: The shape format of the `inputs` and `outputs` Tensors.
+#
+#       If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+#
+#       If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+#
+#       Using `time_major = True` is a bit more efficient because it avoids
+#
+#       transposes at the beginning and end of the RNN calculation.  However,
+#
+#       most TensorFlow data is batch-major, so by default this function
+#
+#       accepts input and emits output in batch-major form.
+#
+#     scope: VariableScope for the created subgraph; defaults to
+#
+#       "bidirectional_rnn"
+#
+#
+#
+#   Returns:
+#
+#     A tuple (outputs, output_states) where:
+#
+#       outputs: A tuple (output_fw, output_bw) containing the forward and
+#
+#         the backward rnn output `Tensor`.
+#
+#         If time_major == False (default),
+#
+#           output_fw will be a `Tensor` shaped:
+#
+#           `[batch_size, max_time, cell_fw.output_size]`
+#
+#           and output_bw will be a `Tensor` shaped:
+#
+#           `[batch_size, max_time, cell_bw.output_size]`.
+#
+#         If time_major == True,
+#
+#           output_fw will be a `Tensor` shaped:
+#
+#           `[max_time, batch_size, cell_fw.output_size]`
+#
+#           and output_bw will be a `Tensor` shaped:
+#
+#           `[max_time, batch_size, cell_bw.output_size]`.
+#
+#         It returns a tuple instead of a single concatenated `Tensor`, unlike
+#
+#         in the `bidirectional_rnn`. If the concatenated one is preferred,
+#
+#         the forward and backward outputs can be concatenated as
+#
+#         `tf.concat(outputs, 2)`.
+#
+#       output_states: A tuple (output_state_fw, output_state_bw) containing
+#
+#         the forward and the backward final states of bidirectional rnn.
+#
+#
+#
+#   Raises:
+#
+#     TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
+#
+#   """
+#
+#
+#
+#   if not _like_rnncell(cell_fw):
+#
+#     raise TypeError("cell_fw must be an instance of RNNCell")
+#
+#   if not _like_rnncell(cell_bw):
+#
+#     raise TypeError("cell_bw must be an instance of RNNCell")
+#
+#
+#
+#   with vs.variable_scope(scope or "bidirectional_rnn"):
+#
+#     # Forward direction
+#
+#     with vs.variable_scope("fw") as fw_scope:
+#
+#       output_fw, output_state_fw = dynamic_rnn(
+#
+#           cell=cell_fw, inputs=inputs, sequence_length=sequence_length,
+#
+#           initial_state=initial_state_fw, dtype=dtype,
+#
+#           parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+#
+#           time_major=time_major, scope=fw_scope)
+#
+#
+#
+#     # Backward direction
+#
+#     if not time_major:
+#
+#       time_dim = 1
+#
+#       batch_dim = 0
+#
+#     else:
+#
+#       time_dim = 0
+#
+#       batch_dim = 1
+#
+#
+#
+#     def _reverse(input_, seq_lengths, seq_dim, batch_dim):
+#
+#       if seq_lengths is not None:
+#
+#         return array_ops.reverse_sequence(
+#
+#             input=input_, seq_lengths=seq_lengths,
+#
+#             seq_dim=seq_dim, batch_dim=batch_dim)
+#
+#       else:
+#
+#         return array_ops.reverse(input_, axis=[seq_dim])
+#
+#
+#
+#     with vs.variable_scope("bw") as bw_scope:
+#
+#       inputs_reverse = _reverse(
+#
+#           inputs, seq_lengths=sequence_length,
+#
+#           seq_dim=time_dim, batch_dim=batch_dim)
+#
+#       tmp, output_state_bw = dynamic_rnn(
+#
+#           cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
+#
+#           initial_state=initial_state_bw, dtype=dtype,
+#
+#           parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+#
+#           time_major=time_major, scope=bw_scope)
+#
+#
+#
+#   output_bw = _reverse(
+#
+#       tmp, seq_lengths=sequence_length,
+#
+#       seq_dim=time_dim, batch_dim=batch_dim)
+#
+#
+#
+#   outputs = (output_fw, output_bw)
+#
+#   output_states = (output_state_fw, output_state_bw)
+#
+#
+#
+#   return (outputs, output_states)
+#
+
+
+def dynamic_rnn(cell, inputs, att_scores=None, sequence_length=None, initial_state=None,
+
+                dtype=None, parallel_iterations=None, swap_memory=False,
+
+                time_major=False, scope=None):
+    """Creates a recurrent neural network specified by RNNCell `cell`.
+
+
+
+    Performs fully dynamic unrolling of `inputs`.
+
+
+
+    Example:
+
+
+
+    ```python
+
+    # create a BasicRNNCell
+
+    rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+
+
+
+    # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size]
+
+
+
+    # defining initial state
+
+    initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32)
+
+
+
+    # 'state' is a tensor of shape [batch_size, cell_state_size]
+
+    outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data,
+
+                                       initial_state=initial_state,
+
+                                       dtype=tf.float32)
+
+    ```
+
+
+
+    ```python
+
+    # create 2 LSTMCells
+
+    rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [128, 256]]
+
+
+
+    # create a RNN cell composed sequentially of a number of RNNCells
+
+    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
+
+
+
+    # 'outputs' is a tensor of shape [batch_size, max_time, 256]
+
+    # 'state' is a N-tuple where N is the number of LSTMCells containing a
+
+    # tf.contrib.rnn.LSTMStateTuple for each cell
+
+    outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
+
+                                       inputs=data,
+
+                                       dtype=tf.float32)
+
+    ```
+
+
+
+
+
+    Args:
+
+      cell: An instance of RNNCell.
+
+      inputs: The RNN inputs.
+
+        If `time_major == False` (default), this must be a `Tensor` of shape:
+
+          `[batch_size, max_time, ...]`, or a nested tuple of such
+
+          elements.
+
+        If `time_major == True`, this must be a `Tensor` of shape:
+
+          `[max_time, batch_size, ...]`, or a nested tuple of such
+
+          elements.
+
+        This may also be a (possibly nested) tuple of Tensors satisfying
+
+        this property.  The first two dimensions must match across all the inputs,
+
+        but otherwise the ranks and other shape components may differ.
+
+        In this case, input to `cell` at each time-step will replicate the
+
+        structure of these tuples, except for the time dimension (from which the
+
+        time is taken).
+
+        The input to `cell` at each time step will be a `Tensor` or (possibly
+
+        nested) tuple of Tensors each with dimensions `[batch_size, ...]`.
+
+      sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
+
+        Used to copy-through state and zero-out outputs when past a batch
+
+        element's sequence length.  So it's more for correctness than performance.
+
+      initial_state: (optional) An initial state for the RNN.
+
+        If `cell.state_size` is an integer, this must be
+
+        a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
+
+        If `cell.state_size` is a tuple, this should be a tuple of
+
+        tensors having shapes `[batch_size, s] for s in cell.state_size`.
+
+      dtype: (optional) The data type for the initial state and expected output.
+
+        Required if initial_state is not provided or RNN state has a heterogeneous
+
+        dtype.
+
+      parallel_iterations: (Default: 32).  The number of iterations to run in
+
+        parallel.  Those operations which do not have any temporal dependency
+
+        and can be run in parallel, will be.  This parameter trades off
+
+        time for space.  Values >> 1 use more memory but take less time,
+
+        while smaller values use less memory but computations take longer.
+
+      swap_memory: Transparently swap the tensors produced in forward inference
+
+        but needed for back prop from GPU to CPU.  This allows training RNNs
+
+        which would typically not fit on a single GPU, with very minimal (or no)
+
+        performance penalty.
+
+      time_major: The shape format of the `inputs` and `outputs` Tensors.
+
+        If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+
+        If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+
+        Using `time_major = True` is a bit more efficient because it avoids
+
+        transposes at the beginning and end of the RNN calculation.  However,
+
+        most TensorFlow data is batch-major, so by default this function
+
+        accepts input and emits output in batch-major form.
+
+      scope: VariableScope for the created subgraph; defaults to "rnn".
+
+
+
+    Returns:
+
+      A pair (outputs, state) where:
+
+
+
+      outputs: The RNN output `Tensor`.
+
+
+
+        If time_major == False (default), this will be a `Tensor` shaped:
+
+          `[batch_size, max_time, cell.output_size]`.
+
+
+
+        If time_major == True, this will be a `Tensor` shaped:
+
+          `[max_time, batch_size, cell.output_size]`.
+
+
+
+        Note, if `cell.output_size` is a (possibly nested) tuple of integers
+
+        or `TensorShape` objects, then `outputs` will be a tuple having the
+
+        same structure as `cell.output_size`, containing Tensors having shapes
+
+        corresponding to the shape data in `cell.output_size`.
+
+
+
+      state: The final state.  If `cell.state_size` is an int, this
+
+        will be shaped `[batch_size, cell.state_size]`.  If it is a
+
+        `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
+
+        If it is a (possibly nested) tuple of ints or `TensorShape`, this will
+
+        be a tuple having the corresponding shapes. If cells are `LSTMCells`
+
+        `state` will be a tuple containing a `LSTMStateTuple` for each cell.
+
+
+
+    Raises:
+
+      TypeError: If `cell` is not an instance of RNNCell.
+
+      ValueError: If inputs is None or an empty list.
+
+    """
+
+    if not _like_rnncell(cell):
+        raise TypeError("cell must be an instance of RNNCell")
+
+    # By default, time_major==False and inputs are batch-major: shaped
+
+    #   [batch, time, depth]
+
+    # For internal calculations, we transpose to [time, batch, depth]
+
+    flat_input = nest.flatten(inputs)
+
+    if not time_major:
+        # (B,T,D) => (T,B,D)
+
+        flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
+
+        flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
+
+    parallel_iterations = parallel_iterations or 32
+
+    if sequence_length is not None:
+
+        sequence_length = math_ops.to_int32(sequence_length)
+
+        if sequence_length.get_shape().ndims not in (None, 1):
+            raise ValueError(
+
+                "sequence_length must be a vector of length batch_size, "
+
+                "but saw shape: %s" % sequence_length.get_shape())
+
+        sequence_length = array_ops.identity(  # Just to find it in the graph.
+
+            sequence_length, name="sequence_length")
+
+    # Create a new scope in which the caching device is either
+
+    # determined by the parent scope, or is set to place the cached
+
+    # Variable using the same placement as for the rest of the RNN.
+
+    try:
+        resue = tf.AUTO_REUSE
+    except:
+        resue = tf.compat.v1.AUTO_REUSE
+
+    with vs.variable_scope(scope or "rnn",reuse=resue) as varscope:#TODO:user defined reuse
+
+        if varscope.caching_device is None:
+            varscope.set_caching_device(lambda op: op.device)
+
+        batch_size = _best_effort_input_batch_size(flat_input)
+
+        if initial_state is not None:
+
+            state = initial_state
+
+        else:
+
+            if not dtype:
+                raise ValueError("If there is no initial_state, you must give a dtype.")
+
+            state = cell.zero_state(batch_size, dtype)
+
+        def _assert_has_shape(x, shape):
+
+            x_shape = array_ops.shape(x)
+
+            packed_shape = array_ops.stack(shape)
+
+            return control_flow_ops.Assert(
+
+                math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)),
+
+                ["Expected shape for Tensor %s is " % x.name,
+
+                 packed_shape, " but saw shape: ", x_shape])
+
+        if sequence_length is not None:
+            # Perform some shape validation
+
+            with ops.control_dependencies(
+
+                    [_assert_has_shape(sequence_length, [batch_size])]):
+                sequence_length = array_ops.identity(
+
+                    sequence_length, name="CheckSeqLen")
+
+        inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input)
+
+        (outputs, final_state) = _dynamic_rnn_loop(
+
+            cell,
+
+            inputs,
+
+            state,
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory,
+
+            att_scores=att_scores,
+
+            sequence_length=sequence_length,
+
+            dtype=dtype)
+
+        # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth].
+
+        # If we are performing batch-major calculations, transpose output back
+
+        # to shape [batch, time, depth]
+
+        if not time_major:
+            # (T,B,D) => (B,T,D)
+
+            outputs = nest.map_structure(_transpose_batch_time, outputs)
+
+        return (outputs, final_state)
+
+
+def _dynamic_rnn_loop(cell,
+
+                      inputs,
+
+                      initial_state,
+
+                      parallel_iterations,
+
+                      swap_memory,
+
+                      att_scores=None,
+
+                      sequence_length=None,
+
+                      dtype=None):
+    """Internal implementation of Dynamic RNN.
+
+
+
+    Args:
+
+      cell: An instance of RNNCell.
+
+      inputs: A `Tensor` of shape [time, batch_size, input_size], or a nested
+
+        tuple of such elements.
+
+      initial_state: A `Tensor` of shape `[batch_size, state_size]`, or if
+
+        `cell.state_size` is a tuple, then this should be a tuple of
+
+        tensors having shapes `[batch_size, s] for s in cell.state_size`.
+
+      parallel_iterations: Positive Python int.
+
+      swap_memory: A Python boolean
+
+      sequence_length: (optional) An `int32` `Tensor` of shape [batch_size].
+
+      dtype: (optional) Expected dtype of output. If not specified, inferred from
+
+        initial_state.
+
+
+
+    Returns:
+
+      Tuple `(final_outputs, final_state)`.
+
+      final_outputs:
+
+        A `Tensor` of shape `[time, batch_size, cell.output_size]`.  If
+
+        `cell.output_size` is a (possibly nested) tuple of ints or `TensorShape`
+
+        objects, then this returns a (possibly nsted) tuple of Tensors matching
+
+        the corresponding shapes.
+
+      final_state:
+
+        A `Tensor`, or possibly nested tuple of Tensors, matching in length
+
+        and shapes to `initial_state`.
+
+
+
+    Raises:
+
+      ValueError: If the input depth cannot be inferred via shape inference
+
+        from the inputs.
+
+    """
+
+    state = initial_state
+
+    assert isinstance(parallel_iterations, int), "parallel_iterations must be int"
+
+    state_size = cell.state_size
+
+    flat_input = nest.flatten(inputs)
+
+    flat_output_size = nest.flatten(cell.output_size)
+
+    # Construct an initial output
+
+    input_shape = array_ops.shape(flat_input[0])
+
+    time_steps = input_shape[0]
+
+    batch_size = _best_effort_input_batch_size(flat_input)
+
+    inputs_got_shape = tuple(input_.get_shape().with_rank_at_least(3)
+
+                             for input_ in flat_input)
+
+    const_time_steps, const_batch_size = inputs_got_shape[0].as_list()[:2]
+
+    for shape in inputs_got_shape:
+
+        if not shape[2:].is_fully_defined():
+            raise ValueError(
+
+                "Input size (depth of inputs) must be accessible via shape inference,"
+
+                " but saw value None.")
+
+        got_time_steps = shape[0]
+
+        got_batch_size = shape[1]
+
+        if const_time_steps != got_time_steps:
+            raise ValueError(
+
+                "Time steps is not the same for all the elements in the input in a "
+
+                "batch.")
+
+        if const_batch_size != got_batch_size:
+            raise ValueError(
+
+                "Batch_size is not the same for all the elements in the input.")
+
+    # Prepare dynamic conditional copying of state & output
+
+    def _create_zero_arrays(size):
+
+        size = _concat(batch_size, size)
+
+        return array_ops.zeros(
+
+            array_ops.stack(size), _infer_state_dtype(dtype, state))
+
+    flat_zero_output = tuple(_create_zero_arrays(output)
+
+                             for output in flat_output_size)
+
+    zero_output = nest.pack_sequence_as(structure=cell.output_size,
+
+                                        flat_sequence=flat_zero_output)
+
+    if sequence_length is not None:
+        min_sequence_length = math_ops.reduce_min(sequence_length)
+
+        max_sequence_length = math_ops.reduce_max(sequence_length)
+
+    time = array_ops.constant(0, dtype=dtypes.int32, name="time")
+
+    with ops.name_scope("dynamic_rnn") as scope:
+
+        base_name = scope
+
+    def _create_ta(name, dtype):
+
+        return tensor_array_ops.TensorArray(dtype=dtype,
+
+                                            size=time_steps,
+
+                                            tensor_array_name=base_name + name)
+
+    output_ta = tuple(_create_ta("output_%d" % i,
+
+                                 _infer_state_dtype(dtype, state))
+
+                      for i in range(len(flat_output_size)))
+
+    input_ta = tuple(_create_ta("input_%d" % i, flat_input[i].dtype)
+
+                     for i in range(len(flat_input)))
+
+    input_ta = tuple(ta.unstack(input_)
+
+                     for ta, input_ in zip(input_ta, flat_input))
+
+    def _time_step(time, output_ta_t, state, att_scores=None):
+
+        """Take a time step of the dynamic RNN.
+
+
+
+        Args:
+
+          time: int32 scalar Tensor.
+
+          output_ta_t: List of `TensorArray`s that represent the output.
+
+          state: nested tuple of vector tensors that represent the state.
+
+
+
+        Returns:
+
+          The tuple (time + 1, output_ta_t with updated flow, new_state).
+
+        """
+
+        input_t = tuple(ta.read(time) for ta in input_ta)
+
+        # Restore some shape information
+
+        for input_, shape in zip(input_t, inputs_got_shape):
+            input_.set_shape(shape[1:])
+
+        input_t = nest.pack_sequence_as(structure=inputs, flat_sequence=input_t)
+
+        if att_scores is not None:
+
+            att_score = att_scores[:, time, :]
+
+            call_cell = lambda: cell(input_t, state, att_score)
+
+        else:
+
+            call_cell = lambda: cell(input_t, state)
+
+        if sequence_length is not None:
+
+            (output, new_state) = _rnn_step(
+
+                time=time,
+
+                sequence_length=sequence_length,
+
+                min_sequence_length=min_sequence_length,
+
+                max_sequence_length=max_sequence_length,
+
+                zero_output=zero_output,
+
+                state=state,
+
+                call_cell=call_cell,
+
+                state_size=state_size,
+
+                skip_conditionals=True)
+
+        else:
+
+            (output, new_state) = call_cell()
+
+        # Pack state if using state tuples
+
+        output = nest.flatten(output)
+
+        output_ta_t = tuple(
+
+            ta.write(time, out) for ta, out in zip(output_ta_t, output))
+
+        if att_scores is not None:
+
+            return (time + 1, output_ta_t, new_state, att_scores)
+
+        else:
+
+            return (time + 1, output_ta_t, new_state)
+
+    if att_scores is not None:
+
+        _, output_final_ta, final_state, _ = control_flow_ops.while_loop(
+
+            cond=lambda time, *_: time < time_steps,
+
+            body=_time_step,
+
+            loop_vars=(time, output_ta, state, att_scores),
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory)
+
+    else:
+
+        _, output_final_ta, final_state = control_flow_ops.while_loop(
+
+            cond=lambda time, *_: time < time_steps,
+
+            body=_time_step,
+
+            loop_vars=(time, output_ta, state),
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory)
+
+    # Unpack final output if not using output tuples.
+
+    final_outputs = tuple(ta.stack() for ta in output_final_ta)
+
+    # Restore some shape information
+
+    for output, output_size in zip(final_outputs, flat_output_size):
+        shape = _concat(
+
+            [const_time_steps, const_batch_size], output_size, static=True)
+
+        output.set_shape(shape)
+
+    final_outputs = nest.pack_sequence_as(
+
+        structure=cell.output_size, flat_sequence=final_outputs)
+
+    return (final_outputs, final_state)
diff --git a/modelzoo/FNN/script/contrib/utils.py b/modelzoo/FNN/script/contrib/utils.py
new file mode 100644
index 00000000000..692f4ef6e89
--- /dev/null
+++ b/modelzoo/FNN/script/contrib/utils.py
@@ -0,0 +1,378 @@
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops.rnn_cell import *
+from tensorflow.python.util import nest
+
+_BIAS_VARIABLE_NAME = "bias"
+
+_WEIGHTS_VARIABLE_NAME = "kernel"
+
+
+class _Linear_(object):
+    """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
+
+
+
+    Args:
+
+      args: a 2D Tensor or a list of 2D, batch x n, Tensors.
+
+      output_size: int, second dimension of weight variable.
+
+      dtype: data type for variables.
+
+      build_bias: boolean, whether to build a bias variable.
+
+      bias_initializer: starting value to initialize the bias
+
+        (default is all zeros).
+
+      kernel_initializer: starting value to initialize the weight.
+
+
+
+    Raises:
+
+      ValueError: if inputs_shape is wrong.
+
+    """
+
+    def __init__(self,
+
+                 args,
+
+                 output_size,
+
+                 build_bias,
+
+                 bias_initializer=None,
+
+                 kernel_initializer=None):
+
+        self._build_bias = build_bias
+
+        if args is None or (nest.is_sequence(args) and not args):
+            raise ValueError("`args` must be specified")
+
+        if not nest.is_sequence(args):
+
+            args = [args]
+
+            self._is_sequence = False
+
+        else:
+
+            self._is_sequence = True
+
+        # Calculate the total size of arguments on dimension 1.
+
+        total_arg_size = 0
+
+        shapes = [a.get_shape() for a in args]
+
+        for shape in shapes:
+
+            if shape.ndims != 2:
+                raise ValueError(
+                    "linear is expecting 2D arguments: %s" % shapes)
+
+            if shape[1] is None:
+
+                raise ValueError("linear expects shape[1] to be provided for shape %s, "
+
+                                 "but saw %s" % (shape, shape[1]))
+
+            else:
+
+                total_arg_size += int(shape[1])#.value
+
+        dtype = [a.dtype for a in args][0]
+
+        scope = vs.get_variable_scope()
+
+        with vs.variable_scope(scope) as outer_scope:
+
+            self._weights = vs.get_variable(
+
+                _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size],
+
+                dtype=dtype,
+
+                initializer=kernel_initializer)
+
+            if build_bias:
+
+                with vs.variable_scope(outer_scope) as inner_scope:
+
+                    inner_scope.set_partitioner(None)
+
+                    if bias_initializer is None:
+                        bias_initializer = init_ops.constant_initializer(
+                            0.0, dtype=dtype)
+
+                    self._biases = vs.get_variable(
+
+                        _BIAS_VARIABLE_NAME, [output_size],
+
+                        dtype=dtype,
+
+                        initializer=bias_initializer)
+
+    def __call__(self, args):
+
+        if not self._is_sequence:
+            args = [args]
+
+        if len(args) == 1:
+
+            res = math_ops.matmul(args[0], self._weights)
+
+        else:
+
+            res = math_ops.matmul(array_ops.concat(args, 1), self._weights)
+
+        if self._build_bias:
+            res = nn_ops.bias_add(res, self._biases)
+
+        return res
+
+
+try:
+    from tensorflow.python.ops.rnn_cell_impl import _Linear
+except:
+    _Linear = _Linear_
+
+
+class QAAttGRUCell(RNNCell):
+    """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
+
+    Args:
+
+      num_units: int, The number of units in the GRU cell.
+
+      activation: Nonlinearity to use.  Default: `tanh`.
+
+      reuse: (optional) Python boolean describing whether to reuse variables
+
+       in an existing scope.  If not `True`, and the existing scope already has
+
+       the given variables, an error is raised.
+
+      kernel_initializer: (optional) The initializer to use for the weight and
+
+      projection matrices.
+
+      bias_initializer: (optional) The initializer to use for the bias.
+
+    """
+
+    def __init__(self,
+
+                 num_units,
+
+                 activation=None,
+
+                 reuse=None,
+
+                 kernel_initializer=None,
+
+                 bias_initializer=None):
+
+        super(QAAttGRUCell, self).__init__(_reuse=reuse)
+
+        self._num_units = num_units
+
+        self._activation = activation or math_ops.tanh
+
+        self._kernel_initializer = kernel_initializer
+
+        self._bias_initializer = bias_initializer
+
+        self._gate_linear = None
+
+        self._candidate_linear = None
+
+    @property
+    def state_size(self):
+
+        return self._num_units
+
+    @property
+    def output_size(self):
+
+        return self._num_units
+
+    def __call__(self, inputs, state, att_score):
+
+        return self.call(inputs, state, att_score)
+
+    def call(self, inputs, state, att_score=None):
+        """Gated recurrent unit (GRU) with nunits cells."""
+
+        if self._gate_linear is None:
+
+            bias_ones = self._bias_initializer
+
+            if self._bias_initializer is None:
+                bias_ones = init_ops.constant_initializer(
+                    1.0, dtype=inputs.dtype)
+
+            with vs.variable_scope("gates"):  # Reset gate and update gate.
+
+                self._gate_linear = _Linear(
+
+                    [inputs, state],
+
+                    2 * self._num_units,
+
+                    True,
+
+                    bias_initializer=bias_ones,
+
+                    kernel_initializer=self._kernel_initializer)
+
+        value = math_ops.sigmoid(self._gate_linear([inputs, state]))
+
+        r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
+
+        r_state = r * state
+
+        if self._candidate_linear is None:
+            with vs.variable_scope("candidate"):
+                self._candidate_linear = _Linear(
+
+                    [inputs, r_state],
+
+                    self._num_units,
+
+                    True,
+
+                    bias_initializer=self._bias_initializer,
+
+                    kernel_initializer=self._kernel_initializer)
+
+        c = self._activation(self._candidate_linear([inputs, r_state]))
+
+        new_h = (1. - att_score) * state + att_score * c
+
+        return new_h, new_h
+
+
+class VecAttGRUCell(RNNCell):
+    """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
+
+    Args:
+
+      num_units: int, The number of units in the GRU cell.
+
+      activation: Nonlinearity to use.  Default: `tanh`.
+
+      reuse: (optional) Python boolean describing whether to reuse variables
+
+       in an existing scope.  If not `True`, and the existing scope already has
+
+       the given variables, an error is raised.
+
+      kernel_initializer: (optional) The initializer to use for the weight and
+
+      projection matrices.
+
+      bias_initializer: (optional) The initializer to use for the bias.
+
+    """
+
+    def __init__(self,
+
+                 num_units,
+
+                 activation=None,
+
+                 reuse=None,
+
+                 kernel_initializer=None,
+
+                 bias_initializer=None):
+
+        super(VecAttGRUCell, self).__init__(_reuse=reuse)
+
+        self._num_units = num_units
+
+        self._activation = activation or math_ops.tanh
+
+        self._kernel_initializer = kernel_initializer
+
+        self._bias_initializer = bias_initializer
+
+        self._gate_linear = None
+
+        self._candidate_linear = None
+
+    @property
+    def state_size(self):
+
+        return self._num_units
+
+    @property
+    def output_size(self):
+
+        return self._num_units
+
+    def __call__(self, inputs, state, att_score):
+
+        return self.call(inputs, state, att_score)
+
+    def call(self, inputs, state, att_score=None):
+        """Gated recurrent unit (GRU) with nunits cells."""
+
+        if self._gate_linear is None:
+
+            bias_ones = self._bias_initializer
+
+            if self._bias_initializer is None:
+                bias_ones = init_ops.constant_initializer(
+                    1.0, dtype=inputs.dtype)
+
+            with vs.variable_scope("gates"):  # Reset gate and update gate.
+
+                self._gate_linear = _Linear(
+
+                    [inputs, state],
+
+                    2 * self._num_units,
+
+                    True,
+
+                    bias_initializer=bias_ones,
+
+                    kernel_initializer=self._kernel_initializer)
+
+        value = math_ops.sigmoid(self._gate_linear([inputs, state]))
+
+        r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
+
+        r_state = r * state
+
+        if self._candidate_linear is None:
+            with vs.variable_scope("candidate"):
+                self._candidate_linear = _Linear(
+
+                    [inputs, r_state],
+
+                    self._num_units,
+
+                    True,
+
+                    bias_initializer=self._bias_initializer,
+
+                    kernel_initializer=self._kernel_initializer)
+
+        c = self._activation(self._candidate_linear([inputs, r_state]))
+
+        u = (1.0 - att_score) * u
+
+        new_h = u * state + (1 - u) * c
+
+        return new_h, new_h
diff --git a/modelzoo/FNN/script/estimator/__init__.py b/modelzoo/FNN/script/estimator/__init__.py
new file mode 100644
index 00000000000..cf4f59d6c09
--- /dev/null
+++ b/modelzoo/FNN/script/estimator/__init__.py
@@ -0,0 +1 @@
+from .models import *
\ No newline at end of file
diff --git a/modelzoo/FNN/script/estimator/feature_column.py b/modelzoo/FNN/script/estimator/feature_column.py
new file mode 100644
index 00000000000..c8d7a6cd013
--- /dev/null
+++ b/modelzoo/FNN/script/estimator/feature_column.py
@@ -0,0 +1,52 @@
+import tensorflow as tf
+from tensorflow.python.feature_column.feature_column import _EmbeddingColumn
+
+from .utils import LINEAR_SCOPE_NAME, variable_scope, get_collection, get_GraphKeys, input_layer, get_losses
+
+
+def linear_model(features, linear_feature_columns):
+    if tf.__version__ >= '2.0.0':
+        linear_logits = tf.compat.v1.feature_column.linear_model(features, linear_feature_columns)
+    else:
+        linear_logits = tf.feature_column.linear_model(features, linear_feature_columns)
+    return linear_logits
+
+
+def get_linear_logit(features, linear_feature_columns, l2_reg_linear=0):
+    with variable_scope(LINEAR_SCOPE_NAME):
+        if not linear_feature_columns:
+            linear_logits = tf.Variable([[0.0]], name='bias_weights')
+        else:
+
+            linear_logits = linear_model(features, linear_feature_columns)
+
+            if l2_reg_linear > 0:
+                for var in get_collection(get_GraphKeys().TRAINABLE_VARIABLES, LINEAR_SCOPE_NAME)[:-1]:
+                    get_losses().add_loss(l2_reg_linear * tf.nn.l2_loss(var, name=var.name.split(":")[0] + "_l2loss"),
+                                          get_GraphKeys().REGULARIZATION_LOSSES)
+    return linear_logits
+
+
+def input_from_feature_columns(features, feature_columns, l2_reg_embedding=0.0):
+    dense_value_list = []
+    sparse_emb_list = []
+    for feat in feature_columns:
+        if is_embedding(feat):
+            sparse_emb = tf.expand_dims(input_layer(features, [feat]), axis=1)
+            sparse_emb_list.append(sparse_emb)
+            if l2_reg_embedding > 0:
+                get_losses().add_loss(l2_reg_embedding * tf.nn.l2_loss(sparse_emb, name=feat.name + "_l2loss"),
+                                      get_GraphKeys().REGULARIZATION_LOSSES)
+
+        else:
+            dense_value_list.append(input_layer(features, [feat]))
+
+    return sparse_emb_list, dense_value_list
+
+
+def is_embedding(feature_column):
+    try:
+        from tensorflow.python.feature_column.feature_column_v2 import EmbeddingColumn
+    except ImportError:
+        EmbeddingColumn = _EmbeddingColumn
+    return isinstance(feature_column, (_EmbeddingColumn, EmbeddingColumn))
diff --git a/modelzoo/FNN/script/estimator/inputs.py b/modelzoo/FNN/script/estimator/inputs.py
new file mode 100644
index 00000000000..2c175a9934e
--- /dev/null
+++ b/modelzoo/FNN/script/estimator/inputs.py
@@ -0,0 +1,52 @@
+import tensorflow as tf
+
+
+def input_fn_pandas(df, features, label=None, batch_size=256, num_epochs=1, shuffle=False, queue_capacity_factor=10,
+                    num_threads=1):
+    if label is not None:
+        y = df[label]
+    else:
+        y = None
+    if tf.__version__ >= "2.0.0":
+        return tf.compat.v1.estimator.inputs.pandas_input_fn(df[features], y, batch_size=batch_size,
+                                                             num_epochs=num_epochs,
+                                                             shuffle=shuffle,
+                                                             queue_capacity=batch_size * queue_capacity_factor,
+                                                             num_threads=num_threads)
+
+    return tf.estimator.inputs.pandas_input_fn(df[features], y, batch_size=batch_size, num_epochs=num_epochs,
+                                               shuffle=shuffle, queue_capacity=batch_size * queue_capacity_factor,
+                                               num_threads=num_threads)
+
+
+def input_fn_tfrecord(filenames, feature_description, label=None, batch_size=256, num_epochs=1, num_parallel_calls=8,
+                      shuffle_factor=10, prefetch_factor=1,
+                      ):
+    def _parse_examples(serial_exmp):
+        try:
+            features = tf.parse_single_example(serial_exmp, features=feature_description)
+        except AttributeError:
+            features = tf.io.parse_single_example(serial_exmp, features=feature_description)
+        if label is not None:
+            labels = features.pop(label)
+            return features, labels
+        return features
+
+    def input_fn():
+        dataset = tf.data.TFRecordDataset(filenames)
+        dataset = dataset.map(_parse_examples, num_parallel_calls=num_parallel_calls)
+        if shuffle_factor > 0:
+            dataset = dataset.shuffle(buffer_size=batch_size * shuffle_factor)
+
+        dataset = dataset.repeat(num_epochs).batch(batch_size)
+
+        if prefetch_factor > 0:
+            dataset = dataset.prefetch(buffer_size=batch_size * prefetch_factor)
+        try:
+            iterator = dataset.make_one_shot_iterator()
+        except AttributeError:
+            iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
+
+        return iterator.get_next()
+
+    return input_fn
diff --git a/modelzoo/FNN/script/estimator/utils.py b/modelzoo/FNN/script/estimator/utils.py
new file mode 100644
index 00000000000..5d722515f6b
--- /dev/null
+++ b/modelzoo/FNN/script/estimator/utils.py
@@ -0,0 +1,217 @@
+import tensorflow as tf
+from tensorflow.python.estimator.canned.head import _Head
+from tensorflow.python.estimator.canned.optimizers import get_optimizer_instance
+
+LINEAR_SCOPE_NAME = 'linear'
+DNN_SCOPE_NAME = 'dnn'
+
+
+def _summary_key(head_name, val):
+    return '%s/%s' % (val, head_name) if head_name else val
+
+
+class Head(_Head):
+
+    def __init__(self, task,
+                 name=None):
+        self._task = task
+        self._name = name
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def logits_dimension(self):
+        return 1
+
+    def _eval_metric_ops(self,
+                         labels,
+                         logits,
+                         predictions,
+                         unweighted_loss,
+                         weights=None):
+
+        labels = to_float(labels)
+        predictions = to_float(predictions)
+
+        # with name_scope(None, 'metrics', (labels, logits, predictions,
+        # unweighted_loss, weights)):
+        metrics = get_metrics()
+        losses = get_losses()
+
+        metric_ops = {
+            _summary_key(self._name, "prediction/mean"): metrics.mean(predictions, weights=weights),
+            _summary_key(self._name, "label/mean"): metrics.mean(labels, weights=weights),
+        }
+
+        summary_scalar("prediction/mean", metric_ops[_summary_key(self._name, "prediction/mean")][1])
+        summary_scalar("label/mean", metric_ops[_summary_key(self._name, "label/mean")][1])
+
+
+        mean_loss = losses.compute_weighted_loss(
+            unweighted_loss, weights=1.0, reduction=losses.Reduction.MEAN)
+
+        if self._task == "binary":
+            metric_ops[_summary_key(self._name, "LogLoss")] = metrics.mean(mean_loss, weights=weights, )
+            summary_scalar("LogLoss", mean_loss)
+
+            metric_ops[_summary_key(self._name, "AUC")] = metrics.auc(labels, predictions, weights=weights)
+            summary_scalar("AUC", metric_ops[_summary_key(self._name, "AUC")][1])
+        else:
+
+            metric_ops[_summary_key(self._name, "MSE")] = metrics.mean_squared_error(labels, predictions,
+                                                                                     weights=weights)
+            summary_scalar("MSE", mean_loss)
+
+            metric_ops[_summary_key(self._name, "MAE")] = metrics.mean_absolute_error(labels, predictions,
+                                                                                      weights=weights)
+            summary_scalar("MAE", metric_ops[_summary_key(self._name, "MAE")][1])
+
+        return metric_ops
+
+    def create_loss(self, features, mode, logits, labels):
+        del mode, features  # Unused for this head.
+        losses = get_losses()
+        if self._task == "binary":
+            loss = losses.sigmoid_cross_entropy(labels, logits, reduction=losses.Reduction.NONE)
+        else:
+            loss = losses.mean_squared_error(labels, logits, reduction=losses.Reduction.NONE)
+        return loss
+
+    def create_estimator_spec(
+            self, features, mode, logits, labels=None, train_op_fn=None, training_chief_hooks=None):
+        # with name_scope('head'):
+        logits = tf.reshape(logits, [-1, 1])
+        if self._task == 'binary':
+            pred = tf.sigmoid(logits)
+        else:
+            pred = logits
+
+        predictions = {"pred": pred, "logits": logits}
+        export_outputs = {"predict": tf.estimator.export.PredictOutput(predictions)}
+        if mode == tf.estimator.ModeKeys.PREDICT:
+            return tf.estimator.EstimatorSpec(
+                mode=mode,
+                predictions=predictions,
+                export_outputs=export_outputs)
+
+        labels = tf.reshape(labels, [-1, 1])
+
+        unweighted_loss = self.create_loss(features, mode, logits, labels)
+
+        losses = get_losses()
+        loss = losses.compute_weighted_loss(
+            unweighted_loss, weights=1.0, reduction=losses.Reduction.SUM)
+        reg_loss = losses.get_regularization_loss()
+
+        training_loss = loss + reg_loss
+
+        eval_metric_ops = self._eval_metric_ops(labels, logits, pred, unweighted_loss)
+
+        return tf.estimator.EstimatorSpec(
+            mode=mode,
+            predictions=predictions,
+            loss=training_loss,
+            train_op=train_op_fn(training_loss),
+            eval_metric_ops=eval_metric_ops,
+            training_chief_hooks=training_chief_hooks)
+
+
+def deepctr_model_fn(features, mode, logits, labels, task, linear_optimizer, dnn_optimizer, training_chief_hooks):
+    linear_optimizer = get_optimizer_instance(linear_optimizer, 0.005)
+    dnn_optimizer = get_optimizer_instance(dnn_optimizer, 0.01)
+    train_op_fn = get_train_op_fn(linear_optimizer, dnn_optimizer)
+
+    head = Head(task)
+    return head.create_estimator_spec(features=features,
+                                      mode=mode,
+                                      labels=labels,
+                                      train_op_fn=train_op_fn,
+                                      logits=logits, training_chief_hooks=training_chief_hooks)
+
+
+def get_train_op_fn(linear_optimizer, dnn_optimizer):
+    def _train_op_fn(loss):
+        train_ops = []
+        try:
+            global_step = tf.train.get_global_step()
+        except AttributeError:
+            global_step = tf.compat.v1.train.get_global_step()
+        linear_var_list = get_collection(get_GraphKeys().TRAINABLE_VARIABLES, LINEAR_SCOPE_NAME)
+        dnn_var_list = get_collection(get_GraphKeys().TRAINABLE_VARIABLES, DNN_SCOPE_NAME)
+
+        if len(dnn_var_list) > 0:
+            train_ops.append(
+                dnn_optimizer.minimize(
+                    loss,
+                    var_list=dnn_var_list))
+        if len(linear_var_list) > 0:
+            train_ops.append(
+                linear_optimizer.minimize(
+                    loss,
+                    var_list=linear_var_list))
+
+        train_op = tf.group(*train_ops)
+        with tf.control_dependencies([train_op]):
+            try:
+                return tf.assign_add(global_step, 1).op
+            except AttributeError:
+                return tf.compat.v1.assign_add(global_step, 1).op
+
+    return _train_op_fn
+
+
+def variable_scope(name_or_scope):
+    try:
+        return tf.variable_scope(name_or_scope)
+    except AttributeError:
+        return tf.compat.v1.variable_scope(name_or_scope)
+
+def get_collection(key, scope=None):
+    try:
+        return tf.get_collection(key, scope=scope)
+    except AttributeError:
+        return tf.compat.v1.get_collection(key, scope=scope)
+
+
+def get_GraphKeys():
+    try:
+        return tf.GraphKeys
+    except AttributeError:
+        return tf.compat.v1.GraphKeys
+
+
+def get_losses():
+    try:
+        return tf.compat.v1.losses
+    except AttributeError:
+        return tf.losses
+
+
+def input_layer(features, feature_columns):
+    try:
+        return tf.feature_column.input_layer(features, feature_columns)
+    except AttributeError:
+        return tf.compat.v1.feature_column.input_layer(features, feature_columns)
+
+
+def get_metrics():
+    try:
+        return tf.compat.v1.metrics
+    except AttributeError:
+        return tf.metrics
+
+
+def to_float(x, name="ToFloat"):
+    try:
+        return tf.to_float(x, name)
+    except AttributeError:
+        return tf.compat.v1.to_float(x, name)
+
+
+def summary_scalar(name, data):
+    try:
+        tf.summary.scalar(name, data)
+    except AttributeError:  # tf version 2.5.0+:AttributeError: module 'tensorflow._api.v2.summary' has no attribute 'scalar'
+        tf.compat.v1.summary.scalar(name, data)
\ No newline at end of file
diff --git a/modelzoo/FNN/script/feature_column.py b/modelzoo/FNN/script/feature_column.py
new file mode 100644
index 00000000000..0569e32d3c3
--- /dev/null
+++ b/modelzoo/FNN/script/feature_column.py
@@ -0,0 +1,220 @@
+import tensorflow as tf
+from collections import namedtuple, OrderedDict
+from copy import copy
+from itertools import chain
+
+from tensorflow.python.keras.initializers import RandomNormal, Zeros
+from tensorflow.python.keras.layers import Input, Lambda
+
+from .inputs import create_embedding_matrix, embedding_lookup, get_dense_input, varlen_embedding_lookup, \
+    get_varlen_pooling_list, mergeDict
+from .layers import Linear
+from .layers.utils import concat_func
+#from keras import backend as K
+import pandas as pd
+import numpy as np
+
+
+DEFAULT_GROUP_NAME = "default_group"
+
+
+class SparseFeat(namedtuple('SparseFeat',
+                            ['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'vocabulary_path', 'dtype', 'embeddings_initializer',
+                             'embedding_name',
+                             'group_name', 'trainable'])):
+    __slots__ = ()
+
+    def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype="int32", embeddings_initializer=None,
+                embedding_name=None,
+                group_name=DEFAULT_GROUP_NAME, trainable=True):
+
+        if embedding_dim == "auto":
+            embedding_dim = 6 * int(pow(vocabulary_size, 0.25))
+        if embeddings_initializer is None:
+            embeddings_initializer = RandomNormal(mean=0.0, stddev=0.0001, seed=2020)
+
+
+
+        if embedding_name is None:
+            embedding_name = name
+
+        return super(SparseFeat, cls).__new__(cls, name, vocabulary_size, embedding_dim, use_hash, vocabulary_path, dtype,
+                                              embeddings_initializer,
+                                              embedding_name, group_name, trainable)
+
+    def __hash__(self):
+        return self.name.__hash__()
+
+
+class VarLenSparseFeat(namedtuple('VarLenSparseFeat',
+                                  ['sparsefeat', 'maxlen', 'combiner', 'length_name', 'weight_name', 'weight_norm'])):
+    __slots__ = ()
+
+    def __new__(cls, sparsefeat, maxlen, combiner="mean", length_name=None, weight_name=None, weight_norm=True):
+        return super(VarLenSparseFeat, cls).__new__(cls, sparsefeat, maxlen, combiner, length_name, weight_name,
+                                                    weight_norm)
+
+    @property
+    def name(self):
+        return self.sparsefeat.name
+
+    @property
+    def vocabulary_size(self):
+        return self.sparsefeat.vocabulary_size
+
+    @property
+    def embedding_dim(self):
+        return self.sparsefeat.embedding_dim
+
+    @property
+    def use_hash(self):
+        return self.sparsefeat.use_hash
+
+    @property
+    def vocabulary_path(self):
+        return self.sparsefeat.vocabulary_path
+
+    @property
+    def dtype(self):
+        return self.sparsefeat.dtype
+
+    @property
+    def embeddings_initializer(self):
+        return self.sparsefeat.embeddings_initializer
+
+    @property
+    def embedding_name(self):
+        return self.sparsefeat.embedding_name
+
+    @property
+    def group_name(self):
+        return self.sparsefeat.group_name
+
+    @property
+    def trainable(self):
+        return self.sparsefeat.trainable
+
+    def __hash__(self):
+        return self.name.__hash__()
+
+
+class DenseFeat(namedtuple('DenseFeat', ['name', 'dimension', 'dtype', 'transform_fn'])):
+    """ Dense feature
+    Args:
+        name: feature name,
+        dimension: dimension of the feature, default = 1.
+        dtype: dtype of the feature, default="float32".
+        transform_fn: If not `None` , a function that can be used to transform
+        values of the feature.  the function takes the input Tensor as its
+        argument, and returns the output Tensor.
+        (e.g. lambda x: (x - 3.0) / 4.2).
+    """
+    __slots__ = ()
+
+    def __new__(cls, name, dimension=1, dtype="float32", transform_fn=None):
+        return super(DenseFeat, cls).__new__(cls, name, dimension, dtype, transform_fn)
+
+    def __hash__(self):
+        return self.name.__hash__()
+
+    # def __eq__(self, other):
+    #     if self.name == other.name:
+    #         return True
+    #     return False
+
+    # def __repr__(self):
+    #     return 'DenseFeat:'+self.name
+
+
+def get_feature_names(feature_columns):
+    features = build_input_features(feature_columns)
+    return list(features.keys())
+
+
+def build_input_features(feature_columns, prefix=''):
+    input_features = OrderedDict()
+    for fc in feature_columns:
+        if isinstance(fc, SparseFeat):
+            input_features[fc.name] = Input(
+                shape=(1,), name=prefix + fc.name, dtype=fc.dtype)
+        elif isinstance(fc, DenseFeat):
+            input_features[fc.name] = Input(
+                shape=(fc.dimension,), name=prefix + fc.name, dtype=fc.dtype)
+        elif isinstance(fc, VarLenSparseFeat):
+            input_features[fc.name] = Input(shape=(fc.maxlen,), name=prefix + fc.name,
+                                            dtype=fc.dtype)
+            if fc.weight_name is not None:
+                input_features[fc.weight_name] = Input(shape=(fc.maxlen, 1), name=prefix + fc.weight_name,
+                                                       dtype="float32")
+            if fc.length_name is not None:
+                input_features[fc.length_name] = Input((1,), name=prefix + fc.length_name, dtype='int32')
+
+        else:
+            raise TypeError("Invalid feature column type,got", type(fc))
+
+    return input_features
+
+
+def get_linear_logit(features, feature_columns, units=1, use_bias=False, seed=1024, prefix='linear',
+                     l2_reg=0, sparse_feat_refine_weight=None):
+    linear_feature_columns = copy(feature_columns)
+    for i in range(len(linear_feature_columns)):
+        if isinstance(linear_feature_columns[i], SparseFeat):
+            linear_feature_columns[i] = linear_feature_columns[i]._replace(embedding_dim=1,
+                                                                           embeddings_initializer=Zeros())
+        if isinstance(linear_feature_columns[i], VarLenSparseFeat):
+            linear_feature_columns[i] = linear_feature_columns[i]._replace(
+                sparsefeat=linear_feature_columns[i].sparsefeat._replace(embedding_dim=1,
+                                                                         embeddings_initializer=Zeros()))
+
+    linear_emb_list = [input_from_feature_columns(features, linear_feature_columns, l2_reg, seed,
+                                                  prefix=prefix + str(i))[0] for i in range(units)]
+    _, dense_input_list = input_from_feature_columns(features, linear_feature_columns, l2_reg, seed, prefix=prefix)
+
+    linear_logit_list = []
+    for i in range(units):
+
+        if len(linear_emb_list[i]) > 0 and len(dense_input_list) > 0:
+            sparse_input = concat_func(linear_emb_list[i])
+            dense_input = concat_func(dense_input_list)
+            if sparse_feat_refine_weight is not None:
+                sparse_input = Lambda(lambda x: x[0] * tf.expand_dims(x[1], axis=1))(
+                    [sparse_input, sparse_feat_refine_weight])
+            linear_logit = Linear(l2_reg, mode=2, use_bias=use_bias, seed=seed)([sparse_input, dense_input])
+        elif len(linear_emb_list[i]) > 0:
+            sparse_input = concat_func(linear_emb_list[i])
+            if sparse_feat_refine_weight is not None:
+                sparse_input = Lambda(lambda x: x[0] * tf.expand_dims(x[1], axis=1))(
+                    [sparse_input, sparse_feat_refine_weight])
+            linear_logit = Linear(l2_reg, mode=0, use_bias=use_bias, seed=seed)(sparse_input)
+        elif len(dense_input_list) > 0:
+            dense_input = concat_func(dense_input_list)
+            linear_logit = Linear(l2_reg, mode=1, use_bias=use_bias, seed=seed)(dense_input)
+        else:   #empty feature_columns
+            return Lambda(lambda x: tf.constant([[0.0]]))(list(features.values())[0])
+        linear_logit_list.append(linear_logit)
+
+    return concat_func(linear_logit_list)
+
+
+def input_from_feature_columns(features, feature_columns, l2_reg, seed, prefix='', seq_mask_zero=True,
+                               support_dense=True, support_group=False):
+    sparse_feature_columns = list(
+        filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
+    varlen_sparse_feature_columns = list(
+        filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else []
+
+    embedding_matrix_dict = create_embedding_matrix(feature_columns, l2_reg, seed, prefix=prefix,
+                                                    seq_mask_zero=seq_mask_zero)
+    group_sparse_embedding_dict = embedding_lookup(embedding_matrix_dict, features, sparse_feature_columns)
+    dense_value_list = get_dense_input(features, feature_columns)
+    if not support_dense and len(dense_value_list) > 0:
+        raise ValueError("DenseFeat is not supported in dnn_feature_columns")
+
+    sequence_embed_dict = varlen_embedding_lookup(embedding_matrix_dict, features, varlen_sparse_feature_columns)
+    group_varlen_sparse_embedding_dict = get_varlen_pooling_list(sequence_embed_dict, features,
+                                                                 varlen_sparse_feature_columns)
+    group_embedding_dict = mergeDict(group_sparse_embedding_dict, group_varlen_sparse_embedding_dict)
+    if not support_group:
+        group_embedding_dict = list(chain.from_iterable(group_embedding_dict.values()))
+    return group_embedding_dict, dense_value_list
diff --git a/modelzoo/FNN/script/inputs.py b/modelzoo/FNN/script/inputs.py
new file mode 100644
index 00000000000..d567f846265
--- /dev/null
+++ b/modelzoo/FNN/script/inputs.py
@@ -0,0 +1,155 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+from collections import defaultdict
+from itertools import chain
+
+from tensorflow.python.keras.layers import Embedding, Lambda
+from tensorflow.python.keras.regularizers import l2
+
+from .layers.sequence import SequencePoolingLayer, WeightedSequenceLayer
+from .layers.utils import Hash
+
+
+def get_inputs_list(inputs):
+    return list(chain(*list(map(lambda x: x.values(), filter(lambda x: x is not None, inputs)))))
+
+
+def create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, seed, l2_reg,
+                          prefix='sparse_', seq_mask_zero=True):
+    sparse_embedding = {}
+    for feat in sparse_feature_columns:
+        emb = Embedding(feat.vocabulary_size, feat.embedding_dim,
+                        embeddings_initializer=feat.embeddings_initializer,
+                        embeddings_regularizer=l2(l2_reg),
+                        name=prefix + '_emb_' + feat.embedding_name)
+        emb.trainable = feat.trainable
+        sparse_embedding[feat.embedding_name] = emb
+
+    if varlen_sparse_feature_columns and len(varlen_sparse_feature_columns) > 0:
+        for feat in varlen_sparse_feature_columns:
+            # if feat.name not in sparse_embedding:
+            emb = Embedding(feat.vocabulary_size, feat.embedding_dim,
+                            embeddings_initializer=feat.embeddings_initializer,
+                            embeddings_regularizer=l2(
+                                l2_reg),
+                            name=prefix + '_seq_emb_' + feat.name,
+                            mask_zero=seq_mask_zero)
+            emb.trainable = feat.trainable
+            sparse_embedding[feat.embedding_name] = emb
+    return sparse_embedding
+
+
+def get_embedding_vec_list(embedding_dict, input_dict, sparse_feature_columns, return_feat_list=(), mask_feat_list=()):
+    embedding_vec_list = []
+    for fg in sparse_feature_columns:
+        feat_name = fg.name
+        if len(return_feat_list) == 0 or feat_name in return_feat_list:
+            if fg.use_hash:
+                lookup_idx = Hash(fg.vocabulary_size, mask_zero=(feat_name in mask_feat_list), vocabulary_path=fg.vocabulary_path)(input_dict[feat_name])
+            else:
+                lookup_idx = input_dict[feat_name]
+
+            embedding_vec_list.append(embedding_dict[feat_name](lookup_idx))
+
+    return embedding_vec_list
+
+
+def create_embedding_matrix(feature_columns, l2_reg, seed, prefix="", seq_mask_zero=True):
+    from . import feature_column as fc_lib
+
+    sparse_feature_columns = list(
+        filter(lambda x: isinstance(x, fc_lib.SparseFeat), feature_columns)) if feature_columns else []
+    varlen_sparse_feature_columns = list(
+        filter(lambda x: isinstance(x, fc_lib.VarLenSparseFeat), feature_columns)) if feature_columns else []
+    sparse_emb_dict = create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, seed,
+                                            l2_reg, prefix=prefix + 'sparse', seq_mask_zero=seq_mask_zero)
+    return sparse_emb_dict
+
+
+def embedding_lookup(sparse_embedding_dict, sparse_input_dict, sparse_feature_columns, return_feat_list=(),
+                     mask_feat_list=(), to_list=False):
+    group_embedding_dict = defaultdict(list)
+    for fc in sparse_feature_columns:
+        feature_name = fc.name
+        embedding_name = fc.embedding_name
+        if (len(return_feat_list) == 0 or feature_name in return_feat_list):
+            if fc.use_hash:
+                lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list), vocabulary_path=fc.vocabulary_path)(
+                    sparse_input_dict[feature_name])
+            else:
+                lookup_idx = sparse_input_dict[feature_name]
+
+            group_embedding_dict[fc.group_name].append(sparse_embedding_dict[embedding_name](lookup_idx))
+    if to_list:
+        return list(chain.from_iterable(group_embedding_dict.values()))
+    return group_embedding_dict
+
+
+def varlen_embedding_lookup(embedding_dict, sequence_input_dict, varlen_sparse_feature_columns):
+    varlen_embedding_vec_dict = {}
+    for fc in varlen_sparse_feature_columns:
+        feature_name = fc.name
+        embedding_name = fc.embedding_name
+        if fc.use_hash:
+            lookup_idx = Hash(fc.vocabulary_size, mask_zero=True, vocabulary_path=fc.vocabulary_path)(sequence_input_dict[feature_name])
+        else:
+            lookup_idx = sequence_input_dict[feature_name]
+        varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](lookup_idx)
+    return varlen_embedding_vec_dict
+
+
+def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_columns, to_list=False):
+    pooling_vec_list = defaultdict(list)
+    for fc in varlen_sparse_feature_columns:
+        feature_name = fc.name
+        combiner = fc.combiner
+        feature_length_name = fc.length_name
+        if feature_length_name is not None:
+            if fc.weight_name is not None:
+                seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm)(
+                    [embedding_dict[feature_name], features[feature_length_name], features[fc.weight_name]])
+            else:
+                seq_input = embedding_dict[feature_name]
+            vec = SequencePoolingLayer(combiner, supports_masking=False)(
+                [seq_input, features[feature_length_name]])
+        else:
+            if fc.weight_name is not None:
+                seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm, supports_masking=True)(
+                    [embedding_dict[feature_name], features[fc.weight_name]])
+            else:
+                seq_input = embedding_dict[feature_name]
+            vec = SequencePoolingLayer(combiner, supports_masking=True)(
+                seq_input)
+        pooling_vec_list[fc.group_name].append(vec)
+    if to_list:
+        return chain.from_iterable(pooling_vec_list.values())
+    return pooling_vec_list
+
+
+def get_dense_input(features, feature_columns):
+    from . import feature_column as fc_lib
+    dense_feature_columns = list(
+        filter(lambda x: isinstance(x, fc_lib.DenseFeat), feature_columns)) if feature_columns else []
+    dense_input_list = []
+    for fc in dense_feature_columns:
+        if fc.transform_fn is None:
+            dense_input_list.append(features[fc.name])
+        else:
+            transform_result = Lambda(fc.transform_fn)(features[fc.name])
+            dense_input_list.append(transform_result)
+    return dense_input_list
+
+
+def mergeDict(a, b):
+    c = defaultdict(list)
+    for k, v in a.items():
+        c[k].extend(v)
+    for k, v in b.items():
+        c[k].extend(v)
+    return c
diff --git a/modelzoo/FNN/script/layers/__init__.py b/modelzoo/FNN/script/layers/__init__.py
new file mode 100644
index 00000000000..1bfd40effe7
--- /dev/null
+++ b/modelzoo/FNN/script/layers/__init__.py
@@ -0,0 +1,52 @@
+import tensorflow as tf
+
+from .activation import Dice
+from .core import DNN, LocalActivationUnit, PredictionLayer
+from .interaction import (CIN, FM, AFMLayer, BiInteractionPooling, CrossNet, CrossNetMix,
+                          InnerProductLayer, InteractingLayer,
+                          OutterProductLayer, FGCNNLayer, SENETLayer, BilinearInteraction,
+                          FieldWiseBiInteraction, FwFMLayer, FEFMLayer)
+from .normalization import LayerNormalization
+from .sequence import (AttentionSequencePoolingLayer, BiasEncoding, BiLSTM,
+                       KMaxPooling, SequencePoolingLayer, WeightedSequenceLayer,
+                       Transformer, DynamicGRU,PositionEncoding)
+
+from .utils import NoMask, Hash, Linear, _Add, combined_dnn_input, softmax, reduce_sum
+
+custom_objects = {'tf': tf,
+                  'InnerProductLayer': InnerProductLayer,
+                  'OutterProductLayer': OutterProductLayer,
+                  'DNN': DNN,
+                  'PredictionLayer': PredictionLayer,
+                  'FM': FM,
+                  'AFMLayer': AFMLayer,
+                  'CrossNet': CrossNet,
+                  'CrossNetMix': CrossNetMix,
+                  'BiInteractionPooling': BiInteractionPooling,
+                  'LocalActivationUnit': LocalActivationUnit,
+                  'Dice': Dice,
+                  'SequencePoolingLayer': SequencePoolingLayer,
+                  'AttentionSequencePoolingLayer': AttentionSequencePoolingLayer,
+                  'CIN': CIN,
+                  'InteractingLayer': InteractingLayer,
+                  'LayerNormalization': LayerNormalization,
+                  'BiLSTM': BiLSTM,
+                  'Transformer': Transformer,
+                  'NoMask': NoMask,
+                  'BiasEncoding': BiasEncoding,
+                  'KMaxPooling': KMaxPooling,
+                  'FGCNNLayer': FGCNNLayer,
+                  'Hash': Hash,
+                  'Linear': Linear,
+                  'DynamicGRU': DynamicGRU,
+                  'SENETLayer': SENETLayer,
+                  'BilinearInteraction': BilinearInteraction,
+                  'WeightedSequenceLayer': WeightedSequenceLayer,
+                  '_Add': _Add,
+                  'FieldWiseBiInteraction': FieldWiseBiInteraction,
+                  'FwFMLayer': FwFMLayer,
+                  'softmax': softmax,
+                  'FEFMLayer': FEFMLayer,
+                  'reduce_sum': reduce_sum,
+                  'PositionEncoding':PositionEncoding
+                  }
diff --git a/modelzoo/FNN/script/layers/activation.py b/modelzoo/FNN/script/layers/activation.py
new file mode 100644
index 00000000000..1b953bff8bc
--- /dev/null
+++ b/modelzoo/FNN/script/layers/activation.py
@@ -0,0 +1,85 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+import tensorflow as tf
+
+try:
+    from tensorflow.python.ops.init_ops import Zeros
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import Zeros
+from tensorflow.python.keras.layers import Layer, Activation
+
+try:
+    from tensorflow.python.keras.layers import BatchNormalization
+except ImportError:
+    BatchNormalization = tf.keras.layers.BatchNormalization
+
+try:
+    unicode
+except NameError:
+    unicode = str
+
+
+class Dice(Layer):
+    """The Data Adaptive Activation Function in DIN,which can be viewed as a generalization of PReLu and can adaptively adjust the rectified point according to distribution of input data.
+
+      Input shape
+        - Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
+
+      Output shape
+        - Same shape as the input.
+
+      Arguments
+        - **axis** : Integer, the axis that should be used to compute data distribution (typically the features axis).
+
+        - **epsilon** : Small float added to variance to avoid dividing by zero.
+
+      References
+        - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
+    """
+
+    def __init__(self, axis=-1, epsilon=1e-9, **kwargs):
+        self.axis = axis
+        self.epsilon = epsilon
+        super(Dice, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        self.bn = BatchNormalization(
+            axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
+        self.alphas = self.add_weight(shape=(input_shape[-1],), initializer=Zeros(
+        ), dtype=tf.float32, name='dice_alpha')  # name='alpha_'+self.name
+        super(Dice, self).build(input_shape)  # Be sure to call this somewhere!
+        self.uses_learning_phase = True
+
+    def call(self, inputs, training=None, **kwargs):
+        inputs_normed = self.bn(inputs, training=training)
+        # tf.layers.batch_normalization(
+        # inputs, axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
+        x_p = tf.sigmoid(inputs_normed)
+        return self.alphas * (1.0 - x_p) * inputs + x_p * inputs
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self, ):
+        config = {'axis': self.axis, 'epsilon': self.epsilon}
+        base_config = super(Dice, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+def activation_layer(activation):
+    if activation in ("dice", "Dice"):
+        act_layer = Dice()
+    elif isinstance(activation, (str, unicode)):
+        act_layer = Activation(activation)
+    elif issubclass(activation, Layer):
+        act_layer = activation()
+    else:
+        raise ValueError(
+            "Invalid activation,found %s.You should use a str or a Activation Layer Class." % (activation))
+    return act_layer
diff --git a/modelzoo/FNN/script/layers/core.py b/modelzoo/FNN/script/layers/core.py
new file mode 100644
index 00000000000..668348d2eb7
--- /dev/null
+++ b/modelzoo/FNN/script/layers/core.py
@@ -0,0 +1,267 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+import tensorflow as tf
+from tensorflow.python.keras import backend as K
+
+try:
+    from tensorflow.python.ops.init_ops_v2 import Zeros, glorot_normal
+except ImportError:
+    from tensorflow.python.ops.init_ops import Zeros, glorot_normal_initializer as glorot_normal
+
+from tensorflow.python.keras.layers import Layer, Dropout
+
+try:
+    from tensorflow.python.keras.layers import BatchNormalization
+except ImportError:
+    BatchNormalization = tf.keras.layers.BatchNormalization
+from tensorflow.python.keras.regularizers import l2
+
+from .activation import activation_layer
+
+
+class LocalActivationUnit(Layer):
+    """The LocalActivationUnit used in DIN with which the representation of
+    user interests varies adaptively given different candidate items.
+
+      Input shape
+        - A list of two 3D tensor with shape:  ``(batch_size, 1, embedding_size)`` and ``(batch_size, T, embedding_size)``
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, T, 1)``.
+
+      Arguments
+        - **hidden_units**:list of positive integer, the attention net layer number and units in each layer.
+
+        - **activation**: Activation function to use in attention net.
+
+        - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix of attention net.
+
+        - **dropout_rate**: float in [0,1). Fraction of the units to dropout in attention net.
+
+        - **use_bn**: bool. Whether use BatchNormalization before activation or not in attention net.
+
+        - **seed**: A Python integer to use as random seed.
+
+      References
+        - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
+    """
+
+    def __init__(self, hidden_units=(64, 32), activation='sigmoid', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024,
+                 **kwargs):
+        self.hidden_units = hidden_units
+        self.activation = activation
+        self.l2_reg = l2_reg
+        self.dropout_rate = dropout_rate
+        self.use_bn = use_bn
+        self.seed = seed
+        super(LocalActivationUnit, self).__init__(**kwargs)
+        self.supports_masking = True
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) != 2:
+            raise ValueError('A `LocalActivationUnit` layer should be called '
+                             'on a list of 2 inputs')
+
+        if len(input_shape[0]) != 3 or len(input_shape[1]) != 3:
+            raise ValueError("Unexpected inputs dimensions %d and %d, expect to be 3 dimensions" % (
+                len(input_shape[0]), len(input_shape[1])))
+
+        if input_shape[0][-1] != input_shape[1][-1] or input_shape[0][1] != 1:
+            raise ValueError('A `LocalActivationUnit` layer requires '
+                             'inputs of a two inputs with shape (None,1,embedding_size) and (None,T,embedding_size)'
+                             'Got different shapes: %s,%s' % (input_shape[0], input_shape[1]))
+        size = 4 * \
+               int(input_shape[0][-1]
+                   ) if len(self.hidden_units) == 0 else self.hidden_units[-1]
+        self.kernel = self.add_weight(shape=(size, 1),
+                                      initializer=glorot_normal(
+                                          seed=self.seed),
+                                      name="kernel")
+        self.bias = self.add_weight(
+            shape=(1,), initializer=Zeros(), name="bias")
+        self.dnn = DNN(self.hidden_units, self.activation, self.l2_reg, self.dropout_rate, self.use_bn, seed=self.seed)
+
+        super(LocalActivationUnit, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, training=None, **kwargs):
+
+        query, keys = inputs
+
+        keys_len = keys.get_shape()[1]
+        queries = K.repeat_elements(query, keys_len, 1)
+
+        att_input = tf.concat(
+            [queries, keys, queries - keys, queries * keys], axis=-1)
+
+        att_out = self.dnn(att_input, training=training)
+
+        attention_score = tf.nn.bias_add(tf.tensordot(att_out, self.kernel, axes=(-1, 0)), self.bias)
+
+        return attention_score
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[1][:2] + (1,)
+
+    def compute_mask(self, inputs, mask):
+        return mask
+
+    def get_config(self, ):
+        config = {'activation': self.activation, 'hidden_units': self.hidden_units,
+                  'l2_reg': self.l2_reg, 'dropout_rate': self.dropout_rate, 'use_bn': self.use_bn, 'seed': self.seed}
+        base_config = super(LocalActivationUnit, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class DNN(Layer):
+    """The Multi Layer Percetron
+
+      Input shape
+        - nD tensor with shape: ``(batch_size, ..., input_dim)``. The most common situation would be a 2D input with shape ``(batch_size, input_dim)``.
+
+      Output shape
+        - nD tensor with shape: ``(batch_size, ..., hidden_size[-1])``. For instance, for a 2D input with shape ``(batch_size, input_dim)``, the output would have shape ``(batch_size, hidden_size[-1])``.
+
+      Arguments
+        - **hidden_units**:list of positive integer, the layer number and units in each layer.
+
+        - **activation**: Activation function to use.
+
+        - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix.
+
+        - **dropout_rate**: float in [0,1). Fraction of the units to dropout.
+
+        - **use_bn**: bool. Whether use BatchNormalization before activation or not.
+
+        - **output_activation**: Activation function to use in the last layer.If ``None``,it will be same as ``activation``.
+
+        - **seed**: A Python integer to use as random seed.
+    """
+
+    def __init__(self, hidden_units, activation='relu', l2_reg=0, dropout_rate=0, use_bn=False, output_activation=None,
+                 seed=1024, **kwargs):
+        self.hidden_units = hidden_units
+        self.activation = activation
+        self.l2_reg = l2_reg
+        self.dropout_rate = dropout_rate
+        self.use_bn = use_bn
+        self.output_activation = output_activation
+        self.seed = seed
+
+        super(DNN, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # if len(self.hidden_units) == 0:
+        #     raise ValueError("hidden_units is empty")
+        input_size = input_shape[-1]
+        hidden_units = [int(input_size)] + list(self.hidden_units)
+        self.kernels = [self.add_weight(name='kernel' + str(i),
+                                        shape=(
+                                            hidden_units[i], hidden_units[i + 1]),
+                                        initializer=glorot_normal(
+                                            seed=self.seed),
+                                        regularizer=l2(self.l2_reg),
+                                        trainable=True) for i in range(len(self.hidden_units))]
+        self.bias = [self.add_weight(name='bias' + str(i),
+                                     shape=(self.hidden_units[i],),
+                                     initializer=Zeros(),
+                                     trainable=True) for i in range(len(self.hidden_units))]
+        if self.use_bn:
+            self.bn_layers = [BatchNormalization() for _ in range(len(self.hidden_units))]
+
+        self.dropout_layers = [Dropout(self.dropout_rate, seed=self.seed + i) for i in
+                               range(len(self.hidden_units))]
+
+        self.activation_layers = [activation_layer(self.activation) for _ in range(len(self.hidden_units))]
+
+        if self.output_activation:
+            self.activation_layers[-1] = activation_layer(self.output_activation)
+
+        super(DNN, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, training=None, **kwargs):
+
+        deep_input = inputs
+
+        for i in range(len(self.hidden_units)):
+            fc = tf.nn.bias_add(tf.tensordot(
+                deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i])
+
+            if self.use_bn:
+                fc = self.bn_layers[i](fc, training=training)
+            try:
+                fc = self.activation_layers[i](fc, training=training)
+            except TypeError as e:  # TypeError: call() got an unexpected keyword argument 'training'
+                print("make sure the activation function use training flag properly", e)
+                fc = self.activation_layers[i](fc)
+
+            fc = self.dropout_layers[i](fc, training=training)
+            deep_input = fc
+
+        return deep_input
+
+    def compute_output_shape(self, input_shape):
+        if len(self.hidden_units) > 0:
+            shape = input_shape[:-1] + (self.hidden_units[-1],)
+        else:
+            shape = input_shape
+
+        return tuple(shape)
+
+    def get_config(self, ):
+        config = {'activation': self.activation, 'hidden_units': self.hidden_units,
+                  'l2_reg': self.l2_reg, 'use_bn': self.use_bn, 'dropout_rate': self.dropout_rate,
+                  'output_activation': self.output_activation, 'seed': self.seed}
+        base_config = super(DNN, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class PredictionLayer(Layer):
+    """
+      Arguments
+         - **task**: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
+
+         - **use_bias**: bool.Whether add bias term or not.
+    """
+
+    def __init__(self, task='binary', use_bias=True, **kwargs):
+        if task not in ["binary", "multiclass", "regression"]:
+            raise ValueError("task must be binary,multiclass or regression")
+        self.task = task
+        self.use_bias = use_bias
+        super(PredictionLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if self.use_bias:
+            self.global_bias = self.add_weight(
+                shape=(1,), initializer=Zeros(), name="global_bias")
+
+        # Be sure to call this somewhere!
+        super(PredictionLayer, self).build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        x = inputs
+        if self.use_bias:
+            x = tf.nn.bias_add(x, self.global_bias, data_format='NHWC')
+        if self.task == "binary":
+            x = tf.sigmoid(x)
+
+        output = tf.reshape(x, (-1, 1))
+
+        return output
+
+    def compute_output_shape(self, input_shape):
+        return (None, 1)
+
+    def get_config(self, ):
+        config = {'task': self.task, 'use_bias': self.use_bias}
+        base_config = super(PredictionLayer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/modelzoo/FNN/script/layers/interaction.py b/modelzoo/FNN/script/layers/interaction.py
new file mode 100644
index 00000000000..f19be14be9c
--- /dev/null
+++ b/modelzoo/FNN/script/layers/interaction.py
@@ -0,0 +1,1492 @@
+# -*- coding:utf-8 -*-
+"""
+
+Authors:
+    Weichen Shen,weichenswc@163.com,
+    Harshit Pande
+
+"""
+
+import itertools
+
+import tensorflow as tf
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.backend import batch_dot
+
+try:
+    from tensorflow.python.ops.init_ops import Zeros, Ones, Constant, TruncatedNormal, \
+        glorot_normal_initializer as glorot_normal, \
+        glorot_uniform_initializer as glorot_uniform
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import Zeros, Ones, Constant, TruncatedNormal, glorot_normal, glorot_uniform
+
+from tensorflow.python.keras.layers import Layer, MaxPooling2D, Conv2D, Dropout, Lambda, Dense, Flatten
+from tensorflow.python.keras.regularizers import l2
+from tensorflow.python.layers import utils
+
+from .activation import activation_layer
+from .utils import concat_func, reduce_sum, softmax, reduce_mean
+
+
+class AFMLayer(Layer):
+    """Attentonal Factorization Machine models pairwise (order-2) feature
+    interactions without linear term and bias.
+
+      Input shape
+        - A list of 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, 1)``.
+
+      Arguments
+        - **attention_factor** : Positive integer, dimensionality of the
+         attention network output space.
+
+        - **l2_reg_w** : float between 0 and 1. L2 regularizer strength
+         applied to attention network.
+
+        - **dropout_rate** : float between in [0,1). Fraction of the attention net output units to dropout.
+
+        - **seed** : A Python integer to use as random seed.
+
+      References
+        - [Attentional Factorization Machines : Learning the Weight of Feature
+        Interactions via Attention Networks](https://arxiv.org/pdf/1708.04617.pdf)
+    """
+
+    def __init__(self, attention_factor=4, l2_reg_w=0, dropout_rate=0, seed=1024, **kwargs):
+        self.attention_factor = attention_factor
+        self.l2_reg_w = l2_reg_w
+        self.dropout_rate = dropout_rate
+        self.seed = seed
+        super(AFMLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            # input_shape = input_shape[0]
+            # if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError('A `AttentionalFM` layer should be called '
+                             'on a list of at least 2 inputs')
+
+        shape_set = set()
+        reduced_input_shape = [shape.as_list() for shape in input_shape]
+        for i in range(len(input_shape)):
+            shape_set.add(tuple(reduced_input_shape[i]))
+
+        if len(shape_set) > 1:
+            raise ValueError('A `AttentionalFM` layer requires '
+                             'inputs with same shapes '
+                             'Got different shapes: %s' % (shape_set))
+
+        if len(input_shape[0]) != 3 or input_shape[0][1] != 1:
+            raise ValueError('A `AttentionalFM` layer requires '
+                             'inputs of a list with same shape tensor like\
+                             (None, 1, embedding_size)'
+                             'Got different shapes: %s' % (input_shape[0]))
+
+        embedding_size = int(input_shape[0][-1])
+
+        self.attention_W = self.add_weight(shape=(embedding_size,
+                                                  self.attention_factor), initializer=glorot_normal(seed=self.seed),
+                                           regularizer=l2(self.l2_reg_w), name="attention_W")
+        self.attention_b = self.add_weight(
+            shape=(self.attention_factor,), initializer=Zeros(), name="attention_b")
+        self.projection_h = self.add_weight(shape=(self.attention_factor, 1),
+                                            initializer=glorot_normal(seed=self.seed), name="projection_h")
+        self.projection_p = self.add_weight(shape=(
+            embedding_size, 1), initializer=glorot_normal(seed=self.seed), name="projection_p")
+        self.dropout = Dropout(
+            self.dropout_rate, seed=self.seed)
+
+        self.tensordot = Lambda(
+            lambda x: tf.tensordot(x[0], x[1], axes=(-1, 0)))
+
+        # Be sure to call this somewhere!
+        super(AFMLayer, self).build(input_shape)
+
+    def call(self, inputs, training=None, **kwargs):
+
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        embeds_vec_list = inputs
+        row = []
+        col = []
+
+        for r, c in itertools.combinations(embeds_vec_list, 2):
+            row.append(r)
+            col.append(c)
+
+        p = tf.concat(row, axis=1)
+        q = tf.concat(col, axis=1)
+        inner_product = p * q
+
+        bi_interaction = inner_product
+        attention_temp = tf.nn.relu(tf.nn.bias_add(tf.tensordot(
+            bi_interaction, self.attention_W, axes=(-1, 0)), self.attention_b))
+        #  Dense(self.attention_factor,'relu',kernel_regularizer=l2(self.l2_reg_w))(bi_interaction)
+        self.normalized_att_score = softmax(tf.tensordot(
+            attention_temp, self.projection_h, axes=(-1, 0)), dim=1)
+        attention_output = reduce_sum(
+            self.normalized_att_score * bi_interaction, axis=1)
+
+        attention_output = self.dropout(attention_output, training=training)  # training
+
+        afm_out = self.tensordot([attention_output, self.projection_p])
+        return afm_out
+
+    def compute_output_shape(self, input_shape):
+
+        if not isinstance(input_shape, list):
+            raise ValueError('A `AFMLayer` layer should be called '
+                             'on a list of inputs.')
+        return (None, 1)
+
+    def get_config(self, ):
+        config = {'attention_factor': self.attention_factor,
+                  'l2_reg_w': self.l2_reg_w, 'dropout_rate': self.dropout_rate, 'seed': self.seed}
+        base_config = super(AFMLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class BiInteractionPooling(Layer):
+    """Bi-Interaction Layer used in Neural FM,compress the
+     pairwise element-wise product of features into one single vector.
+
+      Input shape
+        - A 3D tensor with shape:``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      References
+        - [He X, Chua T S. Neural factorization machines for sparse predictive analytics[C]//Proceedings of the 40th International ACM SIGIR conference on Research and Development in Information Retrieval. ACM, 2017: 355-364.](http://arxiv.org/abs/1708.05027)
+    """
+
+    def __init__(self, **kwargs):
+
+        super(BiInteractionPooling, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(input_shape)))
+
+        super(BiInteractionPooling, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        concated_embeds_value = inputs
+        square_of_sum = tf.square(reduce_sum(
+            concated_embeds_value, axis=1, keep_dims=True))
+        sum_of_square = reduce_sum(
+            concated_embeds_value * concated_embeds_value, axis=1, keep_dims=True)
+        cross_term = 0.5 * (square_of_sum - sum_of_square)
+
+        return cross_term
+
+    def compute_output_shape(self, input_shape):
+        return (None, 1, input_shape[-1])
+
+
+class CIN(Layer):
+    """Compressed Interaction Network used in xDeepFM.This implemention is
+    adapted from code that the author of the paper published on https://github.com/Leavingseason/xDeepFM.
+
+      Input shape
+        - 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, featuremap_num)`` ``featuremap_num =  sum(self.layer_size[:-1]) // 2 + self.layer_size[-1]`` if ``split_half=True``,else  ``sum(layer_size)`` .
+
+      Arguments
+        - **layer_size** : list of int.Feature maps in each layer.
+
+        - **activation** : activation function used on feature maps.
+
+        - **split_half** : bool.if set to False, half of the feature maps in each hidden will connect to output unit.
+
+        - **seed** : A Python integer to use as random seed.
+
+      References
+        - [Lian J, Zhou X, Zhang F, et al. xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems[J]. arXiv preprint arXiv:1803.05170, 2018.] (https://arxiv.org/pdf/1803.05170.pdf)
+    """
+
+    def __init__(self, layer_size=(128, 128), activation='relu', split_half=True, l2_reg=1e-5, seed=1024, **kwargs):
+        if len(layer_size) == 0:
+            raise ValueError(
+                "layer_size must be a list(tuple) of length greater than 1")
+        self.layer_size = layer_size
+        self.split_half = split_half
+        self.activation = activation
+        self.l2_reg = l2_reg
+        self.seed = seed
+        super(CIN, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(input_shape)))
+
+        self.field_nums = [int(input_shape[1])]
+        self.filters = []
+        self.bias = []
+        for i, size in enumerate(self.layer_size):
+
+            self.filters.append(self.add_weight(name='filter' + str(i),
+                                                shape=[1, self.field_nums[-1]
+                                                       * self.field_nums[0], size],
+                                                dtype=tf.float32, initializer=glorot_uniform(
+                    seed=self.seed + i),
+                                                regularizer=l2(self.l2_reg)))
+
+            self.bias.append(self.add_weight(name='bias' + str(i), shape=[size], dtype=tf.float32,
+                                             initializer=Zeros()))
+
+            if self.split_half:
+                if i != len(self.layer_size) - 1 and size % 2 > 0:
+                    raise ValueError(
+                        "layer_size must be even number except for the last layer when split_half=True")
+
+                self.field_nums.append(size // 2)
+            else:
+                self.field_nums.append(size)
+
+        self.activation_layers = [activation_layer(
+            self.activation) for _ in self.layer_size]
+
+        super(CIN, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        dim = int(inputs.get_shape()[-1])
+        hidden_nn_layers = [inputs]
+        final_result = []
+
+        split_tensor0 = tf.split(hidden_nn_layers[0], dim * [1], 2)
+        for idx, layer_size in enumerate(self.layer_size):
+            split_tensor = tf.split(hidden_nn_layers[-1], dim * [1], 2)
+
+            dot_result_m = tf.matmul(
+                split_tensor0, split_tensor, transpose_b=True)
+
+            dot_result_o = tf.reshape(
+                dot_result_m, shape=[dim, -1, self.field_nums[0] * self.field_nums[idx]])
+
+            dot_result = tf.transpose(dot_result_o, perm=[1, 0, 2])
+
+            curr_out = tf.nn.conv1d(
+                dot_result, filters=self.filters[idx], stride=1, padding='VALID')
+
+            curr_out = tf.nn.bias_add(curr_out, self.bias[idx])
+
+            curr_out = self.activation_layers[idx](curr_out)
+
+            curr_out = tf.transpose(curr_out, perm=[0, 2, 1])
+
+            if self.split_half:
+                if idx != len(self.layer_size) - 1:
+                    next_hidden, direct_connect = tf.split(
+                        curr_out, 2 * [layer_size // 2], 1)
+                else:
+                    direct_connect = curr_out
+                    next_hidden = 0
+            else:
+                direct_connect = curr_out
+                next_hidden = curr_out
+
+            final_result.append(direct_connect)
+            hidden_nn_layers.append(next_hidden)
+
+        result = tf.concat(final_result, axis=1)
+        result = reduce_sum(result, -1, keep_dims=False)
+
+        return result
+
+    def compute_output_shape(self, input_shape):
+        if self.split_half:
+            featuremap_num = sum(
+                self.layer_size[:-1]) // 2 + self.layer_size[-1]
+        else:
+            featuremap_num = sum(self.layer_size)
+        return (None, featuremap_num)
+
+    def get_config(self, ):
+
+        config = {'layer_size': self.layer_size, 'split_half': self.split_half, 'activation': self.activation,
+                  'seed': self.seed}
+        base_config = super(CIN, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class CrossNet(Layer):
+    """The Cross Network part of Deep&Cross Network model,
+    which leans both low and high degree cross feature.
+
+      Input shape
+        - 2D tensor with shape: ``(batch_size, units)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, units)``.
+
+      Arguments
+        - **layer_num**: Positive integer, the cross layer number
+
+        - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix
+
+        - **parameterization**: string, ``"vector"``  or ``"matrix"`` ,  way to parameterize the cross network.
+
+        - **seed**: A Python integer to use as random seed.
+
+      References
+        - [Wang R, Fu B, Fu G, et al. Deep & cross network for ad click predictions[C]//Proceedings of the ADKDD'17. ACM, 2017: 12.](https://arxiv.org/abs/1708.05123)
+    """
+
+    def __init__(self, layer_num=2, parameterization='vector', l2_reg=0, seed=1024, **kwargs):
+        self.layer_num = layer_num
+        self.parameterization = parameterization
+        self.l2_reg = l2_reg
+        self.seed = seed
+        print('CrossNet parameterization:', self.parameterization)
+        super(CrossNet, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if len(input_shape) != 2:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 2 dimensions" % (len(input_shape),))
+
+        dim = int(input_shape[-1])
+        if self.parameterization == 'vector':
+            self.kernels = [self.add_weight(name='kernel' + str(i),
+                                            shape=(dim, 1),
+                                            initializer=glorot_normal(
+                                                seed=self.seed),
+                                            regularizer=l2(self.l2_reg),
+                                            trainable=True) for i in range(self.layer_num)]
+        elif self.parameterization == 'matrix':
+            self.kernels = [self.add_weight(name='kernel' + str(i),
+                                            shape=(dim, dim),
+                                            initializer=glorot_normal(
+                                                seed=self.seed),
+                                            regularizer=l2(self.l2_reg),
+                                            trainable=True) for i in range(self.layer_num)]
+        else:  # error
+            raise ValueError("parameterization should be 'vector' or 'matrix'")
+        self.bias = [self.add_weight(name='bias' + str(i),
+                                     shape=(dim, 1),
+                                     initializer=Zeros(),
+                                     trainable=True) for i in range(self.layer_num)]
+        # Be sure to call this somewhere!
+        super(CrossNet, self).build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 2:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 2 dimensions" % (K.ndim(inputs)))
+
+        x_0 = tf.expand_dims(inputs, axis=2)
+        x_l = x_0
+        for i in range(self.layer_num):
+            if self.parameterization == 'vector':
+                xl_w = tf.tensordot(x_l, self.kernels[i], axes=(1, 0))
+                dot_ = tf.matmul(x_0, xl_w)
+                x_l = dot_ + self.bias[i] + x_l
+            elif self.parameterization == 'matrix':
+                xl_w = tf.einsum('ij,bjk->bik', self.kernels[i], x_l)  # W * xi  (bs, dim, 1)
+                dot_ = xl_w + self.bias[i]  # W * xi + b
+                x_l = x_0 * dot_ + x_l  # x0 · (W * xi + b) +xl  Hadamard-product
+            else:  # error
+                raise ValueError("parameterization should be 'vector' or 'matrix'")
+        x_l = tf.squeeze(x_l, axis=2)
+        return x_l
+
+    def get_config(self, ):
+
+        config = {'layer_num': self.layer_num, 'parameterization': self.parameterization,
+                  'l2_reg': self.l2_reg, 'seed': self.seed}
+        base_config = super(CrossNet, self).get_config()
+        base_config.update(config)
+        return base_config
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+
+class CrossNetMix(Layer):
+    """The Cross Network part of DCN-Mix model, which improves DCN-M by:
+      1 add MOE to learn feature interactions in different subspaces
+      2 add nonlinear transformations in low-dimensional space
+
+      Input shape
+        - 2D tensor with shape: ``(batch_size, units)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, units)``.
+
+      Arguments
+        - **low_rank** : Positive integer, dimensionality of low-rank sapce.
+
+        - **num_experts** : Positive integer, number of experts.
+
+        - **layer_num**: Positive integer, the cross layer number
+
+        - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix
+
+        - **seed**: A Python integer to use as random seed.
+
+      References
+        - [Wang R, Shivanna R, Cheng D Z, et al. DCN-M: Improved Deep & Cross Network for Feature Cross Learning in Web-scale Learning to Rank Systems[J]. 2020.](https://arxiv.org/abs/2008.13535)
+    """
+
+    def __init__(self, low_rank=32, num_experts=4, layer_num=2, l2_reg=0, seed=1024, **kwargs):
+        self.low_rank = low_rank
+        self.num_experts = num_experts
+        self.layer_num = layer_num
+        self.l2_reg = l2_reg
+        self.seed = seed
+        super(CrossNetMix, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if len(input_shape) != 2:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 2 dimensions" % (len(input_shape),))
+
+        dim = int(input_shape[-1])
+
+        # U: (dim, low_rank)
+        self.U_list = [self.add_weight(name='U_list' + str(i),
+                                       shape=(self.num_experts, dim, self.low_rank),
+                                       initializer=glorot_normal(
+                                           seed=self.seed),
+                                       regularizer=l2(self.l2_reg),
+                                       trainable=True) for i in range(self.layer_num)]
+        # V: (dim, low_rank)
+        self.V_list = [self.add_weight(name='V_list' + str(i),
+                                       shape=(self.num_experts, dim, self.low_rank),
+                                       initializer=glorot_normal(
+                                           seed=self.seed),
+                                       regularizer=l2(self.l2_reg),
+                                       trainable=True) for i in range(self.layer_num)]
+        # C: (low_rank, low_rank)
+        self.C_list = [self.add_weight(name='C_list' + str(i),
+                                       shape=(self.num_experts, self.low_rank, self.low_rank),
+                                       initializer=glorot_normal(
+                                           seed=self.seed),
+                                       regularizer=l2(self.l2_reg),
+                                       trainable=True) for i in range(self.layer_num)]
+
+        self.gating = [Dense(1, use_bias=False) for i in range(self.num_experts)]
+
+        self.bias = [self.add_weight(name='bias' + str(i),
+                                     shape=(dim, 1),
+                                     initializer=Zeros(),
+                                     trainable=True) for i in range(self.layer_num)]
+        # Be sure to call this somewhere!
+        super(CrossNetMix, self).build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 2:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 2 dimensions" % (K.ndim(inputs)))
+
+        x_0 = tf.expand_dims(inputs, axis=2)
+        x_l = x_0
+        for i in range(self.layer_num):
+            output_of_experts = []
+            gating_score_of_experts = []
+            for expert_id in range(self.num_experts):
+                # (1) G(x_l)
+                # compute the gating score by x_l
+                gating_score_of_experts.append(self.gating[expert_id](tf.squeeze(x_l, axis=2)))
+
+                # (2) E(x_l)
+                # project the input x_l to $\mathbb{R}^{r}$
+                v_x = tf.einsum('ij,bjk->bik', tf.transpose(self.V_list[i][expert_id]), x_l)  # (bs, low_rank, 1)
+
+                # nonlinear activation in low rank space
+                v_x = tf.nn.tanh(v_x)
+                v_x = tf.einsum('ij,bjk->bik', self.C_list[i][expert_id], v_x)  # (bs, low_rank, 1)
+                v_x = tf.nn.tanh(v_x)
+
+                # project back to $\mathbb{R}^{d}$
+                uv_x = tf.einsum('ij,bjk->bik', self.U_list[i][expert_id], v_x)  # (bs, dim, 1)
+
+                dot_ = uv_x + self.bias[i]
+                dot_ = x_0 * dot_  # Hadamard-product
+
+                output_of_experts.append(tf.squeeze(dot_, axis=2))
+
+            # (3) mixture of low-rank experts
+            output_of_experts = tf.stack(output_of_experts, 2)  # (bs, dim, num_experts)
+            gating_score_of_experts = tf.stack(gating_score_of_experts, 1)  # (bs, num_experts, 1)
+            moe_out = tf.matmul(output_of_experts, tf.nn.softmax(gating_score_of_experts, 1))
+            x_l = moe_out + x_l  # (bs, dim, 1)
+        x_l = tf.squeeze(x_l, axis=2)
+        return x_l
+
+    def get_config(self, ):
+
+        config = {'low_rank': self.low_rank, 'num_experts': self.num_experts, 'layer_num': self.layer_num,
+                  'l2_reg': self.l2_reg, 'seed': self.seed}
+        base_config = super(CrossNetMix, self).get_config()
+        base_config.update(config)
+        return base_config
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+
+class FM(Layer):
+    """Factorization Machine models pairwise (order-2) feature interactions
+     without linear term and bias.
+
+      Input shape
+        - 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, 1)``.
+
+      References
+        - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
+    """
+
+    def __init__(self, **kwargs):
+
+        super(FM, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError("Unexpected inputs dimensions % d,\
+                             expect to be 3 dimensions" % (len(input_shape)))
+
+        super(FM, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions"
+                % (K.ndim(inputs)))
+
+        concated_embeds_value = inputs
+
+        square_of_sum = tf.square(reduce_sum(
+            concated_embeds_value, axis=1, keep_dims=True))
+        sum_of_square = reduce_sum(
+            concated_embeds_value * concated_embeds_value, axis=1, keep_dims=True)
+        cross_term = square_of_sum - sum_of_square
+        cross_term = 0.5 * reduce_sum(cross_term, axis=2, keep_dims=False)
+
+        return cross_term
+
+    def compute_output_shape(self, input_shape):
+        return (None, 1)
+
+
+
+class InnerProductLayer(Layer):
+    """InnerProduct Layer used in PNN that compute the element-wise
+    product or inner product between feature vectors.
+
+      Input shape
+        - a list of 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, N*(N-1)/2 ,1)`` if use reduce_sum. or 3D tensor with shape: ``(batch_size, N*(N-1)/2, embedding_size )`` if not use reduce_sum.
+
+      Arguments
+        - **reduce_sum**: bool. Whether return inner product or element-wise product
+
+      References
+            - [Qu Y, Cai H, Ren K, et al. Product-based neural networks for user response prediction[C]//Data Mining (ICDM), 2016 IEEE 16th International Conference on. IEEE, 2016: 1149-1154.](https://arxiv.org/pdf/1611.00144.pdf)
+    """
+
+    def __init__(self, reduce_sum=True, **kwargs):
+        self.reduce_sum = reduce_sum
+        super(InnerProductLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError('A `InnerProductLayer` layer should be called '
+                             'on a list of at least 2 inputs')
+
+        reduced_inputs_shapes = [shape.as_list() for shape in input_shape]
+        shape_set = set()
+
+        for i in range(len(input_shape)):
+            shape_set.add(tuple(reduced_inputs_shapes[i]))
+
+        if len(shape_set) > 1:
+            raise ValueError('A `InnerProductLayer` layer requires '
+                             'inputs with same shapes '
+                             'Got different shapes: %s' % (shape_set))
+
+        if len(input_shape[0]) != 3 or input_shape[0][1] != 1:
+            raise ValueError('A `InnerProductLayer` layer requires '
+                             'inputs of a list with same shape tensor like (None,1,embedding_size)'
+                             'Got different shapes: %s' % (input_shape[0]))
+        super(InnerProductLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        embed_list = inputs
+        row = []
+        col = []
+        num_inputs = len(embed_list)
+
+        for i in range(num_inputs - 1):
+            for j in range(i + 1, num_inputs):
+                row.append(i)
+                col.append(j)
+        p = tf.concat([embed_list[idx]
+                       for idx in row], axis=1)  # batch num_pairs k
+        q = tf.concat([embed_list[idx]
+                       for idx in col], axis=1)
+
+        inner_product = p * q
+        if self.reduce_sum:
+            inner_product = reduce_sum(
+                inner_product, axis=2, keep_dims=True)
+        return inner_product
+
+    def compute_output_shape(self, input_shape):
+        num_inputs = len(input_shape)
+        num_pairs = int(num_inputs * (num_inputs - 1) / 2)
+        input_shape = input_shape[0]
+        embed_size = input_shape[-1]
+        if self.reduce_sum:
+            return (input_shape[0], num_pairs, 1)
+        else:
+            return (input_shape[0], num_pairs, embed_size)
+
+    def get_config(self, ):
+        config = {'reduce_sum': self.reduce_sum, }
+        base_config = super(InnerProductLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class InteractingLayer(Layer):
+    """A Layer used in AutoInt that model the correlations between different feature fields by multi-head self-attention mechanism.
+
+      Input shape
+            - A 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+            - 3D tensor with shape:``(batch_size,field_size,att_embedding_size * head_num)``.
+
+
+      Arguments
+            - **att_embedding_size**: int.The embedding size in multi-head self-attention network.
+            - **head_num**: int.The head number in multi-head  self-attention network.
+            - **use_res**: bool.Whether or not use standard residual connections before output.
+            - **seed**: A Python integer to use as random seed.
+
+      References
+            - [Song W, Shi C, Xiao Z, et al. AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks[J]. arXiv preprint arXiv:1810.11921, 2018.](https://arxiv.org/abs/1810.11921)
+    """
+
+    def __init__(self, att_embedding_size=8, head_num=2, use_res=True, scaling=False, seed=1024, **kwargs):
+        if head_num <= 0:
+            raise ValueError('head_num must be a int > 0')
+        self.att_embedding_size = att_embedding_size
+        self.head_num = head_num
+        self.use_res = use_res
+        self.seed = seed
+        self.scaling = scaling
+        super(InteractingLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(input_shape)))
+        embedding_size = int(input_shape[-1])
+        self.W_Query = self.add_weight(name='query', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                       dtype=tf.float32,
+                                       initializer=TruncatedNormal(seed=self.seed))
+        self.W_key = self.add_weight(name='key', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                     dtype=tf.float32,
+                                     initializer=TruncatedNormal(seed=self.seed + 1))
+        self.W_Value = self.add_weight(name='value', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                       dtype=tf.float32,
+                                       initializer=TruncatedNormal(seed=self.seed + 2))
+        if self.use_res:
+            self.W_Res = self.add_weight(name='res', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                         dtype=tf.float32,
+                                         initializer=TruncatedNormal(seed=self.seed))
+
+        # Be sure to call this somewhere!
+        super(InteractingLayer, self).build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        querys = tf.tensordot(inputs, self.W_Query,
+                              axes=(-1, 0))  # None F D*head_num
+        keys = tf.tensordot(inputs, self.W_key, axes=(-1, 0))
+        values = tf.tensordot(inputs, self.W_Value, axes=(-1, 0))
+
+        # head_num None F D
+        querys = tf.stack(tf.split(querys, self.head_num, axis=2))
+        keys = tf.stack(tf.split(keys, self.head_num, axis=2))
+        values = tf.stack(tf.split(values, self.head_num, axis=2))
+
+        inner_product = tf.matmul(
+            querys, keys, transpose_b=True)  # head_num None F F
+        if self.scaling:
+            inner_product /= self.att_embedding_size ** 0.5
+        self.normalized_att_scores = softmax(inner_product)
+
+        result = tf.matmul(self.normalized_att_scores,
+                           values)  # head_num None F D
+        result = tf.concat(tf.split(result, self.head_num, ), axis=-1)
+        result = tf.squeeze(result, axis=0)  # None F D*head_num
+
+        if self.use_res:
+            result += tf.tensordot(inputs, self.W_Res, axes=(-1, 0))
+        result = tf.nn.relu(result)
+
+        return result
+
+    def compute_output_shape(self, input_shape):
+
+        return (None, input_shape[1], self.att_embedding_size * self.head_num)
+
+    def get_config(self, ):
+        config = {'att_embedding_size': self.att_embedding_size, 'head_num': self.head_num, 'use_res': self.use_res,
+                  'seed': self.seed}
+        base_config = super(InteractingLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class OutterProductLayer(Layer):
+    """OutterProduct Layer used in PNN.This implemention is
+    adapted from code that the author of the paper published on https://github.com/Atomu2014/product-nets.
+
+      Input shape
+            - A list of N 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      Output shape
+            - 2D tensor with shape:``(batch_size,N*(N-1)/2 )``.
+
+      Arguments
+            - **kernel_type**: str. The kernel weight matrix type to use,can be mat,vec or num
+
+            - **seed**: A Python integer to use as random seed.
+
+      References
+            - [Qu Y, Cai H, Ren K, et al. Product-based neural networks for user response prediction[C]//Data Mining (ICDM), 2016 IEEE 16th International Conference on. IEEE, 2016: 1149-1154.](https://arxiv.org/pdf/1611.00144.pdf)
+    """
+
+    def __init__(self, kernel_type='mat', seed=1024, **kwargs):
+        if kernel_type not in ['mat', 'vec', 'num']:
+            raise ValueError("kernel_type must be mat,vec or num")
+        self.kernel_type = kernel_type
+        self.seed = seed
+        super(OutterProductLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError('A `OutterProductLayer` layer should be called '
+                             'on a list of at least 2 inputs')
+
+        reduced_inputs_shapes = [shape.as_list() for shape in input_shape]
+        shape_set = set()
+
+        for i in range(len(input_shape)):
+            shape_set.add(tuple(reduced_inputs_shapes[i]))
+
+        if len(shape_set) > 1:
+            raise ValueError('A `OutterProductLayer` layer requires '
+                             'inputs with same shapes '
+                             'Got different shapes: %s' % (shape_set))
+
+        if len(input_shape[0]) != 3 or input_shape[0][1] != 1:
+            raise ValueError('A `OutterProductLayer` layer requires '
+                             'inputs of a list with same shape tensor like (None,1,embedding_size)'
+                             'Got different shapes: %s' % (input_shape[0]))
+        num_inputs = len(input_shape)
+        num_pairs = int(num_inputs * (num_inputs - 1) / 2)
+        input_shape = input_shape[0]
+        embed_size = int(input_shape[-1])
+        if self.kernel_type == 'mat':
+
+            self.kernel = self.add_weight(shape=(embed_size, num_pairs, embed_size),
+                                          initializer=glorot_uniform(
+                                              seed=self.seed),
+                                          name='kernel')
+        elif self.kernel_type == 'vec':
+            self.kernel = self.add_weight(shape=(num_pairs, embed_size,), initializer=glorot_uniform(self.seed),
+                                          name='kernel'
+                                          )
+        elif self.kernel_type == 'num':
+            self.kernel = self.add_weight(
+                shape=(num_pairs, 1), initializer=glorot_uniform(self.seed), name='kernel')
+
+        super(OutterProductLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        embed_list = inputs
+        row = []
+        col = []
+        num_inputs = len(embed_list)
+        for i in range(num_inputs - 1):
+            for j in range(i + 1, num_inputs):
+                row.append(i)
+                col.append(j)
+        p = tf.concat([embed_list[idx]
+                       for idx in row], axis=1)  # batch num_pairs k
+        # Reshape([num_pairs, self.embedding_size])
+        q = tf.concat([embed_list[idx] for idx in col], axis=1)
+
+        # -------------------------
+        if self.kernel_type == 'mat':
+            p = tf.expand_dims(p, 1)
+            # k     k* pair* k
+            # batch * pair
+            kp = reduce_sum(
+
+                # batch * pair * k
+
+                tf.multiply(
+
+                    # batch * pair * k
+
+                    tf.transpose(
+
+                        # batch * k * pair
+
+                        reduce_sum(
+
+                            # batch * k * pair * k
+
+                            tf.multiply(
+
+                                p, self.kernel),
+
+                            -1),
+
+                        [0, 2, 1]),
+
+                    q),
+
+                -1)
+        else:
+            # 1 * pair * (k or 1)
+
+            k = tf.expand_dims(self.kernel, 0)
+
+            # batch * pair
+
+            kp = reduce_sum(p * q * k, -1)
+
+            # p q # b * p * k
+
+        return kp
+
+    def compute_output_shape(self, input_shape):
+        num_inputs = len(input_shape)
+        num_pairs = int(num_inputs * (num_inputs - 1) / 2)
+        return (None, num_pairs)
+
+    def get_config(self, ):
+        config = {'kernel_type': self.kernel_type, 'seed': self.seed}
+        base_config = super(OutterProductLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class FGCNNLayer(Layer):
+    """Feature Generation Layer used in FGCNN,including Convolution,MaxPooling and Recombination.
+
+      Input shape
+        - A 3D tensor with shape:``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size,new_feture_num,embedding_size)``.
+
+      References
+        - [Liu B, Tang R, Chen Y, et al. Feature Generation by Convolutional Neural Network for Click-Through Rate Prediction[J]. arXiv preprint arXiv:1904.04447, 2019.](https://arxiv.org/pdf/1904.04447)
+
+    """
+
+    def __init__(self, filters=(14, 16,), kernel_width=(7, 7,), new_maps=(3, 3,), pooling_width=(2, 2),
+                 **kwargs):
+        if not (len(filters) == len(kernel_width) == len(new_maps) == len(pooling_width)):
+            raise ValueError("length of argument must be equal")
+        self.filters = filters
+        self.kernel_width = kernel_width
+        self.new_maps = new_maps
+        self.pooling_width = pooling_width
+
+        super(FGCNNLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(input_shape)))
+        self.conv_layers = []
+        self.pooling_layers = []
+        self.dense_layers = []
+        pooling_shape = input_shape.as_list() + [1, ]
+        embedding_size = int(input_shape[-1])
+        for i in range(1, len(self.filters) + 1):
+            filters = self.filters[i - 1]
+            width = self.kernel_width[i - 1]
+            new_filters = self.new_maps[i - 1]
+            pooling_width = self.pooling_width[i - 1]
+            conv_output_shape = self._conv_output_shape(
+                pooling_shape, (width, 1))
+            pooling_shape = self._pooling_output_shape(
+                conv_output_shape, (pooling_width, 1))
+            self.conv_layers.append(Conv2D(filters=filters, kernel_size=(width, 1), strides=(1, 1),
+                                           padding='same',
+                                           activation='tanh', use_bias=True, ))
+            self.pooling_layers.append(
+                MaxPooling2D(pool_size=(pooling_width, 1)))
+            self.dense_layers.append(Dense(pooling_shape[1] * embedding_size * new_filters,
+                                           activation='tanh', use_bias=True))
+
+        self.flatten = Flatten()
+
+        super(FGCNNLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        embedding_size = int(inputs.shape[-1])
+        pooling_result = tf.expand_dims(inputs, axis=3)
+
+        new_feature_list = []
+
+        for i in range(1, len(self.filters) + 1):
+            new_filters = self.new_maps[i - 1]
+
+            conv_result = self.conv_layers[i - 1](pooling_result)
+
+            pooling_result = self.pooling_layers[i - 1](conv_result)
+
+            flatten_result = self.flatten(pooling_result)
+
+            new_result = self.dense_layers[i - 1](flatten_result)
+
+            new_feature_list.append(
+                tf.reshape(new_result, (-1, int(pooling_result.shape[1]) * new_filters, embedding_size)))
+
+        new_features = concat_func(new_feature_list, axis=1)
+        return new_features
+
+    def compute_output_shape(self, input_shape):
+
+        new_features_num = 0
+        features_num = input_shape[1]
+
+        for i in range(0, len(self.kernel_width)):
+            pooled_features_num = features_num // self.pooling_width[i]
+            new_features_num += self.new_maps[i] * pooled_features_num
+            features_num = pooled_features_num
+
+        return (None, new_features_num, input_shape[-1])
+
+    def get_config(self, ):
+        config = {'kernel_width': self.kernel_width, 'filters': self.filters, 'new_maps': self.new_maps,
+                  'pooling_width': self.pooling_width}
+        base_config = super(FGCNNLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+    def _conv_output_shape(self, input_shape, kernel_size):
+        # channels_last
+        space = input_shape[1:-1]
+        new_space = []
+        for i in range(len(space)):
+            new_dim = utils.conv_output_length(
+                space[i],
+                kernel_size[i],
+                padding='same',
+                stride=1,
+                dilation=1)
+            new_space.append(new_dim)
+        return ([input_shape[0]] + new_space + [self.filters])
+
+    def _pooling_output_shape(self, input_shape, pool_size):
+        # channels_last
+
+        rows = input_shape[1]
+        cols = input_shape[2]
+        rows = utils.conv_output_length(rows, pool_size[0], 'valid',
+                                        pool_size[0])
+        cols = utils.conv_output_length(cols, pool_size[1], 'valid',
+                                        pool_size[1])
+        return [input_shape[0], rows, cols, input_shape[3]]
+
+
+class SENETLayer(Layer):
+    """SENETLayer used in FiBiNET.
+
+      Input shape
+        - A list of 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      Output shape
+        - A list of 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      Arguments
+        - **reduction_ratio** : Positive integer, dimensionality of the
+         attention network output space.
+
+        - **seed** : A Python integer to use as random seed.
+
+      References
+        - [FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction](https://arxiv.org/pdf/1905.09433.pdf)
+    """
+
+    def __init__(self, reduction_ratio=3, seed=1024, **kwargs):
+        self.reduction_ratio = reduction_ratio
+
+        self.seed = seed
+        super(SENETLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError('A `AttentionalFM` layer should be called '
+                             'on a list of at least 2 inputs')
+
+        self.filed_size = len(input_shape)
+        self.embedding_size = input_shape[0][-1]
+        reduction_size = max(1, self.filed_size // self.reduction_ratio)
+
+        self.W_1 = self.add_weight(shape=(
+            self.filed_size, reduction_size), initializer=glorot_normal(seed=self.seed), name="W_1")
+        self.W_2 = self.add_weight(shape=(
+            reduction_size, self.filed_size), initializer=glorot_normal(seed=self.seed), name="W_2")
+
+        self.tensordot = Lambda(
+            lambda x: tf.tensordot(x[0], x[1], axes=(-1, 0)))
+
+        # Be sure to call this somewhere!
+        super(SENETLayer, self).build(input_shape)
+
+    def call(self, inputs, training=None, **kwargs):
+
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        inputs = concat_func(inputs, axis=1)
+        Z = reduce_mean(inputs, axis=-1, )
+
+        A_1 = tf.nn.relu(self.tensordot([Z, self.W_1]))
+        A_2 = tf.nn.relu(self.tensordot([A_1, self.W_2]))
+        V = tf.multiply(inputs, tf.expand_dims(A_2, axis=2))
+
+        return tf.split(V, self.filed_size, axis=1)
+
+    def compute_output_shape(self, input_shape):
+
+        return input_shape
+
+    def compute_mask(self, inputs, mask=None):
+        return [None] * self.filed_size
+
+    def get_config(self, ):
+        config = {'reduction_ratio': self.reduction_ratio, 'seed': self.seed}
+        base_config = super(SENETLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class BilinearInteraction(Layer):
+    """BilinearInteraction Layer used in FiBiNET.
+
+      Input shape
+        - A list of 3D tensor with shape: ``(batch_size,1,embedding_size)``. Its length is ``filed_size``.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size,filed_size*(filed_size-1)/2,embedding_size)``.
+
+      Arguments
+        - **bilinear_type** : String, types of bilinear functions used in this layer.
+
+        - **seed** : A Python integer to use as random seed.
+
+      References
+        - [FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction](https://arxiv.org/pdf/1905.09433.pdf)
+
+    """
+
+    def __init__(self, bilinear_type="interaction", seed=1024, **kwargs):
+        self.bilinear_type = bilinear_type
+        self.seed = seed
+
+        super(BilinearInteraction, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError('A `AttentionalFM` layer should be called '
+                             'on a list of at least 2 inputs')
+        embedding_size = int(input_shape[0][-1])
+
+        if self.bilinear_type == "all":
+            self.W = self.add_weight(shape=(embedding_size, embedding_size), initializer=glorot_normal(
+                seed=self.seed), name="bilinear_weight")
+        elif self.bilinear_type == "each":
+            self.W_list = [self.add_weight(shape=(embedding_size, embedding_size), initializer=glorot_normal(
+                seed=self.seed), name="bilinear_weight" + str(i)) for i in range(len(input_shape) - 1)]
+        elif self.bilinear_type == "interaction":
+            self.W_list = [self.add_weight(shape=(embedding_size, embedding_size), initializer=glorot_normal(
+                seed=self.seed), name="bilinear_weight" + str(i) + '_' + str(j)) for i, j in
+                           itertools.combinations(range(len(input_shape)), 2)]
+        else:
+            raise NotImplementedError
+
+        super(BilinearInteraction, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        n = len(inputs)
+        if self.bilinear_type == "all":
+            vidots = [tf.tensordot(inputs[i], self.W, axes=(-1, 0)) for i in range(n)]
+            p = [tf.multiply(vidots[i], inputs[j]) for i, j in itertools.combinations(range(n), 2)]
+        elif self.bilinear_type == "each":
+            vidots = [tf.tensordot(inputs[i], self.W_list[i], axes=(-1, 0)) for i in range(n - 1)]
+            p = [tf.multiply(vidots[i], inputs[j]) for i, j in itertools.combinations(range(n), 2)]
+        elif self.bilinear_type == "interaction":
+            p = [tf.multiply(tf.tensordot(v[0], w, axes=(-1, 0)), v[1])
+                 for v, w in zip(itertools.combinations(inputs, 2), self.W_list)]
+        else:
+            raise NotImplementedError
+        output = concat_func(p, axis=1)
+        return output
+
+    def compute_output_shape(self, input_shape):
+        filed_size = len(input_shape)
+        embedding_size = input_shape[0][-1]
+
+        return (None, filed_size * (filed_size - 1) // 2, embedding_size)
+
+    def get_config(self, ):
+        config = {'bilinear_type': self.bilinear_type, 'seed': self.seed}
+        base_config = super(BilinearInteraction, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class FieldWiseBiInteraction(Layer):
+    """Field-Wise Bi-Interaction Layer used in FLEN,compress the
+     pairwise element-wise product of features into one single vector.
+
+      Input shape
+        - A list of 3D tensor with shape:``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size,embedding_size)``.
+
+      Arguments
+        - **use_bias** : Boolean, if use bias.
+        - **seed** : A Python integer to use as random seed.
+
+      References
+        - [FLEN: Leveraging Field for Scalable CTR Prediction](https://arxiv.org/pdf/1911.04690)
+
+    """
+
+    def __init__(self, use_bias=True, seed=1024, **kwargs):
+        self.use_bias = use_bias
+        self.seed = seed
+
+        super(FieldWiseBiInteraction, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError(
+                'A `Field-Wise Bi-Interaction` layer should be called '
+                'on a list of at least 2 inputs')
+
+        self.num_fields = len(input_shape)
+        embedding_size = input_shape[0][-1]
+
+        self.kernel_mf = self.add_weight(
+            name='kernel_mf',
+            shape=(int(self.num_fields * (self.num_fields - 1) / 2), 1),
+            initializer=Ones(),
+            regularizer=None,
+            trainable=True)
+
+        self.kernel_fm = self.add_weight(
+            name='kernel_fm',
+            shape=(self.num_fields, 1),
+            initializer=Constant(value=0.5),
+            regularizer=None,
+            trainable=True)
+        if self.use_bias:
+            self.bias_mf = self.add_weight(name='bias_mf',
+                                           shape=(embedding_size),
+                                           initializer=Zeros())
+            self.bias_fm = self.add_weight(name='bias_fm',
+                                           shape=(embedding_size),
+                                           initializer=Zeros())
+
+        super(FieldWiseBiInteraction,
+              self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" %
+                (K.ndim(inputs)))
+
+        field_wise_embeds_list = inputs
+
+        # MF module
+        field_wise_vectors = tf.concat([
+            reduce_sum(field_i_vectors, axis=1, keep_dims=True)
+            for field_i_vectors in field_wise_embeds_list
+        ], 1)
+
+        left = []
+        right = []
+
+        for i, j in itertools.combinations(list(range(self.num_fields)), 2):
+            left.append(i)
+            right.append(j)
+
+        embeddings_left = tf.gather(params=field_wise_vectors,
+                                    indices=left,
+                                    axis=1)
+        embeddings_right = tf.gather(params=field_wise_vectors,
+                                     indices=right,
+                                     axis=1)
+
+        embeddings_prod = embeddings_left * embeddings_right
+        field_weighted_embedding = embeddings_prod * self.kernel_mf
+        h_mf = reduce_sum(field_weighted_embedding, axis=1)
+        if self.use_bias:
+            h_mf = tf.nn.bias_add(h_mf, self.bias_mf)
+
+        # FM module
+        square_of_sum_list = [
+            tf.square(reduce_sum(field_i_vectors, axis=1, keep_dims=True))
+            for field_i_vectors in field_wise_embeds_list
+        ]
+        sum_of_square_list = [
+            reduce_sum(field_i_vectors * field_i_vectors,
+                       axis=1,
+                       keep_dims=True)
+            for field_i_vectors in field_wise_embeds_list
+        ]
+
+        field_fm = tf.concat([
+            square_of_sum - sum_of_square for square_of_sum, sum_of_square in
+            zip(square_of_sum_list, sum_of_square_list)
+        ], 1)
+
+        h_fm = reduce_sum(field_fm * self.kernel_fm, axis=1)
+        if self.use_bias:
+            h_fm = tf.nn.bias_add(h_fm, self.bias_fm)
+
+        return h_mf + h_fm
+
+    def compute_output_shape(self, input_shape):
+        return (None, input_shape[0][-1])
+
+    def get_config(self, ):
+        config = {'use_bias': self.use_bias, 'seed': self.seed}
+        base_config = super(FieldWiseBiInteraction, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class FwFMLayer(Layer):
+    """Field-weighted Factorization Machines
+
+      Input shape
+        - 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, 1)``.
+
+      Arguments
+        - **num_fields** : integer for number of fields
+        - **regularizer** : L2 regularizer weight for the field strength parameters of PNN
+
+      References
+        - [Field-weighted Factorization Machines for Click-Through Rate Prediction in Display Advertising]
+        https://arxiv.org/pdf/1806.03514.pdf
+    """
+
+    def __init__(self, num_fields=4, regularizer=0.000001, **kwargs):
+        self.num_fields = num_fields
+        self.regularizer = regularizer
+        super(FwFMLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError("Unexpected inputs dimensions % d,\
+                             expect to be 3 dimensions" % (len(input_shape)))
+
+        if input_shape[1] != self.num_fields:
+            raise ValueError("Mismatch in number of fields {} and \
+                 concatenated embeddings dims {}".format(self.num_fields, input_shape[1]))
+
+        self.field_strengths = self.add_weight(name='field_pair_strengths',
+                                               shape=(self.num_fields, self.num_fields),
+                                               initializer=TruncatedNormal(),
+                                               regularizer=l2(self.regularizer),
+                                               trainable=True)
+
+        super(FwFMLayer, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions"
+                % (K.ndim(inputs)))
+
+        if inputs.shape[1] != self.num_fields:
+            raise ValueError("Mismatch in number of fields {} and \
+                 concatenated embeddings dims {}".format(self.num_fields, inputs.shape[1]))
+
+        pairwise_inner_prods = []
+        for fi, fj in itertools.combinations(range(self.num_fields), 2):
+            # get field strength for pair fi and fj
+            r_ij = self.field_strengths[fi, fj]
+
+            # get embeddings for the features of both the fields
+            feat_embed_i = tf.squeeze(inputs[0:, fi:fi + 1, 0:], axis=1)
+            feat_embed_j = tf.squeeze(inputs[0:, fj:fj + 1, 0:], axis=1)
+
+            f = tf.scalar_mul(r_ij, batch_dot(feat_embed_i, feat_embed_j, axes=1))
+            pairwise_inner_prods.append(f)
+
+        sum_ = tf.add_n(pairwise_inner_prods)
+        return sum_
+
+    def compute_output_shape(self, input_shape):
+        return (None, 1)
+
+    def get_config(self):
+        config = super(FwFMLayer, self).get_config().copy()
+        config.update({
+            'num_fields': self.num_fields,
+            'regularizer': self.regularizer
+        })
+        return config
+
+
+class FEFMLayer(Layer):
+    """Field-Embedded Factorization Machines
+
+      Input shape
+        - 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape:
+            ``(batch_size, (num_fields * (num_fields-1))/2)`` # concatenated FEFM interaction embeddings
+
+      Arguments
+        - **regularizer** : L2 regularizer weight for the field pair matrix embeddings parameters of FEFM
+
+      References
+        - [Field-Embedded Factorization Machines for Click-through Rate Prediction]
+         https://arxiv.org/pdf/2009.09931.pdf
+    """
+
+    def __init__(self, regularizer, **kwargs):
+        self.regularizer = regularizer
+        super(FEFMLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError("Unexpected inputs dimensions % d,\
+                                expect to be 3 dimensions" % (len(input_shape)))
+
+        self.num_fields = int(input_shape[1])
+        embedding_size = int(input_shape[2])
+
+        self.field_embeddings = {}
+        for fi, fj in itertools.combinations(range(self.num_fields), 2):
+            field_pair_id = str(fi) + "-" + str(fj)
+            self.field_embeddings[field_pair_id] = self.add_weight(name='field_embeddings' + field_pair_id,
+                                                                   shape=(embedding_size, embedding_size),
+                                                                   initializer=TruncatedNormal(),
+                                                                   regularizer=l2(self.regularizer),
+                                                                   trainable=True)
+
+        super(FEFMLayer, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions"
+                % (K.ndim(inputs)))
+
+        pairwise_inner_prods = []
+        for fi, fj in itertools.combinations(range(self.num_fields), 2):
+            field_pair_id = str(fi) + "-" + str(fj)
+            feat_embed_i = tf.squeeze(inputs[0:, fi:fi + 1, 0:], axis=1)
+            feat_embed_j = tf.squeeze(inputs[0:, fj:fj + 1, 0:], axis=1)
+            field_pair_embed_ij = self.field_embeddings[field_pair_id]
+
+            feat_embed_i_tr = tf.matmul(feat_embed_i, field_pair_embed_ij + tf.transpose(field_pair_embed_ij))
+
+            f = batch_dot(feat_embed_i_tr, feat_embed_j, axes=1)
+            pairwise_inner_prods.append(f)
+
+        concat_vec = tf.concat(pairwise_inner_prods, axis=1)
+        return concat_vec
+
+    def compute_output_shape(self, input_shape):
+        num_fields = int(input_shape[1])
+        return (None, (num_fields * (num_fields - 1)) / 2)
+
+    def get_config(self):
+        config = super(FEFMLayer, self).get_config().copy()
+        config.update({
+            'regularizer': self.regularizer,
+        })
+        return config
diff --git a/modelzoo/FNN/script/layers/normalization.py b/modelzoo/FNN/script/layers/normalization.py
new file mode 100644
index 00000000000..3fceb1257d8
--- /dev/null
+++ b/modelzoo/FNN/script/layers/normalization.py
@@ -0,0 +1,51 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.layers import Layer
+
+try:
+    from tensorflow.python.ops.init_ops import Zeros, Ones
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import Zeros, Ones
+
+
+class LayerNormalization(Layer):
+    def __init__(self, axis=-1, eps=1e-9, center=True,
+                 scale=True, **kwargs):
+        self.axis = axis
+        self.eps = eps
+        self.center = center
+        self.scale = scale
+        super(LayerNormalization, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
+                                     initializer=Ones(), trainable=True)
+        self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
+                                    initializer=Zeros(), trainable=True)
+        super(LayerNormalization, self).build(input_shape)
+
+    def call(self, inputs):
+        mean = K.mean(inputs, axis=self.axis, keepdims=True)
+        variance = K.mean(K.square(inputs - mean), axis=-1, keepdims=True)
+        std = K.sqrt(variance + self.eps)
+        outputs = (inputs - mean) / std
+        if self.scale:
+            outputs *= self.gamma
+        if self.center:
+            outputs += self.beta
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self, ):
+        config = {'axis': self.axis, 'eps': self.eps, 'center': self.center, 'scale': self.scale}
+        base_config = super(LayerNormalization, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/modelzoo/FNN/script/layers/sequence.py b/modelzoo/FNN/script/layers/sequence.py
new file mode 100644
index 00000000000..45a65915c22
--- /dev/null
+++ b/modelzoo/FNN/script/layers/sequence.py
@@ -0,0 +1,901 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.keras import backend as K
+
+try:
+    from tensorflow.python.ops.init_ops import TruncatedNormal, glorot_uniform_initializer as glorot_uniform, \
+        identity_initializer as identity
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import TruncatedNormal, glorot_uniform, identity
+
+from tensorflow.python.keras.layers import LSTM, Lambda, Layer, Dropout
+
+from .core import LocalActivationUnit
+from .normalization import LayerNormalization
+
+if tf.__version__ >= '2.0.0':
+    from ..contrib.rnn_v2 import dynamic_rnn
+else:
+    from ..contrib.rnn import dynamic_rnn
+from ..contrib.utils import QAAttGRUCell, VecAttGRUCell
+from .utils import reduce_sum, reduce_max, div, softmax, reduce_mean
+
+
+class SequencePoolingLayer(Layer):
+    """The SequencePoolingLayer is used to apply pooling operation(sum,mean,max) on variable-length sequence feature/multi-value feature.
+
+      Input shape
+        - A list of two  tensor [seq_value,seq_len]
+
+        - seq_value is a 3D tensor with shape: ``(batch_size, T, embedding_size)``
+
+        - seq_len is a 2D tensor with shape : ``(batch_size, 1)``,indicate valid length of each sequence.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, 1, embedding_size)``.
+
+      Arguments
+        - **mode**:str.Pooling operation to be used,can be sum,mean or max.
+
+        - **supports_masking**:If True,the input need to support masking.
+    """
+
+    def __init__(self, mode='mean', supports_masking=False, **kwargs):
+
+        if mode not in ['sum', 'mean', 'max']:
+            raise ValueError("mode must be sum or mean")
+        self.mode = mode
+        self.eps = tf.constant(1e-8, tf.float32)
+        super(SequencePoolingLayer, self).__init__(**kwargs)
+
+        self.supports_masking = supports_masking
+
+    def build(self, input_shape):
+        if not self.supports_masking:
+            self.seq_len_max = int(input_shape[0][1])
+        super(SequencePoolingLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, seq_value_len_list, mask=None, **kwargs):
+        if self.supports_masking:
+            if mask is None:
+                raise ValueError(
+                    "When supports_masking=True,input must support masking")
+            uiseq_embed_list = seq_value_len_list
+            mask = tf.cast(mask, tf.float32)  # tf.to_float(mask)
+            user_behavior_length = reduce_sum(mask, axis=-1, keep_dims=True)
+            mask = tf.expand_dims(mask, axis=2)
+        else:
+            uiseq_embed_list, user_behavior_length = seq_value_len_list
+
+            mask = tf.sequence_mask(user_behavior_length,
+                                    self.seq_len_max, dtype=tf.float32)
+            mask = tf.transpose(mask, (0, 2, 1))
+
+        embedding_size = uiseq_embed_list.shape[-1]
+
+        mask = tf.tile(mask, [1, 1, embedding_size])
+
+        if self.mode == "max":
+            hist = uiseq_embed_list - (1 - mask) * 1e9
+            return reduce_max(hist, 1, keep_dims=True)
+
+        hist = reduce_sum(uiseq_embed_list * mask, 1, keep_dims=False)
+
+        if self.mode == "mean":
+            hist = div(hist, tf.cast(user_behavior_length, tf.float32) + self.eps)
+
+        hist = tf.expand_dims(hist, axis=1)
+        return hist
+
+    def compute_output_shape(self, input_shape):
+        if self.supports_masking:
+            return (None, 1, input_shape[-1])
+        else:
+            return (None, 1, input_shape[0][-1])
+
+    def compute_mask(self, inputs, mask):
+        return None
+
+    def get_config(self, ):
+        config = {'mode': self.mode, 'supports_masking': self.supports_masking}
+        base_config = super(SequencePoolingLayer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class WeightedSequenceLayer(Layer):
+    """The WeightedSequenceLayer is used to apply weight score on variable-length sequence feature/multi-value feature.
+
+      Input shape
+        - A list of two  tensor [seq_value,seq_len,seq_weight]
+
+        - seq_value is a 3D tensor with shape: ``(batch_size, T, embedding_size)``
+
+        - seq_len is a 2D tensor with shape : ``(batch_size, 1)``,indicate valid length of each sequence.
+
+        - seq_weight is a 3D tensor with shape: ``(batch_size, T, 1)``
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, T, embedding_size)``.
+
+      Arguments
+        - **weight_normalization**: bool.Whether normalize the weight score before applying to sequence.
+
+        - **supports_masking**:If True,the input need to support masking.
+    """
+
+    def __init__(self, weight_normalization=True, supports_masking=False, **kwargs):
+        super(WeightedSequenceLayer, self).__init__(**kwargs)
+        self.weight_normalization = weight_normalization
+        self.supports_masking = supports_masking
+
+    def build(self, input_shape):
+        if not self.supports_masking:
+            self.seq_len_max = int(input_shape[0][1])
+        super(WeightedSequenceLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, input_list, mask=None, **kwargs):
+        if self.supports_masking:
+            if mask is None:
+                raise ValueError(
+                    "When supports_masking=True,input must support masking")
+            key_input, value_input = input_list
+            mask = tf.expand_dims(mask[0], axis=2)
+        else:
+            key_input, key_length_input, value_input = input_list
+            mask = tf.sequence_mask(key_length_input,
+                                    self.seq_len_max, dtype=tf.bool)
+            mask = tf.transpose(mask, (0, 2, 1))
+
+        embedding_size = key_input.shape[-1]
+
+        if self.weight_normalization:
+            paddings = tf.ones_like(value_input) * (-2 ** 32 + 1)
+        else:
+            paddings = tf.zeros_like(value_input)
+        value_input = tf.where(mask, value_input, paddings)
+
+        if self.weight_normalization:
+            value_input = softmax(value_input, dim=1)
+
+        if len(value_input.shape) == 2:
+            value_input = tf.expand_dims(value_input, axis=2)
+            value_input = tf.tile(value_input, [1, 1, embedding_size])
+
+        return tf.multiply(key_input, value_input)
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[0]
+
+    def compute_mask(self, inputs, mask):
+        if self.supports_masking:
+            return mask[0]
+        else:
+            return None
+
+    def get_config(self, ):
+        config = {'weight_normalization': self.weight_normalization, 'supports_masking': self.supports_masking}
+        base_config = super(WeightedSequenceLayer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class AttentionSequencePoolingLayer(Layer):
+    """The Attentional sequence pooling operation used in DIN.
+
+      Input shape
+        - A list of three tensor: [query,keys,keys_length]
+
+        - query is a 3D tensor with shape:  ``(batch_size, 1, embedding_size)``
+
+        - keys is a 3D tensor with shape:   ``(batch_size, T, embedding_size)``
+
+        - keys_length is a 2D tensor with shape: ``(batch_size, 1)``
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, 1, embedding_size)``.
+
+      Arguments
+        - **att_hidden_units**:list of positive integer, the attention net layer number and units in each layer.
+
+        - **att_activation**: Activation function to use in attention net.
+
+        - **weight_normalization**: bool.Whether normalize the attention score of local activation unit.
+
+        - **supports_masking**:If True,the input need to support masking.
+
+      References
+        - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
+    """
+
+    def __init__(self, att_hidden_units=(80, 40), att_activation='sigmoid', weight_normalization=False,
+                 return_score=False,
+                 supports_masking=False, **kwargs):
+
+        self.att_hidden_units = att_hidden_units
+        self.att_activation = att_activation
+        self.weight_normalization = weight_normalization
+        self.return_score = return_score
+        super(AttentionSequencePoolingLayer, self).__init__(**kwargs)
+        self.supports_masking = supports_masking
+
+    def build(self, input_shape):
+        if not self.supports_masking:
+            if not isinstance(input_shape, list) or len(input_shape) != 3:
+                raise ValueError('A `AttentionSequencePoolingLayer` layer should be called '
+                                 'on a list of 3 inputs')
+
+            if len(input_shape[0]) != 3 or len(input_shape[1]) != 3 or len(input_shape[2]) != 2:
+                raise ValueError(
+                    "Unexpected inputs dimensions,the 3 tensor dimensions are %d,%d and %d , expect to be 3,3 and 2" % (
+                        len(input_shape[0]), len(input_shape[1]), len(input_shape[2])))
+
+            if input_shape[0][-1] != input_shape[1][-1] or input_shape[0][1] != 1 or input_shape[2][1] != 1:
+                raise ValueError('A `AttentionSequencePoolingLayer` layer requires '
+                                 'inputs of a 3 tensor with shape (None,1,embedding_size),(None,T,embedding_size) and (None,1)'
+                                 'Got different shapes: %s' % (input_shape))
+        else:
+            pass
+        self.local_att = LocalActivationUnit(
+            self.att_hidden_units, self.att_activation, l2_reg=0, dropout_rate=0, use_bn=False, seed=1024, )
+        super(AttentionSequencePoolingLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, mask=None, training=None, **kwargs):
+
+        if self.supports_masking:
+            if mask is None:
+                raise ValueError(
+                    "When supports_masking=True,input must support masking")
+            queries, keys = inputs
+            key_masks = tf.expand_dims(mask[-1], axis=1)
+
+        else:
+
+            queries, keys, keys_length = inputs
+            hist_len = keys.get_shape()[1]
+            key_masks = tf.sequence_mask(keys_length, hist_len)
+
+        attention_score = self.local_att([queries, keys], training=training)
+
+        outputs = tf.transpose(attention_score, (0, 2, 1))
+
+        if self.weight_normalization:
+            paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
+        else:
+            paddings = tf.zeros_like(outputs)
+
+        outputs = tf.where(key_masks, outputs, paddings)
+
+        if self.weight_normalization:
+            outputs = softmax(outputs)
+
+        if not self.return_score:
+            outputs = tf.matmul(outputs, keys)
+
+        if tf.__version__ < '1.13.0':
+            outputs._uses_learning_phase = attention_score._uses_learning_phase
+        else:
+            outputs._uses_learning_phase = training is not None
+
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        if self.return_score:
+            return (None, 1, input_shape[1][1])
+        else:
+            return (None, 1, input_shape[0][-1])
+
+    def compute_mask(self, inputs, mask):
+        return None
+
+    def get_config(self, ):
+
+        config = {'att_hidden_units': self.att_hidden_units, 'att_activation': self.att_activation,
+                  'weight_normalization': self.weight_normalization, 'return_score': self.return_score,
+                  'supports_masking': self.supports_masking}
+        base_config = super(AttentionSequencePoolingLayer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class BiLSTM(Layer):
+    """A multiple layer Bidirectional Residual LSTM Layer.
+
+      Input shape
+        - 3D tensor with shape ``(batch_size, timesteps, input_dim)``.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, timesteps, units)``.
+
+      Arguments
+        - **units**: Positive integer, dimensionality of the output space.
+
+        - **layers**:Positive integer, number of LSTM layers to stacked.
+
+        - **res_layers**: Positive integer, number of residual connection to used in last ``res_layers``.
+
+        - **dropout_rate**:  Float between 0 and 1. Fraction of the units to drop for the linear transformation of the inputs.
+
+        - **merge_mode**: merge_mode: Mode by which outputs of the forward and backward RNNs will be combined. One of { ``'fw'`` , ``'bw'`` , ``'sum'`` , ``'mul'`` , ``'concat'`` , ``'ave'`` , ``None`` }. If None, the outputs will not be combined, they will be returned as a list.
+
+
+    """
+
+    def __init__(self, units, layers=2, res_layers=0, dropout_rate=0.2, merge_mode='ave', **kwargs):
+
+        if merge_mode not in ['fw', 'bw', 'sum', 'mul', 'ave', 'concat', None]:
+            raise ValueError('Invalid merge mode. '
+                             'Merge mode should be one of '
+                             '{"fw","bw","sum", "mul", "ave", "concat", None}')
+
+        self.units = units
+        self.layers = layers
+        self.res_layers = res_layers
+        self.dropout_rate = dropout_rate
+        self.merge_mode = merge_mode
+
+        super(BiLSTM, self).__init__(**kwargs)
+        self.supports_masking = True
+
+    def build(self, input_shape):
+
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(input_shape)))
+        self.fw_lstm = []
+        self.bw_lstm = []
+        for _ in range(self.layers):
+            self.fw_lstm.append(
+                LSTM(self.units, dropout=self.dropout_rate, bias_initializer='ones', return_sequences=True,
+                     unroll=True))
+            self.bw_lstm.append(
+                LSTM(self.units, dropout=self.dropout_rate, bias_initializer='ones', return_sequences=True,
+                     go_backwards=True, unroll=True))
+
+        super(BiLSTM, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, mask=None, **kwargs):
+
+        input_fw = inputs
+        input_bw = inputs
+        for i in range(self.layers):
+            output_fw = self.fw_lstm[i](input_fw)
+            output_bw = self.bw_lstm[i](input_bw)
+            output_bw = Lambda(lambda x: K.reverse(
+                x, 1), mask=lambda inputs, mask: mask)(output_bw)
+
+            if i >= self.layers - self.res_layers:
+                output_fw += input_fw
+                output_bw += input_bw
+            input_fw = output_fw
+            input_bw = output_bw
+
+        output_fw = input_fw
+        output_bw = input_bw
+
+        if self.merge_mode == "fw":
+            output = output_fw
+        elif self.merge_mode == "bw":
+            output = output_bw
+        elif self.merge_mode == 'concat':
+            output = K.concatenate([output_fw, output_bw])
+        elif self.merge_mode == 'sum':
+            output = output_fw + output_bw
+        elif self.merge_mode == 'ave':
+            output = (output_fw + output_bw) / 2
+        elif self.merge_mode == 'mul':
+            output = output_fw * output_bw
+        elif self.merge_mode is None:
+            output = [output_fw, output_bw]
+
+        return output
+
+    def compute_output_shape(self, input_shape):
+        print(self.merge_mode)
+        if self.merge_mode is None:
+            return [input_shape, input_shape]
+        elif self.merge_mode == 'concat':
+            return input_shape[:-1] + (input_shape[-1] * 2,)
+        else:
+            return input_shape
+
+    def compute_mask(self, inputs, mask):
+        return mask
+
+    def get_config(self, ):
+
+        config = {'units': self.units, 'layers': self.layers,
+                  'res_layers': self.res_layers, 'dropout_rate': self.dropout_rate, 'merge_mode': self.merge_mode}
+        base_config = super(BiLSTM, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class Transformer(Layer):
+    """  Simplified version of Transformer  proposed in 《Attention is all you need》
+
+      Input shape
+        - a list of two 3D tensor with shape ``(batch_size, timesteps, input_dim)`` if ``supports_masking=True`` .
+        - a list of two 4 tensors, first two tensors with shape ``(batch_size, timesteps, input_dim)``,last two tensors with shape ``(batch_size, 1)`` if ``supports_masking=False`` .
+
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, 1, input_dim)``  if ``output_type='mean'`` or ``output_type='sum'`` , else  ``(batch_size, timesteps, input_dim)`` .
+
+
+      Arguments
+            - **att_embedding_size**: int.The embedding size in multi-head self-attention network.
+            - **head_num**: int.The head number in multi-head  self-attention network.
+            - **dropout_rate**: float between 0 and 1. Fraction of the units to drop.
+            - **use_positional_encoding**: bool. Whether or not use positional_encoding
+            - **use_res**: bool. Whether or not use standard residual connections before output.
+            - **use_feed_forward**: bool. Whether or not use pointwise feed foward network.
+            - **use_layer_norm**: bool. Whether or not use Layer Normalization.
+            - **blinding**: bool. Whether or not use blinding.
+            - **seed**: A Python integer to use as random seed.
+            - **supports_masking**:bool. Whether or not support masking.
+            - **attention_type**: str, Type of attention, the value must be one of { ``'scaled_dot_product'`` , ``'additive'`` }.
+            - **output_type**: ``'mean'`` , ``'sum'`` or `None`. Whether or not use average/sum pooling for output.
+
+      References
+            - [Vaswani, Ashish, et al. "Attention is all you need." Advances in Neural Information Processing Systems. 2017.](https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf)
+    """
+
+    def __init__(self, att_embedding_size=1, head_num=8, dropout_rate=0.0, use_positional_encoding=True, use_res=True,
+                 use_feed_forward=True, use_layer_norm=False, blinding=True, seed=1024, supports_masking=False,
+                 attention_type="scaled_dot_product", output_type="mean", **kwargs):
+        if head_num <= 0:
+            raise ValueError('head_num must be a int > 0')
+        self.att_embedding_size = att_embedding_size
+        self.head_num = head_num
+        self.num_units = att_embedding_size * head_num
+        self.use_res = use_res
+        self.use_feed_forward = use_feed_forward
+        self.seed = seed
+        self.use_positional_encoding = use_positional_encoding
+        self.dropout_rate = dropout_rate
+        self.use_layer_norm = use_layer_norm
+        self.blinding = blinding
+        self.attention_type = attention_type
+        self.output_type = output_type
+        super(Transformer, self).__init__(**kwargs)
+        self.supports_masking = supports_masking
+
+    def build(self, input_shape):
+        embedding_size = int(input_shape[0][-1])
+        if self.num_units != embedding_size:
+            raise ValueError(
+                "att_embedding_size * head_num must equal the last dimension size of inputs,got %d * %d != %d" % (
+                    self.att_embedding_size, self.head_num, embedding_size))
+        self.seq_len_max = int(input_shape[0][-2])
+        self.W_Query = self.add_weight(name='query', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                       dtype=tf.float32,
+                                       initializer=TruncatedNormal(seed=self.seed))
+        self.W_key = self.add_weight(name='key', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                     dtype=tf.float32,
+                                     initializer=TruncatedNormal(seed=self.seed + 1))
+        self.W_Value = self.add_weight(name='value', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                       dtype=tf.float32,
+                                       initializer=TruncatedNormal(seed=self.seed + 2))
+        if self.attention_type == "additive":
+            self.b = self.add_weight('b', shape=[self.att_embedding_size], dtype=tf.float32,
+                                     initializer=glorot_uniform(seed=self.seed))
+            self.v = self.add_weight('v', shape=[self.att_embedding_size], dtype=tf.float32,
+                                     initializer=glorot_uniform(seed=self.seed))
+        # if self.use_res:
+        #     self.W_Res = self.add_weight(name='res', shape=[embedding_size, self.att_embedding_size * self.head_num], dtype=tf.float32,
+        #                                  initializer=TruncatedNormal(seed=self.seed))
+        if self.use_feed_forward:
+            self.fw1 = self.add_weight('fw1', shape=[self.num_units, 4 * self.num_units], dtype=tf.float32,
+                                       initializer=glorot_uniform(seed=self.seed))
+            self.fw2 = self.add_weight('fw2', shape=[4 * self.num_units, self.num_units], dtype=tf.float32,
+                                       initializer=glorot_uniform(seed=self.seed))
+
+        self.dropout = Dropout(
+            self.dropout_rate, seed=self.seed)
+        self.ln = LayerNormalization()
+        if self.use_positional_encoding:
+            self.query_pe = PositionEncoding()
+            self.key_pe = PositionEncoding()
+        # Be sure to call this somewhere!
+        super(Transformer, self).build(input_shape)
+
+    def call(self, inputs, mask=None, training=None, **kwargs):
+
+        if self.supports_masking:
+            queries, keys = inputs
+            query_masks, key_masks = mask
+            query_masks = tf.cast(query_masks, tf.float32)
+            key_masks = tf.cast(key_masks, tf.float32)
+        else:
+            queries, keys, query_masks, key_masks = inputs
+
+            query_masks = tf.sequence_mask(
+                query_masks, self.seq_len_max, dtype=tf.float32)
+            key_masks = tf.sequence_mask(
+                key_masks, self.seq_len_max, dtype=tf.float32)
+            query_masks = tf.squeeze(query_masks, axis=1)
+            key_masks = tf.squeeze(key_masks, axis=1)
+
+        if self.use_positional_encoding:
+            queries = self.query_pe(queries)
+            keys = self.key_pe(queries)
+
+        querys = tf.tensordot(queries, self.W_Query,
+                              axes=(-1, 0))  # None T_q D*head_num
+        keys = tf.tensordot(keys, self.W_key, axes=(-1, 0))
+        values = tf.tensordot(keys, self.W_Value, axes=(-1, 0))
+
+        # head_num*None T_q D
+        querys = tf.concat(tf.split(querys, self.head_num, axis=2), axis=0)
+        keys = tf.concat(tf.split(keys, self.head_num, axis=2), axis=0)
+        values = tf.concat(tf.split(values, self.head_num, axis=2), axis=0)
+
+        if self.attention_type == "scaled_dot_product":
+            # head_num*None T_q T_k
+            outputs = tf.matmul(querys, keys, transpose_b=True)
+
+            outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)
+        elif self.attention_type == "additive":
+            querys_reshaped = tf.expand_dims(querys, axis=-2)
+            keys_reshaped = tf.expand_dims(keys, axis=-3)
+            outputs = tf.tanh(tf.nn.bias_add(querys_reshaped + keys_reshaped, self.b))
+            outputs = tf.squeeze(tf.tensordot(outputs, tf.expand_dims(self.v, axis=-1), axes=[-1, 0]), axis=-1)
+        else:
+            raise ValueError("attention_type must be scaled_dot_product or additive")
+
+        key_masks = tf.tile(key_masks, [self.head_num, 1])
+
+        # (h*N, T_q, T_k)
+        key_masks = tf.tile(tf.expand_dims(key_masks, 1),
+                            [1, tf.shape(queries)[1], 1])
+
+        paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
+
+        # (h*N, T_q, T_k)
+
+        outputs = tf.where(tf.equal(key_masks, 1), outputs, paddings, )
+        if self.blinding:
+            try:
+                outputs = tf.matrix_set_diag(outputs, tf.ones_like(outputs)[
+                                                      :, :, 0] * (-2 ** 32 + 1))
+            except AttributeError:
+                outputs = tf.compat.v1.matrix_set_diag(outputs, tf.ones_like(outputs)[
+                                                                :, :, 0] * (-2 ** 32 + 1))
+
+        outputs -= reduce_max(outputs, axis=-1, keep_dims=True)
+        outputs = softmax(outputs)
+        query_masks = tf.tile(query_masks, [self.head_num, 1])  # (h*N, T_q)
+        # (h*N, T_q, T_k)
+        query_masks = tf.tile(tf.expand_dims(
+            query_masks, -1), [1, 1, tf.shape(keys)[1]])
+
+        outputs *= query_masks
+
+        outputs = self.dropout(outputs, training=training)
+        # Weighted sum
+        # ( h*N, T_q, C/h)
+        result = tf.matmul(outputs, values)
+        result = tf.concat(tf.split(result, self.head_num, axis=0), axis=2)
+
+        if self.use_res:
+            # tf.tensordot(queries, self.W_Res, axes=(-1, 0))
+            result += queries
+        if self.use_layer_norm:
+            result = self.ln(result)
+
+        if self.use_feed_forward:
+            fw1 = tf.nn.relu(tf.tensordot(result, self.fw1, axes=[-1, 0]))
+            fw1 = self.dropout(fw1, training=training)
+            fw2 = tf.tensordot(fw1, self.fw2, axes=[-1, 0])
+            if self.use_res:
+                result += fw2
+            if self.use_layer_norm:
+                result = self.ln(result)
+
+        if self.output_type == "mean":
+            return reduce_mean(result, axis=1, keep_dims=True)
+        elif self.output_type == "sum":
+            return reduce_sum(result, axis=1, keep_dims=True)
+        else:
+            return result
+
+    def compute_output_shape(self, input_shape):
+
+        return (None, 1, self.att_embedding_size * self.head_num)
+
+    def compute_mask(self, inputs, mask=None):
+        return None
+
+    def get_config(self, ):
+        config = {'att_embedding_size': self.att_embedding_size, 'head_num': self.head_num,
+                  'dropout_rate': self.dropout_rate, 'use_res': self.use_res,
+                  'use_positional_encoding': self.use_positional_encoding, 'use_feed_forward': self.use_feed_forward,
+                  'use_layer_norm': self.use_layer_norm, 'seed': self.seed, 'supports_masking': self.supports_masking,
+                  'blinding': self.blinding, 'attention_type': self.attention_type, 'output_type': self.output_type}
+        base_config = super(Transformer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class PositionEncoding(Layer):
+    def __init__(self, pos_embedding_trainable=True,
+                 zero_pad=False,
+                 scale=True, **kwargs):
+        self.pos_embedding_trainable = pos_embedding_trainable
+        self.zero_pad = zero_pad
+        self.scale = scale
+        super(PositionEncoding, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Create a trainable weight variable for this layer.
+        _, T, num_units = input_shape.as_list()  # inputs.get_shape().as_list()
+        # First part of the PE function: sin and cos argument
+        position_enc = np.array([
+            [pos / np.power(10000, 2. * (i // 2) / num_units) for i in range(num_units)]
+            for pos in range(T)])
+
+        # Second part, apply the cosine to even columns and sin to odds.
+        position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
+        position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
+        if self.zero_pad:
+            position_enc[0, :] = np.zeros(num_units)
+        self.lookup_table = self.add_weight("lookup_table", (T, num_units),
+                                            initializer=identity(position_enc),
+                                            trainable=self.pos_embedding_trainable)
+
+        # Be sure to call this somewhere!
+        super(PositionEncoding, self).build(input_shape)
+
+    def call(self, inputs, mask=None):
+        _, T, num_units = inputs.get_shape().as_list()
+        position_ind = tf.expand_dims(tf.range(T), 0)
+        outputs = tf.nn.embedding_lookup(self.lookup_table, position_ind)
+        if self.scale:
+            outputs = outputs * num_units ** 0.5
+        return outputs + inputs
+
+    def compute_output_shape(self, input_shape):
+
+        return input_shape
+
+    def compute_mask(self, inputs, mask=None):
+        return mask
+
+    def get_config(self, ):
+
+        config = {'pos_embedding_trainable': self.pos_embedding_trainable, 'zero_pad': self.zero_pad,
+                  'scale': self.scale}
+        base_config = super(PositionEncoding, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class BiasEncoding(Layer):
+    def __init__(self, sess_max_count, seed=1024, **kwargs):
+        self.sess_max_count = sess_max_count
+        self.seed = seed
+        super(BiasEncoding, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Create a trainable weight variable for this layer.
+
+        if self.sess_max_count == 1:
+            embed_size = input_shape[2].value
+            seq_len_max = input_shape[1].value
+        else:
+            try:
+                embed_size = input_shape[0][2].value
+                seq_len_max = input_shape[0][1].value
+            except AttributeError:
+                embed_size = input_shape[0][2]
+                seq_len_max = input_shape[0][1]
+
+        self.sess_bias_embedding = self.add_weight('sess_bias_embedding', shape=(self.sess_max_count, 1, 1),
+                                                   initializer=TruncatedNormal(
+                                                       mean=0.0, stddev=0.0001, seed=self.seed))
+        self.seq_bias_embedding = self.add_weight('seq_bias_embedding', shape=(1, seq_len_max, 1),
+                                                  initializer=TruncatedNormal(
+                                                      mean=0.0, stddev=0.0001, seed=self.seed))
+        self.item_bias_embedding = self.add_weight('item_bias_embedding', shape=(1, 1, embed_size),
+                                                   initializer=TruncatedNormal(
+                                                       mean=0.0, stddev=0.0001, seed=self.seed))
+
+        # Be sure to call this somewhere!
+        super(BiasEncoding, self).build(input_shape)
+
+    def call(self, inputs, mask=None):
+        """
+        :param concated_embeds_value: None * field_size * embedding_size
+        :return: None*1
+        """
+        transformer_out = []
+        for i in range(self.sess_max_count):
+            transformer_out.append(
+                inputs[i] + self.item_bias_embedding + self.seq_bias_embedding + self.sess_bias_embedding[i])
+        return transformer_out
+
+    def compute_output_shape(self, input_shape):
+
+        return input_shape
+
+    def compute_mask(self, inputs, mask=None):
+        return mask
+
+    def get_config(self, ):
+
+        config = {'sess_max_count': self.sess_max_count, 'seed': self.seed, }
+        base_config = super(BiasEncoding, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class DynamicGRU(Layer):
+    def __init__(self, num_units=None, gru_type='GRU', return_sequence=True, **kwargs):
+
+        self.num_units = num_units
+        self.return_sequence = return_sequence
+        self.gru_type = gru_type
+        super(DynamicGRU, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Create a trainable weight variable for this layer.
+        input_seq_shape = input_shape[0]
+        if self.num_units is None:
+            self.num_units = input_seq_shape.as_list()[-1]
+        if self.gru_type == "AGRU":
+            self.gru_cell = QAAttGRUCell(self.num_units)
+        elif self.gru_type == "AUGRU":
+            self.gru_cell = VecAttGRUCell(self.num_units)
+        else:
+            try:
+                self.gru_cell = tf.nn.rnn_cell.GRUCell(self.num_units)  # GRUCell
+            except AttributeError:
+                self.gru_cell = tf.compat.v1.nn.rnn_cell.GRUCell(self.num_units)
+
+        # Be sure to call this somewhere!
+        super(DynamicGRU, self).build(input_shape)
+
+    def call(self, input_list):
+        """
+        :param concated_embeds_value: None * field_size * embedding_size
+        :return: None*1
+        """
+        if self.gru_type == "GRU" or self.gru_type == "AIGRU":
+            rnn_input, sequence_length = input_list
+            att_score = None
+        else:
+            rnn_input, sequence_length, att_score = input_list
+
+        rnn_output, hidden_state = dynamic_rnn(self.gru_cell, inputs=rnn_input, att_scores=att_score,
+                                               sequence_length=tf.squeeze(sequence_length,
+                                                                          ), dtype=tf.float32, scope=self.name)
+        if self.return_sequence:
+            return rnn_output
+        else:
+            return tf.expand_dims(hidden_state, axis=1)
+
+    def compute_output_shape(self, input_shape):
+        rnn_input_shape = input_shape[0]
+        if self.return_sequence:
+            return rnn_input_shape
+        else:
+            return (None, 1, rnn_input_shape[2])
+
+    def get_config(self, ):
+        config = {'num_units': self.num_units, 'gru_type': self.gru_type, 'return_sequence': self.return_sequence}
+        base_config = super(DynamicGRU, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class KMaxPooling(Layer):
+    """K Max pooling that selects the k biggest value along the specific axis.
+
+      Input shape
+        -  nD tensor with shape: ``(batch_size, ..., input_dim)``.
+
+      Output shape
+        - nD tensor with shape: ``(batch_size, ..., output_dim)``.
+
+      Arguments
+        - **k**: positive integer, number of top elements to look for along the ``axis`` dimension.
+
+        - **axis**: positive integer, the dimension to look for elements.
+
+     """
+
+    def __init__(self, k=1, axis=-1, **kwargs):
+
+        self.k = k
+        self.axis = axis
+        super(KMaxPooling, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if self.axis < 1 or self.axis > len(input_shape):
+            raise ValueError("axis must be 1~%d,now is %d" %
+                             (len(input_shape), self.axis))
+
+        if self.k < 1 or self.k > input_shape[self.axis]:
+            raise ValueError("k must be in 1 ~ %d,now k is %d" %
+                             (input_shape[self.axis], self.k))
+        self.dims = len(input_shape)
+        # Be sure to call this somewhere!
+        super(KMaxPooling, self).build(input_shape)
+
+    def call(self, inputs):
+
+        # swap the last and the axis dimensions since top_k will be applied along the last dimension
+        perm = list(range(self.dims))
+        perm[-1], perm[self.axis] = perm[self.axis], perm[-1]
+        shifted_input = tf.transpose(inputs, perm)
+
+        # extract top_k, returns two tensors [values, indices]
+        top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=True, name=None)[0]
+        output = tf.transpose(top_k, perm)
+
+        return output
+
+    def compute_output_shape(self, input_shape):
+        output_shape = list(input_shape)
+        output_shape[self.axis] = self.k
+        return tuple(output_shape)
+
+    def get_config(self, ):
+        config = {'k': self.k, 'axis': self.axis}
+        base_config = super(KMaxPooling, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+# def positional_encoding(inputs,
+#                         pos_embedding_trainable=True,
+#                         zero_pad=False,
+#                         scale=True,
+#                         ):
+#     '''Sinusoidal Positional_Encoding.
+#
+#     Args:
+#
+#       - inputs: A 2d Tensor with shape of (N, T).
+#       - num_units: Output dimensionality
+#       - zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
+#       - scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper)
+#       - scope: Optional scope for `variable_scope`.
+#       - reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
+#
+#     Returns:
+#
+#       - A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units'
+#     '''
+#
+#     _, T, num_units = inputs.get_shape().as_list()
+#     # with tf.variable_scope(scope, reuse=reuse):
+#     position_ind = tf.expand_dims(tf.range(T), 0)
+#     # First part of the PE function: sin and cos argument
+#     position_enc = np.array([
+#         [pos / np.power(10000, 2. * i / num_units)
+#          for i in range(num_units)]
+#         for pos in range(T)])
+#
+#     # Second part, apply the cosine to even columns and sin to odds.
+#     position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
+#     position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
+#
+#     # Convert to a tensor
+#
+#     if pos_embedding_trainable:
+#         lookup_table = K.variable(position_enc, dtype=tf.float32)
+#
+#     if zero_pad:
+#         lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
+#                                   lookup_table[1:, :]), 0)
+#
+#     outputs = tf.nn.embedding_lookup(lookup_table, position_ind)
+#
+#     if scale:
+#         outputs = outputs * num_units ** 0.5
+#     return outputs + inputs
diff --git a/modelzoo/FNN/script/layers/utils.py b/modelzoo/FNN/script/layers/utils.py
new file mode 100644
index 00000000000..2be8f3fe5ef
--- /dev/null
+++ b/modelzoo/FNN/script/layers/utils.py
@@ -0,0 +1,302 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+import tensorflow as tf
+from tensorflow.python.keras.layers import Flatten, Concatenate, Layer, Add
+from tensorflow.python.ops.lookup_ops import TextFileInitializer
+
+try:
+    from tensorflow.python.ops.init_ops import Zeros, glorot_normal_initializer as glorot_normal
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import Zeros, glorot_normal
+
+from tensorflow.python.keras.regularizers import l2
+
+try:
+    from tensorflow.python.ops.lookup_ops import StaticHashTable
+except ImportError:
+    from tensorflow.python.ops.lookup_ops import HashTable as StaticHashTable
+
+
+class NoMask(Layer):
+    def __init__(self, **kwargs):
+        super(NoMask, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Be sure to call this somewhere!
+        super(NoMask, self).build(input_shape)
+
+    def call(self, x, mask=None, **kwargs):
+        return x
+
+    def compute_mask(self, inputs, mask):
+        return None
+
+
+class Hash(Layer):
+    """Looks up keys in a table when setup `vocabulary_path`, which outputs the corresponding values.
+    If `vocabulary_path` is not set, `Hash` will hash the input to [0,num_buckets). When `mask_zero` = True,
+    input value `0` or `0.0` will be set to `0`, and other value will be set in range [1,num_buckets).
+
+    The following snippet initializes a `Hash` with `vocabulary_path` file with the first column as keys and
+    second column as values:
+
+    * `1,emerson`
+    * `2,lake`
+    * `3,palmer`
+
+    >>> hash = Hash(
+    ...   num_buckets=3+1,
+    ...   vocabulary_path=filename,
+    ...   default_value=0)
+    >>> hash(tf.constant('lake')).numpy()
+    2
+    >>> hash(tf.constant('lakeemerson')).numpy()
+    0
+
+    Args:
+        num_buckets: An `int` that is >= 1. The number of buckets or the vocabulary size + 1
+            when `vocabulary_path` is setup.
+        mask_zero: default is False. The `Hash` value will hash input `0` or `0.0` to value `0` when
+            the `mask_zero` is `True`. `mask_zero` is not used when `vocabulary_path` is setup.
+        vocabulary_path: default `None`. The `CSV` text file path of the vocabulary hash, which contains
+            two columns seperated by delimiter `comma`, the first column is the value and the second is
+            the key. The key data type is `string`, the value data type is `int`. The path must
+            be accessible from wherever `Hash` is initialized.
+        default_value: default '0'. The default value if a key is missing in the table.
+        **kwargs: Additional keyword arguments.
+    """
+
+    def __init__(self, num_buckets, mask_zero=False, vocabulary_path=None, default_value=0, **kwargs):
+        self.num_buckets = num_buckets
+        self.mask_zero = mask_zero
+        self.vocabulary_path = vocabulary_path
+        self.default_value = default_value
+        if self.vocabulary_path:
+            initializer = TextFileInitializer(vocabulary_path, 'string', 1, 'int64', 0, delimiter=',')
+            self.hash_table = StaticHashTable(initializer, default_value=self.default_value)
+        super(Hash, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Be sure to call this somewhere!
+        super(Hash, self).build(input_shape)
+
+    def call(self, x, mask=None, **kwargs):
+
+        if x.dtype != tf.string:
+            zero = tf.as_string(tf.zeros([1], dtype=x.dtype))
+            x = tf.as_string(x, )
+        else:
+            zero = tf.as_string(tf.zeros([1], dtype='int32'))
+
+        if self.vocabulary_path:
+            hash_x = self.hash_table.lookup(x)
+            return hash_x
+
+        num_buckets = self.num_buckets if not self.mask_zero else self.num_buckets - 1
+        try:
+            hash_x = tf.string_to_hash_bucket_fast(x, num_buckets,
+                                                   name=None)  # weak hash
+        except AttributeError:
+            hash_x = tf.strings.to_hash_bucket_fast(x, num_buckets,
+                                                    name=None)  # weak hash
+        if self.mask_zero:
+            mask = tf.cast(tf.not_equal(x, zero), dtype='int64')
+            hash_x = (hash_x + 1) * mask
+
+        return hash_x
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self, ):
+        config = {'num_buckets': self.num_buckets, 'mask_zero': self.mask_zero, 'vocabulary_path': self.vocabulary_path,
+                  'default_value': self.default_value}
+        base_config = super(Hash, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class Linear(Layer):
+
+    def __init__(self, l2_reg=0.0, mode=0, use_bias=False, seed=1024, **kwargs):
+
+        self.l2_reg = l2_reg
+        # self.l2_reg = tf.contrib.layers.l2_regularizer(float(l2_reg_linear))
+        if mode not in [0, 1, 2]:
+            raise ValueError("mode must be 0,1 or 2")
+        self.mode = mode
+        self.use_bias = use_bias
+        self.seed = seed
+        super(Linear, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if self.use_bias:
+            self.bias = self.add_weight(name='linear_bias',
+                                        shape=(1,),
+                                        initializer=Zeros(),
+                                        trainable=True)
+        if self.mode == 1:
+            self.kernel = self.add_weight(
+                'linear_kernel',
+                shape=[int(input_shape[-1]), 1],
+                initializer=glorot_normal(self.seed),
+                regularizer=l2(self.l2_reg),
+                trainable=True)
+        elif self.mode == 2:
+            self.kernel = self.add_weight(
+                'linear_kernel',
+                shape=[int(input_shape[1][-1]), 1],
+                initializer=glorot_normal(self.seed),
+                regularizer=l2(self.l2_reg),
+                trainable=True)
+
+        super(Linear, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+        if self.mode == 0:
+            sparse_input = inputs
+            linear_logit = reduce_sum(sparse_input, axis=-1, keep_dims=True)
+        elif self.mode == 1:
+            dense_input = inputs
+            fc = tf.tensordot(dense_input, self.kernel, axes=(-1, 0))
+            linear_logit = fc
+        else:
+            sparse_input, dense_input = inputs
+            fc = tf.tensordot(dense_input, self.kernel, axes=(-1, 0))
+            linear_logit = reduce_sum(sparse_input, axis=-1, keep_dims=False) + fc
+        if self.use_bias:
+            linear_logit += self.bias
+
+        return linear_logit
+
+    def compute_output_shape(self, input_shape):
+        return (None, 1)
+
+    def compute_mask(self, inputs, mask):
+        return None
+
+    def get_config(self, ):
+        config = {'mode': self.mode, 'l2_reg': self.l2_reg, 'use_bias': self.use_bias, 'seed': self.seed}
+        base_config = super(Linear, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+def concat_func(inputs, axis=-1, mask=False):
+    if not mask:
+        inputs = list(map(NoMask(), inputs))
+    if len(inputs) == 1:
+        return inputs[0]
+    else:
+        return Concatenate(axis=axis)(inputs)
+
+
+def reduce_mean(input_tensor,
+                axis=None,
+                keep_dims=False,
+                name=None,
+                reduction_indices=None):
+    try:
+        return tf.reduce_mean(input_tensor,
+                              axis=axis,
+                              keep_dims=keep_dims,
+                              name=name,
+                              reduction_indices=reduction_indices)
+    except TypeError:
+        return tf.reduce_mean(input_tensor,
+                              axis=axis,
+                              keepdims=keep_dims,
+                              name=name)
+
+
+def reduce_sum(input_tensor,
+               axis=None,
+               keep_dims=False,
+               name=None,
+               reduction_indices=None):
+    try:
+        return tf.reduce_sum(input_tensor,
+                             axis=axis,
+                             keep_dims=keep_dims,
+                             name=name,
+                             reduction_indices=reduction_indices)
+    except TypeError:
+        return tf.reduce_sum(input_tensor,
+                             axis=axis,
+                             keepdims=keep_dims,
+                             name=name)
+
+
+def reduce_max(input_tensor,
+               axis=None,
+               keep_dims=False,
+               name=None,
+               reduction_indices=None):
+    try:
+        return tf.reduce_max(input_tensor,
+                             axis=axis,
+                             keep_dims=keep_dims,
+                             name=name,
+                             reduction_indices=reduction_indices)
+    except TypeError:
+        return tf.reduce_max(input_tensor,
+                             axis=axis,
+                             keepdims=keep_dims,
+                             name=name)
+
+
+def div(x, y, name=None):
+    try:
+        return tf.div(x, y, name=name)
+    except AttributeError:
+        return tf.divide(x, y, name=name)
+
+
+def softmax(logits, dim=-1, name=None):
+    try:
+        return tf.nn.softmax(logits, dim=dim, name=name)
+    except TypeError:
+        return tf.nn.softmax(logits, axis=dim, name=name)
+
+
+class _Add(Layer):
+    def __init__(self, **kwargs):
+        super(_Add, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Be sure to call this somewhere!
+        super(_Add, self).build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        # if not isinstance(inputs, list):
+        #     return inputs
+        # if len(inputs) == 1:
+        #     return inputs[0]
+        if len(inputs) == 0:
+            return tf.constant([[0.0]])
+
+        return Add()(inputs)
+
+
+def add_func(inputs):
+    if not isinstance(inputs, list):
+        return inputs
+    if len(inputs) == 1:
+        return inputs[0]
+    return _Add()(inputs)
+
+
+def combined_dnn_input(sparse_embedding_list, dense_value_list):
+    if len(sparse_embedding_list) > 0 and len(dense_value_list) > 0:
+        sparse_dnn_input = Flatten()(concat_func(sparse_embedding_list))
+        dense_dnn_input = Flatten()(concat_func(dense_value_list))
+        return concat_func([sparse_dnn_input, dense_dnn_input])
+    elif len(sparse_embedding_list) > 0:
+        return Flatten()(concat_func(sparse_embedding_list))
+    elif len(dense_value_list) > 0:
+        return Flatten()(concat_func(dense_value_list))
+    else:
+        raise NotImplementedError("dnn_feature_columns can not be empty list")
diff --git a/modelzoo/FNN/script/models/__init__.py b/modelzoo/FNN/script/models/__init__.py
new file mode 100644
index 00000000000..f1bf243569b
--- /dev/null
+++ b/modelzoo/FNN/script/models/__init__.py
@@ -0,0 +1,4 @@
+from .fnn import FNN
+
+
+__all__ = [ "FNN"]
diff --git a/modelzoo/FNN/script/models/fnn.py b/modelzoo/FNN/script/models/fnn.py
new file mode 100644
index 00000000000..50932f1cc5e
--- /dev/null
+++ b/modelzoo/FNN/script/models/fnn.py
@@ -0,0 +1,53 @@
+# -*- coding:utf-8 -*-
+"""
+Author:
+    Weichen Shen, weichenswc@163.com
+
+Reference:
+    [1] Zhang W, Du T, Wang J. Deep learning over multi-field categorical data[C]//European conference on information retrieval. Springer, Cham, 2016: 45-57.(https://arxiv.org/pdf/1601.02376.pdf)
+"""
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.layers import Dense,Embedding
+
+from ..feature_column import build_input_features, get_linear_logit, input_from_feature_columns
+from ..layers.core import PredictionLayer, DNN
+from ..layers.utils import add_func, combined_dnn_input
+
+
+def FNN(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(256, 128, 64),
+        l2_reg_embedding=1e-5, l2_reg_linear=1e-5, l2_reg_dnn=0.1, seed=1024, dnn_dropout=0,
+        dnn_activation='relu', task='binary'):
+    """Instantiates the Factorization-supported Neural Network architecture.
+
+    :param linear_feature_columns: An iterable containing all the features used by linear part of the model.
+    :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
+    :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of deep net
+    :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
+    :param l2_reg_linear: float. L2 regularizer strength applied to linear weight
+    :param l2_reg_dnn: float . L2 regularizer strength applied to DNN
+    :param seed: integer ,to use as random seed.
+    :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
+    :param dnn_activation: Activation function to use in DNN
+    :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
+    :return: A Keras model instance.
+    """
+    features = build_input_features(
+        linear_feature_columns + dnn_feature_columns)
+
+    inputs_list = list(features.values())
+
+    linear_logit = get_linear_logit(features, linear_feature_columns, seed=seed, prefix='linear',
+                                    l2_reg=l2_reg_linear)
+
+    sparse_embedding_list, dense_value_list = input_from_feature_columns(features, dnn_feature_columns,
+                                                                         l2_reg_embedding, seed)
+
+    dnn_input = combined_dnn_input(sparse_embedding_list, dense_value_list)
+    deep_out = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, False, seed=seed)(dnn_input)
+    dnn_logit = Dense(1, use_bias=False)(deep_out)
+    final_logit = add_func([dnn_logit, linear_logit])
+
+    output = PredictionLayer(task)(final_logit)
+
+    model = Model(inputs=inputs_list, outputs=output)
+    return model
diff --git a/modelzoo/FNN/script/utils.py b/modelzoo/FNN/script/utils.py
new file mode 100644
index 00000000000..7fe3b25a518
--- /dev/null
+++ b/modelzoo/FNN/script/utils.py
@@ -0,0 +1,46 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+import json
+import logging
+from threading import Thread
+
+import requests
+
+try:
+    from packaging.version import parse
+except ImportError:
+    from pip._vendor.packaging.version import parse
+
+
+def check_version(version):
+    """Return version of package on pypi.python.org using json."""
+
+    def check(version):
+        try:
+            url_pattern = 'https://pypi.python.org/pypi/deepctr/json'
+            req = requests.get(url_pattern)
+            latest_version = parse('0')
+            version = parse(version)
+            if req.status_code == requests.codes.ok:
+                j = json.loads(req.text.encode('utf-8'))
+                releases = j.get('releases', [])
+                for release in releases:
+                    ver = parse(release)
+                    if ver.is_prerelease or ver.is_postrelease:
+                        continue
+                    latest_version = max(latest_version, ver)
+                if latest_version > version:
+                    logging.warning(
+                        '\nDeepCTR version {0} detected. Your version is {1}.\nUse `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v{0}'.format(
+                            latest_version, version))
+        except:
+            print("Please check the latest version manually on https://pypi.org/project/deepctr/#history")
+            return
+
+    Thread(target=check, args=(version,)).start()
diff --git a/modelzoo/FNN/train.py b/modelzoo/FNN/train.py
new file mode 100644
index 00000000000..92d94bced4e
--- /dev/null
+++ b/modelzoo/FNN/train.py
@@ -0,0 +1,139 @@
+import os
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+import pickle as pkl
+import math
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.optimizers import Adam
+from sklearn.metrics import log_loss, roc_auc_score
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder, MinMaxScaler,MultiLabelBinarizer
+from script.models.fnn import FNN
+from script.feature_column import SparseFeat, DenseFeat, get_feature_names,VarLenSparseFeat
+import gc
+
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+def split(x):
+    key_ans = x.split(',')
+    for key in key_ans:
+        if key not in key2index:
+            key2index[key] = len(key2index) + 1
+    return list(map(lambda x: key2index[x], key_ans))
+
+if __name__=="__main__":
+    path = 'data/'
+    datalist = ['1458','2259','2261','2997','3386','all']
+
+    for file in datalist:
+
+        data = pd.read_csv(path+file+'/train.log.txt',encoding="utf-8",
+                           header=0,sep="\t",low_memory=False)
+
+        test_data = pd.read_csv(path+file+'/test.log.txt',encoding="utf-8",
+                           header=0,sep="\t",low_memory=False)
+
+
+        data = data[['click','weekday','hour','useragent','IP','region', 'city', 'adexchange', 'domain', 'slotid','slotwidth',
+                     'slotheight', 'slotvisibility', 'slotformat', 'creative', 'advertiser', 'slotprice']]
+
+        test_data = test_data[['click','weekday','hour','useragent','IP','region', 'city', 'adexchange', 'domain', 'slotid','slotwidth',
+                     'slotheight', 'slotvisibility', 'slotformat', 'creative', 'advertiser', 'slotprice']]
+
+        data['istest']=0
+        test_data['istest']=1
+        df =  pd.concat([data, test_data], axis=0, ignore_index=True)
+        del data, test_data
+        gc.collect()
+
+
+        df.dropna(subset=['click'],inplace=True)
+
+        df['adexchange'].fillna(0,inplace=True)
+        df['adexchange']=df['adexchange'].astype(int)
+
+
+        df.fillna('unknown', inplace=True)
+
+
+        dense_features = ['weekday', 'hour','region','city','adexchange','slotwidth','slotheight',
+                          'advertiser', 'slotprice' ]
+
+
+
+        sparse_features=[]
+
+        target='click'
+        for col in df.columns:
+            if col not in dense_features and col not in ['istest','click']:
+                lbe = LabelEncoder()
+                df[col] = lbe.fit_transform(df[col])
+                df[col]=lbe.fit_transform(df[col])
+                sparse_features.append(col)
+
+        mms = MinMaxScaler(feature_range=(0, 1))
+
+        df[dense_features] = mms.fit_transform(df[dense_features])
+
+
+        fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].max() + 1, embedding_dim=11,embeddings_initializer=None)
+                                  for i, feat in enumerate(sparse_features)] + [DenseFeat(feat, 1, )
+                                                                                for feat in dense_features]
+
+        linear_feature_columns = fixlen_feature_columns
+        dnn_feature_columns = fixlen_feature_columns
+
+
+
+
+
+        feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
+
+        # 3.generate train&test input data for model
+        cols = [f for f in df.columns if f not in ['click', 'istest']]
+        train = df[df.istest==0][cols]
+        test = df[df.istest==1][cols]
+
+        train_model_input = {name: train[name] for name in feature_names}
+        test_model_input = {name: test[name] for name in feature_names}
+
+        gpu_options = tf.GPUOptions(allow_growth=True)
+
+
+        model = FNN(linear_feature_columns, dnn_feature_columns,task='binary',dnn_hidden_units=(128, 64, 32))
+
+        adam = Adam(learning_rate=0.001,amsgrad=False)
+
+        model.compile(adam, "binary_crossentropy",
+                      metrics=['binary_crossentropy','AUC'])
+
+        with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
+
+
+            sess.run(tf.tables_initializer())
+            history = model.fit(train_model_input, df[df.istest==0][target].values,
+                            batch_size=128, epochs=50, verbose=2, validation_split=0.2)
+
+            pred_ans = model.predict(test_model_input, batch_size=128)
+
+            test_auc = roc_auc_score(df[df.istest==1][target].values,pred_ans)
+            print('test_auc=',test_auc)
+
+
+            with open('result/result.txt','a+') as tx:
+                print(file+" test LogLoss", round(log_loss(df[df.istest==1][target].values, pred_ans), 4),file=tx)
+                print(file+" test AUC", round(roc_auc_score(df[df.istest==1][target].values, pred_ans), 4),file=tx)
+                print('='*50,file=tx)
+
+
+
+
+
+
+
+
+
+
+

From 4e4b4001a7e20d22addb923cc971b82d841b95aa Mon Sep 17 00:00:00 2001
From: lihangtian <936971274@qq.com>
Date: Wed, 12 Oct 2022 15:48:09 +0800
Subject: [PATCH 5/8] [ModelZoo] Support Co_Action Network

---
 modelzoo/CAN/README.md                      |  15 +-
 modelzoo/CAN/data/prepare_data.sh           |   1 +
 modelzoo/CAN/data/script/data_iterator.py   | 132 +++++++++-
 modelzoo/CAN/data/script/generate_voc.py.bk |  65 -----
 modelzoo/CAN/data/script/local_aggretor.py  |  17 +-
 modelzoo/CAN/result/README.md               |   2 +
 modelzoo/CAN/train.py                       | 264 ++++++--------------
 7 files changed, 226 insertions(+), 270 deletions(-)
 delete mode 100644 modelzoo/CAN/data/script/generate_voc.py.bk
 create mode 100644 modelzoo/CAN/result/README.md

diff --git a/modelzoo/CAN/README.md b/modelzoo/CAN/README.md
index c26f3f8eace..c4350c622ea 100644
--- a/modelzoo/CAN/README.md
+++ b/modelzoo/CAN/README.md
@@ -10,10 +10,11 @@ The following is a brief directory structure and description for this example:
 │   └── README.md               # Documentation describing how to prepare dataset
 │   └── script                      # Directory contains scripts to process dataset
 │       ├── data_iterator.py           
-│       ├── generate_voc.py         
-│       ├── local_aggretor.py               
-│       ├── shuffle.py           
-│       └── split_by_user.py
+│       ├── generate_voc.py         # Create a list of features
+│       ├── local_aggretor.py       # Generate sample data     
+│       ├── shuffle.py
+│       ├──process_data.py          # Parse raw json data
+│       └── split_by_user.py        # Divide the dataset
 ├── script                       #  Directory contains scripts to CAN model
 │	├── Dice.py
 │	├── model.py
@@ -65,9 +66,7 @@ paper: [arxiv (to be released)]()
 ​	2.train.
 
 ```
-CUDA_VISIBLE_DEVICES=0 python script/train.py train {model}
-
-model: CAN,Cartesion,PNN, etc. (check the train.py)
+python train.py
 ```
 
 ​	
@@ -76,7 +75,7 @@ model: CAN,Cartesion,PNN, etc. (check the train.py)
 
 ## Dataset
 
-Amazon, Taobao and Avazu dataset is used as benchmark dataset.
+Amazon Dataset Books dataset is used as benchmark dataset.
 
 ### Prepare
 
diff --git a/modelzoo/CAN/data/prepare_data.sh b/modelzoo/CAN/data/prepare_data.sh
index 54c9733dd15..4e341477042 100644
--- a/modelzoo/CAN/data/prepare_data.sh
+++ b/modelzoo/CAN/data/prepare_data.sh
@@ -7,3 +7,4 @@ python script/process_data.py meta_Books.json reviews_Books.json
 python script/local_aggretor.py
 python script/split_by_user.py
 python script/generate_voc.py
+
diff --git a/modelzoo/CAN/data/script/data_iterator.py b/modelzoo/CAN/data/script/data_iterator.py
index 75c53c46919..4f71db10d51 100644
--- a/modelzoo/CAN/data/script/data_iterator.py
+++ b/modelzoo/CAN/data/script/data_iterator.py
@@ -1,11 +1,14 @@
 import numpy
+import pandas as pd
 import json
 import _pickle as cPickle
 import random
-
+import os
 import gzip
+import time
+
 
-import data.script.shuffle
+path = 'data/'
 
 def unicode_to_utf8(d):
     return dict((key.encode("UTF-8"), value) for (key,value) in d.items())
@@ -51,15 +54,19 @@ def __init__(self, source,
         else:
             self.source = fopen(source, 'r')
         self.source_dicts = []
-        for source_dict in [uid_voc, mid_voc, cat_voc, './data/item_carte_voc.pkl', './data/cate_carte_voc.pkl']:
+        for source_dict in [uid_voc, mid_voc, cat_voc, path+'item_carte_voc.pkl',path+ 'cate_carte_voc.pkl']:
+
             self.source_dicts.append(load_dict(source_dict))
 
-        f_meta = open("./data/item-info", "r")
+
+        f_meta = open(path+"item-info", "r")
         meta_map = {}
         for line in f_meta:
             arr = line.strip().split("\t")
             if arr[0] not in meta_map:
                 meta_map[arr[0]] = arr[1]
+
+
         self.meta_id_map ={}
         for key in meta_map:
             val = meta_map[key]
@@ -73,7 +80,8 @@ def __init__(self, source,
                 cat_idx = 0
             self.meta_id_map[mid_idx] = cat_idx
 
-        f_review = open("./data/reviews-info", "r")
+
+        f_review = open(path+"reviews-info", "r")
         self.mid_list_for_random = []
         for line in f_review:
             arr = line.strip().split("\t")
@@ -82,6 +90,7 @@ def __init__(self, source,
                 tmp_idx = self.source_dicts[1][arr[1]]
             self.mid_list_for_random.append(tmp_idx)
 
+
         self.batch_size = batch_size
         self.maxlen = maxlen
         self.minlen = minlen
@@ -129,6 +138,8 @@ def __next__(self):
                     break
                 self.source_buffer.append(ss.strip("\n").split("\t"))
 
+
+
             # sort by  history behavior length
             if self.sort_by_length:
                 his_length = numpy.array([len(s[4].split("")) for s in self.source_buffer])
@@ -139,6 +150,7 @@ def __next__(self):
             else:
                 self.source_buffer.reverse()
 
+
         if len(self.source_buffer) == 0:
             self.end_of_data = False
             self.reset()
@@ -168,6 +180,7 @@ def __next__(self):
                     item_carte.append(i_c)
                 mid_list = tmp
 
+
                 tmp1 = []
                 cate_carte = []
                 for fea in ss[5].split(""):
@@ -177,6 +190,7 @@ def __next__(self):
                     cate_carte.append(c_c)
                 cat_list = tmp1
 
+
                 # read from source file and map to word index
 
                 if self.minlen != None:
@@ -187,6 +201,9 @@ def __next__(self):
 
                 noclk_mid_list = []
                 noclk_cat_list = []
+
+                #print('end:',self.meta_id_map)
+                start = time.time()
                 for pos_mid in mid_list:
                     noclk_tmp_mid = []
                     noclk_tmp_cat = []
@@ -194,8 +211,10 @@ def __next__(self):
                     while True:
                         noclk_mid_indx = random.randint(0, len(self.mid_list_for_random)-1)
                         noclk_mid = self.mid_list_for_random[noclk_mid_indx]
-                        if noclk_mid == pos_mid:
-                            continue
+
+
+                        # if noclk_mid == pos_mid:
+                        #     continue
                         noclk_tmp_mid.append(noclk_mid)
                         noclk_tmp_cat.append(self.meta_id_map[noclk_mid])
                         noclk_index += 1
@@ -203,8 +222,16 @@ def __next__(self):
                             break
                     noclk_mid_list.append(noclk_tmp_mid)
                     noclk_cat_list.append(noclk_tmp_cat)
+
+
+
+
                 carte_list = [item_carte, cate_carte]
                 source.append([uid, mid, cat, mid_list, cat_list, noclk_mid_list, noclk_cat_list, carte_list])
+
+
+
+
                 if self.label_type == 1:
                     target.append([float(ss[0])])
                 else:
@@ -212,6 +239,7 @@ def __next__(self):
 
                 if len(source) >= self.batch_size or len(target) >= self.batch_size:
                     break
+
         except IOError:
             self.end_of_data = True
 
@@ -219,6 +247,96 @@ def __next__(self):
         if len(source) == 0 or len(target) == 0:
             source, target = self.next()
 
+
+
         return source, target
 
 
+
+
+
+
+def prepare_data(input, target, maxlen = None, return_neg = False):
+
+    # x: a list of sentences
+
+
+    lengths_x = [len(s[4]) for s in input]
+    seqs_mid = [inp[3] for inp in input]
+    seqs_cat = [inp[4] for inp in input]
+    noclk_seqs_mid = [inp[5] for inp in input]
+    noclk_seqs_cat = [inp[6] for inp in input]
+    seqs_item_carte = [inp[7][0] for inp in input]
+    seqs_cate_carte = [inp[7][1] for inp in input]
+
+
+
+    if maxlen is not None:
+        new_seqs_mid = []
+        new_seqs_cat = []
+        new_noclk_seqs_mid = []
+        new_noclk_seqs_cat = []
+        new_lengths_x = []
+        new_seqs_item_carte = []
+        new_seqs_cate_carte = []
+        for l_x, inp in zip(lengths_x, input):
+            if l_x > maxlen:
+                new_seqs_mid.append(inp[3][l_x - maxlen:])
+                new_seqs_cat.append(inp[4][l_x - maxlen:])
+                new_noclk_seqs_mid.append(inp[5][l_x - maxlen:])
+                new_noclk_seqs_cat.append(inp[6][l_x - maxlen:])
+                new_seqs_item_carte.append(inp[7][0][l_x - maxlen:])
+                new_seqs_cate_carte.append(inp[7][1][l_x - maxlen:])
+                new_lengths_x.append(maxlen)
+            else:
+                new_seqs_mid.append(inp[3])
+                new_seqs_cat.append(inp[4])
+                new_noclk_seqs_mid.append(inp[5])
+                new_noclk_seqs_cat.append(inp[6])
+                new_seqs_item_carte.append(inp[7][0])
+                new_seqs_cate_carte.append(inp[7][1])
+                new_lengths_x.append(l_x)
+        lengths_x = new_lengths_x
+        seqs_mid = new_seqs_mid
+        seqs_cat = new_seqs_cat
+        noclk_seqs_mid = new_noclk_seqs_mid
+        noclk_seqs_cat = new_noclk_seqs_cat
+        seqs_item_carte = new_seqs_item_carte
+        seqs_cate_carte = new_seqs_cate_carte
+
+        if len(lengths_x) < 1:
+            return None, None, None, None
+
+    n_samples = len(seqs_mid)
+    maxlen_x = numpy.max(lengths_x)
+    neg_samples = len(noclk_seqs_mid[0][0])
+
+    mid_his = numpy.zeros((n_samples, maxlen_x)).astype('int64')
+    cat_his = numpy.zeros((n_samples, maxlen_x)).astype('int64')
+    noclk_mid_his = numpy.zeros((n_samples, maxlen_x, neg_samples)).astype('int64')
+    noclk_cat_his = numpy.zeros((n_samples, maxlen_x, neg_samples)).astype('int64')
+    item_carte = numpy.zeros((n_samples, maxlen_x)).astype('int64')
+    cate_carte = numpy.zeros((n_samples, maxlen_x)).astype('int64')
+    mid_mask = numpy.zeros((n_samples, maxlen_x)).astype('float32')
+    for idx, [s_x, s_y, no_sx, no_sy, i_c, c_c] in enumerate(zip(seqs_mid, seqs_cat, noclk_seqs_mid, noclk_seqs_cat, seqs_item_carte, seqs_cate_carte)):
+        mid_mask[idx, :lengths_x[idx]] = 1.
+        mid_his[idx, :lengths_x[idx]] = s_x
+        cat_his[idx, :lengths_x[idx]] = s_y
+        noclk_mid_his[idx, :lengths_x[idx], :] = no_sx
+        noclk_cat_his[idx, :lengths_x[idx], :] = no_sy
+        item_carte[idx, :lengths_x[idx]] = i_c
+        cate_carte[idx, :lengths_x[idx]] = c_c
+
+    uids = numpy.array([inp[0] for inp in input])
+    mids = numpy.array([inp[1] for inp in input])
+    cats = numpy.array([inp[2] for inp in input])
+
+    carte = numpy.stack([item_carte, cate_carte], axis=1)
+
+    if return_neg:
+        return uids, mids, cats, mid_his, cat_his, mid_mask, numpy.array(target), numpy.array(lengths_x), noclk_mid_his, noclk_cat_his, carte
+
+    else:
+        return uids, mids, cats, mid_his, cat_his, mid_mask, numpy.array(target), numpy.array(lengths_x), carte
+
+
diff --git a/modelzoo/CAN/data/script/generate_voc.py.bk b/modelzoo/CAN/data/script/generate_voc.py.bk
deleted file mode 100644
index 411708148aa..00000000000
--- a/modelzoo/CAN/data/script/generate_voc.py.bk
+++ /dev/null
@@ -1,65 +0,0 @@
-import cPickle
-
-f_train = open("local_train_splitByUser", "r")
-uid_dict = {}
-mid_dict = {}
-cat_dict = {}
-
-iddd = 0
-for line in f_train:
-    arr = line.strip("\n").split("\t")
-    clk = arr[0]
-    uid = arr[1]
-    mid = arr[2]
-    cat = arr[3]
-    mid_list = arr[4]
-    cat_list = arr[5]
-    if uid not in uid_dict:
-        uid_dict[uid] = 0
-    uid_dict[uid] += 1
-    if mid not in mid_dict:
-        mid_dict[mid] = 0
-    mid_dict[mid] += 1
-    if cat not in cat_dict:
-        cat_dict[cat] = 0
-    cat_dict[cat] += 1
-    if len(mid_list) == 0:
-        continue
-    for m in mid_list.split(""):
-        if m not in mid_dict:
-            mid_dict[m] = 0
-        mid_dict[m] += 1
-    #print iddd
-    iddd+=1
-    for c in cat_list.split(""):
-        if c not in cat_dict:
-            cat_dict[c] = 0
-        cat_dict[c] += 1
-
-sorted_uid_dict = sorted(uid_dict.iteritems(), key=lambda x:x[1], reverse=True)
-sorted_mid_dict = sorted(mid_dict.iteritems(), key=lambda x:x[1], reverse=True)
-sorted_cat_dict = sorted(cat_dict.iteritems(), key=lambda x:x[1], reverse=True)
-
-uid_voc = {}
-index = 0
-for key, value in sorted_uid_dict:
-    uid_voc[key] = index
-    index += 1
-
-mid_voc = {}
-mid_voc["default_mid"] = 0
-index = 1
-for key, value in sorted_mid_dict:
-    mid_voc[key] = index
-    index += 1
-
-cat_voc = {}
-cat_voc["default_cat"] = 0
-index = 1
-for key, value in sorted_cat_dict:
-    cat_voc[key] = index
-    index += 1
-
-cPickle.dump(uid_voc, open("uid_voc.pkl", "w"))
-cPickle.dump(mid_voc, open("mid_voc.pkl", "w"))
-cPickle.dump(cat_voc, open("cat_voc.pkl", "w"))
diff --git a/modelzoo/CAN/data/script/local_aggretor.py b/modelzoo/CAN/data/script/local_aggretor.py
index e652ff3d543..05e43d4937a 100644
--- a/modelzoo/CAN/data/script/local_aggretor.py
+++ b/modelzoo/CAN/data/script/local_aggretor.py
@@ -2,9 +2,10 @@
 import hashlib
 import random
 
-fin = open("../../DIEN/data/jointed-new-split-info", "r")
-ftrain = open("../../DIEN/data/local_train", "w")
-ftest = open("../../DIEN/data/local_test", "w")
+
+fin = open("jointed-new-split-info", "r")
+ftrain = open("local_train", "w")
+ftest = open("local_test", "w")
 
 last_user = "0"
 common_fea = ""
@@ -18,13 +19,14 @@
     dt = items[5]
     cat1 = items[6]
 
-    if ds=="20180118":
+    if ds == "20180118":
         fo = ftrain
     else:
         fo = ftest
     if user != last_user:
         movie_id_list = []
         cate1_list = []
+        #print >> fo, items[1] + "\t" + user + "\t" + movie_id + "\t" + cat1 +"\t" + "" + "\t" + ""
     else:
         history_clk_num = len(movie_id_list)
         cat_str = ""
@@ -35,11 +37,12 @@
             mid_str += mid + ""
         if len(cat_str) > 0: cat_str = cat_str[:-1]
         if len(mid_str) > 0: mid_str = mid_str[:-1]
-        if history_clk_num >= 1:    # 8 is the average length of user behavior
-            print(items[1] + "\t" + user + "\t" + movie_id + "\t" + cat1 +"\t" + mid_str + "\t" + cat_str,
+        if history_clk_num >= 1:  # 8 is the average length of user behavior
+            print(items[1] + "\t" + user + "\t" + movie_id + "\t" + cat1 +
+                  "\t" + mid_str + "\t" + cat_str,
                   file=fo)
     last_user = user
     if clk:
         movie_id_list.append(movie_id)
-        cate1_list.append(cat1)                
+        cate1_list.append(cat1)
     line_idx += 1
diff --git a/modelzoo/CAN/result/README.md b/modelzoo/CAN/result/README.md
new file mode 100644
index 00000000000..ccec44eb9a5
--- /dev/null
+++ b/modelzoo/CAN/result/README.md
@@ -0,0 +1,2 @@
+# Result
+Checkpoint & timeline file are default saved in this folder.
diff --git a/modelzoo/CAN/train.py b/modelzoo/CAN/train.py
index dd54677aecc..d13ead12e8c 100644
--- a/modelzoo/CAN/train.py
+++ b/modelzoo/CAN/train.py
@@ -1,95 +1,26 @@
 import numpy
-from data.script.data_iterator import DataIterator
+import pandas as pd
+from data.script.data_iterator import DataIterator,prepare_data
 import tensorflow as tf
 from script.model import *
 import time
 import random
 import sys
 from script.utils import *
-from tqdm import tqdm
+from tqdm import *
+import pickle as pkl
+import argparse
+
 
 EMBEDDING_DIM = 18
 HIDDEN_SIZE = 18 * 2
 ATTENTION_SIZE = 18 * 2
 best_auc = 0.0
 
-def prepare_data(input, target, maxlen = None, return_neg = False):
-    # x: a list of sentences
-    lengths_x = [len(s[4]) for s in input]
-    seqs_mid = [inp[3] for inp in input]
-    seqs_cat = [inp[4] for inp in input]
-    noclk_seqs_mid = [inp[5] for inp in input]
-    noclk_seqs_cat = [inp[6] for inp in input]
-    seqs_item_carte = [inp[7][0] for inp in input]
-    seqs_cate_carte = [inp[7][1] for inp in input]
-
-    if maxlen is not None:
-        new_seqs_mid = []
-        new_seqs_cat = []
-        new_noclk_seqs_mid = []
-        new_noclk_seqs_cat = []
-        new_lengths_x = []
-        new_seqs_item_carte = []
-        new_seqs_cate_carte = []
-        for l_x, inp in zip(lengths_x, input):
-            if l_x > maxlen:
-                new_seqs_mid.append(inp[3][l_x - maxlen:])
-                new_seqs_cat.append(inp[4][l_x - maxlen:])
-                new_noclk_seqs_mid.append(inp[5][l_x - maxlen:])
-                new_noclk_seqs_cat.append(inp[6][l_x - maxlen:])
-                new_seqs_item_carte.append(inp[7][0][l_x - maxlen:])
-                new_seqs_cate_carte.append(inp[7][1][l_x - maxlen:])
-                new_lengths_x.append(maxlen)
-            else:
-                new_seqs_mid.append(inp[3])
-                new_seqs_cat.append(inp[4])
-                new_noclk_seqs_mid.append(inp[5])
-                new_noclk_seqs_cat.append(inp[6])
-                new_seqs_item_carte.append(inp[7][0])
-                new_seqs_cate_carte.append(inp[7][1])
-                new_lengths_x.append(l_x)
-        lengths_x = new_lengths_x
-        seqs_mid = new_seqs_mid
-        seqs_cat = new_seqs_cat
-        noclk_seqs_mid = new_noclk_seqs_mid
-        noclk_seqs_cat = new_noclk_seqs_cat
-        seqs_item_carte = new_seqs_item_carte
-        seqs_cate_carte = new_seqs_cate_carte
-
-        if len(lengths_x) < 1:
-            return None, None, None, None
-
-    n_samples = len(seqs_mid)
-    maxlen_x = numpy.max(lengths_x)
-    neg_samples = len(noclk_seqs_mid[0][0])
-
-    mid_his = numpy.zeros((n_samples, maxlen_x)).astype('int64')
-    cat_his = numpy.zeros((n_samples, maxlen_x)).astype('int64')
-    noclk_mid_his = numpy.zeros((n_samples, maxlen_x, neg_samples)).astype('int64')
-    noclk_cat_his = numpy.zeros((n_samples, maxlen_x, neg_samples)).astype('int64')
-    item_carte = numpy.zeros((n_samples, maxlen_x)).astype('int64')
-    cate_carte = numpy.zeros((n_samples, maxlen_x)).astype('int64')
-    mid_mask = numpy.zeros((n_samples, maxlen_x)).astype('float32')
-    for idx, [s_x, s_y, no_sx, no_sy, i_c, c_c] in enumerate(zip(seqs_mid, seqs_cat, noclk_seqs_mid, noclk_seqs_cat, seqs_item_carte, seqs_cate_carte)):
-        mid_mask[idx, :lengths_x[idx]] = 1.
-        mid_his[idx, :lengths_x[idx]] = s_x
-        cat_his[idx, :lengths_x[idx]] = s_y
-        noclk_mid_his[idx, :lengths_x[idx], :] = no_sx
-        noclk_cat_his[idx, :lengths_x[idx], :] = no_sy
-        item_carte[idx, :lengths_x[idx]] = i_c
-        cate_carte[idx, :lengths_x[idx]] = c_c
-
-    uids = numpy.array([inp[0] for inp in input])
-    mids = numpy.array([inp[1] for inp in input])
-    cats = numpy.array([inp[2] for inp in input])
-
-    carte = numpy.stack([item_carte, cate_carte], axis=1)
-
-    if return_neg:
-        return uids, mids, cats, mid_his, cat_his, mid_mask, numpy.array(target), numpy.array(lengths_x), noclk_mid_his, noclk_cat_his, carte
+file_location = 'data'
+
+
 
-    else:
-        return uids, mids, cats, mid_his, cat_his, mid_mask, numpy.array(target), numpy.array(lengths_x), carte
 
 def eval(sess, test_data, model, model_path):
 
@@ -119,80 +50,44 @@ def eval(sess, test_data, model, model_path):
         #model.save(sess, model_path)
     return test_auc, loss_sum, accuracy_sum, aux_loss_sum
 
-def train(
-        train_file = "../DIEN/data/local_train_splitByUser",
-        test_file = "../DIEN/data/local_test_splitByUser",
-        uid_voc = "../CAN/data/uid_voc.pkl",
-        mid_voc = "../CAN/data/mid_voc.pkl",
-        cat_voc = "../CAN/data/cat_voc.pkl",
-        batch_size = 128,
-        maxlen = 100,
-        test_iter = 8400,
-        save_iter = 8400,
-        model_type = 'DNN',
+
+
+def train(train_file = file_location+"/local_train_splitByUser",
+        test_file =file_location+ "/local_test_splitByUser",
+        uid_voc =file_location+ "/uid_voc.pkl",
+        mid_voc = file_location+"/mid_voc.pkl",
+        cat_voc = file_location+"/cat_voc.pkl",
+        model_type = 'CAN',
         seed = 2,
 ):
     model_path = "dnn_save_path/ckpt_noshuff" + model_type + str(seed)
     best_model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed)
     gpu_options = tf.GPUOptions(allow_growth=True)
     with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
-        label_type = 1
-        train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen, shuffle_each_epoch=False, label_type=label_type)
-        test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen, label_type=label_type)
+
+        train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, args.batch_size, args.maxlen,
+                                  shuffle_each_epoch=False, label_type=args.label_type)
+
+        test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, args.batch_size, args.maxlen,
+                                 label_type=args.label_type)
+
         n_uid, n_mid, n_cat, n_carte = train_data.get_n()
-        if model_type == 'DNN':
-            model = Model_DNN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE,use_softmax=False)
-        elif model_type == 'Cartesion':
-            model = Model_DNN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE,use_softmax=False, use_cartes=True)
-        elif model_type == 'CAN+Cartesion':
-            model = Model_DNN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_coaction=True, use_cartes=True)
-        elif model_type == 'CAN':
-            model = Model_DNN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_coaction=True)
-        elif model_type == 'PNN':
-            model = Model_PNN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_softmax=False)
-        elif model_type == 'ONN':
-            model = Model_ONN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_softmax=False)
-        elif model_type == 'Wide':
-            model = Model_WideDeep(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
-        elif model_type == 'NCF':
-            model = Model_NCF(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
-        elif model_type == 'FM':
-            model = Model_FM(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_softmax=False)
-        elif model_type == 'FFM':
-            model = Model_FFM(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_softmax=False)
-        elif model_type == 'DeepFM':
-            model = Model_DeepFM(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_softmax=False)
-        elif model_type == 'DeepFFM':
-            model = Model_DeepFFM(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_softmax=False)
-        elif model_type == 'xDeepFM':
-            model = Model_xDeepFM(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_softmax=False)
-        elif model_type == 'ONN':
-            model = Model_ONN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
-        elif model_type == 'DIN':
-            model = Model_DIN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
-        elif model_type == 'DIEN':
-            model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
-        elif model_type == 'CAN+DIEN':
-            model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_coaction=True)
-        else:
-            print ("Invalid model_type : %s"% model_type)
-            return
-        print("Model: ", model_type)
+
+        model = Model_DNN(n_uid, n_mid, n_cat, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
         sess.run(tf.global_variables_initializer())
         sess.run(tf.local_variables_initializer())
         sys.stdout.flush()
 
         count()
 
-        iter = 0
         lr = 0.001
+        loss_sum = 0.0
+        accuracy_sum = 0.
+        aux_loss_sum = 0.
+        for iter in range(10):
+            for src, tgt in tqdm(train_data):
+                uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats, carte = prepare_data(src, tgt, args.maxlen, return_neg=True)
 
-        for itr in range(1) :
-            loss_sum = 0.0
-            accuracy_sum = 0.
-            aux_loss_sum = 0.
-            for src, tgt in train_data:
-                uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats, carte = prepare_data(src, tgt, maxlen, return_neg=True)
                 loss, acc, aux_loss = model.train(sess, [uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, lr, noclk_mids, noclk_cats, carte])
                 loss_sum += loss
                 accuracy_sum += acc
@@ -204,13 +99,13 @@ def train(
                     loss_sum = 0.0
                     accuracy_sum = 0.0
                     aux_loss_sum = 0.0
-                if (iter % test_iter) == 0:
+                if (iter % args.test_iter) == 0:
                     auc_, loss_, acc_, aux_ = eval(sess, test_data, model, best_model_path)
                     print('iter: %d --- test_auc: %.4f ---- test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % (iter, auc_, loss_, acc_, aux_))
                     loss_sum = 0.0
                     accuracy_sum = 0.0
                     aux_loss_sum = 0.0
-                if (iter % save_iter) == 0:
+                if (iter % args.save_iter) == 0:
                     print('save model iter: %d' %(iter))
                     model.save(sess, model_path+"--"+str(iter))
 
@@ -232,61 +127,64 @@ def count():
     print("Prameter: ", total_parameters)
 
 def test(
-        train_file = "../DIEN/data/local_train_splitByUser",
-        test_file = "../DIEN/data/local_test_splitByUser",
-        uid_voc = "../CAN/data/uid_voc.pkl",
-        mid_voc = "../CAN/data/mid_voc.pkl",
-        cat_voc = "../CAN/data/cat_voc.pkl",
-        batch_size = 128,
-        maxlen = 100,
-        model_type = 'DNN',
+        train_file = file_location+"local_train_splitByUser",
+        test_file = file_location+"local_test_splitByUser",
+        uid_voc = file_location+"uid_voc.pkl",
+        mid_voc = file_location+"mid_voc.pkl",
+        cat_voc = file_location+"cat_voc.pkl",
+        model_type='CAN' ,
 	seed = 2
 ):
 
     model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed)
     gpu_options = tf.GPUOptions(allow_growth=True)
     with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
-        train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
-        test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
+        train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, args.batch_size, args.maxlen)
+        test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, args.batch_size, args.maxlen)
         n_uid, n_mid, n_cat = train_data.get_n()
-        if model_type == 'DNN':
-            model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
-        elif model_type == 'PNN':
-            model = Model_PNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
-        elif model_type == 'Wide':
-	        model = Model_WideDeep(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
-        elif model_type == 'DIN':
-            model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
-        elif model_type == 'DIN-V2-gru-att-gru':
-            model = Model_DIN_V2_Gru_att_Gru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
-        elif model_type == 'DIN-V2-gru-gru-att':
-            model = Model_DIN_V2_Gru_Gru_att(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
-        elif model_type == 'DIN-V2-gru-qa-attGru':
-            model = Model_DIN_V2_Gru_QA_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
-        elif model_type == 'DIN-V2-gru-vec-attGru':
-            model = Model_DIN_V2_Gru_Vec_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
-        elif model_type == 'DIEN':
-            model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
-        else:
-            print ("Invalid model_type : %s", model_type)
-            return
+
+        model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
+
         model.restore(sess, model_path)
         print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, model_path))
 
+
+def get_arg_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--batch_size',
+                        help='Batch size to train. Default is 512',
+                        type=int,
+                        default=128)
+    parser.add_argument('--training',
+                        help='train or test ',
+                        type=bool,
+                        default=True)
+
+    parser.add_argument('--maxlen',
+                        type=int,
+                        default=100)
+
+    parser.add_argument('--test_iter',
+                        type=int,
+                        default=8400)
+
+    parser.add_argument('--save_iter',
+                        type=int,
+                        default=8400)
+    parser.add_argument('--label_type',
+                        type=int,
+                        default=1)
+
+    return parser
+
+
+
 if __name__ == '__main__':
-    if len(sys.argv) == 4:
-        SEED = int(sys.argv[3])
-    else:
-        SEED = 3
-    tf.set_random_seed(SEED)
-    numpy.random.seed(SEED)
-    random.seed(SEED)
-
-    if sys.argv[1] == 'train':
-        train(model_type=sys.argv[2], seed=SEED)
-    elif sys.argv[1] == 'test':
-        test(model_type=sys.argv[2], seed=SEED)
+    parser = get_arg_parser()
+    args = parser.parse_args()
+    if args.training:
+        train()
     else:
-        print('do nothing...')
+        test()
 
 

From b6134722d705e3e8440f5c4592e4ef78a1c2626e Mon Sep 17 00:00:00 2001
From: lihangtian <936971274@qq.com>
Date: Wed, 12 Oct 2022 15:52:17 +0800
Subject: [PATCH 6/8] [ModelZoo] Support FNN

---
 modelzoo/FNN/README.md                        |  28 +-
 modelzoo/FNN/data/prepare_data.sh             |  15 ++
 modelzoo/FNN/data/script/data2labelencode.py  |  54 ++++
 modelzoo/FNN/data/script/generate_neg.py      |  63 +++++
 modelzoo/FNN/data/script/generate_voc.py      |  66 +++++
 .../FNN/data/script/history_behavior_list.py  |  41 +++
 modelzoo/FNN/data/script/item_map.py          |  29 +++
 modelzoo/FNN/data/script/local_aggretor.py    |  47 ++++
 modelzoo/FNN/data/script/pick2txt.py          |  14 +
 modelzoo/FNN/data/script/process_data.py      | 108 ++++++++
 modelzoo/FNN/data/script/split_by_user.py     |  18 ++
 modelzoo/FNN/result/README.md                 |   2 +-
 modelzoo/FNN/script/feature_column.py         |  35 ++-
 modelzoo/FNN/script/layers/utils.py           |   2 +
 modelzoo/FNN/script/utils.py                  |   2 +-
 modelzoo/FNN/train.py                         | 242 +++++++++++-------
 16 files changed, 660 insertions(+), 106 deletions(-)
 create mode 100644 modelzoo/FNN/data/prepare_data.sh
 create mode 100644 modelzoo/FNN/data/script/data2labelencode.py
 create mode 100644 modelzoo/FNN/data/script/generate_neg.py
 create mode 100644 modelzoo/FNN/data/script/generate_voc.py
 create mode 100644 modelzoo/FNN/data/script/history_behavior_list.py
 create mode 100644 modelzoo/FNN/data/script/item_map.py
 create mode 100644 modelzoo/FNN/data/script/local_aggretor.py
 create mode 100644 modelzoo/FNN/data/script/pick2txt.py
 create mode 100644 modelzoo/FNN/data/script/process_data.py
 create mode 100644 modelzoo/FNN/data/script/split_by_user.py

diff --git a/modelzoo/FNN/README.md b/modelzoo/FNN/README.md
index a2f9e721921..39fd79202ec 100644
--- a/modelzoo/FNN/README.md
+++ b/modelzoo/FNN/README.md
@@ -6,7 +6,18 @@ The following is a brief directory structure and description for this example:
 
 ```
 ├── data                        # Data set directory
+│   ├── prepare_data.sh         # Shell script to download and process dataset
 │   └── README.md              # Documentation describing how to prepare dataset
+│	└──script                   # Directory contains scripts to process dataset
+│       ├──data2labelencode           # Convert data to csv file
+│       ├── generate_neg.py           # Create negative sample
+│       ├── generate_voc.py           # Create a list of features
+│       ├── history_behavior_list.py  # Count user's history behaviors
+│       ├── item_map.py               # Create a map between item id and cate
+│       ├── local_aggretor.py         # Generate sample data
+│       ├── pick2txt.py               # Convert voc's format
+│       ├── process_data.py           # Parse raw json data
+│       └── split_by_user.py          # Divide the dataset
 ├── script                       # model set directory
 │	├── contrib                  #Directory contains rnn
 │	├── estimator                #Directory contains estimator to data
@@ -29,7 +40,7 @@ The following is a brief directory structure and description for this example:
 
 ## Model Structure
 
-Implementation of paper "Deep Learning over Multi-field Categorical Data– A Case Study on User Response  Prediction".
+Implementation of paper "Deep Learning over Multi-field Categorical Data A Case Study on User Response Prediction".
 
 
 
@@ -67,21 +78,8 @@ Implementation of paper "Deep Learning over Multi-field Categorical Data– A Ca
 
 ## Dataset
 
- iPinYou dataset is used as benchmark dataset.
+ Amazon Dataset Books dataset is used as benchmark dataset.
 
 ### Prepare
 
 For details of Data download, see [Data Preparation](https://github.com/Atomu2014/make-ipinyou-data)
-
-### Campaigs
-
-We use campaign 1458 as example here.
-
-```
-make-ipinyou-data/1458$ ls
-featindex.txt  test.log.txt  test.txt  train.log.txt  train.txt
-```
-
-- `train.log.txt` and `test.log.txt` are the formalised string data for each row (record) in train and test. The first column is whether the user click the ad or not.
-- `featindex.txt`maps the features to their indexes. For example, `8:1.1.174.* 76` means that the 8th column in `train.log.txt` with the string `1.1.174.*` maps to feature index `76`.
-- `train.txt` and `test.txt` are the mapped vector data for `train.log.txt` and `test.log.txt`. The format is y:click, and x:features. Such data is in the standard form as introduced in [iPinYou Benchmarking](http://arxiv.org/abs/1407.7073).
diff --git a/modelzoo/FNN/data/prepare_data.sh b/modelzoo/FNN/data/prepare_data.sh
new file mode 100644
index 00000000000..49fdb9a0da1
--- /dev/null
+++ b/modelzoo/FNN/data/prepare_data.sh
@@ -0,0 +1,15 @@
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Books.json.gz
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Books.json.gz
+gunzip reviews_Books.json.gz
+gunzip meta_Books.json.gz
+
+python script/process_data.py meta_Books.json reviews_Books.json
+python script/local_aggretor.py
+python script/split_by_user.py
+python script/generate_voc.py
+
+python script/item_map.py
+python script/history_behavior_list.py
+python script/generate_neg.py
+
+python script/data2labelencode.py
\ No newline at end of file
diff --git a/modelzoo/FNN/data/script/data2labelencode.py b/modelzoo/FNN/data/script/data2labelencode.py
new file mode 100644
index 00000000000..04daba5e28a
--- /dev/null
+++ b/modelzoo/FNN/data/script/data2labelencode.py
@@ -0,0 +1,54 @@
+import pandas as pd
+import numpy as np
+import pickle
+
+UNSEQ_COLUMNS = ['UID', 'ITEM', 'CATEGORY']
+HIS_COLUMNS = ['HISTORY_ITEM', 'HISTORY_CATEGORY']
+SEQ_COLUMNS = HIS_COLUMNS
+LABEL_COLUMN = ['CLICKED']
+TRAIN_DATA_COLUMNS = LABEL_COLUMN + UNSEQ_COLUMNS + SEQ_COLUMNS
+
+
+
+def inputs_to_labelencode(filename):
+    def encoder_dict(data, category_col):
+        category_dict = data[category_col].value_counts()
+        category_dict = pd.Series(np.arange(0, len(category_dict)), index=category_dict.index).to_dict()
+        data[category_col + '_encode'] = data[category_col].map(category_dict).astype('int32')
+        return data
+
+    uid_file = '../CAN/data/uid_voc.txt'
+    mid_file = '../CAN/data/mid_voc.txt'
+    cat_file = '../CAN/data/cat_voc.txt'
+
+    uid_data = pd.read_csv(uid_file, encoding="utf-8", header=None, names=['UID'])
+    mid_data = pd.read_csv(mid_file, encoding="utf-8", header=None, names=['ITEM'])
+    cat_data = pd.read_csv(cat_file, encoding="utf-8", header=None, names=['CATEGORY'])
+
+    uid_data = encoder_dict(uid_data, 'UID')
+    mid_data = encoder_dict(mid_data, 'ITEM')
+    cat_data = encoder_dict(cat_data, 'CATEGORY')
+
+    dataset = pd.read_csv(filename, encoding="utf-8",
+                          header=None, names=TRAIN_DATA_COLUMNS, sep="\t", low_memory=False)
+    for key in ['UID','ITEM','CATEGORY']:
+        if key=='UID':
+            dataset = pd.merge(dataset, uid_data, on=key, how='inner')
+        elif key=='ITEM':
+            dataset = pd.merge(dataset, mid_data, on=key, how='inner')
+        else:
+            dataset = pd.merge(dataset, cat_data, on=key, how='inner')
+
+    dataset = dataset.drop(UNSEQ_COLUMNS + SEQ_COLUMNS, axis=1)
+
+    dataset.to_csv(filename + '_to_labelencode.txt',index=0,header=0)
+    uid_data.to_csv('dataset/uid_labelencode.csv',index=False)
+    mid_data.to_csv('dataset/mid_labelencode.csv',index=False)
+    cat_data.to_csv('dataset/cat_labelencode.csv',index=False)
+
+
+
+if __name__ == '__main__':
+    inputs_to_labelencode('../CAN/data/local_train_splitByUser')
+    inputs_to_labelencode('../CAN/data/local_test_splitByUser')
+
diff --git a/modelzoo/FNN/data/script/generate_neg.py b/modelzoo/FNN/data/script/generate_neg.py
new file mode 100644
index 00000000000..a10ef919e13
--- /dev/null
+++ b/modelzoo/FNN/data/script/generate_neg.py
@@ -0,0 +1,63 @@
+import random
+
+NEG_SEQ_LENGTH_FOR_EACH_HISTORY_ITEM = 1
+
+
+def createNegData(file):
+    with open(file, 'r') as f_raw:
+        with open(file + '_neg', 'w') as f_out:
+            FirstLine = True
+            for line in f_raw:
+                linelist = line.strip().split('\t')
+                uid = linelist[1]
+
+                if uid not in user_history_behavior:
+                    str = '\t'
+                else:
+                    his_items = linelist[4].split('')
+                    neg_items_str = ''
+                    neg_cates_str = ''
+                    for pos in his_items:
+                        tmp_items_str = ''
+                        tmp_cates_str = ''
+                        tmp_items = []
+                        tmp_cates = []
+                        neg_length = 0
+                        while (True):
+                            index = random.randint(
+                                0,
+                                len(user_history_behavior[uid][0]) - 1)
+                            if user_history_behavior[uid][0][index] != pos:
+                                tmp_items.append(
+                                    user_history_behavior[uid][0][index])
+                                tmp_cates.append(
+                                    user_history_behavior[uid][1][index])
+                                neg_length += 1
+                            if neg_length >= NEG_SEQ_LENGTH_FOR_EACH_HISTORY_ITEM:
+                                break
+                        for item in tmp_items:
+                            tmp_items_str += (item + '')
+                        for cate in tmp_cates:
+                            tmp_cates_str += (cate + '')
+                        neg_items_str += (tmp_items_str[:-1] + '')
+                        neg_cates_str += (tmp_cates_str[:-1] + '')
+                    str = neg_items_str[:-1] + '\t' + neg_cates_str[:-1]
+                if FirstLine:
+                    f_out.write(str)
+                    FirstLine = False
+                else:
+                    f_out.write('\n' + str)
+
+
+user_history_behavior = {}
+with open('user_history_behavior.txt', 'r') as f:
+    for line in f:
+        linelist = line.strip().split('\t')
+        uid = linelist[0]
+        items = linelist[1].split('')
+        cates = linelist[2].split('')
+        user_history_behavior[uid] = [items, cates]
+
+data_file = ['local_test_splitByUser', 'local_train_splitByUser']
+for file in data_file:
+    createNegData(file)
diff --git a/modelzoo/FNN/data/script/generate_voc.py b/modelzoo/FNN/data/script/generate_voc.py
new file mode 100644
index 00000000000..447fe6393b7
--- /dev/null
+++ b/modelzoo/FNN/data/script/generate_voc.py
@@ -0,0 +1,66 @@
+# import cPickle
+import pickle as cPickle
+
+f_train = open("local_train_splitByUser", "r")
+uid_dict = {}
+mid_dict = {}
+cat_dict = {}
+
+iddd = 0
+for line in f_train:
+    arr = line.strip("\n").split("\t")
+    clk = arr[0]
+    uid = arr[1]
+    mid = arr[2]
+    cat = arr[3]
+    mid_list = arr[4]
+    cat_list = arr[5]
+    if uid not in uid_dict:
+        uid_dict[uid] = 0
+    uid_dict[uid] += 1
+    if mid not in mid_dict:
+        mid_dict[mid] = 0
+    mid_dict[mid] += 1
+    if cat not in cat_dict:
+        cat_dict[cat] = 0
+    cat_dict[cat] += 1
+    if len(mid_list) == 0:
+        continue
+    for m in mid_list.split(""):
+        if m not in mid_dict:
+            mid_dict[m] = 0
+        mid_dict[m] += 1
+    #print iddd
+    iddd+=1
+    for c in cat_list.split(""):
+        if c not in cat_dict:
+            cat_dict[c] = 0
+        cat_dict[c] += 1
+
+sorted_uid_dict = sorted(uid_dict.items(), key=lambda x:x[1], reverse=True)
+sorted_mid_dict = sorted(mid_dict.items(), key=lambda x:x[1], reverse=True)
+sorted_cat_dict = sorted(cat_dict.items(), key=lambda x:x[1], reverse=True)
+
+uid_voc = {}
+index = 0
+for key, value in sorted_uid_dict:
+    uid_voc[key] = index
+    index += 1
+
+mid_voc = {}
+mid_voc["default_mid"] = 0
+index = 1
+for key, value in sorted_mid_dict:
+    mid_voc[key] = index
+    index += 1
+
+cat_voc = {}
+cat_voc["default_cat"] = 0
+index = 1
+for key, value in sorted_cat_dict:
+    cat_voc[key] = index
+    index += 1
+
+cPickle.dump(uid_voc, open("uid_voc.pkl", "wb"))
+cPickle.dump(mid_voc, open("mid_voc.pkl", "wb"))
+cPickle.dump(cat_voc, open("cat_voc.pkl", "wb"))
diff --git a/modelzoo/FNN/data/script/history_behavior_list.py b/modelzoo/FNN/data/script/history_behavior_list.py
new file mode 100644
index 00000000000..6adaf398cef
--- /dev/null
+++ b/modelzoo/FNN/data/script/history_behavior_list.py
@@ -0,0 +1,41 @@
+item_to_cate_map = {}
+with open('item2catmap.txt', 'r') as f:
+    for line in f:
+        linelist = line.strip().split('\t')
+        item = linelist[0]
+        cate = linelist[1]
+        item_to_cate_map[item] = cate
+
+user_history_behavior = {}
+with open('reviews-info', 'r') as f:
+    for line in f:
+        linelist = line.strip().split('\t')
+        uid = linelist[0]
+        item = linelist[1]
+        if uid not in user_history_behavior:
+            user_history_behavior[uid] = [item]
+        else:
+            if item not in user_history_behavior[uid]:
+                user_history_behavior[uid].append(item)
+
+FirstLine = True
+with open('user_history_behavior.txt', 'w') as f:
+    for uid, items in user_history_behavior.items():
+        itemstr = ''
+        catestr = ''
+        for i in items:
+            if i in item_to_cate_map:
+                c = item_to_cate_map[i]
+            else:
+                c = 'Unknown'
+            if not itemstr:
+                itemstr += i
+                catestr += c
+            else:
+                itemstr += ('' + i)
+                catestr += ('' + c)
+        if FirstLine:
+            f.write(uid + '\t' + itemstr + '\t' + catestr)
+            FirstLine = False
+        else:
+            f.write('\n' + uid + '\t' + itemstr + '\t' + catestr)
diff --git a/modelzoo/FNN/data/script/item_map.py b/modelzoo/FNN/data/script/item_map.py
new file mode 100644
index 00000000000..94bebee5184
--- /dev/null
+++ b/modelzoo/FNN/data/script/item_map.py
@@ -0,0 +1,29 @@
+import sys
+from tqdm import tqdm
+
+data_file = ['local_test_splitByUser', 'local_train_splitByUser']
+
+item_to_cate_map = {}
+# 367983
+for file_name in data_file:
+    with open(file_name, 'r') as f:
+        for line in f:
+            linelist = line.strip().split('\t')
+            items = linelist[4].split('')
+            cates = linelist[5].split('')
+            items.append(linelist[2])
+            cates.append(linelist[3])
+            # print(items)
+            # print(cates)
+            for index, item in enumerate(items):
+                if item not in item_to_cate_map:
+                    item_to_cate_map[item] = cates[index]
+
+with open('item2catmap.txt', 'w') as f:
+    firstline = True
+    for item, cate in item_to_cate_map.items():
+        if firstline:
+            f.write(item + '\t' + cate)
+            firstline = False
+        else:
+            f.write('\n' + item + '\t' + cate)
diff --git a/modelzoo/FNN/data/script/local_aggretor.py b/modelzoo/FNN/data/script/local_aggretor.py
new file mode 100644
index 00000000000..1fd8aceb32c
--- /dev/null
+++ b/modelzoo/FNN/data/script/local_aggretor.py
@@ -0,0 +1,47 @@
+import sys
+import hashlib
+import random
+
+fin = open("jointed-new-split-info", "r")
+ftrain = open("local_train", "w")
+ftest = open("local_test", "w")
+
+last_user = "0"
+common_fea = ""
+line_idx = 0
+for line in fin:
+    items = line.strip().split("\t")
+    ds = items[0]
+    clk = int(items[1])
+    user = items[2]
+    movie_id = items[3]
+    dt = items[5]
+    cat1 = items[6]
+
+    if ds == "20180118":
+        fo = ftrain
+    else:
+        fo = ftest
+    if user != last_user:
+        movie_id_list = []
+        cate1_list = []
+        #print >> fo, items[1] + "\t" + user + "\t" + movie_id + "\t" + cat1 +"\t" + "" + "\t" + ""
+    else:
+        history_clk_num = len(movie_id_list)
+        cat_str = ""
+        mid_str = ""
+        for c1 in cate1_list:
+            cat_str += c1 + ""
+        for mid in movie_id_list:
+            mid_str += mid + ""
+        if len(cat_str) > 0: cat_str = cat_str[:-1]
+        if len(mid_str) > 0: mid_str = mid_str[:-1]
+        if history_clk_num >= 1:  # 8 is the average length of user behavior
+            print(items[1] + "\t" + user + "\t" + movie_id + "\t" + cat1 +
+                  "\t" + mid_str + "\t" + cat_str,
+                  file=fo)
+    last_user = user
+    if clk:
+        movie_id_list.append(movie_id)
+        cate1_list.append(cat1)
+    line_idx += 1
diff --git a/modelzoo/FNN/data/script/pick2txt.py b/modelzoo/FNN/data/script/pick2txt.py
new file mode 100644
index 00000000000..b7c129ffbe0
--- /dev/null
+++ b/modelzoo/FNN/data/script/pick2txt.py
@@ -0,0 +1,14 @@
+import pickle
+
+def pkl2txt(filename):
+    pklfile = pickle.load(open(filename+'.pkl', 'rb'))
+    with open(filename+'.txt','w') as f:
+        f.write('\n'.join(pklfile))
+
+
+
+
+if __name__ == '__main__':
+    pkl2txt('uid_voc')
+    pkl2txt('mid_voc')
+    pkl2txt('cat_voc')
\ No newline at end of file
diff --git a/modelzoo/FNN/data/script/process_data.py b/modelzoo/FNN/data/script/process_data.py
new file mode 100644
index 00000000000..0bff64f30bd
--- /dev/null
+++ b/modelzoo/FNN/data/script/process_data.py
@@ -0,0 +1,108 @@
+import sys
+import random
+import time
+
+
+def process_meta(file):
+    fi = open(file, "r")
+    fo = open("item-info", "w")
+    for line in fi:
+        obj = eval(line)
+        cat = obj["categories"][0][-1]
+        print(obj["asin"] + "\t" + cat, file=fo)
+
+
+def process_reviews(file):
+    fi = open(file, "r")
+    user_map = {}
+    fo = open("reviews-info", "w")
+    for line in fi:
+        obj = eval(line)
+        userID = obj["reviewerID"]
+        itemID = obj["asin"]
+        rating = obj["overall"]
+        time = obj["unixReviewTime"]
+        print(userID + "\t" + itemID + "\t" + str(rating) + "\t" + str(time),
+              file=fo)
+
+
+def manual_join():
+    f_rev = open("reviews-info", "r")
+    user_map = {}
+    item_list = []
+    for line in f_rev:
+        line = line.strip()
+        items = line.split("\t")
+        #loctime = time.localtime(float(items[-1]))
+        #items[-1] = time.strftime('%Y-%m-%d', loctime)
+        if items[0] not in user_map:
+            user_map[items[0]] = []
+        user_map[items[0]].append(("\t".join(items), float(items[-1])))
+        item_list.append(items[1])
+    f_meta = open("item-info", "r")
+    meta_map = {}
+    for line in f_meta:
+        arr = line.strip().split("\t")
+        if arr[0] not in meta_map:
+            meta_map[arr[0]] = arr[1]
+            arr = line.strip().split("\t")
+    fo = open("jointed-new", "w")
+    for key in user_map:
+        sorted_user_bh = sorted(user_map[key], key=lambda x: x[1])
+        for line, t in sorted_user_bh:
+            items = line.split("\t")
+            asin = items[1]
+            j = 0
+            while True:
+                asin_neg_index = random.randint(0, len(item_list) - 1)
+                asin_neg = item_list[asin_neg_index]
+                if asin_neg == asin:
+                    continue
+                items[1] = asin_neg
+                print("0" + "\t" + "\t".join(items) + "\t" +
+                      meta_map[asin_neg],
+                      file=fo)
+                j += 1
+                if j == 1:  #negative sampling frequency
+                    break
+            if asin in meta_map:
+                print("1" + "\t" + line + "\t" + meta_map[asin], file=fo)
+            else:
+                print("1" + "\t" + line + "\t" + "default_cat", file=fo)
+
+
+def split_test():
+    fi = open("jointed-new", "r")
+    fo = open("jointed-new-split-info", "w")
+    user_count = {}
+    for line in fi:
+        line = line.strip()
+        user = line.split("\t")[1]
+        if user not in user_count:
+            user_count[user] = 0
+        user_count[user] += 1
+    fi.seek(0)
+    i = 0
+    last_user = "A26ZDKC53OP6JD"
+    for line in fi:
+        line = line.strip()
+        user = line.split("\t")[1]
+        if user == last_user:
+            if i < user_count[user] - 2:  # 1 + negative samples
+                print("20180118" + "\t" + line, file=fo)
+            else:
+                print("20190119" + "\t" + line, file=fo)
+        else:
+            last_user = user
+            i = 0
+            if i < user_count[user] - 2:
+                print("20180118" + "\t" + line, file=fo)
+            else:
+                print("20190119" + "\t" + line, file=fo)
+        i += 1
+
+
+process_meta(sys.argv[1])
+process_reviews(sys.argv[2])
+manual_join()
+split_test()
diff --git a/modelzoo/FNN/data/script/split_by_user.py b/modelzoo/FNN/data/script/split_by_user.py
new file mode 100644
index 00000000000..cc7988c6601
--- /dev/null
+++ b/modelzoo/FNN/data/script/split_by_user.py
@@ -0,0 +1,18 @@
+import random
+
+fi = open("local_test", "r")
+ftrain = open("local_train_splitByUser", "w")
+ftest = open("local_test_splitByUser", "w")
+
+while True:
+    rand_int = random.randint(1, 10)
+    noclk_line = fi.readline().strip()
+    clk_line = fi.readline().strip()
+    if noclk_line == "" or clk_line == "":
+        break
+    if rand_int == 2:
+        print(noclk_line, file=ftest)
+        print(clk_line, file=ftest)
+    else:
+        print(noclk_line, file=ftrain)
+        print(clk_line, file=ftrain)
diff --git a/modelzoo/FNN/result/README.md b/modelzoo/FNN/result/README.md
index 6f962fb1716..ccec44eb9a5 100644
--- a/modelzoo/FNN/result/README.md
+++ b/modelzoo/FNN/result/README.md
@@ -1,2 +1,2 @@
 # Result
-Evaluation Metrics file are default saved in this folder.
+Checkpoint & timeline file are default saved in this folder.
diff --git a/modelzoo/FNN/script/feature_column.py b/modelzoo/FNN/script/feature_column.py
index 0569e32d3c3..69650e9e9ac 100644
--- a/modelzoo/FNN/script/feature_column.py
+++ b/modelzoo/FNN/script/feature_column.py
@@ -14,6 +14,33 @@
 import pandas as pd
 import numpy as np
 
+fi = open('../../deep_ctr_master/data/fm.model.txt','r')
+
+first = True
+feat_weights={}
+k=0
+for line in fi:
+    s = line.strip().split()
+    if first:
+        first = False
+        w_0 = float(s[0])
+        feat_num = int(s[1])
+        k = int(s[2]) + 1 # w and v
+
+    else:
+        feat = int(s[0])
+        weights = [float(s[1 + i]) for i in range(k)]
+        feat_weights[feat] = weights
+
+list1 =[]
+for col,val in feat_weights.items():
+    list1.append(val)
+
+# def my_init(shape,dtype=None):
+#     weight = np.array(list1)
+#
+#     return weight.reshape(shape)
+
 
 DEFAULT_GROUP_NAME = "default_group"
 
@@ -31,7 +58,9 @@ def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, vocabul
         if embedding_dim == "auto":
             embedding_dim = 6 * int(pow(vocabulary_size, 0.25))
         if embeddings_initializer is None:
-            embeddings_initializer = RandomNormal(mean=0.0, stddev=0.0001, seed=2020)
+            embeddings_initializer = RandomNormal(mean=0.0, stddev=0.001, seed=2020)
+        # if embeddings_initializer=='fm':
+        #     embeddings_initializer = my_init(shape=(vocabulary_size,embedding_dim))
 
 
 
@@ -161,11 +190,11 @@ def get_linear_logit(features, feature_columns, units=1, use_bias=False, seed=10
     for i in range(len(linear_feature_columns)):
         if isinstance(linear_feature_columns[i], SparseFeat):
             linear_feature_columns[i] = linear_feature_columns[i]._replace(embedding_dim=1,
-                                                                           embeddings_initializer=Zeros())
+                                                                           embeddings_initializer=RandomNormal(mean=0.0, stddev=0.01, seed=2020))
         if isinstance(linear_feature_columns[i], VarLenSparseFeat):
             linear_feature_columns[i] = linear_feature_columns[i]._replace(
                 sparsefeat=linear_feature_columns[i].sparsefeat._replace(embedding_dim=1,
-                                                                         embeddings_initializer=Zeros()))
+                                                                         embeddings_initializer=RandomNormal(mean=0.0, stddev=0.01, seed=2020)))
 
     linear_emb_list = [input_from_feature_columns(features, linear_feature_columns, l2_reg, seed,
                                                   prefix=prefix + str(i))[0] for i in range(units)]
diff --git a/modelzoo/FNN/script/layers/utils.py b/modelzoo/FNN/script/layers/utils.py
index 2be8f3fe5ef..7808e376dbd 100644
--- a/modelzoo/FNN/script/layers/utils.py
+++ b/modelzoo/FNN/script/layers/utils.py
@@ -6,6 +6,7 @@
 
 """
 import tensorflow as tf
+import numpy as np
 from tensorflow.python.keras.layers import Flatten, Concatenate, Layer, Add
 from tensorflow.python.ops.lookup_ops import TextFileInitializer
 
@@ -188,6 +189,7 @@ def get_config(self, ):
 def concat_func(inputs, axis=-1, mask=False):
     if not mask:
         inputs = list(map(NoMask(), inputs))
+
     if len(inputs) == 1:
         return inputs[0]
     else:
diff --git a/modelzoo/FNN/script/utils.py b/modelzoo/FNN/script/utils.py
index 7fe3b25a518..6425e58df6c 100644
--- a/modelzoo/FNN/script/utils.py
+++ b/modelzoo/FNN/script/utils.py
@@ -37,7 +37,7 @@ def check(version):
                     latest_version = max(latest_version, ver)
                 if latest_version > version:
                     logging.warning(
-                        '\nDeepCTR version {0} detected. Your version is {1}.\nUse `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v{0}'.format(
+                        '\nDeepCTR version {0} detected. Your version is {1}.\nUse `pip install -U script` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v{0}'.format(
                             latest_version, version))
         except:
             print("Please check the latest version manually on https://pypi.org/project/deepctr/#history")
diff --git a/modelzoo/FNN/train.py b/modelzoo/FNN/train.py
index 92d94bced4e..aca9d9037e2 100644
--- a/modelzoo/FNN/train.py
+++ b/modelzoo/FNN/train.py
@@ -1,4 +1,6 @@
 import os
+import sys
+import argparse
 import pandas as pd
 import numpy as np
 import tensorflow as tf
@@ -6,126 +8,194 @@
 import math
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 from tensorflow.keras.optimizers import Adam
-from sklearn.metrics import log_loss, roc_auc_score
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import LabelEncoder, MinMaxScaler,MultiLabelBinarizer
 from script.models.fnn import FNN
 from script.feature_column import SparseFeat, DenseFeat, get_feature_names,VarLenSparseFeat
-import gc
+import collections
 
 
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
-def split(x):
-    key_ans = x.split(',')
-    for key in key_ans:
-        if key not in key2index:
-            key2index[key] = len(key2index) + 1
-    return list(map(lambda x: key2index[x], key_ans))
+UNSEQ_COLUMNS = ['UID', 'ITEM', 'CATEGORY']
+LABEL_COLUMN = ['CLICKED']
+TRAIN_DATA_COLUMNS = LABEL_COLUMN + UNSEQ_COLUMNS
 
-if __name__=="__main__":
-    path = 'data/'
-    datalist = ['1458','2259','2261','2997','3386','all']
-
-    for file in datalist:
-
-        data = pd.read_csv(path+file+'/train.log.txt',encoding="utf-8",
-                           header=0,sep="\t",low_memory=False)
-
-        test_data = pd.read_csv(path+file+'/test.log.txt',encoding="utf-8",
-                           header=0,sep="\t",low_memory=False)
-
-
-        data = data[['click','weekday','hour','useragent','IP','region', 'city', 'adexchange', 'domain', 'slotid','slotwidth',
-                     'slotheight', 'slotvisibility', 'slotformat', 'creative', 'advertiser', 'slotprice']]
-
-        test_data = test_data[['click','weekday','hour','useragent','IP','region', 'city', 'adexchange', 'domain', 'slotid','slotwidth',
-                     'slotheight', 'slotvisibility', 'slotformat', 'creative', 'advertiser', 'slotprice']]
-
-        data['istest']=0
-        test_data['istest']=1
-        df =  pd.concat([data, test_data], axis=0, ignore_index=True)
-        del data, test_data
-        gc.collect()
-
-
-        df.dropna(subset=['click'],inplace=True)
+EMBEDDING_DIM=8
 
-        df['adexchange'].fillna(0,inplace=True)
-        df['adexchange']=df['adexchange'].astype(int)
+def build_model_input(filename=None,chunkSize=1e6,loop=True):
+    chunks=[]
+    data = pd.read_csv(filename, encoding="utf-8", header=None, names=TRAIN_DATA_COLUMNS, iterator=True)
+    while loop:
+        try:
+            chunk = data.get_chunk(chunkSize)
+            chunks.append(chunk)
+        except StopIteration:
+            loop=False
+    dataset = pd.concat(chunks)
 
+    return dataset
 
-        df.fillna('unknown', inplace=True)
 
 
-        dense_features = ['weekday', 'hour','region','city','adexchange','slotwidth','slotheight',
-                          'advertiser', 'slotprice' ]
+def build_feature_columns(data_location=None):
 
+    if data_location:
+        uid_file = os.path.join(data_location, 'uid_labelencode.csv')
+        mid_file = os.path.join(data_location, 'mid_labelencode.csv')
+        cat_file = os.path.join(data_location, 'cat_labelencode.csv')
+        if (not os.path.exists(uid_file)) or (not os.path.exists(mid_file)) or (
+                    not os.path.exists(cat_file)):
+            print("uid_labelencode.csv, mid_labelencode.csv or cat_labelencode.csv does not exist in data file.")
+            sys.exit()
 
+        uid_data = pd.read_csv(uid_file,encoding="utf-8")
+        mid_data = pd.read_csv(mid_file,encoding="utf-8")
+        cat_data = pd.read_csv(cat_file,encoding="utf-8")
 
-        sparse_features=[]
 
-        target='click'
-        for col in df.columns:
-            if col not in dense_features and col not in ['istest','click']:
-                lbe = LabelEncoder()
-                df[col] = lbe.fit_transform(df[col])
-                df[col]=lbe.fit_transform(df[col])
-                sparse_features.append(col)
+        feature_column=[SparseFeat('UID', vocabulary_size=uid_data['UID'+'_encode'].max() + 1, embedding_dim=EMBEDDING_DIM,embeddings_initializer=None),
+                        SparseFeat('ITEM',vocabulary_size=mid_data['ITEM'+'_encode'].max()+1,embedding_dim=EMBEDDING_DIM,embeddings_initializer=None),
+                        SparseFeat('CATEGORY',vocabulary_size=cat_data['CATEGORY'+'_encode'].max()+1,embedding_dim=EMBEDDING_DIM,embeddings_initializer=None)]
 
-        mms = MinMaxScaler(feature_range=(0, 1))
+    else:
+        print("data_location does not exist in data file. ")
+        sys.exit()
 
-        df[dense_features] = mms.fit_transform(df[dense_features])
 
+    return feature_column
 
-        fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].max() + 1, embedding_dim=11,embeddings_initializer=None)
-                                  for i, feat in enumerate(sparse_features)] + [DenseFeat(feat, 1, )
-                                                                                for feat in dense_features]
 
-        linear_feature_columns = fixlen_feature_columns
-        dnn_feature_columns = fixlen_feature_columns
+def main(train_data=None,test_data=None,feature_colums=None):
+    feature_names = get_feature_names(feature_colums)
+    model = FNN(feature_colums, feature_colums, dnn_hidden_units=args.dnn_hidden_units,l2_reg_embedding=args.l2_reg_embedding,
+                l2_reg_linear=args.l2_reg_linear,l2_reg_dnn=args.l2_reg_dnn,seed=args.seed,dnn_dropout=args.dnn_dropout,
+                dnn_activation=args.dnn_activation,task=args.task)
 
+    if args.optimizer=='adam':
+        optimizer = Adam(learning_rate=args.learning_rate, amsgrad=False)
+    model.compile(optimizer, loss=args.loss,
+                  metrics=args.metrics)
+    saver = tf.train.Saver()
+    gpu_options = tf.GPUOptions(allow_growth=True)
+    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
+        if args.training:
+            train_inputs = {name: train_data[name].values for name in feature_names}
+            sess.run(tf.tables_initializer())
+            history = model.fit(train_inputs, train_data[LABEL_COLUMN].values,
+                            batch_size=args.batch_size, epochs=args.epochs,
+                            verbose=args.verbose,validation_split=args.validation_split)
+            saver.save(sess,args.save_path,global_step=args.save_step)
+
+        else:
+            #new_saver = tf.train.import_meta_graph(save_path+'model.ckpt.meta')
+
+            saver.restore(sess, tf.train.latest_checkpoint(args.save_path))
+            test_inputs = {name:test_data[name].values for name in feature_names}
+            pred_ans = model.predict(test_inputs, batch_size=args.batch_size)
+
+
+# Get parse
+def get_arg_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--learning_rate',
+                        help='Learning rate for model',
+                        type=float,
+                        default=0.001)
+    parser.add_argument('--save_path',
+                        help='Full path to model output directory',
+                        required=False,
+                        default='results/')
+    parser.add_argument('--batch_size',
+                        help='Batch size to train. Default is 512',
+                        type=int,
+                        default=512)
+    parser.add_argument('--training',
+                        help='train or eval ',
+                        type=bool,
+                        default=True)
+    parser.add_argument('--epochs',
+                        help='Epoch to train.Default is 50',
+                        type=int,
+                        default=1)
+    parser.add_argument('--save_step',
+                        help='set the number of steps on saving checkpoints',
+                        type=int,
+                        default=1)
+    parser.add_argument('--verbose',
+                        help='set the random seed for tensorflow.',
+                        choices=[0,1,2],
+                        default=2)
+    parser.add_argument('--validation_split',
+                        help='Validation split.',
+                        type=float,
+                        default=0.2)
+    parser.add_argument('--optimizer',
+                        type=str,
+                        default='adam')
+    parser.add_argument('--dnn_hidden_units',
+                        type=tuple,
+                        help='An iterable containing all the features used by deep part of the model.',
+                        default=(256, 128, 64))
+    parser.add_argument('--l2_reg_embedding',
+                        help=' L2 regularizer strength applied to embedding vector.',
+                        type=float,
+                        default=0.00001)
+    parser.add_argument('--l2_reg_linear',
+                        help='L2 regularizer strength applied to linear weight.',
+                        type=float,
+                        default=0.00001)
+    parser.add_argument('--l2_reg_dnn',
+                        help='L2 regularizer strength applied to DNN.',
+                        type=float,
+                        default=0)
+    parser.add_argument('--seed',
+                        help='to use as random seed.',
+                        type=int,
+                        default=1024)
+    parser.add_argument('--dnn_dropout',
+                        help='the probability we will drop out a given DNN coordinate,float in [0,1).',
+                        type=float,
+                        default=0)
+    parser.add_argument('--dnn_activation',
+                        help='Activation function to use in DNN.',
+                        type=str,
+                        default='relu')
+    parser.add_argument('--task',
+                        help='``"binary"`` for  binary logloss or  ``"regression"`` for regression loss.',
+                        type=str,
+                        choices=['binary', 'regression'],
+                        default='binary')
+    parser.add_argument('--loss',
+                        type=str,
+                        default='binary_crossentropy')
+    parser.add_argument('--metrics',
+                        type=list,
+                        default=['binary_crossentropy', 'AUC'])
+
+
+    return parser
 
 
 
+if __name__=="__main__":
+    path = 'dataset'
+    train_path = path+'/local_train_splitByUser_to_labelencode.txt'
+    test_path = path+'/local_test_splitByUser_to_labelencode.txt'
+    feature_colums = build_feature_columns(path)
 
-        feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
-
-        # 3.generate train&test input data for model
-        cols = [f for f in df.columns if f not in ['click', 'istest']]
-        train = df[df.istest==0][cols]
-        test = df[df.istest==1][cols]
-
-        train_model_input = {name: train[name] for name in feature_names}
-        test_model_input = {name: test[name] for name in feature_names}
-
-        gpu_options = tf.GPUOptions(allow_growth=True)
-
-
-        model = FNN(linear_feature_columns, dnn_feature_columns,task='binary',dnn_hidden_units=(128, 64, 32))
-
-        adam = Adam(learning_rate=0.001,amsgrad=False)
+    train_data = build_model_input(train_path)
+    test_data = build_model_input(test_path)
 
-        model.compile(adam, "binary_crossentropy",
-                      metrics=['binary_crossentropy','AUC'])
+    feature_names = get_feature_names(feature_colums)
 
-        with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
+    parser = get_arg_parser()
+    args = parser.parse_args()
+    main(train_data,test_data,feature_colums)
 
 
-            sess.run(tf.tables_initializer())
-            history = model.fit(train_model_input, df[df.istest==0][target].values,
-                            batch_size=128, epochs=50, verbose=2, validation_split=0.2)
 
-            pred_ans = model.predict(test_model_input, batch_size=128)
 
-            test_auc = roc_auc_score(df[df.istest==1][target].values,pred_ans)
-            print('test_auc=',test_auc)
 
 
-            with open('result/result.txt','a+') as tx:
-                print(file+" test LogLoss", round(log_loss(df[df.istest==1][target].values, pred_ans), 4),file=tx)
-                print(file+" test AUC", round(roc_auc_score(df[df.istest==1][target].values, pred_ans), 4),file=tx)
-                print('='*50,file=tx)
 
 
 

From 0cc1389bc3e6883b1724236dc6b76814cc5236b0 Mon Sep 17 00:00:00 2001
From: lihangtian <936971274@qq.com>
Date: Wed, 12 Oct 2022 15:55:45 +0800
Subject: [PATCH 7/8] [ModelZoo] Support FwFM

---
 modelzoo/FwFM/README.md                       |   85 +
 modelzoo/FwFM/data/README.md                  |    4 +
 modelzoo/FwFM/data/prepare_data.sh            |   15 +
 modelzoo/FwFM/data/script/data2labelencode.py |   54 +
 modelzoo/FwFM/data/script/generate_neg.py     |   63 +
 modelzoo/FwFM/data/script/generate_voc.py     |   66 +
 .../FwFM/data/script/history_behavior_list.py |   41 +
 modelzoo/FwFM/data/script/item_map.py         |   29 +
 modelzoo/FwFM/data/script/local_aggretor.py   |   47 +
 modelzoo/FwFM/data/script/pick2txt.py         |   14 +
 modelzoo/FwFM/data/script/process_data.py     |  108 ++
 modelzoo/FwFM/data/script/split_by_user.py    |   18 +
 modelzoo/FwFM/result/README.md                |    2 +
 modelzoo/FwFM/script/__init__.py              |    0
 modelzoo/FwFM/script/contrib/__init__.py      |    0
 modelzoo/FwFM/script/contrib/rnn.py           | 1153 +++++++++++++
 modelzoo/FwFM/script/contrib/rnn_v2.py        | 1452 ++++++++++++++++
 modelzoo/FwFM/script/contrib/utils.py         |  378 +++++
 modelzoo/FwFM/script/estimator/__init__.py    |    1 +
 .../FwFM/script/estimator/feature_column.py   |   52 +
 modelzoo/FwFM/script/estimator/inputs.py      |   52 +
 .../FwFM/script/estimator/models/__init__.py  |   13 +
 modelzoo/FwFM/script/estimator/models/fwfm.py |   84 +
 modelzoo/FwFM/script/estimator/utils.py       |  217 +++
 modelzoo/FwFM/script/feature_column.py        |  249 +++
 modelzoo/FwFM/script/inputs.py                |  155 ++
 modelzoo/FwFM/script/layers/__init__.py       |   52 +
 modelzoo/FwFM/script/layers/activation.py     |   85 +
 modelzoo/FwFM/script/layers/core.py           |  267 +++
 modelzoo/FwFM/script/layers/interaction.py    | 1492 +++++++++++++++++
 modelzoo/FwFM/script/layers/normalization.py  |   51 +
 modelzoo/FwFM/script/layers/sequence.py       |  901 ++++++++++
 modelzoo/FwFM/script/layers/utils.py          |  302 ++++
 modelzoo/FwFM/script/models/__init__.py       |    3 +
 modelzoo/FwFM/script/models/fwfm.py           |   72 +
 modelzoo/FwFM/script/utils.py                 |   46 +
 modelzoo/FwFM/train.py                        |  255 +++
 37 files changed, 7878 insertions(+)
 create mode 100644 modelzoo/FwFM/README.md
 create mode 100644 modelzoo/FwFM/data/README.md
 create mode 100644 modelzoo/FwFM/data/prepare_data.sh
 create mode 100644 modelzoo/FwFM/data/script/data2labelencode.py
 create mode 100644 modelzoo/FwFM/data/script/generate_neg.py
 create mode 100644 modelzoo/FwFM/data/script/generate_voc.py
 create mode 100644 modelzoo/FwFM/data/script/history_behavior_list.py
 create mode 100644 modelzoo/FwFM/data/script/item_map.py
 create mode 100644 modelzoo/FwFM/data/script/local_aggretor.py
 create mode 100644 modelzoo/FwFM/data/script/pick2txt.py
 create mode 100644 modelzoo/FwFM/data/script/process_data.py
 create mode 100644 modelzoo/FwFM/data/script/split_by_user.py
 create mode 100644 modelzoo/FwFM/result/README.md
 create mode 100644 modelzoo/FwFM/script/__init__.py
 create mode 100644 modelzoo/FwFM/script/contrib/__init__.py
 create mode 100644 modelzoo/FwFM/script/contrib/rnn.py
 create mode 100644 modelzoo/FwFM/script/contrib/rnn_v2.py
 create mode 100644 modelzoo/FwFM/script/contrib/utils.py
 create mode 100644 modelzoo/FwFM/script/estimator/__init__.py
 create mode 100644 modelzoo/FwFM/script/estimator/feature_column.py
 create mode 100644 modelzoo/FwFM/script/estimator/inputs.py
 create mode 100644 modelzoo/FwFM/script/estimator/models/__init__.py
 create mode 100644 modelzoo/FwFM/script/estimator/models/fwfm.py
 create mode 100644 modelzoo/FwFM/script/estimator/utils.py
 create mode 100644 modelzoo/FwFM/script/feature_column.py
 create mode 100644 modelzoo/FwFM/script/inputs.py
 create mode 100644 modelzoo/FwFM/script/layers/__init__.py
 create mode 100644 modelzoo/FwFM/script/layers/activation.py
 create mode 100644 modelzoo/FwFM/script/layers/core.py
 create mode 100644 modelzoo/FwFM/script/layers/interaction.py
 create mode 100644 modelzoo/FwFM/script/layers/normalization.py
 create mode 100644 modelzoo/FwFM/script/layers/sequence.py
 create mode 100644 modelzoo/FwFM/script/layers/utils.py
 create mode 100644 modelzoo/FwFM/script/models/__init__.py
 create mode 100644 modelzoo/FwFM/script/models/fwfm.py
 create mode 100644 modelzoo/FwFM/script/utils.py
 create mode 100644 modelzoo/FwFM/train.py

diff --git a/modelzoo/FwFM/README.md b/modelzoo/FwFM/README.md
new file mode 100644
index 00000000000..6e5ebfa5cb9
--- /dev/null
+++ b/modelzoo/FwFM/README.md
@@ -0,0 +1,85 @@
+# FwFM
+
+The following is a brief directory structure and description for this example:
+
+
+
+```
+├── data                        # Data set directory
+│   ├── prepare_data.sh         # Shell script to download and process dataset
+│   └── README.md              # Documentation describing how to prepare dataset
+│	└──script                   # Directory contains scripts to process dataset
+│       ├──data2labelencode           # Convert data to csv file
+│       ├── generate_neg.py           # Create negative sample
+│       ├── generate_voc.py           # Create a list of features
+│       ├── history_behavior_list.py  # Count user's history behaviors
+│       ├── item_map.py               # Create a map between item id and cate
+│       ├── local_aggretor.py         # Generate sample data
+│       ├── pick2txt.py               # Convert voc's format
+│       ├── process_data.py           # Parse raw json data
+│       └── split_by_user.py          # Divide the dataset
+├── script                       # model set directory
+│	├── contrib                  #Directory contains rnn
+│	├── estimator                #Directory contains estimator to data
+│	├── layers                   #Directory contains layers of model 
+│	├── models                   #Directory contains FNN model
+│	├── feature_column.py        # Feature marker
+│	├── inputs.py                #Construction of Input Layer
+│	└──utils
+├── train.py                    # Training script
+└── README.md                      # Documentation
+```
+
+
+
+## Content
+
+[TOC]
+
+
+
+## Model Structure
+
+Implementation of paper "Field-weighted Factorization Machines for Click-Through Rate Prediction in Display Advertising".
+
+
+
+## Usage
+
+### Stand-alone Training
+
+1. Please prepare the data set and DeepRec env.
+
+   1. Manually
+
+      - Follow [dataset preparation](https://github.com/alibaba/DeepRec/tree/main/modelzoo/DIEN#prepare) to prepare data set.
+      - Download code by `git clone https://github.com/alibaba/DeepRec`
+      - Follow [How to Build](https://github.com/alibaba/DeepRec#how-to-build) to build DeepRec whl package and install by `pip install $DEEPREC_WHL`.
+
+   2. Docker(Recommended)
+
+      ```
+      docker pull alideeprec/deeprec-release-modelzoo:latest
+      docker run -it alideeprec/deeprec-release-modelzoo:latest /bin/bash
+      
+      # In docker container
+      cd /root/modelzoo/CAN
+      ```
+
+​	2.train.
+
+```
+  python train.py
+```
+
+​	
+
+
+
+## Dataset
+
+ Amazon Dataset Books dataset is used as benchmark dataset.
+
+### Prepare
+
+For details of Data download, see [Data Preparation](https://github.com/Atomu2014/make-ipinyou-data)
diff --git a/modelzoo/FwFM/data/README.md b/modelzoo/FwFM/data/README.md
new file mode 100644
index 00000000000..15a0bc61c8d
--- /dev/null
+++ b/modelzoo/FwFM/data/README.md
@@ -0,0 +1,4 @@
+make-ipinyou-data
+=================
+
+For details of Data download, see [Data Preparation](https://github.com/Atomu2014/make-ipinyou-data)
diff --git a/modelzoo/FwFM/data/prepare_data.sh b/modelzoo/FwFM/data/prepare_data.sh
new file mode 100644
index 00000000000..49fdb9a0da1
--- /dev/null
+++ b/modelzoo/FwFM/data/prepare_data.sh
@@ -0,0 +1,15 @@
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Books.json.gz
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Books.json.gz
+gunzip reviews_Books.json.gz
+gunzip meta_Books.json.gz
+
+python script/process_data.py meta_Books.json reviews_Books.json
+python script/local_aggretor.py
+python script/split_by_user.py
+python script/generate_voc.py
+
+python script/item_map.py
+python script/history_behavior_list.py
+python script/generate_neg.py
+
+python script/data2labelencode.py
\ No newline at end of file
diff --git a/modelzoo/FwFM/data/script/data2labelencode.py b/modelzoo/FwFM/data/script/data2labelencode.py
new file mode 100644
index 00000000000..04daba5e28a
--- /dev/null
+++ b/modelzoo/FwFM/data/script/data2labelencode.py
@@ -0,0 +1,54 @@
+import pandas as pd
+import numpy as np
+import pickle
+
+UNSEQ_COLUMNS = ['UID', 'ITEM', 'CATEGORY']
+HIS_COLUMNS = ['HISTORY_ITEM', 'HISTORY_CATEGORY']
+SEQ_COLUMNS = HIS_COLUMNS
+LABEL_COLUMN = ['CLICKED']
+TRAIN_DATA_COLUMNS = LABEL_COLUMN + UNSEQ_COLUMNS + SEQ_COLUMNS
+
+
+
+def inputs_to_labelencode(filename):
+    def encoder_dict(data, category_col):
+        category_dict = data[category_col].value_counts()
+        category_dict = pd.Series(np.arange(0, len(category_dict)), index=category_dict.index).to_dict()
+        data[category_col + '_encode'] = data[category_col].map(category_dict).astype('int32')
+        return data
+
+    uid_file = '../CAN/data/uid_voc.txt'
+    mid_file = '../CAN/data/mid_voc.txt'
+    cat_file = '../CAN/data/cat_voc.txt'
+
+    uid_data = pd.read_csv(uid_file, encoding="utf-8", header=None, names=['UID'])
+    mid_data = pd.read_csv(mid_file, encoding="utf-8", header=None, names=['ITEM'])
+    cat_data = pd.read_csv(cat_file, encoding="utf-8", header=None, names=['CATEGORY'])
+
+    uid_data = encoder_dict(uid_data, 'UID')
+    mid_data = encoder_dict(mid_data, 'ITEM')
+    cat_data = encoder_dict(cat_data, 'CATEGORY')
+
+    dataset = pd.read_csv(filename, encoding="utf-8",
+                          header=None, names=TRAIN_DATA_COLUMNS, sep="\t", low_memory=False)
+    for key in ['UID','ITEM','CATEGORY']:
+        if key=='UID':
+            dataset = pd.merge(dataset, uid_data, on=key, how='inner')
+        elif key=='ITEM':
+            dataset = pd.merge(dataset, mid_data, on=key, how='inner')
+        else:
+            dataset = pd.merge(dataset, cat_data, on=key, how='inner')
+
+    dataset = dataset.drop(UNSEQ_COLUMNS + SEQ_COLUMNS, axis=1)
+
+    dataset.to_csv(filename + '_to_labelencode.txt',index=0,header=0)
+    uid_data.to_csv('dataset/uid_labelencode.csv',index=False)
+    mid_data.to_csv('dataset/mid_labelencode.csv',index=False)
+    cat_data.to_csv('dataset/cat_labelencode.csv',index=False)
+
+
+
+if __name__ == '__main__':
+    inputs_to_labelencode('../CAN/data/local_train_splitByUser')
+    inputs_to_labelencode('../CAN/data/local_test_splitByUser')
+
diff --git a/modelzoo/FwFM/data/script/generate_neg.py b/modelzoo/FwFM/data/script/generate_neg.py
new file mode 100644
index 00000000000..a10ef919e13
--- /dev/null
+++ b/modelzoo/FwFM/data/script/generate_neg.py
@@ -0,0 +1,63 @@
+import random
+
+NEG_SEQ_LENGTH_FOR_EACH_HISTORY_ITEM = 1
+
+
+def createNegData(file):
+    with open(file, 'r') as f_raw:
+        with open(file + '_neg', 'w') as f_out:
+            FirstLine = True
+            for line in f_raw:
+                linelist = line.strip().split('\t')
+                uid = linelist[1]
+
+                if uid not in user_history_behavior:
+                    str = '\t'
+                else:
+                    his_items = linelist[4].split('')
+                    neg_items_str = ''
+                    neg_cates_str = ''
+                    for pos in his_items:
+                        tmp_items_str = ''
+                        tmp_cates_str = ''
+                        tmp_items = []
+                        tmp_cates = []
+                        neg_length = 0
+                        while (True):
+                            index = random.randint(
+                                0,
+                                len(user_history_behavior[uid][0]) - 1)
+                            if user_history_behavior[uid][0][index] != pos:
+                                tmp_items.append(
+                                    user_history_behavior[uid][0][index])
+                                tmp_cates.append(
+                                    user_history_behavior[uid][1][index])
+                                neg_length += 1
+                            if neg_length >= NEG_SEQ_LENGTH_FOR_EACH_HISTORY_ITEM:
+                                break
+                        for item in tmp_items:
+                            tmp_items_str += (item + '')
+                        for cate in tmp_cates:
+                            tmp_cates_str += (cate + '')
+                        neg_items_str += (tmp_items_str[:-1] + '')
+                        neg_cates_str += (tmp_cates_str[:-1] + '')
+                    str = neg_items_str[:-1] + '\t' + neg_cates_str[:-1]
+                if FirstLine:
+                    f_out.write(str)
+                    FirstLine = False
+                else:
+                    f_out.write('\n' + str)
+
+
+user_history_behavior = {}
+with open('user_history_behavior.txt', 'r') as f:
+    for line in f:
+        linelist = line.strip().split('\t')
+        uid = linelist[0]
+        items = linelist[1].split('')
+        cates = linelist[2].split('')
+        user_history_behavior[uid] = [items, cates]
+
+data_file = ['local_test_splitByUser', 'local_train_splitByUser']
+for file in data_file:
+    createNegData(file)
diff --git a/modelzoo/FwFM/data/script/generate_voc.py b/modelzoo/FwFM/data/script/generate_voc.py
new file mode 100644
index 00000000000..447fe6393b7
--- /dev/null
+++ b/modelzoo/FwFM/data/script/generate_voc.py
@@ -0,0 +1,66 @@
+# import cPickle
+import pickle as cPickle
+
+f_train = open("local_train_splitByUser", "r")
+uid_dict = {}
+mid_dict = {}
+cat_dict = {}
+
+iddd = 0
+for line in f_train:
+    arr = line.strip("\n").split("\t")
+    clk = arr[0]
+    uid = arr[1]
+    mid = arr[2]
+    cat = arr[3]
+    mid_list = arr[4]
+    cat_list = arr[5]
+    if uid not in uid_dict:
+        uid_dict[uid] = 0
+    uid_dict[uid] += 1
+    if mid not in mid_dict:
+        mid_dict[mid] = 0
+    mid_dict[mid] += 1
+    if cat not in cat_dict:
+        cat_dict[cat] = 0
+    cat_dict[cat] += 1
+    if len(mid_list) == 0:
+        continue
+    for m in mid_list.split(""):
+        if m not in mid_dict:
+            mid_dict[m] = 0
+        mid_dict[m] += 1
+    #print iddd
+    iddd+=1
+    for c in cat_list.split(""):
+        if c not in cat_dict:
+            cat_dict[c] = 0
+        cat_dict[c] += 1
+
+sorted_uid_dict = sorted(uid_dict.items(), key=lambda x:x[1], reverse=True)
+sorted_mid_dict = sorted(mid_dict.items(), key=lambda x:x[1], reverse=True)
+sorted_cat_dict = sorted(cat_dict.items(), key=lambda x:x[1], reverse=True)
+
+uid_voc = {}
+index = 0
+for key, value in sorted_uid_dict:
+    uid_voc[key] = index
+    index += 1
+
+mid_voc = {}
+mid_voc["default_mid"] = 0
+index = 1
+for key, value in sorted_mid_dict:
+    mid_voc[key] = index
+    index += 1
+
+cat_voc = {}
+cat_voc["default_cat"] = 0
+index = 1
+for key, value in sorted_cat_dict:
+    cat_voc[key] = index
+    index += 1
+
+cPickle.dump(uid_voc, open("uid_voc.pkl", "wb"))
+cPickle.dump(mid_voc, open("mid_voc.pkl", "wb"))
+cPickle.dump(cat_voc, open("cat_voc.pkl", "wb"))
diff --git a/modelzoo/FwFM/data/script/history_behavior_list.py b/modelzoo/FwFM/data/script/history_behavior_list.py
new file mode 100644
index 00000000000..6adaf398cef
--- /dev/null
+++ b/modelzoo/FwFM/data/script/history_behavior_list.py
@@ -0,0 +1,41 @@
+item_to_cate_map = {}
+with open('item2catmap.txt', 'r') as f:
+    for line in f:
+        linelist = line.strip().split('\t')
+        item = linelist[0]
+        cate = linelist[1]
+        item_to_cate_map[item] = cate
+
+user_history_behavior = {}
+with open('reviews-info', 'r') as f:
+    for line in f:
+        linelist = line.strip().split('\t')
+        uid = linelist[0]
+        item = linelist[1]
+        if uid not in user_history_behavior:
+            user_history_behavior[uid] = [item]
+        else:
+            if item not in user_history_behavior[uid]:
+                user_history_behavior[uid].append(item)
+
+FirstLine = True
+with open('user_history_behavior.txt', 'w') as f:
+    for uid, items in user_history_behavior.items():
+        itemstr = ''
+        catestr = ''
+        for i in items:
+            if i in item_to_cate_map:
+                c = item_to_cate_map[i]
+            else:
+                c = 'Unknown'
+            if not itemstr:
+                itemstr += i
+                catestr += c
+            else:
+                itemstr += ('' + i)
+                catestr += ('' + c)
+        if FirstLine:
+            f.write(uid + '\t' + itemstr + '\t' + catestr)
+            FirstLine = False
+        else:
+            f.write('\n' + uid + '\t' + itemstr + '\t' + catestr)
diff --git a/modelzoo/FwFM/data/script/item_map.py b/modelzoo/FwFM/data/script/item_map.py
new file mode 100644
index 00000000000..94bebee5184
--- /dev/null
+++ b/modelzoo/FwFM/data/script/item_map.py
@@ -0,0 +1,29 @@
+import sys
+from tqdm import tqdm
+
+data_file = ['local_test_splitByUser', 'local_train_splitByUser']
+
+item_to_cate_map = {}
+# 367983
+for file_name in data_file:
+    with open(file_name, 'r') as f:
+        for line in f:
+            linelist = line.strip().split('\t')
+            items = linelist[4].split('')
+            cates = linelist[5].split('')
+            items.append(linelist[2])
+            cates.append(linelist[3])
+            # print(items)
+            # print(cates)
+            for index, item in enumerate(items):
+                if item not in item_to_cate_map:
+                    item_to_cate_map[item] = cates[index]
+
+with open('item2catmap.txt', 'w') as f:
+    firstline = True
+    for item, cate in item_to_cate_map.items():
+        if firstline:
+            f.write(item + '\t' + cate)
+            firstline = False
+        else:
+            f.write('\n' + item + '\t' + cate)
diff --git a/modelzoo/FwFM/data/script/local_aggretor.py b/modelzoo/FwFM/data/script/local_aggretor.py
new file mode 100644
index 00000000000..1fd8aceb32c
--- /dev/null
+++ b/modelzoo/FwFM/data/script/local_aggretor.py
@@ -0,0 +1,47 @@
+import sys
+import hashlib
+import random
+
+fin = open("jointed-new-split-info", "r")
+ftrain = open("local_train", "w")
+ftest = open("local_test", "w")
+
+last_user = "0"
+common_fea = ""
+line_idx = 0
+for line in fin:
+    items = line.strip().split("\t")
+    ds = items[0]
+    clk = int(items[1])
+    user = items[2]
+    movie_id = items[3]
+    dt = items[5]
+    cat1 = items[6]
+
+    if ds == "20180118":
+        fo = ftrain
+    else:
+        fo = ftest
+    if user != last_user:
+        movie_id_list = []
+        cate1_list = []
+        #print >> fo, items[1] + "\t" + user + "\t" + movie_id + "\t" + cat1 +"\t" + "" + "\t" + ""
+    else:
+        history_clk_num = len(movie_id_list)
+        cat_str = ""
+        mid_str = ""
+        for c1 in cate1_list:
+            cat_str += c1 + ""
+        for mid in movie_id_list:
+            mid_str += mid + ""
+        if len(cat_str) > 0: cat_str = cat_str[:-1]
+        if len(mid_str) > 0: mid_str = mid_str[:-1]
+        if history_clk_num >= 1:  # 8 is the average length of user behavior
+            print(items[1] + "\t" + user + "\t" + movie_id + "\t" + cat1 +
+                  "\t" + mid_str + "\t" + cat_str,
+                  file=fo)
+    last_user = user
+    if clk:
+        movie_id_list.append(movie_id)
+        cate1_list.append(cat1)
+    line_idx += 1
diff --git a/modelzoo/FwFM/data/script/pick2txt.py b/modelzoo/FwFM/data/script/pick2txt.py
new file mode 100644
index 00000000000..b7c129ffbe0
--- /dev/null
+++ b/modelzoo/FwFM/data/script/pick2txt.py
@@ -0,0 +1,14 @@
+import pickle
+
+def pkl2txt(filename):
+    pklfile = pickle.load(open(filename+'.pkl', 'rb'))
+    with open(filename+'.txt','w') as f:
+        f.write('\n'.join(pklfile))
+
+
+
+
+if __name__ == '__main__':
+    pkl2txt('uid_voc')
+    pkl2txt('mid_voc')
+    pkl2txt('cat_voc')
\ No newline at end of file
diff --git a/modelzoo/FwFM/data/script/process_data.py b/modelzoo/FwFM/data/script/process_data.py
new file mode 100644
index 00000000000..0bff64f30bd
--- /dev/null
+++ b/modelzoo/FwFM/data/script/process_data.py
@@ -0,0 +1,108 @@
+import sys
+import random
+import time
+
+
+def process_meta(file):
+    fi = open(file, "r")
+    fo = open("item-info", "w")
+    for line in fi:
+        obj = eval(line)
+        cat = obj["categories"][0][-1]
+        print(obj["asin"] + "\t" + cat, file=fo)
+
+
+def process_reviews(file):
+    fi = open(file, "r")
+    user_map = {}
+    fo = open("reviews-info", "w")
+    for line in fi:
+        obj = eval(line)
+        userID = obj["reviewerID"]
+        itemID = obj["asin"]
+        rating = obj["overall"]
+        time = obj["unixReviewTime"]
+        print(userID + "\t" + itemID + "\t" + str(rating) + "\t" + str(time),
+              file=fo)
+
+
+def manual_join():
+    f_rev = open("reviews-info", "r")
+    user_map = {}
+    item_list = []
+    for line in f_rev:
+        line = line.strip()
+        items = line.split("\t")
+        #loctime = time.localtime(float(items[-1]))
+        #items[-1] = time.strftime('%Y-%m-%d', loctime)
+        if items[0] not in user_map:
+            user_map[items[0]] = []
+        user_map[items[0]].append(("\t".join(items), float(items[-1])))
+        item_list.append(items[1])
+    f_meta = open("item-info", "r")
+    meta_map = {}
+    for line in f_meta:
+        arr = line.strip().split("\t")
+        if arr[0] not in meta_map:
+            meta_map[arr[0]] = arr[1]
+            arr = line.strip().split("\t")
+    fo = open("jointed-new", "w")
+    for key in user_map:
+        sorted_user_bh = sorted(user_map[key], key=lambda x: x[1])
+        for line, t in sorted_user_bh:
+            items = line.split("\t")
+            asin = items[1]
+            j = 0
+            while True:
+                asin_neg_index = random.randint(0, len(item_list) - 1)
+                asin_neg = item_list[asin_neg_index]
+                if asin_neg == asin:
+                    continue
+                items[1] = asin_neg
+                print("0" + "\t" + "\t".join(items) + "\t" +
+                      meta_map[asin_neg],
+                      file=fo)
+                j += 1
+                if j == 1:  #negative sampling frequency
+                    break
+            if asin in meta_map:
+                print("1" + "\t" + line + "\t" + meta_map[asin], file=fo)
+            else:
+                print("1" + "\t" + line + "\t" + "default_cat", file=fo)
+
+
+def split_test():
+    fi = open("jointed-new", "r")
+    fo = open("jointed-new-split-info", "w")
+    user_count = {}
+    for line in fi:
+        line = line.strip()
+        user = line.split("\t")[1]
+        if user not in user_count:
+            user_count[user] = 0
+        user_count[user] += 1
+    fi.seek(0)
+    i = 0
+    last_user = "A26ZDKC53OP6JD"
+    for line in fi:
+        line = line.strip()
+        user = line.split("\t")[1]
+        if user == last_user:
+            if i < user_count[user] - 2:  # 1 + negative samples
+                print("20180118" + "\t" + line, file=fo)
+            else:
+                print("20190119" + "\t" + line, file=fo)
+        else:
+            last_user = user
+            i = 0
+            if i < user_count[user] - 2:
+                print("20180118" + "\t" + line, file=fo)
+            else:
+                print("20190119" + "\t" + line, file=fo)
+        i += 1
+
+
+process_meta(sys.argv[1])
+process_reviews(sys.argv[2])
+manual_join()
+split_test()
diff --git a/modelzoo/FwFM/data/script/split_by_user.py b/modelzoo/FwFM/data/script/split_by_user.py
new file mode 100644
index 00000000000..cc7988c6601
--- /dev/null
+++ b/modelzoo/FwFM/data/script/split_by_user.py
@@ -0,0 +1,18 @@
+import random
+
+fi = open("local_test", "r")
+ftrain = open("local_train_splitByUser", "w")
+ftest = open("local_test_splitByUser", "w")
+
+while True:
+    rand_int = random.randint(1, 10)
+    noclk_line = fi.readline().strip()
+    clk_line = fi.readline().strip()
+    if noclk_line == "" or clk_line == "":
+        break
+    if rand_int == 2:
+        print(noclk_line, file=ftest)
+        print(clk_line, file=ftest)
+    else:
+        print(noclk_line, file=ftrain)
+        print(clk_line, file=ftrain)
diff --git a/modelzoo/FwFM/result/README.md b/modelzoo/FwFM/result/README.md
new file mode 100644
index 00000000000..ccec44eb9a5
--- /dev/null
+++ b/modelzoo/FwFM/result/README.md
@@ -0,0 +1,2 @@
+# Result
+Checkpoint & timeline file are default saved in this folder.
diff --git a/modelzoo/FwFM/script/__init__.py b/modelzoo/FwFM/script/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/modelzoo/FwFM/script/contrib/__init__.py b/modelzoo/FwFM/script/contrib/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/modelzoo/FwFM/script/contrib/rnn.py b/modelzoo/FwFM/script/contrib/rnn.py
new file mode 100644
index 00000000000..b3554993063
--- /dev/null
+++ b/modelzoo/FwFM/script/contrib/rnn.py
@@ -0,0 +1,1153 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+#
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+
+# you may not use this file except in compliance with the License.
+
+# You may obtain a copy of the License at
+
+#
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+#
+
+# Unless required by applicable law or agreed to in writing, software
+
+# distributed under the License is distributed on an "AS IS" BASIS,
+
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+# See the License for the specific language governing permissions and
+
+# limitations under the License.
+
+# ==============================================================================
+
+
+"""RNN helpers for TensorFlow models.
+@@bidirectional_dynamic_rnn
+@@dynamic_rnn
+@@raw_rnn
+@@static_rnn
+@@static_state_saving_rnn
+@@static_bidirectional_rnn
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.util import nest
+import tensorflow as tf
+
+
+def _like_rnncell_(cell):
+    """Checks that a given object is an RNNCell by using duck typing."""
+
+    conditions = [hasattr(cell, "output_size"), hasattr(cell, "state_size"),
+
+                  hasattr(cell, "zero_state"), callable(cell)]
+
+    return all(conditions)
+
+
+# pylint: disable=protected-access
+
+_concat = rnn_cell_impl._concat
+try:
+    _like_rnncell = rnn_cell_impl._like_rnncell
+except Exception as e:
+    _like_rnncell = _like_rnncell_
+
+
+# pylint: enable=protected-access
+
+
+def _transpose_batch_time(x):
+    """Transpose the batch and time dimensions of a Tensor.
+    Retains as much of the static shape information as possible.
+    Args:
+      x: A tensor of rank 2 or higher.
+    Returns:
+      x transposed along the first two dimensions.
+    Raises:
+      ValueError: if `x` is rank 1 or lower.
+    """
+
+    x_static_shape = x.get_shape()
+
+    if x_static_shape.ndims is not None and x_static_shape.ndims < 2:
+        raise ValueError(
+
+            "Expected input tensor %s to have rank at least 2, but saw shape: %s" %
+
+            (x, x_static_shape))
+
+    x_rank = array_ops.rank(x)
+
+    x_t = array_ops.transpose(
+
+        x, array_ops.concat(
+
+            ([1, 0], math_ops.range(2, x_rank)), axis=0))
+
+    x_t.set_shape(
+
+        tensor_shape.TensorShape([
+
+            x_static_shape[1].value, x_static_shape[0].value
+
+        ]).concatenate(x_static_shape[2:]))
+
+    return x_t
+
+
+def _best_effort_input_batch_size(flat_input):
+    """Get static input batch size if available, with fallback to the dynamic one.
+    Args:
+      flat_input: An iterable of time major input Tensors of shape [max_time,
+        batch_size, ...]. All inputs should have compatible batch sizes.
+    Returns:
+      The batch size in Python integer if available, or a scalar Tensor otherwise.
+    Raises:
+      ValueError: if there is any input with an invalid shape.
+    """
+
+    for input_ in flat_input:
+
+        shape = input_.shape
+
+        if shape.ndims is None:
+            continue
+
+        if shape.ndims < 2:
+            raise ValueError(
+
+                "Expected input tensor %s to have rank at least 2" % input_)
+
+        batch_size = shape[1].value
+
+        if batch_size is not None:
+            return batch_size
+
+    # Fallback to the dynamic batch size of the first input.
+
+    return array_ops.shape(flat_input[0])[1]
+
+
+def _infer_state_dtype(explicit_dtype, state):
+    """Infer the dtype of an RNN state.
+    Args:
+      explicit_dtype: explicitly declared dtype or None.
+      state: RNN's hidden state. Must be a Tensor or a nested iterable containing
+        Tensors.
+    Returns:
+      dtype: inferred dtype of hidden state.
+    Raises:
+      ValueError: if `state` has heterogeneous dtypes or is empty.
+    """
+
+    if explicit_dtype is not None:
+
+        return explicit_dtype
+
+    elif nest.is_sequence(state):
+
+        inferred_dtypes = [element.dtype for element in nest.flatten(state)]
+
+        if not inferred_dtypes:
+            raise ValueError("Unable to infer dtype from empty state.")
+
+        all_same = all([x == inferred_dtypes[0] for x in inferred_dtypes])
+
+        if not all_same:
+            raise ValueError(
+
+                "State has tensors of different inferred_dtypes. Unable to infer a "
+
+                "single representative dtype.")
+
+        return inferred_dtypes[0]
+
+    else:
+
+        return state.dtype
+
+
+# pylint: disable=unused-argument
+
+def _rnn_step(
+
+        time, sequence_length, min_sequence_length, max_sequence_length,
+
+        zero_output, state, call_cell, state_size, skip_conditionals=False):
+    """Calculate one step of a dynamic RNN minibatch.
+    Returns an (output, state) pair conditioned on the sequence_lengths.
+    When skip_conditionals=False, the pseudocode is something like:
+    if t >= max_sequence_length:
+      return (zero_output, state)
+    if t < min_sequence_length:
+      return call_cell()
+    # Selectively output zeros or output, old state or new state depending
+    # on if we've finished calculating each row.
+    new_output, new_state = call_cell()
+    final_output = np.vstack([
+      zero_output if time >= sequence_lengths[r] else new_output_r
+      for r, new_output_r in enumerate(new_output)
+    ])
+    final_state = np.vstack([
+      state[r] if time >= sequence_lengths[r] else new_state_r
+      for r, new_state_r in enumerate(new_state)
+    ])
+    return (final_output, final_state)
+    Args:
+      time: Python int, the current time step
+      sequence_length: int32 `Tensor` vector of size [batch_size]
+      min_sequence_length: int32 `Tensor` scalar, min of sequence_length
+      max_sequence_length: int32 `Tensor` scalar, max of sequence_length
+      zero_output: `Tensor` vector of shape [output_size]
+      state: Either a single `Tensor` matrix of shape `[batch_size, state_size]`,
+        or a list/tuple of such tensors.
+      call_cell: lambda returning tuple of (new_output, new_state) where
+        new_output is a `Tensor` matrix of shape `[batch_size, output_size]`.
+        new_state is a `Tensor` matrix of shape `[batch_size, state_size]`.
+      state_size: The `cell.state_size` associated with the state.
+      skip_conditionals: Python bool, whether to skip using the conditional
+        calculations.  This is useful for `dynamic_rnn`, where the input tensor
+        matches `max_sequence_length`, and using conditionals just slows
+        everything down.
+    Returns:
+      A tuple of (`final_output`, `final_state`) as given by the pseudocode above:
+        final_output is a `Tensor` matrix of shape [batch_size, output_size]
+        final_state is either a single `Tensor` matrix, or a tuple of such
+          matrices (matching length and shapes of input `state`).
+    Raises:
+      ValueError: If the cell returns a state tuple whose length does not match
+        that returned by `state_size`.
+    """
+
+    # Convert state to a list for ease of use
+
+    flat_state = nest.flatten(state)
+
+    flat_zero_output = nest.flatten(zero_output)
+
+    def _copy_one_through(output, new_output):
+
+        # If the state contains a scalar value we simply pass it through.
+
+        if output.shape.ndims == 0:
+            return new_output
+
+        copy_cond = (time >= sequence_length)
+
+        with ops.colocate_with(new_output):
+            return array_ops.where(copy_cond, output, new_output)
+
+    def _copy_some_through(flat_new_output, flat_new_state):
+
+        # Use broadcasting select to determine which values should get
+
+        # the previous state & zero output, and which values should get
+
+        # a calculated state & output.
+
+        flat_new_output = [
+
+            _copy_one_through(zero_output, new_output)
+
+            for zero_output, new_output in zip(flat_zero_output, flat_new_output)]
+
+        flat_new_state = [
+
+            _copy_one_through(state, new_state)
+
+            for state, new_state in zip(flat_state, flat_new_state)]
+
+        return flat_new_output + flat_new_state
+
+    def _maybe_copy_some_through():
+
+        """Run RNN step.  Pass through either no or some past state."""
+
+        new_output, new_state = call_cell()
+
+        nest.assert_same_structure(state, new_state)
+
+        flat_new_state = nest.flatten(new_state)
+
+        flat_new_output = nest.flatten(new_output)
+
+        return control_flow_ops.cond(
+
+            # if t < min_seq_len: calculate and return everything
+
+            time < min_sequence_length, lambda: flat_new_output + flat_new_state,
+
+            # else copy some of it through
+
+            lambda: _copy_some_through(flat_new_output, flat_new_state))
+
+    # TODO(ebrevdo): skipping these conditionals may cause a slowdown,
+
+    # but benefits from removing cond() and its gradient.  We should
+
+    # profile with and without this switch here.
+
+    if skip_conditionals:
+
+        # Instead of using conditionals, perform the selective copy at all time
+
+        # steps.  This is faster when max_seq_len is equal to the number of unrolls
+
+        # (which is typical for dynamic_rnn).
+
+        new_output, new_state = call_cell()
+
+        nest.assert_same_structure(state, new_state)
+
+        new_state = nest.flatten(new_state)
+
+        new_output = nest.flatten(new_output)
+
+        final_output_and_state = _copy_some_through(new_output, new_state)
+
+    else:
+
+        empty_update = lambda: flat_zero_output + flat_state
+
+        final_output_and_state = control_flow_ops.cond(
+
+            # if t >= max_seq_len: copy all state through, output zeros
+
+            time >= max_sequence_length, empty_update,
+
+            # otherwise calculation is required: copy some or all of it through
+
+            _maybe_copy_some_through)
+
+    if len(final_output_and_state) != len(flat_zero_output) + len(flat_state):
+        raise ValueError("Internal error: state and output were not concatenated "
+
+                         "correctly.")
+
+    final_output = final_output_and_state[:len(flat_zero_output)]
+
+    final_state = final_output_and_state[len(flat_zero_output):]
+
+    for output, flat_output in zip(final_output, flat_zero_output):
+        output.set_shape(flat_output.get_shape())
+
+    for substate, flat_substate in zip(final_state, flat_state):
+        substate.set_shape(flat_substate.get_shape())
+
+    final_output = nest.pack_sequence_as(
+
+        structure=zero_output, flat_sequence=final_output)
+
+    final_state = nest.pack_sequence_as(
+
+        structure=state, flat_sequence=final_state)
+
+    return final_output, final_state
+
+
+def _reverse_seq(input_seq, lengths):
+    """Reverse a list of Tensors up to specified lengths.
+    Args:
+      input_seq: Sequence of seq_len tensors of dimension (batch_size, n_features)
+                 or nested tuples of tensors.
+      lengths:   A `Tensor` of dimension batch_size, containing lengths for each
+                 sequence in the batch. If "None" is specified, simply reverses
+                 the list.
+    Returns:
+      time-reversed sequence
+    """
+
+    if lengths is None:
+        return list(reversed(input_seq))
+
+    flat_input_seq = tuple(nest.flatten(input_) for input_ in input_seq)
+
+    flat_results = [[] for _ in range(len(input_seq))]
+
+    for sequence in zip(*flat_input_seq):
+
+        input_shape = tensor_shape.unknown_shape(
+
+            ndims=sequence[0].get_shape().ndims)
+
+        for input_ in sequence:
+            input_shape.merge_with(input_.get_shape())
+
+            input_.set_shape(input_shape)
+
+        # Join into (time, batch_size, depth)
+
+        s_joined = array_ops.stack(sequence)
+
+        # Reverse along dimension 0
+
+        s_reversed = array_ops.reverse_sequence(s_joined, lengths, 0, 1)
+
+        # Split again into list
+
+        result = array_ops.unstack(s_reversed)
+
+        for r, flat_result in zip(result, flat_results):
+            r.set_shape(input_shape)
+
+            flat_result.append(r)
+
+    results = [nest.pack_sequence_as(structure=input_, flat_sequence=flat_result)
+
+               for input_, flat_result in zip(input_seq, flat_results)]
+
+    return results
+
+
+#
+# def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
+#
+#                               initial_state_fw=None, initial_state_bw=None,
+#
+#                               dtype=None, parallel_iterations=None,
+#
+#                               swap_memory=False, time_major=False, scope=None):
+#
+#   """Creates a dynamic version of bidirectional recurrent neural network.
+#
+#
+#
+#   Takes input and builds independent forward and backward RNNs. The input_size
+#
+#   of forward and backward cell must match. The initial state for both directions
+#
+#   is zero by default (but can be set optionally) and no intermediate states are
+#
+#   ever returned -- the network is fully unrolled for the given (passed in)
+#
+#   length(s) of the sequence(s) or completely unrolled if length(s) is not
+#
+#   given.
+#
+#
+#
+#   Args:
+#
+#     cell_fw: An instance of RNNCell, to be used for forward direction.
+#
+#     cell_bw: An instance of RNNCell, to be used for backward direction.
+#
+#     inputs: The RNN inputs.
+#
+#       If time_major == False (default), this must be a tensor of shape:
+#
+#         `[batch_size, max_time, ...]`, or a nested tuple of such elements.
+#
+#       If time_major == True, this must be a tensor of shape:
+#
+#         `[max_time, batch_size, ...]`, or a nested tuple of such elements.
+#
+#     sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
+#
+#       containing the actual lengths for each of the sequences in the batch.
+#
+#       If not provided, all batch entries are assumed to be full sequences; and
+#
+#       time reversal is applied from time `0` to `max_time` for each sequence.
+#
+#     initial_state_fw: (optional) An initial state for the forward RNN.
+#
+#       This must be a tensor of appropriate type and shape
+#
+#       `[batch_size, cell_fw.state_size]`.
+#
+#       If `cell_fw.state_size` is a tuple, this should be a tuple of
+#
+#       tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
+#
+#     initial_state_bw: (optional) Same as for `initial_state_fw`, but using
+#
+#       the corresponding properties of `cell_bw`.
+#
+#     dtype: (optional) The data type for the initial states and expected output.
+#
+#       Required if initial_states are not provided or RNN states have a
+#
+#       heterogeneous dtype.
+#
+#     parallel_iterations: (Default: 32).  The number of iterations to run in
+#
+#       parallel.  Those operations which do not have any temporal dependency
+#
+#       and can be run in parallel, will be.  This parameter trades off
+#
+#       time for space.  Values >> 1 use more memory but take less time,
+#
+#       while smaller values use less memory but computations take longer.
+#
+#     swap_memory: Transparently swap the tensors produced in forward inference
+#
+#       but needed for back prop from GPU to CPU.  This allows training RNNs
+#
+#       which would typically not fit on a single GPU, with very minimal (or no)
+#
+#       performance penalty.
+#
+#     time_major: The shape format of the `inputs` and `outputs` Tensors.
+#
+#       If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+#
+#       If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+#
+#       Using `time_major = True` is a bit more efficient because it avoids
+#
+#       transposes at the beginning and end of the RNN calculation.  However,
+#
+#       most TensorFlow data is batch-major, so by default this function
+#
+#       accepts input and emits output in batch-major form.
+#
+#     scope: VariableScope for the created subgraph; defaults to
+#
+#       "bidirectional_rnn"
+#
+#
+#
+#   Returns:
+#
+#     A tuple (outputs, output_states) where:
+#
+#       outputs: A tuple (output_fw, output_bw) containing the forward and
+#
+#         the backward rnn output `Tensor`.
+#
+#         If time_major == False (default),
+#
+#           output_fw will be a `Tensor` shaped:
+#
+#           `[batch_size, max_time, cell_fw.output_size]`
+#
+#           and output_bw will be a `Tensor` shaped:
+#
+#           `[batch_size, max_time, cell_bw.output_size]`.
+#
+#         If time_major == True,
+#
+#           output_fw will be a `Tensor` shaped:
+#
+#           `[max_time, batch_size, cell_fw.output_size]`
+#
+#           and output_bw will be a `Tensor` shaped:
+#
+#           `[max_time, batch_size, cell_bw.output_size]`.
+#
+#         It returns a tuple instead of a single concatenated `Tensor`, unlike
+#
+#         in the `bidirectional_rnn`. If the concatenated one is preferred,
+#
+#         the forward and backward outputs can be concatenated as
+#
+#         `tf.concat(outputs, 2)`.
+#
+#       output_states: A tuple (output_state_fw, output_state_bw) containing
+#
+#         the forward and the backward final states of bidirectional rnn.
+#
+#
+#
+#   Raises:
+#
+#     TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
+#
+#   """
+#
+#
+#
+#   if not _like_rnncell(cell_fw):
+#
+#     raise TypeError("cell_fw must be an instance of RNNCell")
+#
+#   if not _like_rnncell(cell_bw):
+#
+#     raise TypeError("cell_bw must be an instance of RNNCell")
+#
+#
+#
+#   with vs.variable_scope(scope or "bidirectional_rnn"):
+#
+#     # Forward direction
+#
+#     with vs.variable_scope("fw") as fw_scope:
+#
+#       output_fw, output_state_fw = dynamic_rnn(
+#
+#           cell=cell_fw, inputs=inputs, sequence_length=sequence_length,
+#
+#           initial_state=initial_state_fw, dtype=dtype,
+#
+#           parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+#
+#           time_major=time_major, scope=fw_scope)
+#
+#
+#
+#     # Backward direction
+#
+#     if not time_major:
+#
+#       time_dim = 1
+#
+#       batch_dim = 0
+#
+#     else:
+#
+#       time_dim = 0
+#
+#       batch_dim = 1
+#
+#
+#
+#     def _reverse(input_, seq_lengths, seq_dim, batch_dim):
+#
+#       if seq_lengths is not None:
+#
+#         return array_ops.reverse_sequence(
+#
+#             input=input_, seq_lengths=seq_lengths,
+#
+#             seq_dim=seq_dim, batch_dim=batch_dim)
+#
+#       else:
+#
+#         return array_ops.reverse(input_, axis=[seq_dim])
+#
+#
+#
+#     with vs.variable_scope("bw") as bw_scope:
+#
+#       inputs_reverse = _reverse(
+#
+#           inputs, seq_lengths=sequence_length,
+#
+#           seq_dim=time_dim, batch_dim=batch_dim)
+#
+#       tmp, output_state_bw = dynamic_rnn(
+#
+#           cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
+#
+#           initial_state=initial_state_bw, dtype=dtype,
+#
+#           parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+#
+#           time_major=time_major, scope=bw_scope)
+#
+#
+#
+#   output_bw = _reverse(
+#
+#       tmp, seq_lengths=sequence_length,
+#
+#       seq_dim=time_dim, batch_dim=batch_dim)
+#
+#
+#
+#   outputs = (output_fw, output_bw)
+#
+#   output_states = (output_state_fw, output_state_bw)
+#
+#
+#
+#   return (outputs, output_states)
+#
+
+
+def dynamic_rnn(cell, inputs, att_scores=None, sequence_length=None, initial_state=None,
+
+                dtype=None, parallel_iterations=None, swap_memory=False,
+
+                time_major=False, scope=None):
+    """Creates a recurrent neural network specified by RNNCell `cell`.
+    Performs fully dynamic unrolling of `inputs`.
+    Example:
+    ```python
+    # create a BasicRNNCell
+    rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+    # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size]
+    # defining initial state
+    initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32)
+    # 'state' is a tensor of shape [batch_size, cell_state_size]
+    outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data,
+                                       initial_state=initial_state,
+                                       dtype=tf.float32)
+    ```
+    ```python
+    # create 2 LSTMCells
+    rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [128, 256]]
+    # create a RNN cell composed sequentially of a number of RNNCells
+    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
+    # 'outputs' is a tensor of shape [batch_size, max_time, 256]
+    # 'state' is a N-tuple where N is the number of LSTMCells containing a
+    # tf.contrib.rnn.LSTMStateTuple for each cell
+    outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
+                                       inputs=data,
+                                       dtype=tf.float32)
+    ```
+    Args:
+      cell: An instance of RNNCell.
+      inputs: The RNN inputs.
+        If `time_major == False` (default), this must be a `Tensor` of shape:
+          `[batch_size, max_time, ...]`, or a nested tuple of such
+          elements.
+        If `time_major == True`, this must be a `Tensor` of shape:
+          `[max_time, batch_size, ...]`, or a nested tuple of such
+          elements.
+        This may also be a (possibly nested) tuple of Tensors satisfying
+        this property.  The first two dimensions must match across all the inputs,
+        but otherwise the ranks and other shape components may differ.
+        In this case, input to `cell` at each time-step will replicate the
+        structure of these tuples, except for the time dimension (from which the
+        time is taken).
+        The input to `cell` at each time step will be a `Tensor` or (possibly
+        nested) tuple of Tensors each with dimensions `[batch_size, ...]`.
+      sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
+        Used to copy-through state and zero-out outputs when past a batch
+        element's sequence length.  So it's more for correctness than performance.
+      initial_state: (optional) An initial state for the RNN.
+        If `cell.state_size` is an integer, this must be
+        a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
+        If `cell.state_size` is a tuple, this should be a tuple of
+        tensors having shapes `[batch_size, s] for s in cell.state_size`.
+      dtype: (optional) The data type for the initial state and expected output.
+        Required if initial_state is not provided or RNN state has a heterogeneous
+        dtype.
+      parallel_iterations: (Default: 32).  The number of iterations to run in
+        parallel.  Those operations which do not have any temporal dependency
+        and can be run in parallel, will be.  This parameter trades off
+        time for space.  Values >> 1 use more memory but take less time,
+        while smaller values use less memory but computations take longer.
+      swap_memory: Transparently swap the tensors produced in forward inference
+        but needed for back prop from GPU to CPU.  This allows training RNNs
+        which would typically not fit on a single GPU, with very minimal (or no)
+        performance penalty.
+      time_major: The shape format of the `inputs` and `outputs` Tensors.
+        If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+        If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+        Using `time_major = True` is a bit more efficient because it avoids
+        transposes at the beginning and end of the RNN calculation.  However,
+        most TensorFlow data is batch-major, so by default this function
+        accepts input and emits output in batch-major form.
+      scope: VariableScope for the created subgraph; defaults to "rnn".
+    Returns:
+      A pair (outputs, state) where:
+      outputs: The RNN output `Tensor`.
+        If time_major == False (default), this will be a `Tensor` shaped:
+          `[batch_size, max_time, cell.output_size]`.
+        If time_major == True, this will be a `Tensor` shaped:
+          `[max_time, batch_size, cell.output_size]`.
+        Note, if `cell.output_size` is a (possibly nested) tuple of integers
+        or `TensorShape` objects, then `outputs` will be a tuple having the
+        same structure as `cell.output_size`, containing Tensors having shapes
+        corresponding to the shape data in `cell.output_size`.
+      state: The final state.  If `cell.state_size` is an int, this
+        will be shaped `[batch_size, cell.state_size]`.  If it is a
+        `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
+        If it is a (possibly nested) tuple of ints or `TensorShape`, this will
+        be a tuple having the corresponding shapes. If cells are `LSTMCells`
+        `state` will be a tuple containing a `LSTMStateTuple` for each cell.
+    Raises:
+      TypeError: If `cell` is not an instance of RNNCell.
+      ValueError: If inputs is None or an empty list.
+    """
+
+    if not _like_rnncell(cell):
+        raise TypeError("cell must be an instance of RNNCell")
+
+    # By default, time_major==False and inputs are batch-major: shaped
+
+    #   [batch, time, depth]
+
+    # For internal calculations, we transpose to [time, batch, depth]
+
+    flat_input = nest.flatten(inputs)
+
+    if not time_major:
+        # (B,T,D) => (T,B,D)
+
+        flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
+
+        flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
+
+    parallel_iterations = parallel_iterations or 32
+
+    if sequence_length is not None:
+
+        sequence_length = math_ops.to_int32(sequence_length)
+
+        if sequence_length.get_shape().ndims not in (None, 1):
+            raise ValueError(
+
+                "sequence_length must be a vector of length batch_size, "
+
+                "but saw shape: %s" % sequence_length.get_shape())
+
+        sequence_length = array_ops.identity(  # Just to find it in the graph.
+
+            sequence_length, name="sequence_length")
+
+    # Create a new scope in which the caching device is either
+
+    # determined by the parent scope, or is set to place the cached
+
+    # Variable using the same placement as for the rest of the RNN.
+
+    with vs.variable_scope(scope or "rnn",reuse=tf.AUTO_REUSE) as varscope:#TODO:user defined reuse
+
+        if varscope.caching_device is None:
+            varscope.set_caching_device(lambda op: op.device)
+
+        batch_size = _best_effort_input_batch_size(flat_input)
+
+        if initial_state is not None:
+
+            state = initial_state
+
+        else:
+
+            if not dtype:
+                raise ValueError("If there is no initial_state, you must give a dtype.")
+
+            state = cell.zero_state(batch_size, dtype)
+
+        def _assert_has_shape(x, shape):
+
+            x_shape = array_ops.shape(x)
+
+            packed_shape = array_ops.stack(shape)
+
+            return control_flow_ops.Assert(
+
+                math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)),
+
+                ["Expected shape for Tensor %s is " % x.name,
+
+                 packed_shape, " but saw shape: ", x_shape])
+
+        if sequence_length is not None:
+            # Perform some shape validation
+
+            with ops.control_dependencies(
+
+                    [_assert_has_shape(sequence_length, [batch_size])]):
+                sequence_length = array_ops.identity(
+
+                    sequence_length, name="CheckSeqLen")
+
+        inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input)
+
+        (outputs, final_state) = _dynamic_rnn_loop(
+
+            cell,
+
+            inputs,
+
+            state,
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory,
+
+            att_scores=att_scores,
+
+            sequence_length=sequence_length,
+
+            dtype=dtype)
+
+        # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth].
+
+        # If we are performing batch-major calculations, transpose output back
+
+        # to shape [batch, time, depth]
+
+        if not time_major:
+            # (T,B,D) => (B,T,D)
+
+            outputs = nest.map_structure(_transpose_batch_time, outputs)
+
+        return (outputs, final_state)
+
+
+def _dynamic_rnn_loop(cell,
+
+                      inputs,
+
+                      initial_state,
+
+                      parallel_iterations,
+
+                      swap_memory,
+
+                      att_scores=None,
+
+                      sequence_length=None,
+
+                      dtype=None):
+    """Internal implementation of Dynamic RNN.
+    Args:
+      cell: An instance of RNNCell.
+      inputs: A `Tensor` of shape [time, batch_size, input_size], or a nested
+        tuple of such elements.
+      initial_state: A `Tensor` of shape `[batch_size, state_size]`, or if
+        `cell.state_size` is a tuple, then this should be a tuple of
+        tensors having shapes `[batch_size, s] for s in cell.state_size`.
+      parallel_iterations: Positive Python int.
+      swap_memory: A Python boolean
+      sequence_length: (optional) An `int32` `Tensor` of shape [batch_size].
+      dtype: (optional) Expected dtype of output. If not specified, inferred from
+        initial_state.
+    Returns:
+      Tuple `(final_outputs, final_state)`.
+      final_outputs:
+        A `Tensor` of shape `[time, batch_size, cell.output_size]`.  If
+        `cell.output_size` is a (possibly nested) tuple of ints or `TensorShape`
+        objects, then this returns a (possibly nsted) tuple of Tensors matching
+        the corresponding shapes.
+      final_state:
+        A `Tensor`, or possibly nested tuple of Tensors, matching in length
+        and shapes to `initial_state`.
+    Raises:
+      ValueError: If the input depth cannot be inferred via shape inference
+        from the inputs.
+    """
+
+    state = initial_state
+
+    assert isinstance(parallel_iterations, int), "parallel_iterations must be int"
+
+    state_size = cell.state_size
+
+    flat_input = nest.flatten(inputs)
+
+    flat_output_size = nest.flatten(cell.output_size)
+
+    # Construct an initial output
+
+    input_shape = array_ops.shape(flat_input[0])
+
+    time_steps = input_shape[0]
+
+    batch_size = _best_effort_input_batch_size(flat_input)
+
+    inputs_got_shape = tuple(input_.get_shape().with_rank_at_least(3)
+
+                             for input_ in flat_input)
+
+    const_time_steps, const_batch_size = inputs_got_shape[0].as_list()[:2]
+
+    for shape in inputs_got_shape:
+
+        if not shape[2:].is_fully_defined():
+            raise ValueError(
+
+                "Input size (depth of inputs) must be accessible via shape inference,"
+
+                " but saw value None.")
+
+        got_time_steps = shape[0].value
+
+        got_batch_size = shape[1].value
+
+        if const_time_steps != got_time_steps:
+            raise ValueError(
+
+                "Time steps is not the same for all the elements in the input in a "
+
+                "batch.")
+
+        if const_batch_size != got_batch_size:
+            raise ValueError(
+
+                "Batch_size is not the same for all the elements in the input.")
+
+    # Prepare dynamic conditional copying of state & output
+
+    def _create_zero_arrays(size):
+
+        size = _concat(batch_size, size)
+
+        return array_ops.zeros(
+
+            array_ops.stack(size), _infer_state_dtype(dtype, state))
+
+    flat_zero_output = tuple(_create_zero_arrays(output)
+
+                             for output in flat_output_size)
+
+    zero_output = nest.pack_sequence_as(structure=cell.output_size,
+
+                                        flat_sequence=flat_zero_output)
+
+    if sequence_length is not None:
+        min_sequence_length = math_ops.reduce_min(sequence_length)
+
+        max_sequence_length = math_ops.reduce_max(sequence_length)
+
+    time = array_ops.constant(0, dtype=dtypes.int32, name="time")
+
+    with ops.name_scope("dynamic_rnn") as scope:
+
+        base_name = scope
+
+    def _create_ta(name, dtype):
+
+        return tensor_array_ops.TensorArray(dtype=dtype,
+
+                                            size=time_steps,
+
+                                            tensor_array_name=base_name + name)
+
+    output_ta = tuple(_create_ta("output_%d" % i,
+
+                                 _infer_state_dtype(dtype, state))
+
+                      for i in range(len(flat_output_size)))
+
+    input_ta = tuple(_create_ta("input_%d" % i, flat_input[i].dtype)
+
+                     for i in range(len(flat_input)))
+
+    input_ta = tuple(ta.unstack(input_)
+
+                     for ta, input_ in zip(input_ta, flat_input))
+
+    def _time_step(time, output_ta_t, state, att_scores=None):
+
+        """Take a time step of the dynamic RNN.
+        Args:
+          time: int32 scalar Tensor.
+          output_ta_t: List of `TensorArray`s that represent the output.
+          state: nested tuple of vector tensors that represent the state.
+        Returns:
+          The tuple (time + 1, output_ta_t with updated flow, new_state).
+        """
+
+        input_t = tuple(ta.read(time) for ta in input_ta)
+
+        # Restore some shape information
+
+        for input_, shape in zip(input_t, inputs_got_shape):
+            input_.set_shape(shape[1:])
+
+        input_t = nest.pack_sequence_as(structure=inputs, flat_sequence=input_t)
+
+        if att_scores is not None:
+
+            att_score = att_scores[:, time, :]
+
+            call_cell = lambda: cell(input_t, state, att_score)
+
+        else:
+
+            call_cell = lambda: cell(input_t, state)
+
+        if sequence_length is not None:
+
+            (output, new_state) = _rnn_step(
+
+                time=time,
+
+                sequence_length=sequence_length,
+
+                min_sequence_length=min_sequence_length,
+
+                max_sequence_length=max_sequence_length,
+
+                zero_output=zero_output,
+
+                state=state,
+
+                call_cell=call_cell,
+
+                state_size=state_size,
+
+                skip_conditionals=True)
+
+        else:
+
+            (output, new_state) = call_cell()
+
+        # Pack state if using state tuples
+
+        output = nest.flatten(output)
+
+        output_ta_t = tuple(
+
+            ta.write(time, out) for ta, out in zip(output_ta_t, output))
+
+        if att_scores is not None:
+
+            return (time + 1, output_ta_t, new_state, att_scores)
+
+        else:
+
+            return (time + 1, output_ta_t, new_state)
+
+    if att_scores is not None:
+
+        _, output_final_ta, final_state, _ = control_flow_ops.while_loop(
+
+            cond=lambda time, *_: time < time_steps,
+
+            body=_time_step,
+
+            loop_vars=(time, output_ta, state, att_scores),
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory)
+
+    else:
+
+        _, output_final_ta, final_state = control_flow_ops.while_loop(
+
+            cond=lambda time, *_: time < time_steps,
+
+            body=_time_step,
+
+            loop_vars=(time, output_ta, state),
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory)
+
+    # Unpack final output if not using output tuples.
+
+    final_outputs = tuple(ta.stack() for ta in output_final_ta)
+
+    # Restore some shape information
+
+    for output, output_size in zip(final_outputs, flat_output_size):
+        shape = _concat(
+
+            [const_time_steps, const_batch_size], output_size, static=True)
+
+        output.set_shape(shape)
+
+    final_outputs = nest.pack_sequence_as(
+
+        structure=cell.output_size, flat_sequence=final_outputs)
+
+    return (final_outputs, final_state)
\ No newline at end of file
diff --git a/modelzoo/FwFM/script/contrib/rnn_v2.py b/modelzoo/FwFM/script/contrib/rnn_v2.py
new file mode 100644
index 00000000000..a2bd625cd8b
--- /dev/null
+++ b/modelzoo/FwFM/script/contrib/rnn_v2.py
@@ -0,0 +1,1452 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+#
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+
+# you may not use this file except in compliance with the License.
+
+# You may obtain a copy of the License at
+
+#
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+#
+
+# Unless required by applicable law or agreed to in writing, software
+
+# distributed under the License is distributed on an "AS IS" BASIS,
+
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+# See the License for the specific language governing permissions and
+
+# limitations under the License.
+
+# ==============================================================================
+
+
+"""RNN helpers for TensorFlow models.
+
+
+
+
+
+@@bidirectional_dynamic_rnn
+
+@@dynamic_rnn
+
+@@raw_rnn
+
+@@static_rnn
+
+@@static_state_saving_rnn
+
+@@static_bidirectional_rnn
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.util import nest
+import tensorflow as tf
+
+
+def _like_rnncell_(cell):
+    """Checks that a given object is an RNNCell by using duck typing."""
+
+    conditions = [hasattr(cell, "output_size"), hasattr(cell, "state_size"),
+
+                  hasattr(cell, "zero_state"), callable(cell)]
+
+    return all(conditions)
+
+
+# pylint: disable=protected-access
+
+_concat = rnn_cell_impl._concat
+try:
+    _like_rnncell = rnn_cell_impl._like_rnncell
+except:
+    _like_rnncell = _like_rnncell_
+
+
+# pylint: enable=protected-access
+
+
+def _transpose_batch_time(x):
+    """Transpose the batch and time dimensions of a Tensor.
+
+
+
+    Retains as much of the static shape information as possible.
+
+
+
+    Args:
+
+      x: A tensor of rank 2 or higher.
+
+
+
+    Returns:
+
+      x transposed along the first two dimensions.
+
+
+
+    Raises:
+
+      ValueError: if `x` is rank 1 or lower.
+
+    """
+
+    x_static_shape = x.get_shape()
+
+    if x_static_shape.ndims is not None and x_static_shape.ndims < 2:
+        raise ValueError(
+
+            "Expected input tensor %s to have rank at least 2, but saw shape: %s" %
+
+            (x, x_static_shape))
+
+    x_rank = array_ops.rank(x)
+
+    x_t = array_ops.transpose(
+
+        x, array_ops.concat(
+
+            ([1, 0], math_ops.range(2, x_rank)), axis=0))
+
+    x_t.set_shape(
+
+        tensor_shape.TensorShape([
+
+            x_static_shape[1], x_static_shape[0]
+
+        ]).concatenate(x_static_shape[2:]))
+
+    return x_t
+
+
+def _best_effort_input_batch_size(flat_input):
+    """Get static input batch size if available, with fallback to the dynamic one.
+
+
+
+    Args:
+
+      flat_input: An iterable of time major input Tensors of shape [max_time,
+
+        batch_size, ...]. All inputs should have compatible batch sizes.
+
+
+
+    Returns:
+
+      The batch size in Python integer if available, or a scalar Tensor otherwise.
+
+
+
+    Raises:
+
+      ValueError: if there is any input with an invalid shape.
+
+    """
+
+    for input_ in flat_input:
+
+        shape = input_.shape
+
+        if shape.ndims is None:
+            continue
+
+        if shape.ndims < 2:
+            raise ValueError(
+
+                "Expected input tensor %s to have rank at least 2" % input_)
+
+        batch_size = shape[1]
+
+        if batch_size is not None:
+            return batch_size
+
+    # Fallback to the dynamic batch size of the first input.
+
+    return array_ops.shape(flat_input[0])[1]
+
+
+def _infer_state_dtype(explicit_dtype, state):
+    """Infer the dtype of an RNN state.
+
+
+
+    Args:
+
+      explicit_dtype: explicitly declared dtype or None.
+
+      state: RNN's hidden state. Must be a Tensor or a nested iterable containing
+
+        Tensors.
+
+
+
+    Returns:
+
+      dtype: inferred dtype of hidden state.
+
+
+
+    Raises:
+
+      ValueError: if `state` has heterogeneous dtypes or is empty.
+
+    """
+
+    if explicit_dtype is not None:
+
+        return explicit_dtype
+
+    elif nest.is_sequence(state):
+
+        inferred_dtypes = [element.dtype for element in nest.flatten(state)]
+
+        if not inferred_dtypes:
+            raise ValueError("Unable to infer dtype from empty state.")
+
+        all_same = all([x == inferred_dtypes[0] for x in inferred_dtypes])
+
+        if not all_same:
+            raise ValueError(
+
+                "State has tensors of different inferred_dtypes. Unable to infer a "
+
+                "single representative dtype.")
+
+        return inferred_dtypes[0]
+
+    else:
+
+        return state.dtype
+
+
+# pylint: disable=unused-argument
+
+def _rnn_step(
+
+        time, sequence_length, min_sequence_length, max_sequence_length,
+
+        zero_output, state, call_cell, state_size, skip_conditionals=False):
+    """Calculate one step of a dynamic RNN minibatch.
+
+
+
+    Returns an (output, state) pair conditioned on the sequence_lengths.
+
+    When skip_conditionals=False, the pseudocode is something like:
+
+
+
+    if t >= max_sequence_length:
+
+      return (zero_output, state)
+
+    if t < min_sequence_length:
+
+      return call_cell()
+
+
+
+    # Selectively output zeros or output, old state or new state depending
+
+    # on if we've finished calculating each row.
+
+    new_output, new_state = call_cell()
+
+    final_output = np.vstack([
+
+      zero_output if time >= sequence_lengths[r] else new_output_r
+
+      for r, new_output_r in enumerate(new_output)
+
+    ])
+
+    final_state = np.vstack([
+
+      state[r] if time >= sequence_lengths[r] else new_state_r
+
+      for r, new_state_r in enumerate(new_state)
+
+    ])
+
+    return (final_output, final_state)
+
+
+
+    Args:
+
+      time: Python int, the current time step
+
+      sequence_length: int32 `Tensor` vector of size [batch_size]
+
+      min_sequence_length: int32 `Tensor` scalar, min of sequence_length
+
+      max_sequence_length: int32 `Tensor` scalar, max of sequence_length
+
+      zero_output: `Tensor` vector of shape [output_size]
+
+      state: Either a single `Tensor` matrix of shape `[batch_size, state_size]`,
+
+        or a list/tuple of such tensors.
+
+      call_cell: lambda returning tuple of (new_output, new_state) where
+
+        new_output is a `Tensor` matrix of shape `[batch_size, output_size]`.
+
+        new_state is a `Tensor` matrix of shape `[batch_size, state_size]`.
+
+      state_size: The `cell.state_size` associated with the state.
+
+      skip_conditionals: Python bool, whether to skip using the conditional
+
+        calculations.  This is useful for `dynamic_rnn`, where the input tensor
+
+        matches `max_sequence_length`, and using conditionals just slows
+
+        everything down.
+
+
+
+    Returns:
+
+      A tuple of (`final_output`, `final_state`) as given by the pseudocode above:
+
+        final_output is a `Tensor` matrix of shape [batch_size, output_size]
+
+        final_state is either a single `Tensor` matrix, or a tuple of such
+
+          matrices (matching length and shapes of input `state`).
+
+
+
+    Raises:
+
+      ValueError: If the cell returns a state tuple whose length does not match
+
+        that returned by `state_size`.
+
+    """
+
+    # Convert state to a list for ease of use
+
+    flat_state = nest.flatten(state)
+
+    flat_zero_output = nest.flatten(zero_output)
+
+    def _copy_one_through(output, new_output):
+
+        # If the state contains a scalar value we simply pass it through.
+
+        if output.shape.ndims == 0:
+            return new_output
+
+        copy_cond = (time >= sequence_length)
+
+        with ops.colocate_with(new_output):
+            return array_ops.where(copy_cond, output, new_output)
+
+    def _copy_some_through(flat_new_output, flat_new_state):
+
+        # Use broadcasting select to determine which values should get
+
+        # the previous state & zero output, and which values should get
+
+        # a calculated state & output.
+
+        flat_new_output = [
+
+            _copy_one_through(zero_output, new_output)
+
+            for zero_output, new_output in zip(flat_zero_output, flat_new_output)]
+
+        flat_new_state = [
+
+            _copy_one_through(state, new_state)
+
+            for state, new_state in zip(flat_state, flat_new_state)]
+
+        return flat_new_output + flat_new_state
+
+    def _maybe_copy_some_through():
+
+        """Run RNN step.  Pass through either no or some past state."""
+
+        new_output, new_state = call_cell()
+
+        nest.assert_same_structure(state, new_state)
+
+        flat_new_state = nest.flatten(new_state)
+
+        flat_new_output = nest.flatten(new_output)
+
+        return control_flow_ops.cond(
+
+            # if t < min_seq_len: calculate and return everything
+
+            time < min_sequence_length, lambda: flat_new_output + flat_new_state,
+
+            # else copy some of it through
+
+            lambda: _copy_some_through(flat_new_output, flat_new_state))
+
+    # TODO(ebrevdo): skipping these conditionals may cause a slowdown,
+
+    # but benefits from removing cond() and its gradient.  We should
+
+    # profile with and without this switch here.
+
+    if skip_conditionals:
+
+        # Instead of using conditionals, perform the selective copy at all time
+
+        # steps.  This is faster when max_seq_len is equal to the number of unrolls
+
+        # (which is typical for dynamic_rnn).
+
+        new_output, new_state = call_cell()
+
+        nest.assert_same_structure(state, new_state)
+
+        new_state = nest.flatten(new_state)
+
+        new_output = nest.flatten(new_output)
+
+        final_output_and_state = _copy_some_through(new_output, new_state)
+
+    else:
+
+        empty_update = lambda: flat_zero_output + flat_state
+
+        final_output_and_state = control_flow_ops.cond(
+
+            # if t >= max_seq_len: copy all state through, output zeros
+
+            time >= max_sequence_length, empty_update,
+
+            # otherwise calculation is required: copy some or all of it through
+
+            _maybe_copy_some_through)
+
+    if len(final_output_and_state) != len(flat_zero_output) + len(flat_state):
+        raise ValueError("Internal error: state and output were not concatenated "
+
+                         "correctly.")
+
+    final_output = final_output_and_state[:len(flat_zero_output)]
+
+    final_state = final_output_and_state[len(flat_zero_output):]
+
+    for output, flat_output in zip(final_output, flat_zero_output):
+        output.set_shape(flat_output.get_shape())
+
+    for substate, flat_substate in zip(final_state, flat_state):
+        substate.set_shape(flat_substate.get_shape())
+
+    final_output = nest.pack_sequence_as(
+
+        structure=zero_output, flat_sequence=final_output)
+
+    final_state = nest.pack_sequence_as(
+
+        structure=state, flat_sequence=final_state)
+
+    return final_output, final_state
+
+
+def _reverse_seq(input_seq, lengths):
+    """Reverse a list of Tensors up to specified lengths.
+
+
+
+    Args:
+
+      input_seq: Sequence of seq_len tensors of dimension (batch_size, n_features)
+
+                 or nested tuples of tensors.
+
+      lengths:   A `Tensor` of dimension batch_size, containing lengths for each
+
+                 sequence in the batch. If "None" is specified, simply reverses
+
+                 the list.
+
+
+
+    Returns:
+
+      time-reversed sequence
+
+    """
+
+    if lengths is None:
+        return list(reversed(input_seq))
+
+    flat_input_seq = tuple(nest.flatten(input_) for input_ in input_seq)
+
+    flat_results = [[] for _ in range(len(input_seq))]
+
+    for sequence in zip(*flat_input_seq):
+
+        input_shape = tensor_shape.unknown_shape(
+
+            ndims=sequence[0].get_shape().ndims)
+
+        for input_ in sequence:
+            input_shape.merge_with(input_.get_shape())
+
+            input_.set_shape(input_shape)
+
+        # Join into (time, batch_size, depth)
+
+        s_joined = array_ops.stack(sequence)
+
+        # Reverse along dimension 0
+
+        s_reversed = array_ops.reverse_sequence(s_joined, lengths, 0, 1)
+
+        # Split again into list
+
+        result = array_ops.unstack(s_reversed)
+
+        for r, flat_result in zip(result, flat_results):
+            r.set_shape(input_shape)
+
+            flat_result.append(r)
+
+    results = [nest.pack_sequence_as(structure=input_, flat_sequence=flat_result)
+
+               for input_, flat_result in zip(input_seq, flat_results)]
+
+    return results
+
+
+#
+# def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
+#
+#                               initial_state_fw=None, initial_state_bw=None,
+#
+#                               dtype=None, parallel_iterations=None,
+#
+#                               swap_memory=False, time_major=False, scope=None):
+#
+#   """Creates a dynamic version of bidirectional recurrent neural network.
+#
+#
+#
+#   Takes input and builds independent forward and backward RNNs. The input_size
+#
+#   of forward and backward cell must match. The initial state for both directions
+#
+#   is zero by default (but can be set optionally) and no intermediate states are
+#
+#   ever returned -- the network is fully unrolled for the given (passed in)
+#
+#   length(s) of the sequence(s) or completely unrolled if length(s) is not
+#
+#   given.
+#
+#
+#
+#   Args:
+#
+#     cell_fw: An instance of RNNCell, to be used for forward direction.
+#
+#     cell_bw: An instance of RNNCell, to be used for backward direction.
+#
+#     inputs: The RNN inputs.
+#
+#       If time_major == False (default), this must be a tensor of shape:
+#
+#         `[batch_size, max_time, ...]`, or a nested tuple of such elements.
+#
+#       If time_major == True, this must be a tensor of shape:
+#
+#         `[max_time, batch_size, ...]`, or a nested tuple of such elements.
+#
+#     sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
+#
+#       containing the actual lengths for each of the sequences in the batch.
+#
+#       If not provided, all batch entries are assumed to be full sequences; and
+#
+#       time reversal is applied from time `0` to `max_time` for each sequence.
+#
+#     initial_state_fw: (optional) An initial state for the forward RNN.
+#
+#       This must be a tensor of appropriate type and shape
+#
+#       `[batch_size, cell_fw.state_size]`.
+#
+#       If `cell_fw.state_size` is a tuple, this should be a tuple of
+#
+#       tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
+#
+#     initial_state_bw: (optional) Same as for `initial_state_fw`, but using
+#
+#       the corresponding properties of `cell_bw`.
+#
+#     dtype: (optional) The data type for the initial states and expected output.
+#
+#       Required if initial_states are not provided or RNN states have a
+#
+#       heterogeneous dtype.
+#
+#     parallel_iterations: (Default: 32).  The number of iterations to run in
+#
+#       parallel.  Those operations which do not have any temporal dependency
+#
+#       and can be run in parallel, will be.  This parameter trades off
+#
+#       time for space.  Values >> 1 use more memory but take less time,
+#
+#       while smaller values use less memory but computations take longer.
+#
+#     swap_memory: Transparently swap the tensors produced in forward inference
+#
+#       but needed for back prop from GPU to CPU.  This allows training RNNs
+#
+#       which would typically not fit on a single GPU, with very minimal (or no)
+#
+#       performance penalty.
+#
+#     time_major: The shape format of the `inputs` and `outputs` Tensors.
+#
+#       If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+#
+#       If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+#
+#       Using `time_major = True` is a bit more efficient because it avoids
+#
+#       transposes at the beginning and end of the RNN calculation.  However,
+#
+#       most TensorFlow data is batch-major, so by default this function
+#
+#       accepts input and emits output in batch-major form.
+#
+#     scope: VariableScope for the created subgraph; defaults to
+#
+#       "bidirectional_rnn"
+#
+#
+#
+#   Returns:
+#
+#     A tuple (outputs, output_states) where:
+#
+#       outputs: A tuple (output_fw, output_bw) containing the forward and
+#
+#         the backward rnn output `Tensor`.
+#
+#         If time_major == False (default),
+#
+#           output_fw will be a `Tensor` shaped:
+#
+#           `[batch_size, max_time, cell_fw.output_size]`
+#
+#           and output_bw will be a `Tensor` shaped:
+#
+#           `[batch_size, max_time, cell_bw.output_size]`.
+#
+#         If time_major == True,
+#
+#           output_fw will be a `Tensor` shaped:
+#
+#           `[max_time, batch_size, cell_fw.output_size]`
+#
+#           and output_bw will be a `Tensor` shaped:
+#
+#           `[max_time, batch_size, cell_bw.output_size]`.
+#
+#         It returns a tuple instead of a single concatenated `Tensor`, unlike
+#
+#         in the `bidirectional_rnn`. If the concatenated one is preferred,
+#
+#         the forward and backward outputs can be concatenated as
+#
+#         `tf.concat(outputs, 2)`.
+#
+#       output_states: A tuple (output_state_fw, output_state_bw) containing
+#
+#         the forward and the backward final states of bidirectional rnn.
+#
+#
+#
+#   Raises:
+#
+#     TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
+#
+#   """
+#
+#
+#
+#   if not _like_rnncell(cell_fw):
+#
+#     raise TypeError("cell_fw must be an instance of RNNCell")
+#
+#   if not _like_rnncell(cell_bw):
+#
+#     raise TypeError("cell_bw must be an instance of RNNCell")
+#
+#
+#
+#   with vs.variable_scope(scope or "bidirectional_rnn"):
+#
+#     # Forward direction
+#
+#     with vs.variable_scope("fw") as fw_scope:
+#
+#       output_fw, output_state_fw = dynamic_rnn(
+#
+#           cell=cell_fw, inputs=inputs, sequence_length=sequence_length,
+#
+#           initial_state=initial_state_fw, dtype=dtype,
+#
+#           parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+#
+#           time_major=time_major, scope=fw_scope)
+#
+#
+#
+#     # Backward direction
+#
+#     if not time_major:
+#
+#       time_dim = 1
+#
+#       batch_dim = 0
+#
+#     else:
+#
+#       time_dim = 0
+#
+#       batch_dim = 1
+#
+#
+#
+#     def _reverse(input_, seq_lengths, seq_dim, batch_dim):
+#
+#       if seq_lengths is not None:
+#
+#         return array_ops.reverse_sequence(
+#
+#             input=input_, seq_lengths=seq_lengths,
+#
+#             seq_dim=seq_dim, batch_dim=batch_dim)
+#
+#       else:
+#
+#         return array_ops.reverse(input_, axis=[seq_dim])
+#
+#
+#
+#     with vs.variable_scope("bw") as bw_scope:
+#
+#       inputs_reverse = _reverse(
+#
+#           inputs, seq_lengths=sequence_length,
+#
+#           seq_dim=time_dim, batch_dim=batch_dim)
+#
+#       tmp, output_state_bw = dynamic_rnn(
+#
+#           cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
+#
+#           initial_state=initial_state_bw, dtype=dtype,
+#
+#           parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+#
+#           time_major=time_major, scope=bw_scope)
+#
+#
+#
+#   output_bw = _reverse(
+#
+#       tmp, seq_lengths=sequence_length,
+#
+#       seq_dim=time_dim, batch_dim=batch_dim)
+#
+#
+#
+#   outputs = (output_fw, output_bw)
+#
+#   output_states = (output_state_fw, output_state_bw)
+#
+#
+#
+#   return (outputs, output_states)
+#
+
+
+def dynamic_rnn(cell, inputs, att_scores=None, sequence_length=None, initial_state=None,
+
+                dtype=None, parallel_iterations=None, swap_memory=False,
+
+                time_major=False, scope=None):
+    """Creates a recurrent neural network specified by RNNCell `cell`.
+
+
+
+    Performs fully dynamic unrolling of `inputs`.
+
+
+
+    Example:
+
+
+
+    ```python
+
+    # create a BasicRNNCell
+
+    rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+
+
+
+    # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size]
+
+
+
+    # defining initial state
+
+    initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32)
+
+
+
+    # 'state' is a tensor of shape [batch_size, cell_state_size]
+
+    outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data,
+
+                                       initial_state=initial_state,
+
+                                       dtype=tf.float32)
+
+    ```
+
+
+
+    ```python
+
+    # create 2 LSTMCells
+
+    rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [128, 256]]
+
+
+
+    # create a RNN cell composed sequentially of a number of RNNCells
+
+    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
+
+
+
+    # 'outputs' is a tensor of shape [batch_size, max_time, 256]
+
+    # 'state' is a N-tuple where N is the number of LSTMCells containing a
+
+    # tf.contrib.rnn.LSTMStateTuple for each cell
+
+    outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
+
+                                       inputs=data,
+
+                                       dtype=tf.float32)
+
+    ```
+
+
+
+
+
+    Args:
+
+      cell: An instance of RNNCell.
+
+      inputs: The RNN inputs.
+
+        If `time_major == False` (default), this must be a `Tensor` of shape:
+
+          `[batch_size, max_time, ...]`, or a nested tuple of such
+
+          elements.
+
+        If `time_major == True`, this must be a `Tensor` of shape:
+
+          `[max_time, batch_size, ...]`, or a nested tuple of such
+
+          elements.
+
+        This may also be a (possibly nested) tuple of Tensors satisfying
+
+        this property.  The first two dimensions must match across all the inputs,
+
+        but otherwise the ranks and other shape components may differ.
+
+        In this case, input to `cell` at each time-step will replicate the
+
+        structure of these tuples, except for the time dimension (from which the
+
+        time is taken).
+
+        The input to `cell` at each time step will be a `Tensor` or (possibly
+
+        nested) tuple of Tensors each with dimensions `[batch_size, ...]`.
+
+      sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
+
+        Used to copy-through state and zero-out outputs when past a batch
+
+        element's sequence length.  So it's more for correctness than performance.
+
+      initial_state: (optional) An initial state for the RNN.
+
+        If `cell.state_size` is an integer, this must be
+
+        a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
+
+        If `cell.state_size` is a tuple, this should be a tuple of
+
+        tensors having shapes `[batch_size, s] for s in cell.state_size`.
+
+      dtype: (optional) The data type for the initial state and expected output.
+
+        Required if initial_state is not provided or RNN state has a heterogeneous
+
+        dtype.
+
+      parallel_iterations: (Default: 32).  The number of iterations to run in
+
+        parallel.  Those operations which do not have any temporal dependency
+
+        and can be run in parallel, will be.  This parameter trades off
+
+        time for space.  Values >> 1 use more memory but take less time,
+
+        while smaller values use less memory but computations take longer.
+
+      swap_memory: Transparently swap the tensors produced in forward inference
+
+        but needed for back prop from GPU to CPU.  This allows training RNNs
+
+        which would typically not fit on a single GPU, with very minimal (or no)
+
+        performance penalty.
+
+      time_major: The shape format of the `inputs` and `outputs` Tensors.
+
+        If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+
+        If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+
+        Using `time_major = True` is a bit more efficient because it avoids
+
+        transposes at the beginning and end of the RNN calculation.  However,
+
+        most TensorFlow data is batch-major, so by default this function
+
+        accepts input and emits output in batch-major form.
+
+      scope: VariableScope for the created subgraph; defaults to "rnn".
+
+
+
+    Returns:
+
+      A pair (outputs, state) where:
+
+
+
+      outputs: The RNN output `Tensor`.
+
+
+
+        If time_major == False (default), this will be a `Tensor` shaped:
+
+          `[batch_size, max_time, cell.output_size]`.
+
+
+
+        If time_major == True, this will be a `Tensor` shaped:
+
+          `[max_time, batch_size, cell.output_size]`.
+
+
+
+        Note, if `cell.output_size` is a (possibly nested) tuple of integers
+
+        or `TensorShape` objects, then `outputs` will be a tuple having the
+
+        same structure as `cell.output_size`, containing Tensors having shapes
+
+        corresponding to the shape data in `cell.output_size`.
+
+
+
+      state: The final state.  If `cell.state_size` is an int, this
+
+        will be shaped `[batch_size, cell.state_size]`.  If it is a
+
+        `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
+
+        If it is a (possibly nested) tuple of ints or `TensorShape`, this will
+
+        be a tuple having the corresponding shapes. If cells are `LSTMCells`
+
+        `state` will be a tuple containing a `LSTMStateTuple` for each cell.
+
+
+
+    Raises:
+
+      TypeError: If `cell` is not an instance of RNNCell.
+
+      ValueError: If inputs is None or an empty list.
+
+    """
+
+    if not _like_rnncell(cell):
+        raise TypeError("cell must be an instance of RNNCell")
+
+    # By default, time_major==False and inputs are batch-major: shaped
+
+    #   [batch, time, depth]
+
+    # For internal calculations, we transpose to [time, batch, depth]
+
+    flat_input = nest.flatten(inputs)
+
+    if not time_major:
+        # (B,T,D) => (T,B,D)
+
+        flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
+
+        flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
+
+    parallel_iterations = parallel_iterations or 32
+
+    if sequence_length is not None:
+
+        sequence_length = math_ops.to_int32(sequence_length)
+
+        if sequence_length.get_shape().ndims not in (None, 1):
+            raise ValueError(
+
+                "sequence_length must be a vector of length batch_size, "
+
+                "but saw shape: %s" % sequence_length.get_shape())
+
+        sequence_length = array_ops.identity(  # Just to find it in the graph.
+
+            sequence_length, name="sequence_length")
+
+    # Create a new scope in which the caching device is either
+
+    # determined by the parent scope, or is set to place the cached
+
+    # Variable using the same placement as for the rest of the RNN.
+
+    try:
+        resue = tf.AUTO_REUSE
+    except:
+        resue = tf.compat.v1.AUTO_REUSE
+
+    with vs.variable_scope(scope or "rnn",reuse=resue) as varscope:#TODO:user defined reuse
+
+        if varscope.caching_device is None:
+            varscope.set_caching_device(lambda op: op.device)
+
+        batch_size = _best_effort_input_batch_size(flat_input)
+
+        if initial_state is not None:
+
+            state = initial_state
+
+        else:
+
+            if not dtype:
+                raise ValueError("If there is no initial_state, you must give a dtype.")
+
+            state = cell.zero_state(batch_size, dtype)
+
+        def _assert_has_shape(x, shape):
+
+            x_shape = array_ops.shape(x)
+
+            packed_shape = array_ops.stack(shape)
+
+            return control_flow_ops.Assert(
+
+                math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)),
+
+                ["Expected shape for Tensor %s is " % x.name,
+
+                 packed_shape, " but saw shape: ", x_shape])
+
+        if sequence_length is not None:
+            # Perform some shape validation
+
+            with ops.control_dependencies(
+
+                    [_assert_has_shape(sequence_length, [batch_size])]):
+                sequence_length = array_ops.identity(
+
+                    sequence_length, name="CheckSeqLen")
+
+        inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input)
+
+        (outputs, final_state) = _dynamic_rnn_loop(
+
+            cell,
+
+            inputs,
+
+            state,
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory,
+
+            att_scores=att_scores,
+
+            sequence_length=sequence_length,
+
+            dtype=dtype)
+
+        # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth].
+
+        # If we are performing batch-major calculations, transpose output back
+
+        # to shape [batch, time, depth]
+
+        if not time_major:
+            # (T,B,D) => (B,T,D)
+
+            outputs = nest.map_structure(_transpose_batch_time, outputs)
+
+        return (outputs, final_state)
+
+
+def _dynamic_rnn_loop(cell,
+
+                      inputs,
+
+                      initial_state,
+
+                      parallel_iterations,
+
+                      swap_memory,
+
+                      att_scores=None,
+
+                      sequence_length=None,
+
+                      dtype=None):
+    """Internal implementation of Dynamic RNN.
+
+
+
+    Args:
+
+      cell: An instance of RNNCell.
+
+      inputs: A `Tensor` of shape [time, batch_size, input_size], or a nested
+
+        tuple of such elements.
+
+      initial_state: A `Tensor` of shape `[batch_size, state_size]`, or if
+
+        `cell.state_size` is a tuple, then this should be a tuple of
+
+        tensors having shapes `[batch_size, s] for s in cell.state_size`.
+
+      parallel_iterations: Positive Python int.
+
+      swap_memory: A Python boolean
+
+      sequence_length: (optional) An `int32` `Tensor` of shape [batch_size].
+
+      dtype: (optional) Expected dtype of output. If not specified, inferred from
+
+        initial_state.
+
+
+
+    Returns:
+
+      Tuple `(final_outputs, final_state)`.
+
+      final_outputs:
+
+        A `Tensor` of shape `[time, batch_size, cell.output_size]`.  If
+
+        `cell.output_size` is a (possibly nested) tuple of ints or `TensorShape`
+
+        objects, then this returns a (possibly nsted) tuple of Tensors matching
+
+        the corresponding shapes.
+
+      final_state:
+
+        A `Tensor`, or possibly nested tuple of Tensors, matching in length
+
+        and shapes to `initial_state`.
+
+
+
+    Raises:
+
+      ValueError: If the input depth cannot be inferred via shape inference
+
+        from the inputs.
+
+    """
+
+    state = initial_state
+
+    assert isinstance(parallel_iterations, int), "parallel_iterations must be int"
+
+    state_size = cell.state_size
+
+    flat_input = nest.flatten(inputs)
+
+    flat_output_size = nest.flatten(cell.output_size)
+
+    # Construct an initial output
+
+    input_shape = array_ops.shape(flat_input[0])
+
+    time_steps = input_shape[0]
+
+    batch_size = _best_effort_input_batch_size(flat_input)
+
+    inputs_got_shape = tuple(input_.get_shape().with_rank_at_least(3)
+
+                             for input_ in flat_input)
+
+    const_time_steps, const_batch_size = inputs_got_shape[0].as_list()[:2]
+
+    for shape in inputs_got_shape:
+
+        if not shape[2:].is_fully_defined():
+            raise ValueError(
+
+                "Input size (depth of inputs) must be accessible via shape inference,"
+
+                " but saw value None.")
+
+        got_time_steps = shape[0]
+
+        got_batch_size = shape[1]
+
+        if const_time_steps != got_time_steps:
+            raise ValueError(
+
+                "Time steps is not the same for all the elements in the input in a "
+
+                "batch.")
+
+        if const_batch_size != got_batch_size:
+            raise ValueError(
+
+                "Batch_size is not the same for all the elements in the input.")
+
+    # Prepare dynamic conditional copying of state & output
+
+    def _create_zero_arrays(size):
+
+        size = _concat(batch_size, size)
+
+        return array_ops.zeros(
+
+            array_ops.stack(size), _infer_state_dtype(dtype, state))
+
+    flat_zero_output = tuple(_create_zero_arrays(output)
+
+                             for output in flat_output_size)
+
+    zero_output = nest.pack_sequence_as(structure=cell.output_size,
+
+                                        flat_sequence=flat_zero_output)
+
+    if sequence_length is not None:
+        min_sequence_length = math_ops.reduce_min(sequence_length)
+
+        max_sequence_length = math_ops.reduce_max(sequence_length)
+
+    time = array_ops.constant(0, dtype=dtypes.int32, name="time")
+
+    with ops.name_scope("dynamic_rnn") as scope:
+
+        base_name = scope
+
+    def _create_ta(name, dtype):
+
+        return tensor_array_ops.TensorArray(dtype=dtype,
+
+                                            size=time_steps,
+
+                                            tensor_array_name=base_name + name)
+
+    output_ta = tuple(_create_ta("output_%d" % i,
+
+                                 _infer_state_dtype(dtype, state))
+
+                      for i in range(len(flat_output_size)))
+
+    input_ta = tuple(_create_ta("input_%d" % i, flat_input[i].dtype)
+
+                     for i in range(len(flat_input)))
+
+    input_ta = tuple(ta.unstack(input_)
+
+                     for ta, input_ in zip(input_ta, flat_input))
+
+    def _time_step(time, output_ta_t, state, att_scores=None):
+
+        """Take a time step of the dynamic RNN.
+
+
+
+        Args:
+
+          time: int32 scalar Tensor.
+
+          output_ta_t: List of `TensorArray`s that represent the output.
+
+          state: nested tuple of vector tensors that represent the state.
+
+
+
+        Returns:
+
+          The tuple (time + 1, output_ta_t with updated flow, new_state).
+
+        """
+
+        input_t = tuple(ta.read(time) for ta in input_ta)
+
+        # Restore some shape information
+
+        for input_, shape in zip(input_t, inputs_got_shape):
+            input_.set_shape(shape[1:])
+
+        input_t = nest.pack_sequence_as(structure=inputs, flat_sequence=input_t)
+
+        if att_scores is not None:
+
+            att_score = att_scores[:, time, :]
+
+            call_cell = lambda: cell(input_t, state, att_score)
+
+        else:
+
+            call_cell = lambda: cell(input_t, state)
+
+        if sequence_length is not None:
+
+            (output, new_state) = _rnn_step(
+
+                time=time,
+
+                sequence_length=sequence_length,
+
+                min_sequence_length=min_sequence_length,
+
+                max_sequence_length=max_sequence_length,
+
+                zero_output=zero_output,
+
+                state=state,
+
+                call_cell=call_cell,
+
+                state_size=state_size,
+
+                skip_conditionals=True)
+
+        else:
+
+            (output, new_state) = call_cell()
+
+        # Pack state if using state tuples
+
+        output = nest.flatten(output)
+
+        output_ta_t = tuple(
+
+            ta.write(time, out) for ta, out in zip(output_ta_t, output))
+
+        if att_scores is not None:
+
+            return (time + 1, output_ta_t, new_state, att_scores)
+
+        else:
+
+            return (time + 1, output_ta_t, new_state)
+
+    if att_scores is not None:
+
+        _, output_final_ta, final_state, _ = control_flow_ops.while_loop(
+
+            cond=lambda time, *_: time < time_steps,
+
+            body=_time_step,
+
+            loop_vars=(time, output_ta, state, att_scores),
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory)
+
+    else:
+
+        _, output_final_ta, final_state = control_flow_ops.while_loop(
+
+            cond=lambda time, *_: time < time_steps,
+
+            body=_time_step,
+
+            loop_vars=(time, output_ta, state),
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory)
+
+    # Unpack final output if not using output tuples.
+
+    final_outputs = tuple(ta.stack() for ta in output_final_ta)
+
+    # Restore some shape information
+
+    for output, output_size in zip(final_outputs, flat_output_size):
+        shape = _concat(
+
+            [const_time_steps, const_batch_size], output_size, static=True)
+
+        output.set_shape(shape)
+
+    final_outputs = nest.pack_sequence_as(
+
+        structure=cell.output_size, flat_sequence=final_outputs)
+
+    return (final_outputs, final_state)
diff --git a/modelzoo/FwFM/script/contrib/utils.py b/modelzoo/FwFM/script/contrib/utils.py
new file mode 100644
index 00000000000..692f4ef6e89
--- /dev/null
+++ b/modelzoo/FwFM/script/contrib/utils.py
@@ -0,0 +1,378 @@
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops.rnn_cell import *
+from tensorflow.python.util import nest
+
+_BIAS_VARIABLE_NAME = "bias"
+
+_WEIGHTS_VARIABLE_NAME = "kernel"
+
+
+class _Linear_(object):
+    """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
+
+
+
+    Args:
+
+      args: a 2D Tensor or a list of 2D, batch x n, Tensors.
+
+      output_size: int, second dimension of weight variable.
+
+      dtype: data type for variables.
+
+      build_bias: boolean, whether to build a bias variable.
+
+      bias_initializer: starting value to initialize the bias
+
+        (default is all zeros).
+
+      kernel_initializer: starting value to initialize the weight.
+
+
+
+    Raises:
+
+      ValueError: if inputs_shape is wrong.
+
+    """
+
+    def __init__(self,
+
+                 args,
+
+                 output_size,
+
+                 build_bias,
+
+                 bias_initializer=None,
+
+                 kernel_initializer=None):
+
+        self._build_bias = build_bias
+
+        if args is None or (nest.is_sequence(args) and not args):
+            raise ValueError("`args` must be specified")
+
+        if not nest.is_sequence(args):
+
+            args = [args]
+
+            self._is_sequence = False
+
+        else:
+
+            self._is_sequence = True
+
+        # Calculate the total size of arguments on dimension 1.
+
+        total_arg_size = 0
+
+        shapes = [a.get_shape() for a in args]
+
+        for shape in shapes:
+
+            if shape.ndims != 2:
+                raise ValueError(
+                    "linear is expecting 2D arguments: %s" % shapes)
+
+            if shape[1] is None:
+
+                raise ValueError("linear expects shape[1] to be provided for shape %s, "
+
+                                 "but saw %s" % (shape, shape[1]))
+
+            else:
+
+                total_arg_size += int(shape[1])#.value
+
+        dtype = [a.dtype for a in args][0]
+
+        scope = vs.get_variable_scope()
+
+        with vs.variable_scope(scope) as outer_scope:
+
+            self._weights = vs.get_variable(
+
+                _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size],
+
+                dtype=dtype,
+
+                initializer=kernel_initializer)
+
+            if build_bias:
+
+                with vs.variable_scope(outer_scope) as inner_scope:
+
+                    inner_scope.set_partitioner(None)
+
+                    if bias_initializer is None:
+                        bias_initializer = init_ops.constant_initializer(
+                            0.0, dtype=dtype)
+
+                    self._biases = vs.get_variable(
+
+                        _BIAS_VARIABLE_NAME, [output_size],
+
+                        dtype=dtype,
+
+                        initializer=bias_initializer)
+
+    def __call__(self, args):
+
+        if not self._is_sequence:
+            args = [args]
+
+        if len(args) == 1:
+
+            res = math_ops.matmul(args[0], self._weights)
+
+        else:
+
+            res = math_ops.matmul(array_ops.concat(args, 1), self._weights)
+
+        if self._build_bias:
+            res = nn_ops.bias_add(res, self._biases)
+
+        return res
+
+
+try:
+    from tensorflow.python.ops.rnn_cell_impl import _Linear
+except:
+    _Linear = _Linear_
+
+
+class QAAttGRUCell(RNNCell):
+    """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
+
+    Args:
+
+      num_units: int, The number of units in the GRU cell.
+
+      activation: Nonlinearity to use.  Default: `tanh`.
+
+      reuse: (optional) Python boolean describing whether to reuse variables
+
+       in an existing scope.  If not `True`, and the existing scope already has
+
+       the given variables, an error is raised.
+
+      kernel_initializer: (optional) The initializer to use for the weight and
+
+      projection matrices.
+
+      bias_initializer: (optional) The initializer to use for the bias.
+
+    """
+
+    def __init__(self,
+
+                 num_units,
+
+                 activation=None,
+
+                 reuse=None,
+
+                 kernel_initializer=None,
+
+                 bias_initializer=None):
+
+        super(QAAttGRUCell, self).__init__(_reuse=reuse)
+
+        self._num_units = num_units
+
+        self._activation = activation or math_ops.tanh
+
+        self._kernel_initializer = kernel_initializer
+
+        self._bias_initializer = bias_initializer
+
+        self._gate_linear = None
+
+        self._candidate_linear = None
+
+    @property
+    def state_size(self):
+
+        return self._num_units
+
+    @property
+    def output_size(self):
+
+        return self._num_units
+
+    def __call__(self, inputs, state, att_score):
+
+        return self.call(inputs, state, att_score)
+
+    def call(self, inputs, state, att_score=None):
+        """Gated recurrent unit (GRU) with nunits cells."""
+
+        if self._gate_linear is None:
+
+            bias_ones = self._bias_initializer
+
+            if self._bias_initializer is None:
+                bias_ones = init_ops.constant_initializer(
+                    1.0, dtype=inputs.dtype)
+
+            with vs.variable_scope("gates"):  # Reset gate and update gate.
+
+                self._gate_linear = _Linear(
+
+                    [inputs, state],
+
+                    2 * self._num_units,
+
+                    True,
+
+                    bias_initializer=bias_ones,
+
+                    kernel_initializer=self._kernel_initializer)
+
+        value = math_ops.sigmoid(self._gate_linear([inputs, state]))
+
+        r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
+
+        r_state = r * state
+
+        if self._candidate_linear is None:
+            with vs.variable_scope("candidate"):
+                self._candidate_linear = _Linear(
+
+                    [inputs, r_state],
+
+                    self._num_units,
+
+                    True,
+
+                    bias_initializer=self._bias_initializer,
+
+                    kernel_initializer=self._kernel_initializer)
+
+        c = self._activation(self._candidate_linear([inputs, r_state]))
+
+        new_h = (1. - att_score) * state + att_score * c
+
+        return new_h, new_h
+
+
+class VecAttGRUCell(RNNCell):
+    """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
+
+    Args:
+
+      num_units: int, The number of units in the GRU cell.
+
+      activation: Nonlinearity to use.  Default: `tanh`.
+
+      reuse: (optional) Python boolean describing whether to reuse variables
+
+       in an existing scope.  If not `True`, and the existing scope already has
+
+       the given variables, an error is raised.
+
+      kernel_initializer: (optional) The initializer to use for the weight and
+
+      projection matrices.
+
+      bias_initializer: (optional) The initializer to use for the bias.
+
+    """
+
+    def __init__(self,
+
+                 num_units,
+
+                 activation=None,
+
+                 reuse=None,
+
+                 kernel_initializer=None,
+
+                 bias_initializer=None):
+
+        super(VecAttGRUCell, self).__init__(_reuse=reuse)
+
+        self._num_units = num_units
+
+        self._activation = activation or math_ops.tanh
+
+        self._kernel_initializer = kernel_initializer
+
+        self._bias_initializer = bias_initializer
+
+        self._gate_linear = None
+
+        self._candidate_linear = None
+
+    @property
+    def state_size(self):
+
+        return self._num_units
+
+    @property
+    def output_size(self):
+
+        return self._num_units
+
+    def __call__(self, inputs, state, att_score):
+
+        return self.call(inputs, state, att_score)
+
+    def call(self, inputs, state, att_score=None):
+        """Gated recurrent unit (GRU) with nunits cells."""
+
+        if self._gate_linear is None:
+
+            bias_ones = self._bias_initializer
+
+            if self._bias_initializer is None:
+                bias_ones = init_ops.constant_initializer(
+                    1.0, dtype=inputs.dtype)
+
+            with vs.variable_scope("gates"):  # Reset gate and update gate.
+
+                self._gate_linear = _Linear(
+
+                    [inputs, state],
+
+                    2 * self._num_units,
+
+                    True,
+
+                    bias_initializer=bias_ones,
+
+                    kernel_initializer=self._kernel_initializer)
+
+        value = math_ops.sigmoid(self._gate_linear([inputs, state]))
+
+        r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
+
+        r_state = r * state
+
+        if self._candidate_linear is None:
+            with vs.variable_scope("candidate"):
+                self._candidate_linear = _Linear(
+
+                    [inputs, r_state],
+
+                    self._num_units,
+
+                    True,
+
+                    bias_initializer=self._bias_initializer,
+
+                    kernel_initializer=self._kernel_initializer)
+
+        c = self._activation(self._candidate_linear([inputs, r_state]))
+
+        u = (1.0 - att_score) * u
+
+        new_h = u * state + (1 - u) * c
+
+        return new_h, new_h
diff --git a/modelzoo/FwFM/script/estimator/__init__.py b/modelzoo/FwFM/script/estimator/__init__.py
new file mode 100644
index 00000000000..cf4f59d6c09
--- /dev/null
+++ b/modelzoo/FwFM/script/estimator/__init__.py
@@ -0,0 +1 @@
+from .models import *
\ No newline at end of file
diff --git a/modelzoo/FwFM/script/estimator/feature_column.py b/modelzoo/FwFM/script/estimator/feature_column.py
new file mode 100644
index 00000000000..c8d7a6cd013
--- /dev/null
+++ b/modelzoo/FwFM/script/estimator/feature_column.py
@@ -0,0 +1,52 @@
+import tensorflow as tf
+from tensorflow.python.feature_column.feature_column import _EmbeddingColumn
+
+from .utils import LINEAR_SCOPE_NAME, variable_scope, get_collection, get_GraphKeys, input_layer, get_losses
+
+
+def linear_model(features, linear_feature_columns):
+    if tf.__version__ >= '2.0.0':
+        linear_logits = tf.compat.v1.feature_column.linear_model(features, linear_feature_columns)
+    else:
+        linear_logits = tf.feature_column.linear_model(features, linear_feature_columns)
+    return linear_logits
+
+
+def get_linear_logit(features, linear_feature_columns, l2_reg_linear=0):
+    with variable_scope(LINEAR_SCOPE_NAME):
+        if not linear_feature_columns:
+            linear_logits = tf.Variable([[0.0]], name='bias_weights')
+        else:
+
+            linear_logits = linear_model(features, linear_feature_columns)
+
+            if l2_reg_linear > 0:
+                for var in get_collection(get_GraphKeys().TRAINABLE_VARIABLES, LINEAR_SCOPE_NAME)[:-1]:
+                    get_losses().add_loss(l2_reg_linear * tf.nn.l2_loss(var, name=var.name.split(":")[0] + "_l2loss"),
+                                          get_GraphKeys().REGULARIZATION_LOSSES)
+    return linear_logits
+
+
+def input_from_feature_columns(features, feature_columns, l2_reg_embedding=0.0):
+    dense_value_list = []
+    sparse_emb_list = []
+    for feat in feature_columns:
+        if is_embedding(feat):
+            sparse_emb = tf.expand_dims(input_layer(features, [feat]), axis=1)
+            sparse_emb_list.append(sparse_emb)
+            if l2_reg_embedding > 0:
+                get_losses().add_loss(l2_reg_embedding * tf.nn.l2_loss(sparse_emb, name=feat.name + "_l2loss"),
+                                      get_GraphKeys().REGULARIZATION_LOSSES)
+
+        else:
+            dense_value_list.append(input_layer(features, [feat]))
+
+    return sparse_emb_list, dense_value_list
+
+
+def is_embedding(feature_column):
+    try:
+        from tensorflow.python.feature_column.feature_column_v2 import EmbeddingColumn
+    except ImportError:
+        EmbeddingColumn = _EmbeddingColumn
+    return isinstance(feature_column, (_EmbeddingColumn, EmbeddingColumn))
diff --git a/modelzoo/FwFM/script/estimator/inputs.py b/modelzoo/FwFM/script/estimator/inputs.py
new file mode 100644
index 00000000000..2c175a9934e
--- /dev/null
+++ b/modelzoo/FwFM/script/estimator/inputs.py
@@ -0,0 +1,52 @@
+import tensorflow as tf
+
+
+def input_fn_pandas(df, features, label=None, batch_size=256, num_epochs=1, shuffle=False, queue_capacity_factor=10,
+                    num_threads=1):
+    if label is not None:
+        y = df[label]
+    else:
+        y = None
+    if tf.__version__ >= "2.0.0":
+        return tf.compat.v1.estimator.inputs.pandas_input_fn(df[features], y, batch_size=batch_size,
+                                                             num_epochs=num_epochs,
+                                                             shuffle=shuffle,
+                                                             queue_capacity=batch_size * queue_capacity_factor,
+                                                             num_threads=num_threads)
+
+    return tf.estimator.inputs.pandas_input_fn(df[features], y, batch_size=batch_size, num_epochs=num_epochs,
+                                               shuffle=shuffle, queue_capacity=batch_size * queue_capacity_factor,
+                                               num_threads=num_threads)
+
+
+def input_fn_tfrecord(filenames, feature_description, label=None, batch_size=256, num_epochs=1, num_parallel_calls=8,
+                      shuffle_factor=10, prefetch_factor=1,
+                      ):
+    def _parse_examples(serial_exmp):
+        try:
+            features = tf.parse_single_example(serial_exmp, features=feature_description)
+        except AttributeError:
+            features = tf.io.parse_single_example(serial_exmp, features=feature_description)
+        if label is not None:
+            labels = features.pop(label)
+            return features, labels
+        return features
+
+    def input_fn():
+        dataset = tf.data.TFRecordDataset(filenames)
+        dataset = dataset.map(_parse_examples, num_parallel_calls=num_parallel_calls)
+        if shuffle_factor > 0:
+            dataset = dataset.shuffle(buffer_size=batch_size * shuffle_factor)
+
+        dataset = dataset.repeat(num_epochs).batch(batch_size)
+
+        if prefetch_factor > 0:
+            dataset = dataset.prefetch(buffer_size=batch_size * prefetch_factor)
+        try:
+            iterator = dataset.make_one_shot_iterator()
+        except AttributeError:
+            iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
+
+        return iterator.get_next()
+
+    return input_fn
diff --git a/modelzoo/FwFM/script/estimator/models/__init__.py b/modelzoo/FwFM/script/estimator/models/__init__.py
new file mode 100644
index 00000000000..9bc1e120dbc
--- /dev/null
+++ b/modelzoo/FwFM/script/estimator/models/__init__.py
@@ -0,0 +1,13 @@
+from .afm import AFMEstimator
+from .autoint import AutoIntEstimator
+from .ccpm import CCPMEstimator
+from .dcn import DCNEstimator
+from .deepfm import DeepFMEstimator
+from .fwfm import FwFMEstimator
+from .fibinet import FiBiNETEstimator
+from .fnn import FNNEstimator
+from .nfm import NFMEstimator
+from .pnn import PNNEstimator
+from .wdl import WDLEstimator
+from .xdeepfm import xDeepFMEstimator
+from .deepfefm import DeepFEFMEstimator
diff --git a/modelzoo/FwFM/script/estimator/models/fwfm.py b/modelzoo/FwFM/script/estimator/models/fwfm.py
new file mode 100644
index 00000000000..059331643de
--- /dev/null
+++ b/modelzoo/FwFM/script/estimator/models/fwfm.py
@@ -0,0 +1,84 @@
+# -*- coding:utf-8 -*-
+"""
+Author:
+    Weichen Shen, weichenswc@163.com
+    Harshit Pande
+
+Reference:
+    [1] Field-weighted Factorization Machines for Click-Through Rate Prediction in Display Advertising
+    (https://arxiv.org/pdf/1806.03514.pdf)
+
+"""
+
+import tensorflow as tf
+
+from ..feature_column import get_linear_logit, input_from_feature_columns
+from ..utils import DNN_SCOPE_NAME, deepctr_model_fn, variable_scope
+from ...layers.core import DNN
+from ...layers.interaction import FwFMLayer
+from ...layers.utils import concat_func, add_func, combined_dnn_input
+
+
+def FwFMEstimator(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(256, 128, 64),
+                  l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_field_strength=0.00001, l2_reg_dnn=0,
+                  seed=1024, dnn_dropout=0, dnn_activation='relu', dnn_use_bn=False, task='binary', model_dir=None,
+                  config=None, linear_optimizer='Ftrl',
+                  dnn_optimizer='Adagrad', training_chief_hooks=None):
+    """Instantiates the DeepFwFM Network architecture.
+
+    :param linear_feature_columns: An iterable containing all the features used by linear part of the model.
+    :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
+    :param fm_group: list, group_name of features that will be used to do feature interactions.
+    :param dnn_hidden_units: list,list of positive integer or empty list if do not want DNN, the layer number and units
+    in each layer of DNN
+    :param l2_reg_linear: float. L2 regularizer strength applied to linear part
+    :param l2_reg_field_strength: float. L2 regularizer strength applied to the field pair strength parameters
+    :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
+    :param l2_reg_dnn: float. L2 regularizer strength applied to DNN
+    :param seed: integer ,to use as random seed.
+    :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
+    :param dnn_activation: Activation function to use in DNN
+    :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN
+    :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
+    :param model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+    :param config: tf.RunConfig object to configure the runtime settings.
+    :param linear_optimizer: An instance of `tf.Optimizer` used to apply gradients to
+        the linear part of the model. Defaults to FTRL optimizer.
+    :param dnn_optimizer: An instance of `tf.Optimizer` used to apply gradients to
+        the deep part of the model. Defaults to Adagrad optimizer.
+    :param training_chief_hooks: Iterable of `tf.train.SessionRunHook` objects to
+        run on the chief worker during training.
+    :return: A Tensorflow Estimator  instance.
+
+    """
+
+    def _model_fn(features, labels, mode, config):
+        train_flag = (mode == tf.estimator.ModeKeys.TRAIN)
+
+        linear_logits = get_linear_logit(features, linear_feature_columns, l2_reg_linear=l2_reg_linear)
+        final_logit_components = [linear_logits]
+        with variable_scope(DNN_SCOPE_NAME):
+            sparse_embedding_list, dense_value_list = input_from_feature_columns(features, dnn_feature_columns,
+                                                                                 l2_reg_embedding=l2_reg_embedding)
+
+            fwfm_logit = FwFMLayer(num_fields=len(sparse_embedding_list), regularizer=l2_reg_field_strength)(
+                concat_func(sparse_embedding_list, axis=1))
+
+            final_logit_components.append(fwfm_logit)
+
+            if dnn_hidden_units:
+                dnn_input = combined_dnn_input(sparse_embedding_list, dense_value_list)
+
+                dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, seed=seed)(dnn_input, training=train_flag)
+                dnn_logit = tf.keras.layers.Dense(
+                    1, use_bias=False, kernel_initializer=tf.keras.initializers.glorot_normal(seed))(dnn_output)
+                final_logit_components.append(dnn_logit)
+
+        logits = add_func(final_logit_components)
+
+        return deepctr_model_fn(features, mode, logits, labels, task, linear_optimizer, dnn_optimizer,
+                                training_chief_hooks=training_chief_hooks)
+
+    return tf.estimator.Estimator(_model_fn, model_dir=model_dir, config=config)
diff --git a/modelzoo/FwFM/script/estimator/utils.py b/modelzoo/FwFM/script/estimator/utils.py
new file mode 100644
index 00000000000..5d722515f6b
--- /dev/null
+++ b/modelzoo/FwFM/script/estimator/utils.py
@@ -0,0 +1,217 @@
+import tensorflow as tf
+from tensorflow.python.estimator.canned.head import _Head
+from tensorflow.python.estimator.canned.optimizers import get_optimizer_instance
+
+LINEAR_SCOPE_NAME = 'linear'
+DNN_SCOPE_NAME = 'dnn'
+
+
+def _summary_key(head_name, val):
+    return '%s/%s' % (val, head_name) if head_name else val
+
+
+class Head(_Head):
+
+    def __init__(self, task,
+                 name=None):
+        self._task = task
+        self._name = name
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def logits_dimension(self):
+        return 1
+
+    def _eval_metric_ops(self,
+                         labels,
+                         logits,
+                         predictions,
+                         unweighted_loss,
+                         weights=None):
+
+        labels = to_float(labels)
+        predictions = to_float(predictions)
+
+        # with name_scope(None, 'metrics', (labels, logits, predictions,
+        # unweighted_loss, weights)):
+        metrics = get_metrics()
+        losses = get_losses()
+
+        metric_ops = {
+            _summary_key(self._name, "prediction/mean"): metrics.mean(predictions, weights=weights),
+            _summary_key(self._name, "label/mean"): metrics.mean(labels, weights=weights),
+        }
+
+        summary_scalar("prediction/mean", metric_ops[_summary_key(self._name, "prediction/mean")][1])
+        summary_scalar("label/mean", metric_ops[_summary_key(self._name, "label/mean")][1])
+
+
+        mean_loss = losses.compute_weighted_loss(
+            unweighted_loss, weights=1.0, reduction=losses.Reduction.MEAN)
+
+        if self._task == "binary":
+            metric_ops[_summary_key(self._name, "LogLoss")] = metrics.mean(mean_loss, weights=weights, )
+            summary_scalar("LogLoss", mean_loss)
+
+            metric_ops[_summary_key(self._name, "AUC")] = metrics.auc(labels, predictions, weights=weights)
+            summary_scalar("AUC", metric_ops[_summary_key(self._name, "AUC")][1])
+        else:
+
+            metric_ops[_summary_key(self._name, "MSE")] = metrics.mean_squared_error(labels, predictions,
+                                                                                     weights=weights)
+            summary_scalar("MSE", mean_loss)
+
+            metric_ops[_summary_key(self._name, "MAE")] = metrics.mean_absolute_error(labels, predictions,
+                                                                                      weights=weights)
+            summary_scalar("MAE", metric_ops[_summary_key(self._name, "MAE")][1])
+
+        return metric_ops
+
+    def create_loss(self, features, mode, logits, labels):
+        del mode, features  # Unused for this head.
+        losses = get_losses()
+        if self._task == "binary":
+            loss = losses.sigmoid_cross_entropy(labels, logits, reduction=losses.Reduction.NONE)
+        else:
+            loss = losses.mean_squared_error(labels, logits, reduction=losses.Reduction.NONE)
+        return loss
+
+    def create_estimator_spec(
+            self, features, mode, logits, labels=None, train_op_fn=None, training_chief_hooks=None):
+        # with name_scope('head'):
+        logits = tf.reshape(logits, [-1, 1])
+        if self._task == 'binary':
+            pred = tf.sigmoid(logits)
+        else:
+            pred = logits
+
+        predictions = {"pred": pred, "logits": logits}
+        export_outputs = {"predict": tf.estimator.export.PredictOutput(predictions)}
+        if mode == tf.estimator.ModeKeys.PREDICT:
+            return tf.estimator.EstimatorSpec(
+                mode=mode,
+                predictions=predictions,
+                export_outputs=export_outputs)
+
+        labels = tf.reshape(labels, [-1, 1])
+
+        unweighted_loss = self.create_loss(features, mode, logits, labels)
+
+        losses = get_losses()
+        loss = losses.compute_weighted_loss(
+            unweighted_loss, weights=1.0, reduction=losses.Reduction.SUM)
+        reg_loss = losses.get_regularization_loss()
+
+        training_loss = loss + reg_loss
+
+        eval_metric_ops = self._eval_metric_ops(labels, logits, pred, unweighted_loss)
+
+        return tf.estimator.EstimatorSpec(
+            mode=mode,
+            predictions=predictions,
+            loss=training_loss,
+            train_op=train_op_fn(training_loss),
+            eval_metric_ops=eval_metric_ops,
+            training_chief_hooks=training_chief_hooks)
+
+
+def deepctr_model_fn(features, mode, logits, labels, task, linear_optimizer, dnn_optimizer, training_chief_hooks):
+    linear_optimizer = get_optimizer_instance(linear_optimizer, 0.005)
+    dnn_optimizer = get_optimizer_instance(dnn_optimizer, 0.01)
+    train_op_fn = get_train_op_fn(linear_optimizer, dnn_optimizer)
+
+    head = Head(task)
+    return head.create_estimator_spec(features=features,
+                                      mode=mode,
+                                      labels=labels,
+                                      train_op_fn=train_op_fn,
+                                      logits=logits, training_chief_hooks=training_chief_hooks)
+
+
+def get_train_op_fn(linear_optimizer, dnn_optimizer):
+    def _train_op_fn(loss):
+        train_ops = []
+        try:
+            global_step = tf.train.get_global_step()
+        except AttributeError:
+            global_step = tf.compat.v1.train.get_global_step()
+        linear_var_list = get_collection(get_GraphKeys().TRAINABLE_VARIABLES, LINEAR_SCOPE_NAME)
+        dnn_var_list = get_collection(get_GraphKeys().TRAINABLE_VARIABLES, DNN_SCOPE_NAME)
+
+        if len(dnn_var_list) > 0:
+            train_ops.append(
+                dnn_optimizer.minimize(
+                    loss,
+                    var_list=dnn_var_list))
+        if len(linear_var_list) > 0:
+            train_ops.append(
+                linear_optimizer.minimize(
+                    loss,
+                    var_list=linear_var_list))
+
+        train_op = tf.group(*train_ops)
+        with tf.control_dependencies([train_op]):
+            try:
+                return tf.assign_add(global_step, 1).op
+            except AttributeError:
+                return tf.compat.v1.assign_add(global_step, 1).op
+
+    return _train_op_fn
+
+
+def variable_scope(name_or_scope):
+    try:
+        return tf.variable_scope(name_or_scope)
+    except AttributeError:
+        return tf.compat.v1.variable_scope(name_or_scope)
+
+def get_collection(key, scope=None):
+    try:
+        return tf.get_collection(key, scope=scope)
+    except AttributeError:
+        return tf.compat.v1.get_collection(key, scope=scope)
+
+
+def get_GraphKeys():
+    try:
+        return tf.GraphKeys
+    except AttributeError:
+        return tf.compat.v1.GraphKeys
+
+
+def get_losses():
+    try:
+        return tf.compat.v1.losses
+    except AttributeError:
+        return tf.losses
+
+
+def input_layer(features, feature_columns):
+    try:
+        return tf.feature_column.input_layer(features, feature_columns)
+    except AttributeError:
+        return tf.compat.v1.feature_column.input_layer(features, feature_columns)
+
+
+def get_metrics():
+    try:
+        return tf.compat.v1.metrics
+    except AttributeError:
+        return tf.metrics
+
+
+def to_float(x, name="ToFloat"):
+    try:
+        return tf.to_float(x, name)
+    except AttributeError:
+        return tf.compat.v1.to_float(x, name)
+
+
+def summary_scalar(name, data):
+    try:
+        tf.summary.scalar(name, data)
+    except AttributeError:  # tf version 2.5.0+:AttributeError: module 'tensorflow._api.v2.summary' has no attribute 'scalar'
+        tf.compat.v1.summary.scalar(name, data)
\ No newline at end of file
diff --git a/modelzoo/FwFM/script/feature_column.py b/modelzoo/FwFM/script/feature_column.py
new file mode 100644
index 00000000000..3b778360b33
--- /dev/null
+++ b/modelzoo/FwFM/script/feature_column.py
@@ -0,0 +1,249 @@
+import tensorflow as tf
+from collections import namedtuple, OrderedDict
+from copy import copy
+from itertools import chain
+
+from tensorflow.python.keras.initializers import RandomNormal, Zeros
+from tensorflow.python.keras.layers import Input, Lambda
+
+from .inputs import create_embedding_matrix, embedding_lookup, get_dense_input, varlen_embedding_lookup, \
+    get_varlen_pooling_list, mergeDict
+from .layers import Linear
+from .layers.utils import concat_func
+#from keras import backend as K
+import pandas as pd
+import numpy as np
+
+fi = open('../../deep_ctr_master/data/fm.model.txt','r')
+
+first = True
+feat_weights={}
+k=0
+for line in fi:
+    s = line.strip().split()
+    if first:
+        first = False
+        w_0 = float(s[0])
+        feat_num = int(s[1])
+        k = int(s[2]) + 1 # w and v
+
+    else:
+        feat = int(s[0])
+        weights = [float(s[1 + i]) for i in range(k)]
+        feat_weights[feat] = weights
+
+list1 =[]
+for col,val in feat_weights.items():
+    list1.append(val)
+
+# def my_init(shape,dtype=None):
+#     weight = np.array(list1)
+#
+#     return weight.reshape(shape)
+
+
+DEFAULT_GROUP_NAME = "default_group"
+
+
+class SparseFeat(namedtuple('SparseFeat',
+                            ['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'vocabulary_path', 'dtype', 'embeddings_initializer',
+                             'embedding_name',
+                             'group_name', 'trainable'])):
+    __slots__ = ()
+
+    def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype="int32", embeddings_initializer=None,
+                embedding_name=None,
+                group_name=DEFAULT_GROUP_NAME, trainable=True):
+
+        if embedding_dim == "auto":
+            embedding_dim = 6 * int(pow(vocabulary_size, 0.25))
+        if embeddings_initializer is None:
+            embeddings_initializer = RandomNormal(mean=0.0, stddev=0.0001, seed=2020)
+        # if embeddings_initializer=='fm':
+        #     embeddings_initializer = my_init(shape=(vocabulary_size,embedding_dim))
+
+
+
+        if embedding_name is None:
+            embedding_name = name
+
+        return super(SparseFeat, cls).__new__(cls, name, vocabulary_size, embedding_dim, use_hash, vocabulary_path, dtype,
+                                              embeddings_initializer,
+                                              embedding_name, group_name, trainable)
+
+    def __hash__(self):
+        return self.name.__hash__()
+
+
+class VarLenSparseFeat(namedtuple('VarLenSparseFeat',
+                                  ['sparsefeat', 'maxlen', 'combiner', 'length_name', 'weight_name', 'weight_norm'])):
+    __slots__ = ()
+
+    def __new__(cls, sparsefeat, maxlen, combiner="mean", length_name=None, weight_name=None, weight_norm=True):
+        return super(VarLenSparseFeat, cls).__new__(cls, sparsefeat, maxlen, combiner, length_name, weight_name,
+                                                    weight_norm)
+
+    @property
+    def name(self):
+        return self.sparsefeat.name
+
+    @property
+    def vocabulary_size(self):
+        return self.sparsefeat.vocabulary_size
+
+    @property
+    def embedding_dim(self):
+        return self.sparsefeat.embedding_dim
+
+    @property
+    def use_hash(self):
+        return self.sparsefeat.use_hash
+
+    @property
+    def vocabulary_path(self):
+        return self.sparsefeat.vocabulary_path
+
+    @property
+    def dtype(self):
+        return self.sparsefeat.dtype
+
+    @property
+    def embeddings_initializer(self):
+        return self.sparsefeat.embeddings_initializer
+
+    @property
+    def embedding_name(self):
+        return self.sparsefeat.embedding_name
+
+    @property
+    def group_name(self):
+        return self.sparsefeat.group_name
+
+    @property
+    def trainable(self):
+        return self.sparsefeat.trainable
+
+    def __hash__(self):
+        return self.name.__hash__()
+
+
+class DenseFeat(namedtuple('DenseFeat', ['name', 'dimension', 'dtype', 'transform_fn'])):
+    """ Dense feature
+    Args:
+        name: feature name,
+        dimension: dimension of the feature, default = 1.
+        dtype: dtype of the feature, default="float32".
+        transform_fn: If not `None` , a function that can be used to transform
+        values of the feature.  the function takes the input Tensor as its
+        argument, and returns the output Tensor.
+        (e.g. lambda x: (x - 3.0) / 4.2).
+    """
+    __slots__ = ()
+
+    def __new__(cls, name, dimension=1, dtype="float32", transform_fn=None):
+        return super(DenseFeat, cls).__new__(cls, name, dimension, dtype, transform_fn)
+
+    def __hash__(self):
+        return self.name.__hash__()
+
+    # def __eq__(self, other):
+    #     if self.name == other.name:
+    #         return True
+    #     return False
+
+    # def __repr__(self):
+    #     return 'DenseFeat:'+self.name
+
+
+def get_feature_names(feature_columns):
+    features = build_input_features(feature_columns)
+    return list(features.keys())
+
+
+def build_input_features(feature_columns, prefix=''):
+    input_features = OrderedDict()
+    for fc in feature_columns:
+        if isinstance(fc, SparseFeat):
+            input_features[fc.name] = Input(
+                shape=(1,), name=prefix + fc.name, dtype=fc.dtype)
+        elif isinstance(fc, DenseFeat):
+            input_features[fc.name] = Input(
+                shape=(fc.dimension,), name=prefix + fc.name, dtype=fc.dtype)
+        elif isinstance(fc, VarLenSparseFeat):
+            input_features[fc.name] = Input(shape=(fc.maxlen,), name=prefix + fc.name,
+                                            dtype=fc.dtype)
+            if fc.weight_name is not None:
+                input_features[fc.weight_name] = Input(shape=(fc.maxlen, 1), name=prefix + fc.weight_name,
+                                                       dtype="float32")
+            if fc.length_name is not None:
+                input_features[fc.length_name] = Input((1,), name=prefix + fc.length_name, dtype='int32')
+
+        else:
+            raise TypeError("Invalid feature column type,got", type(fc))
+
+    return input_features
+
+
+def get_linear_logit(features, feature_columns, units=1, use_bias=False, seed=1024, prefix='linear',
+                     l2_reg=0, sparse_feat_refine_weight=None):
+    linear_feature_columns = copy(feature_columns)
+    for i in range(len(linear_feature_columns)):
+        if isinstance(linear_feature_columns[i], SparseFeat):
+            linear_feature_columns[i] = linear_feature_columns[i]._replace(embedding_dim=1,
+                                                                           embeddings_initializer=Zeros())
+        if isinstance(linear_feature_columns[i], VarLenSparseFeat):
+            linear_feature_columns[i] = linear_feature_columns[i]._replace(
+                sparsefeat=linear_feature_columns[i].sparsefeat._replace(embedding_dim=1,
+                                                                         embeddings_initializer=Zeros()))
+
+    linear_emb_list = [input_from_feature_columns(features, linear_feature_columns, l2_reg, seed,
+                                                  prefix=prefix + str(i))[0] for i in range(units)]
+    _, dense_input_list = input_from_feature_columns(features, linear_feature_columns, l2_reg, seed, prefix=prefix)
+
+    linear_logit_list = []
+    for i in range(units):
+
+        if len(linear_emb_list[i]) > 0 and len(dense_input_list) > 0:
+            sparse_input = concat_func(linear_emb_list[i])
+            dense_input = concat_func(dense_input_list)
+            if sparse_feat_refine_weight is not None:
+                sparse_input = Lambda(lambda x: x[0] * tf.expand_dims(x[1], axis=1))(
+                    [sparse_input, sparse_feat_refine_weight])
+            linear_logit = Linear(l2_reg, mode=2, use_bias=use_bias, seed=seed)([sparse_input, dense_input])
+        elif len(linear_emb_list[i]) > 0:
+            sparse_input = concat_func(linear_emb_list[i])
+            if sparse_feat_refine_weight is not None:
+                sparse_input = Lambda(lambda x: x[0] * tf.expand_dims(x[1], axis=1))(
+                    [sparse_input, sparse_feat_refine_weight])
+            linear_logit = Linear(l2_reg, mode=0, use_bias=use_bias, seed=seed)(sparse_input)
+        elif len(dense_input_list) > 0:
+            dense_input = concat_func(dense_input_list)
+            linear_logit = Linear(l2_reg, mode=1, use_bias=use_bias, seed=seed)(dense_input)
+        else:   #empty feature_columns
+            return Lambda(lambda x: tf.constant([[0.0]]))(list(features.values())[0])
+        linear_logit_list.append(linear_logit)
+
+    return concat_func(linear_logit_list)
+
+
+def input_from_feature_columns(features, feature_columns, l2_reg, seed, prefix='', seq_mask_zero=True,
+                               support_dense=True, support_group=False):
+    sparse_feature_columns = list(
+        filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
+    varlen_sparse_feature_columns = list(
+        filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else []
+
+    embedding_matrix_dict = create_embedding_matrix(feature_columns, l2_reg, seed, prefix=prefix,
+                                                    seq_mask_zero=seq_mask_zero)
+    group_sparse_embedding_dict = embedding_lookup(embedding_matrix_dict, features, sparse_feature_columns)
+    dense_value_list = get_dense_input(features, feature_columns)
+    if not support_dense and len(dense_value_list) > 0:
+        raise ValueError("DenseFeat is not supported in dnn_feature_columns")
+
+    sequence_embed_dict = varlen_embedding_lookup(embedding_matrix_dict, features, varlen_sparse_feature_columns)
+    group_varlen_sparse_embedding_dict = get_varlen_pooling_list(sequence_embed_dict, features,
+                                                                 varlen_sparse_feature_columns)
+    group_embedding_dict = mergeDict(group_sparse_embedding_dict, group_varlen_sparse_embedding_dict)
+    if not support_group:
+        group_embedding_dict = list(chain.from_iterable(group_embedding_dict.values()))
+    return group_embedding_dict, dense_value_list
diff --git a/modelzoo/FwFM/script/inputs.py b/modelzoo/FwFM/script/inputs.py
new file mode 100644
index 00000000000..d567f846265
--- /dev/null
+++ b/modelzoo/FwFM/script/inputs.py
@@ -0,0 +1,155 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+from collections import defaultdict
+from itertools import chain
+
+from tensorflow.python.keras.layers import Embedding, Lambda
+from tensorflow.python.keras.regularizers import l2
+
+from .layers.sequence import SequencePoolingLayer, WeightedSequenceLayer
+from .layers.utils import Hash
+
+
+def get_inputs_list(inputs):
+    return list(chain(*list(map(lambda x: x.values(), filter(lambda x: x is not None, inputs)))))
+
+
+def create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, seed, l2_reg,
+                          prefix='sparse_', seq_mask_zero=True):
+    sparse_embedding = {}
+    for feat in sparse_feature_columns:
+        emb = Embedding(feat.vocabulary_size, feat.embedding_dim,
+                        embeddings_initializer=feat.embeddings_initializer,
+                        embeddings_regularizer=l2(l2_reg),
+                        name=prefix + '_emb_' + feat.embedding_name)
+        emb.trainable = feat.trainable
+        sparse_embedding[feat.embedding_name] = emb
+
+    if varlen_sparse_feature_columns and len(varlen_sparse_feature_columns) > 0:
+        for feat in varlen_sparse_feature_columns:
+            # if feat.name not in sparse_embedding:
+            emb = Embedding(feat.vocabulary_size, feat.embedding_dim,
+                            embeddings_initializer=feat.embeddings_initializer,
+                            embeddings_regularizer=l2(
+                                l2_reg),
+                            name=prefix + '_seq_emb_' + feat.name,
+                            mask_zero=seq_mask_zero)
+            emb.trainable = feat.trainable
+            sparse_embedding[feat.embedding_name] = emb
+    return sparse_embedding
+
+
+def get_embedding_vec_list(embedding_dict, input_dict, sparse_feature_columns, return_feat_list=(), mask_feat_list=()):
+    embedding_vec_list = []
+    for fg in sparse_feature_columns:
+        feat_name = fg.name
+        if len(return_feat_list) == 0 or feat_name in return_feat_list:
+            if fg.use_hash:
+                lookup_idx = Hash(fg.vocabulary_size, mask_zero=(feat_name in mask_feat_list), vocabulary_path=fg.vocabulary_path)(input_dict[feat_name])
+            else:
+                lookup_idx = input_dict[feat_name]
+
+            embedding_vec_list.append(embedding_dict[feat_name](lookup_idx))
+
+    return embedding_vec_list
+
+
+def create_embedding_matrix(feature_columns, l2_reg, seed, prefix="", seq_mask_zero=True):
+    from . import feature_column as fc_lib
+
+    sparse_feature_columns = list(
+        filter(lambda x: isinstance(x, fc_lib.SparseFeat), feature_columns)) if feature_columns else []
+    varlen_sparse_feature_columns = list(
+        filter(lambda x: isinstance(x, fc_lib.VarLenSparseFeat), feature_columns)) if feature_columns else []
+    sparse_emb_dict = create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, seed,
+                                            l2_reg, prefix=prefix + 'sparse', seq_mask_zero=seq_mask_zero)
+    return sparse_emb_dict
+
+
+def embedding_lookup(sparse_embedding_dict, sparse_input_dict, sparse_feature_columns, return_feat_list=(),
+                     mask_feat_list=(), to_list=False):
+    group_embedding_dict = defaultdict(list)
+    for fc in sparse_feature_columns:
+        feature_name = fc.name
+        embedding_name = fc.embedding_name
+        if (len(return_feat_list) == 0 or feature_name in return_feat_list):
+            if fc.use_hash:
+                lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list), vocabulary_path=fc.vocabulary_path)(
+                    sparse_input_dict[feature_name])
+            else:
+                lookup_idx = sparse_input_dict[feature_name]
+
+            group_embedding_dict[fc.group_name].append(sparse_embedding_dict[embedding_name](lookup_idx))
+    if to_list:
+        return list(chain.from_iterable(group_embedding_dict.values()))
+    return group_embedding_dict
+
+
+def varlen_embedding_lookup(embedding_dict, sequence_input_dict, varlen_sparse_feature_columns):
+    varlen_embedding_vec_dict = {}
+    for fc in varlen_sparse_feature_columns:
+        feature_name = fc.name
+        embedding_name = fc.embedding_name
+        if fc.use_hash:
+            lookup_idx = Hash(fc.vocabulary_size, mask_zero=True, vocabulary_path=fc.vocabulary_path)(sequence_input_dict[feature_name])
+        else:
+            lookup_idx = sequence_input_dict[feature_name]
+        varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](lookup_idx)
+    return varlen_embedding_vec_dict
+
+
+def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_columns, to_list=False):
+    pooling_vec_list = defaultdict(list)
+    for fc in varlen_sparse_feature_columns:
+        feature_name = fc.name
+        combiner = fc.combiner
+        feature_length_name = fc.length_name
+        if feature_length_name is not None:
+            if fc.weight_name is not None:
+                seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm)(
+                    [embedding_dict[feature_name], features[feature_length_name], features[fc.weight_name]])
+            else:
+                seq_input = embedding_dict[feature_name]
+            vec = SequencePoolingLayer(combiner, supports_masking=False)(
+                [seq_input, features[feature_length_name]])
+        else:
+            if fc.weight_name is not None:
+                seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm, supports_masking=True)(
+                    [embedding_dict[feature_name], features[fc.weight_name]])
+            else:
+                seq_input = embedding_dict[feature_name]
+            vec = SequencePoolingLayer(combiner, supports_masking=True)(
+                seq_input)
+        pooling_vec_list[fc.group_name].append(vec)
+    if to_list:
+        return chain.from_iterable(pooling_vec_list.values())
+    return pooling_vec_list
+
+
+def get_dense_input(features, feature_columns):
+    from . import feature_column as fc_lib
+    dense_feature_columns = list(
+        filter(lambda x: isinstance(x, fc_lib.DenseFeat), feature_columns)) if feature_columns else []
+    dense_input_list = []
+    for fc in dense_feature_columns:
+        if fc.transform_fn is None:
+            dense_input_list.append(features[fc.name])
+        else:
+            transform_result = Lambda(fc.transform_fn)(features[fc.name])
+            dense_input_list.append(transform_result)
+    return dense_input_list
+
+
+def mergeDict(a, b):
+    c = defaultdict(list)
+    for k, v in a.items():
+        c[k].extend(v)
+    for k, v in b.items():
+        c[k].extend(v)
+    return c
diff --git a/modelzoo/FwFM/script/layers/__init__.py b/modelzoo/FwFM/script/layers/__init__.py
new file mode 100644
index 00000000000..1bfd40effe7
--- /dev/null
+++ b/modelzoo/FwFM/script/layers/__init__.py
@@ -0,0 +1,52 @@
+import tensorflow as tf
+
+from .activation import Dice
+from .core import DNN, LocalActivationUnit, PredictionLayer
+from .interaction import (CIN, FM, AFMLayer, BiInteractionPooling, CrossNet, CrossNetMix,
+                          InnerProductLayer, InteractingLayer,
+                          OutterProductLayer, FGCNNLayer, SENETLayer, BilinearInteraction,
+                          FieldWiseBiInteraction, FwFMLayer, FEFMLayer)
+from .normalization import LayerNormalization
+from .sequence import (AttentionSequencePoolingLayer, BiasEncoding, BiLSTM,
+                       KMaxPooling, SequencePoolingLayer, WeightedSequenceLayer,
+                       Transformer, DynamicGRU,PositionEncoding)
+
+from .utils import NoMask, Hash, Linear, _Add, combined_dnn_input, softmax, reduce_sum
+
+custom_objects = {'tf': tf,
+                  'InnerProductLayer': InnerProductLayer,
+                  'OutterProductLayer': OutterProductLayer,
+                  'DNN': DNN,
+                  'PredictionLayer': PredictionLayer,
+                  'FM': FM,
+                  'AFMLayer': AFMLayer,
+                  'CrossNet': CrossNet,
+                  'CrossNetMix': CrossNetMix,
+                  'BiInteractionPooling': BiInteractionPooling,
+                  'LocalActivationUnit': LocalActivationUnit,
+                  'Dice': Dice,
+                  'SequencePoolingLayer': SequencePoolingLayer,
+                  'AttentionSequencePoolingLayer': AttentionSequencePoolingLayer,
+                  'CIN': CIN,
+                  'InteractingLayer': InteractingLayer,
+                  'LayerNormalization': LayerNormalization,
+                  'BiLSTM': BiLSTM,
+                  'Transformer': Transformer,
+                  'NoMask': NoMask,
+                  'BiasEncoding': BiasEncoding,
+                  'KMaxPooling': KMaxPooling,
+                  'FGCNNLayer': FGCNNLayer,
+                  'Hash': Hash,
+                  'Linear': Linear,
+                  'DynamicGRU': DynamicGRU,
+                  'SENETLayer': SENETLayer,
+                  'BilinearInteraction': BilinearInteraction,
+                  'WeightedSequenceLayer': WeightedSequenceLayer,
+                  '_Add': _Add,
+                  'FieldWiseBiInteraction': FieldWiseBiInteraction,
+                  'FwFMLayer': FwFMLayer,
+                  'softmax': softmax,
+                  'FEFMLayer': FEFMLayer,
+                  'reduce_sum': reduce_sum,
+                  'PositionEncoding':PositionEncoding
+                  }
diff --git a/modelzoo/FwFM/script/layers/activation.py b/modelzoo/FwFM/script/layers/activation.py
new file mode 100644
index 00000000000..1b953bff8bc
--- /dev/null
+++ b/modelzoo/FwFM/script/layers/activation.py
@@ -0,0 +1,85 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+import tensorflow as tf
+
+try:
+    from tensorflow.python.ops.init_ops import Zeros
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import Zeros
+from tensorflow.python.keras.layers import Layer, Activation
+
+try:
+    from tensorflow.python.keras.layers import BatchNormalization
+except ImportError:
+    BatchNormalization = tf.keras.layers.BatchNormalization
+
+try:
+    unicode
+except NameError:
+    unicode = str
+
+
+class Dice(Layer):
+    """The Data Adaptive Activation Function in DIN,which can be viewed as a generalization of PReLu and can adaptively adjust the rectified point according to distribution of input data.
+
+      Input shape
+        - Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
+
+      Output shape
+        - Same shape as the input.
+
+      Arguments
+        - **axis** : Integer, the axis that should be used to compute data distribution (typically the features axis).
+
+        - **epsilon** : Small float added to variance to avoid dividing by zero.
+
+      References
+        - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
+    """
+
+    def __init__(self, axis=-1, epsilon=1e-9, **kwargs):
+        self.axis = axis
+        self.epsilon = epsilon
+        super(Dice, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        self.bn = BatchNormalization(
+            axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
+        self.alphas = self.add_weight(shape=(input_shape[-1],), initializer=Zeros(
+        ), dtype=tf.float32, name='dice_alpha')  # name='alpha_'+self.name
+        super(Dice, self).build(input_shape)  # Be sure to call this somewhere!
+        self.uses_learning_phase = True
+
+    def call(self, inputs, training=None, **kwargs):
+        inputs_normed = self.bn(inputs, training=training)
+        # tf.layers.batch_normalization(
+        # inputs, axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
+        x_p = tf.sigmoid(inputs_normed)
+        return self.alphas * (1.0 - x_p) * inputs + x_p * inputs
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self, ):
+        config = {'axis': self.axis, 'epsilon': self.epsilon}
+        base_config = super(Dice, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+def activation_layer(activation):
+    if activation in ("dice", "Dice"):
+        act_layer = Dice()
+    elif isinstance(activation, (str, unicode)):
+        act_layer = Activation(activation)
+    elif issubclass(activation, Layer):
+        act_layer = activation()
+    else:
+        raise ValueError(
+            "Invalid activation,found %s.You should use a str or a Activation Layer Class." % (activation))
+    return act_layer
diff --git a/modelzoo/FwFM/script/layers/core.py b/modelzoo/FwFM/script/layers/core.py
new file mode 100644
index 00000000000..668348d2eb7
--- /dev/null
+++ b/modelzoo/FwFM/script/layers/core.py
@@ -0,0 +1,267 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+import tensorflow as tf
+from tensorflow.python.keras import backend as K
+
+try:
+    from tensorflow.python.ops.init_ops_v2 import Zeros, glorot_normal
+except ImportError:
+    from tensorflow.python.ops.init_ops import Zeros, glorot_normal_initializer as glorot_normal
+
+from tensorflow.python.keras.layers import Layer, Dropout
+
+try:
+    from tensorflow.python.keras.layers import BatchNormalization
+except ImportError:
+    BatchNormalization = tf.keras.layers.BatchNormalization
+from tensorflow.python.keras.regularizers import l2
+
+from .activation import activation_layer
+
+
+class LocalActivationUnit(Layer):
+    """The LocalActivationUnit used in DIN with which the representation of
+    user interests varies adaptively given different candidate items.
+
+      Input shape
+        - A list of two 3D tensor with shape:  ``(batch_size, 1, embedding_size)`` and ``(batch_size, T, embedding_size)``
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, T, 1)``.
+
+      Arguments
+        - **hidden_units**:list of positive integer, the attention net layer number and units in each layer.
+
+        - **activation**: Activation function to use in attention net.
+
+        - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix of attention net.
+
+        - **dropout_rate**: float in [0,1). Fraction of the units to dropout in attention net.
+
+        - **use_bn**: bool. Whether use BatchNormalization before activation or not in attention net.
+
+        - **seed**: A Python integer to use as random seed.
+
+      References
+        - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
+    """
+
+    def __init__(self, hidden_units=(64, 32), activation='sigmoid', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024,
+                 **kwargs):
+        self.hidden_units = hidden_units
+        self.activation = activation
+        self.l2_reg = l2_reg
+        self.dropout_rate = dropout_rate
+        self.use_bn = use_bn
+        self.seed = seed
+        super(LocalActivationUnit, self).__init__(**kwargs)
+        self.supports_masking = True
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) != 2:
+            raise ValueError('A `LocalActivationUnit` layer should be called '
+                             'on a list of 2 inputs')
+
+        if len(input_shape[0]) != 3 or len(input_shape[1]) != 3:
+            raise ValueError("Unexpected inputs dimensions %d and %d, expect to be 3 dimensions" % (
+                len(input_shape[0]), len(input_shape[1])))
+
+        if input_shape[0][-1] != input_shape[1][-1] or input_shape[0][1] != 1:
+            raise ValueError('A `LocalActivationUnit` layer requires '
+                             'inputs of a two inputs with shape (None,1,embedding_size) and (None,T,embedding_size)'
+                             'Got different shapes: %s,%s' % (input_shape[0], input_shape[1]))
+        size = 4 * \
+               int(input_shape[0][-1]
+                   ) if len(self.hidden_units) == 0 else self.hidden_units[-1]
+        self.kernel = self.add_weight(shape=(size, 1),
+                                      initializer=glorot_normal(
+                                          seed=self.seed),
+                                      name="kernel")
+        self.bias = self.add_weight(
+            shape=(1,), initializer=Zeros(), name="bias")
+        self.dnn = DNN(self.hidden_units, self.activation, self.l2_reg, self.dropout_rate, self.use_bn, seed=self.seed)
+
+        super(LocalActivationUnit, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, training=None, **kwargs):
+
+        query, keys = inputs
+
+        keys_len = keys.get_shape()[1]
+        queries = K.repeat_elements(query, keys_len, 1)
+
+        att_input = tf.concat(
+            [queries, keys, queries - keys, queries * keys], axis=-1)
+
+        att_out = self.dnn(att_input, training=training)
+
+        attention_score = tf.nn.bias_add(tf.tensordot(att_out, self.kernel, axes=(-1, 0)), self.bias)
+
+        return attention_score
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[1][:2] + (1,)
+
+    def compute_mask(self, inputs, mask):
+        return mask
+
+    def get_config(self, ):
+        config = {'activation': self.activation, 'hidden_units': self.hidden_units,
+                  'l2_reg': self.l2_reg, 'dropout_rate': self.dropout_rate, 'use_bn': self.use_bn, 'seed': self.seed}
+        base_config = super(LocalActivationUnit, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class DNN(Layer):
+    """The Multi Layer Percetron
+
+      Input shape
+        - nD tensor with shape: ``(batch_size, ..., input_dim)``. The most common situation would be a 2D input with shape ``(batch_size, input_dim)``.
+
+      Output shape
+        - nD tensor with shape: ``(batch_size, ..., hidden_size[-1])``. For instance, for a 2D input with shape ``(batch_size, input_dim)``, the output would have shape ``(batch_size, hidden_size[-1])``.
+
+      Arguments
+        - **hidden_units**:list of positive integer, the layer number and units in each layer.
+
+        - **activation**: Activation function to use.
+
+        - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix.
+
+        - **dropout_rate**: float in [0,1). Fraction of the units to dropout.
+
+        - **use_bn**: bool. Whether use BatchNormalization before activation or not.
+
+        - **output_activation**: Activation function to use in the last layer.If ``None``,it will be same as ``activation``.
+
+        - **seed**: A Python integer to use as random seed.
+    """
+
+    def __init__(self, hidden_units, activation='relu', l2_reg=0, dropout_rate=0, use_bn=False, output_activation=None,
+                 seed=1024, **kwargs):
+        self.hidden_units = hidden_units
+        self.activation = activation
+        self.l2_reg = l2_reg
+        self.dropout_rate = dropout_rate
+        self.use_bn = use_bn
+        self.output_activation = output_activation
+        self.seed = seed
+
+        super(DNN, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # if len(self.hidden_units) == 0:
+        #     raise ValueError("hidden_units is empty")
+        input_size = input_shape[-1]
+        hidden_units = [int(input_size)] + list(self.hidden_units)
+        self.kernels = [self.add_weight(name='kernel' + str(i),
+                                        shape=(
+                                            hidden_units[i], hidden_units[i + 1]),
+                                        initializer=glorot_normal(
+                                            seed=self.seed),
+                                        regularizer=l2(self.l2_reg),
+                                        trainable=True) for i in range(len(self.hidden_units))]
+        self.bias = [self.add_weight(name='bias' + str(i),
+                                     shape=(self.hidden_units[i],),
+                                     initializer=Zeros(),
+                                     trainable=True) for i in range(len(self.hidden_units))]
+        if self.use_bn:
+            self.bn_layers = [BatchNormalization() for _ in range(len(self.hidden_units))]
+
+        self.dropout_layers = [Dropout(self.dropout_rate, seed=self.seed + i) for i in
+                               range(len(self.hidden_units))]
+
+        self.activation_layers = [activation_layer(self.activation) for _ in range(len(self.hidden_units))]
+
+        if self.output_activation:
+            self.activation_layers[-1] = activation_layer(self.output_activation)
+
+        super(DNN, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, training=None, **kwargs):
+
+        deep_input = inputs
+
+        for i in range(len(self.hidden_units)):
+            fc = tf.nn.bias_add(tf.tensordot(
+                deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i])
+
+            if self.use_bn:
+                fc = self.bn_layers[i](fc, training=training)
+            try:
+                fc = self.activation_layers[i](fc, training=training)
+            except TypeError as e:  # TypeError: call() got an unexpected keyword argument 'training'
+                print("make sure the activation function use training flag properly", e)
+                fc = self.activation_layers[i](fc)
+
+            fc = self.dropout_layers[i](fc, training=training)
+            deep_input = fc
+
+        return deep_input
+
+    def compute_output_shape(self, input_shape):
+        if len(self.hidden_units) > 0:
+            shape = input_shape[:-1] + (self.hidden_units[-1],)
+        else:
+            shape = input_shape
+
+        return tuple(shape)
+
+    def get_config(self, ):
+        config = {'activation': self.activation, 'hidden_units': self.hidden_units,
+                  'l2_reg': self.l2_reg, 'use_bn': self.use_bn, 'dropout_rate': self.dropout_rate,
+                  'output_activation': self.output_activation, 'seed': self.seed}
+        base_config = super(DNN, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class PredictionLayer(Layer):
+    """
+      Arguments
+         - **task**: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
+
+         - **use_bias**: bool.Whether add bias term or not.
+    """
+
+    def __init__(self, task='binary', use_bias=True, **kwargs):
+        if task not in ["binary", "multiclass", "regression"]:
+            raise ValueError("task must be binary,multiclass or regression")
+        self.task = task
+        self.use_bias = use_bias
+        super(PredictionLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if self.use_bias:
+            self.global_bias = self.add_weight(
+                shape=(1,), initializer=Zeros(), name="global_bias")
+
+        # Be sure to call this somewhere!
+        super(PredictionLayer, self).build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        x = inputs
+        if self.use_bias:
+            x = tf.nn.bias_add(x, self.global_bias, data_format='NHWC')
+        if self.task == "binary":
+            x = tf.sigmoid(x)
+
+        output = tf.reshape(x, (-1, 1))
+
+        return output
+
+    def compute_output_shape(self, input_shape):
+        return (None, 1)
+
+    def get_config(self, ):
+        config = {'task': self.task, 'use_bias': self.use_bias}
+        base_config = super(PredictionLayer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/modelzoo/FwFM/script/layers/interaction.py b/modelzoo/FwFM/script/layers/interaction.py
new file mode 100644
index 00000000000..f19be14be9c
--- /dev/null
+++ b/modelzoo/FwFM/script/layers/interaction.py
@@ -0,0 +1,1492 @@
+# -*- coding:utf-8 -*-
+"""
+
+Authors:
+    Weichen Shen,weichenswc@163.com,
+    Harshit Pande
+
+"""
+
+import itertools
+
+import tensorflow as tf
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.backend import batch_dot
+
+try:
+    from tensorflow.python.ops.init_ops import Zeros, Ones, Constant, TruncatedNormal, \
+        glorot_normal_initializer as glorot_normal, \
+        glorot_uniform_initializer as glorot_uniform
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import Zeros, Ones, Constant, TruncatedNormal, glorot_normal, glorot_uniform
+
+from tensorflow.python.keras.layers import Layer, MaxPooling2D, Conv2D, Dropout, Lambda, Dense, Flatten
+from tensorflow.python.keras.regularizers import l2
+from tensorflow.python.layers import utils
+
+from .activation import activation_layer
+from .utils import concat_func, reduce_sum, softmax, reduce_mean
+
+
+class AFMLayer(Layer):
+    """Attentonal Factorization Machine models pairwise (order-2) feature
+    interactions without linear term and bias.
+
+      Input shape
+        - A list of 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, 1)``.
+
+      Arguments
+        - **attention_factor** : Positive integer, dimensionality of the
+         attention network output space.
+
+        - **l2_reg_w** : float between 0 and 1. L2 regularizer strength
+         applied to attention network.
+
+        - **dropout_rate** : float between in [0,1). Fraction of the attention net output units to dropout.
+
+        - **seed** : A Python integer to use as random seed.
+
+      References
+        - [Attentional Factorization Machines : Learning the Weight of Feature
+        Interactions via Attention Networks](https://arxiv.org/pdf/1708.04617.pdf)
+    """
+
+    def __init__(self, attention_factor=4, l2_reg_w=0, dropout_rate=0, seed=1024, **kwargs):
+        self.attention_factor = attention_factor
+        self.l2_reg_w = l2_reg_w
+        self.dropout_rate = dropout_rate
+        self.seed = seed
+        super(AFMLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            # input_shape = input_shape[0]
+            # if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError('A `AttentionalFM` layer should be called '
+                             'on a list of at least 2 inputs')
+
+        shape_set = set()
+        reduced_input_shape = [shape.as_list() for shape in input_shape]
+        for i in range(len(input_shape)):
+            shape_set.add(tuple(reduced_input_shape[i]))
+
+        if len(shape_set) > 1:
+            raise ValueError('A `AttentionalFM` layer requires '
+                             'inputs with same shapes '
+                             'Got different shapes: %s' % (shape_set))
+
+        if len(input_shape[0]) != 3 or input_shape[0][1] != 1:
+            raise ValueError('A `AttentionalFM` layer requires '
+                             'inputs of a list with same shape tensor like\
+                             (None, 1, embedding_size)'
+                             'Got different shapes: %s' % (input_shape[0]))
+
+        embedding_size = int(input_shape[0][-1])
+
+        self.attention_W = self.add_weight(shape=(embedding_size,
+                                                  self.attention_factor), initializer=glorot_normal(seed=self.seed),
+                                           regularizer=l2(self.l2_reg_w), name="attention_W")
+        self.attention_b = self.add_weight(
+            shape=(self.attention_factor,), initializer=Zeros(), name="attention_b")
+        self.projection_h = self.add_weight(shape=(self.attention_factor, 1),
+                                            initializer=glorot_normal(seed=self.seed), name="projection_h")
+        self.projection_p = self.add_weight(shape=(
+            embedding_size, 1), initializer=glorot_normal(seed=self.seed), name="projection_p")
+        self.dropout = Dropout(
+            self.dropout_rate, seed=self.seed)
+
+        self.tensordot = Lambda(
+            lambda x: tf.tensordot(x[0], x[1], axes=(-1, 0)))
+
+        # Be sure to call this somewhere!
+        super(AFMLayer, self).build(input_shape)
+
+    def call(self, inputs, training=None, **kwargs):
+
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        embeds_vec_list = inputs
+        row = []
+        col = []
+
+        for r, c in itertools.combinations(embeds_vec_list, 2):
+            row.append(r)
+            col.append(c)
+
+        p = tf.concat(row, axis=1)
+        q = tf.concat(col, axis=1)
+        inner_product = p * q
+
+        bi_interaction = inner_product
+        attention_temp = tf.nn.relu(tf.nn.bias_add(tf.tensordot(
+            bi_interaction, self.attention_W, axes=(-1, 0)), self.attention_b))
+        #  Dense(self.attention_factor,'relu',kernel_regularizer=l2(self.l2_reg_w))(bi_interaction)
+        self.normalized_att_score = softmax(tf.tensordot(
+            attention_temp, self.projection_h, axes=(-1, 0)), dim=1)
+        attention_output = reduce_sum(
+            self.normalized_att_score * bi_interaction, axis=1)
+
+        attention_output = self.dropout(attention_output, training=training)  # training
+
+        afm_out = self.tensordot([attention_output, self.projection_p])
+        return afm_out
+
+    def compute_output_shape(self, input_shape):
+
+        if not isinstance(input_shape, list):
+            raise ValueError('A `AFMLayer` layer should be called '
+                             'on a list of inputs.')
+        return (None, 1)
+
+    def get_config(self, ):
+        config = {'attention_factor': self.attention_factor,
+                  'l2_reg_w': self.l2_reg_w, 'dropout_rate': self.dropout_rate, 'seed': self.seed}
+        base_config = super(AFMLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class BiInteractionPooling(Layer):
+    """Bi-Interaction Layer used in Neural FM,compress the
+     pairwise element-wise product of features into one single vector.
+
+      Input shape
+        - A 3D tensor with shape:``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      References
+        - [He X, Chua T S. Neural factorization machines for sparse predictive analytics[C]//Proceedings of the 40th International ACM SIGIR conference on Research and Development in Information Retrieval. ACM, 2017: 355-364.](http://arxiv.org/abs/1708.05027)
+    """
+
+    def __init__(self, **kwargs):
+
+        super(BiInteractionPooling, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(input_shape)))
+
+        super(BiInteractionPooling, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        concated_embeds_value = inputs
+        square_of_sum = tf.square(reduce_sum(
+            concated_embeds_value, axis=1, keep_dims=True))
+        sum_of_square = reduce_sum(
+            concated_embeds_value * concated_embeds_value, axis=1, keep_dims=True)
+        cross_term = 0.5 * (square_of_sum - sum_of_square)
+
+        return cross_term
+
+    def compute_output_shape(self, input_shape):
+        return (None, 1, input_shape[-1])
+
+
+class CIN(Layer):
+    """Compressed Interaction Network used in xDeepFM.This implemention is
+    adapted from code that the author of the paper published on https://github.com/Leavingseason/xDeepFM.
+
+      Input shape
+        - 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, featuremap_num)`` ``featuremap_num =  sum(self.layer_size[:-1]) // 2 + self.layer_size[-1]`` if ``split_half=True``,else  ``sum(layer_size)`` .
+
+      Arguments
+        - **layer_size** : list of int.Feature maps in each layer.
+
+        - **activation** : activation function used on feature maps.
+
+        - **split_half** : bool.if set to False, half of the feature maps in each hidden will connect to output unit.
+
+        - **seed** : A Python integer to use as random seed.
+
+      References
+        - [Lian J, Zhou X, Zhang F, et al. xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems[J]. arXiv preprint arXiv:1803.05170, 2018.] (https://arxiv.org/pdf/1803.05170.pdf)
+    """
+
+    def __init__(self, layer_size=(128, 128), activation='relu', split_half=True, l2_reg=1e-5, seed=1024, **kwargs):
+        if len(layer_size) == 0:
+            raise ValueError(
+                "layer_size must be a list(tuple) of length greater than 1")
+        self.layer_size = layer_size
+        self.split_half = split_half
+        self.activation = activation
+        self.l2_reg = l2_reg
+        self.seed = seed
+        super(CIN, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(input_shape)))
+
+        self.field_nums = [int(input_shape[1])]
+        self.filters = []
+        self.bias = []
+        for i, size in enumerate(self.layer_size):
+
+            self.filters.append(self.add_weight(name='filter' + str(i),
+                                                shape=[1, self.field_nums[-1]
+                                                       * self.field_nums[0], size],
+                                                dtype=tf.float32, initializer=glorot_uniform(
+                    seed=self.seed + i),
+                                                regularizer=l2(self.l2_reg)))
+
+            self.bias.append(self.add_weight(name='bias' + str(i), shape=[size], dtype=tf.float32,
+                                             initializer=Zeros()))
+
+            if self.split_half:
+                if i != len(self.layer_size) - 1 and size % 2 > 0:
+                    raise ValueError(
+                        "layer_size must be even number except for the last layer when split_half=True")
+
+                self.field_nums.append(size // 2)
+            else:
+                self.field_nums.append(size)
+
+        self.activation_layers = [activation_layer(
+            self.activation) for _ in self.layer_size]
+
+        super(CIN, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        dim = int(inputs.get_shape()[-1])
+        hidden_nn_layers = [inputs]
+        final_result = []
+
+        split_tensor0 = tf.split(hidden_nn_layers[0], dim * [1], 2)
+        for idx, layer_size in enumerate(self.layer_size):
+            split_tensor = tf.split(hidden_nn_layers[-1], dim * [1], 2)
+
+            dot_result_m = tf.matmul(
+                split_tensor0, split_tensor, transpose_b=True)
+
+            dot_result_o = tf.reshape(
+                dot_result_m, shape=[dim, -1, self.field_nums[0] * self.field_nums[idx]])
+
+            dot_result = tf.transpose(dot_result_o, perm=[1, 0, 2])
+
+            curr_out = tf.nn.conv1d(
+                dot_result, filters=self.filters[idx], stride=1, padding='VALID')
+
+            curr_out = tf.nn.bias_add(curr_out, self.bias[idx])
+
+            curr_out = self.activation_layers[idx](curr_out)
+
+            curr_out = tf.transpose(curr_out, perm=[0, 2, 1])
+
+            if self.split_half:
+                if idx != len(self.layer_size) - 1:
+                    next_hidden, direct_connect = tf.split(
+                        curr_out, 2 * [layer_size // 2], 1)
+                else:
+                    direct_connect = curr_out
+                    next_hidden = 0
+            else:
+                direct_connect = curr_out
+                next_hidden = curr_out
+
+            final_result.append(direct_connect)
+            hidden_nn_layers.append(next_hidden)
+
+        result = tf.concat(final_result, axis=1)
+        result = reduce_sum(result, -1, keep_dims=False)
+
+        return result
+
+    def compute_output_shape(self, input_shape):
+        if self.split_half:
+            featuremap_num = sum(
+                self.layer_size[:-1]) // 2 + self.layer_size[-1]
+        else:
+            featuremap_num = sum(self.layer_size)
+        return (None, featuremap_num)
+
+    def get_config(self, ):
+
+        config = {'layer_size': self.layer_size, 'split_half': self.split_half, 'activation': self.activation,
+                  'seed': self.seed}
+        base_config = super(CIN, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class CrossNet(Layer):
+    """The Cross Network part of Deep&Cross Network model,
+    which leans both low and high degree cross feature.
+
+      Input shape
+        - 2D tensor with shape: ``(batch_size, units)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, units)``.
+
+      Arguments
+        - **layer_num**: Positive integer, the cross layer number
+
+        - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix
+
+        - **parameterization**: string, ``"vector"``  or ``"matrix"`` ,  way to parameterize the cross network.
+
+        - **seed**: A Python integer to use as random seed.
+
+      References
+        - [Wang R, Fu B, Fu G, et al. Deep & cross network for ad click predictions[C]//Proceedings of the ADKDD'17. ACM, 2017: 12.](https://arxiv.org/abs/1708.05123)
+    """
+
+    def __init__(self, layer_num=2, parameterization='vector', l2_reg=0, seed=1024, **kwargs):
+        self.layer_num = layer_num
+        self.parameterization = parameterization
+        self.l2_reg = l2_reg
+        self.seed = seed
+        print('CrossNet parameterization:', self.parameterization)
+        super(CrossNet, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if len(input_shape) != 2:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 2 dimensions" % (len(input_shape),))
+
+        dim = int(input_shape[-1])
+        if self.parameterization == 'vector':
+            self.kernels = [self.add_weight(name='kernel' + str(i),
+                                            shape=(dim, 1),
+                                            initializer=glorot_normal(
+                                                seed=self.seed),
+                                            regularizer=l2(self.l2_reg),
+                                            trainable=True) for i in range(self.layer_num)]
+        elif self.parameterization == 'matrix':
+            self.kernels = [self.add_weight(name='kernel' + str(i),
+                                            shape=(dim, dim),
+                                            initializer=glorot_normal(
+                                                seed=self.seed),
+                                            regularizer=l2(self.l2_reg),
+                                            trainable=True) for i in range(self.layer_num)]
+        else:  # error
+            raise ValueError("parameterization should be 'vector' or 'matrix'")
+        self.bias = [self.add_weight(name='bias' + str(i),
+                                     shape=(dim, 1),
+                                     initializer=Zeros(),
+                                     trainable=True) for i in range(self.layer_num)]
+        # Be sure to call this somewhere!
+        super(CrossNet, self).build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 2:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 2 dimensions" % (K.ndim(inputs)))
+
+        x_0 = tf.expand_dims(inputs, axis=2)
+        x_l = x_0
+        for i in range(self.layer_num):
+            if self.parameterization == 'vector':
+                xl_w = tf.tensordot(x_l, self.kernels[i], axes=(1, 0))
+                dot_ = tf.matmul(x_0, xl_w)
+                x_l = dot_ + self.bias[i] + x_l
+            elif self.parameterization == 'matrix':
+                xl_w = tf.einsum('ij,bjk->bik', self.kernels[i], x_l)  # W * xi  (bs, dim, 1)
+                dot_ = xl_w + self.bias[i]  # W * xi + b
+                x_l = x_0 * dot_ + x_l  # x0 · (W * xi + b) +xl  Hadamard-product
+            else:  # error
+                raise ValueError("parameterization should be 'vector' or 'matrix'")
+        x_l = tf.squeeze(x_l, axis=2)
+        return x_l
+
+    def get_config(self, ):
+
+        config = {'layer_num': self.layer_num, 'parameterization': self.parameterization,
+                  'l2_reg': self.l2_reg, 'seed': self.seed}
+        base_config = super(CrossNet, self).get_config()
+        base_config.update(config)
+        return base_config
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+
+class CrossNetMix(Layer):
+    """The Cross Network part of DCN-Mix model, which improves DCN-M by:
+      1 add MOE to learn feature interactions in different subspaces
+      2 add nonlinear transformations in low-dimensional space
+
+      Input shape
+        - 2D tensor with shape: ``(batch_size, units)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, units)``.
+
+      Arguments
+        - **low_rank** : Positive integer, dimensionality of low-rank sapce.
+
+        - **num_experts** : Positive integer, number of experts.
+
+        - **layer_num**: Positive integer, the cross layer number
+
+        - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix
+
+        - **seed**: A Python integer to use as random seed.
+
+      References
+        - [Wang R, Shivanna R, Cheng D Z, et al. DCN-M: Improved Deep & Cross Network for Feature Cross Learning in Web-scale Learning to Rank Systems[J]. 2020.](https://arxiv.org/abs/2008.13535)
+    """
+
+    def __init__(self, low_rank=32, num_experts=4, layer_num=2, l2_reg=0, seed=1024, **kwargs):
+        self.low_rank = low_rank
+        self.num_experts = num_experts
+        self.layer_num = layer_num
+        self.l2_reg = l2_reg
+        self.seed = seed
+        super(CrossNetMix, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if len(input_shape) != 2:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 2 dimensions" % (len(input_shape),))
+
+        dim = int(input_shape[-1])
+
+        # U: (dim, low_rank)
+        self.U_list = [self.add_weight(name='U_list' + str(i),
+                                       shape=(self.num_experts, dim, self.low_rank),
+                                       initializer=glorot_normal(
+                                           seed=self.seed),
+                                       regularizer=l2(self.l2_reg),
+                                       trainable=True) for i in range(self.layer_num)]
+        # V: (dim, low_rank)
+        self.V_list = [self.add_weight(name='V_list' + str(i),
+                                       shape=(self.num_experts, dim, self.low_rank),
+                                       initializer=glorot_normal(
+                                           seed=self.seed),
+                                       regularizer=l2(self.l2_reg),
+                                       trainable=True) for i in range(self.layer_num)]
+        # C: (low_rank, low_rank)
+        self.C_list = [self.add_weight(name='C_list' + str(i),
+                                       shape=(self.num_experts, self.low_rank, self.low_rank),
+                                       initializer=glorot_normal(
+                                           seed=self.seed),
+                                       regularizer=l2(self.l2_reg),
+                                       trainable=True) for i in range(self.layer_num)]
+
+        self.gating = [Dense(1, use_bias=False) for i in range(self.num_experts)]
+
+        self.bias = [self.add_weight(name='bias' + str(i),
+                                     shape=(dim, 1),
+                                     initializer=Zeros(),
+                                     trainable=True) for i in range(self.layer_num)]
+        # Be sure to call this somewhere!
+        super(CrossNetMix, self).build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 2:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 2 dimensions" % (K.ndim(inputs)))
+
+        x_0 = tf.expand_dims(inputs, axis=2)
+        x_l = x_0
+        for i in range(self.layer_num):
+            output_of_experts = []
+            gating_score_of_experts = []
+            for expert_id in range(self.num_experts):
+                # (1) G(x_l)
+                # compute the gating score by x_l
+                gating_score_of_experts.append(self.gating[expert_id](tf.squeeze(x_l, axis=2)))
+
+                # (2) E(x_l)
+                # project the input x_l to $\mathbb{R}^{r}$
+                v_x = tf.einsum('ij,bjk->bik', tf.transpose(self.V_list[i][expert_id]), x_l)  # (bs, low_rank, 1)
+
+                # nonlinear activation in low rank space
+                v_x = tf.nn.tanh(v_x)
+                v_x = tf.einsum('ij,bjk->bik', self.C_list[i][expert_id], v_x)  # (bs, low_rank, 1)
+                v_x = tf.nn.tanh(v_x)
+
+                # project back to $\mathbb{R}^{d}$
+                uv_x = tf.einsum('ij,bjk->bik', self.U_list[i][expert_id], v_x)  # (bs, dim, 1)
+
+                dot_ = uv_x + self.bias[i]
+                dot_ = x_0 * dot_  # Hadamard-product
+
+                output_of_experts.append(tf.squeeze(dot_, axis=2))
+
+            # (3) mixture of low-rank experts
+            output_of_experts = tf.stack(output_of_experts, 2)  # (bs, dim, num_experts)
+            gating_score_of_experts = tf.stack(gating_score_of_experts, 1)  # (bs, num_experts, 1)
+            moe_out = tf.matmul(output_of_experts, tf.nn.softmax(gating_score_of_experts, 1))
+            x_l = moe_out + x_l  # (bs, dim, 1)
+        x_l = tf.squeeze(x_l, axis=2)
+        return x_l
+
+    def get_config(self, ):
+
+        config = {'low_rank': self.low_rank, 'num_experts': self.num_experts, 'layer_num': self.layer_num,
+                  'l2_reg': self.l2_reg, 'seed': self.seed}
+        base_config = super(CrossNetMix, self).get_config()
+        base_config.update(config)
+        return base_config
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+
+class FM(Layer):
+    """Factorization Machine models pairwise (order-2) feature interactions
+     without linear term and bias.
+
+      Input shape
+        - 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, 1)``.
+
+      References
+        - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
+    """
+
+    def __init__(self, **kwargs):
+
+        super(FM, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError("Unexpected inputs dimensions % d,\
+                             expect to be 3 dimensions" % (len(input_shape)))
+
+        super(FM, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions"
+                % (K.ndim(inputs)))
+
+        concated_embeds_value = inputs
+
+        square_of_sum = tf.square(reduce_sum(
+            concated_embeds_value, axis=1, keep_dims=True))
+        sum_of_square = reduce_sum(
+            concated_embeds_value * concated_embeds_value, axis=1, keep_dims=True)
+        cross_term = square_of_sum - sum_of_square
+        cross_term = 0.5 * reduce_sum(cross_term, axis=2, keep_dims=False)
+
+        return cross_term
+
+    def compute_output_shape(self, input_shape):
+        return (None, 1)
+
+
+
+class InnerProductLayer(Layer):
+    """InnerProduct Layer used in PNN that compute the element-wise
+    product or inner product between feature vectors.
+
+      Input shape
+        - a list of 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, N*(N-1)/2 ,1)`` if use reduce_sum. or 3D tensor with shape: ``(batch_size, N*(N-1)/2, embedding_size )`` if not use reduce_sum.
+
+      Arguments
+        - **reduce_sum**: bool. Whether return inner product or element-wise product
+
+      References
+            - [Qu Y, Cai H, Ren K, et al. Product-based neural networks for user response prediction[C]//Data Mining (ICDM), 2016 IEEE 16th International Conference on. IEEE, 2016: 1149-1154.](https://arxiv.org/pdf/1611.00144.pdf)
+    """
+
+    def __init__(self, reduce_sum=True, **kwargs):
+        self.reduce_sum = reduce_sum
+        super(InnerProductLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError('A `InnerProductLayer` layer should be called '
+                             'on a list of at least 2 inputs')
+
+        reduced_inputs_shapes = [shape.as_list() for shape in input_shape]
+        shape_set = set()
+
+        for i in range(len(input_shape)):
+            shape_set.add(tuple(reduced_inputs_shapes[i]))
+
+        if len(shape_set) > 1:
+            raise ValueError('A `InnerProductLayer` layer requires '
+                             'inputs with same shapes '
+                             'Got different shapes: %s' % (shape_set))
+
+        if len(input_shape[0]) != 3 or input_shape[0][1] != 1:
+            raise ValueError('A `InnerProductLayer` layer requires '
+                             'inputs of a list with same shape tensor like (None,1,embedding_size)'
+                             'Got different shapes: %s' % (input_shape[0]))
+        super(InnerProductLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        embed_list = inputs
+        row = []
+        col = []
+        num_inputs = len(embed_list)
+
+        for i in range(num_inputs - 1):
+            for j in range(i + 1, num_inputs):
+                row.append(i)
+                col.append(j)
+        p = tf.concat([embed_list[idx]
+                       for idx in row], axis=1)  # batch num_pairs k
+        q = tf.concat([embed_list[idx]
+                       for idx in col], axis=1)
+
+        inner_product = p * q
+        if self.reduce_sum:
+            inner_product = reduce_sum(
+                inner_product, axis=2, keep_dims=True)
+        return inner_product
+
+    def compute_output_shape(self, input_shape):
+        num_inputs = len(input_shape)
+        num_pairs = int(num_inputs * (num_inputs - 1) / 2)
+        input_shape = input_shape[0]
+        embed_size = input_shape[-1]
+        if self.reduce_sum:
+            return (input_shape[0], num_pairs, 1)
+        else:
+            return (input_shape[0], num_pairs, embed_size)
+
+    def get_config(self, ):
+        config = {'reduce_sum': self.reduce_sum, }
+        base_config = super(InnerProductLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class InteractingLayer(Layer):
+    """A Layer used in AutoInt that model the correlations between different feature fields by multi-head self-attention mechanism.
+
+      Input shape
+            - A 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+            - 3D tensor with shape:``(batch_size,field_size,att_embedding_size * head_num)``.
+
+
+      Arguments
+            - **att_embedding_size**: int.The embedding size in multi-head self-attention network.
+            - **head_num**: int.The head number in multi-head  self-attention network.
+            - **use_res**: bool.Whether or not use standard residual connections before output.
+            - **seed**: A Python integer to use as random seed.
+
+      References
+            - [Song W, Shi C, Xiao Z, et al. AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks[J]. arXiv preprint arXiv:1810.11921, 2018.](https://arxiv.org/abs/1810.11921)
+    """
+
+    def __init__(self, att_embedding_size=8, head_num=2, use_res=True, scaling=False, seed=1024, **kwargs):
+        if head_num <= 0:
+            raise ValueError('head_num must be a int > 0')
+        self.att_embedding_size = att_embedding_size
+        self.head_num = head_num
+        self.use_res = use_res
+        self.seed = seed
+        self.scaling = scaling
+        super(InteractingLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(input_shape)))
+        embedding_size = int(input_shape[-1])
+        self.W_Query = self.add_weight(name='query', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                       dtype=tf.float32,
+                                       initializer=TruncatedNormal(seed=self.seed))
+        self.W_key = self.add_weight(name='key', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                     dtype=tf.float32,
+                                     initializer=TruncatedNormal(seed=self.seed + 1))
+        self.W_Value = self.add_weight(name='value', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                       dtype=tf.float32,
+                                       initializer=TruncatedNormal(seed=self.seed + 2))
+        if self.use_res:
+            self.W_Res = self.add_weight(name='res', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                         dtype=tf.float32,
+                                         initializer=TruncatedNormal(seed=self.seed))
+
+        # Be sure to call this somewhere!
+        super(InteractingLayer, self).build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        querys = tf.tensordot(inputs, self.W_Query,
+                              axes=(-1, 0))  # None F D*head_num
+        keys = tf.tensordot(inputs, self.W_key, axes=(-1, 0))
+        values = tf.tensordot(inputs, self.W_Value, axes=(-1, 0))
+
+        # head_num None F D
+        querys = tf.stack(tf.split(querys, self.head_num, axis=2))
+        keys = tf.stack(tf.split(keys, self.head_num, axis=2))
+        values = tf.stack(tf.split(values, self.head_num, axis=2))
+
+        inner_product = tf.matmul(
+            querys, keys, transpose_b=True)  # head_num None F F
+        if self.scaling:
+            inner_product /= self.att_embedding_size ** 0.5
+        self.normalized_att_scores = softmax(inner_product)
+
+        result = tf.matmul(self.normalized_att_scores,
+                           values)  # head_num None F D
+        result = tf.concat(tf.split(result, self.head_num, ), axis=-1)
+        result = tf.squeeze(result, axis=0)  # None F D*head_num
+
+        if self.use_res:
+            result += tf.tensordot(inputs, self.W_Res, axes=(-1, 0))
+        result = tf.nn.relu(result)
+
+        return result
+
+    def compute_output_shape(self, input_shape):
+
+        return (None, input_shape[1], self.att_embedding_size * self.head_num)
+
+    def get_config(self, ):
+        config = {'att_embedding_size': self.att_embedding_size, 'head_num': self.head_num, 'use_res': self.use_res,
+                  'seed': self.seed}
+        base_config = super(InteractingLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class OutterProductLayer(Layer):
+    """OutterProduct Layer used in PNN.This implemention is
+    adapted from code that the author of the paper published on https://github.com/Atomu2014/product-nets.
+
+      Input shape
+            - A list of N 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      Output shape
+            - 2D tensor with shape:``(batch_size,N*(N-1)/2 )``.
+
+      Arguments
+            - **kernel_type**: str. The kernel weight matrix type to use,can be mat,vec or num
+
+            - **seed**: A Python integer to use as random seed.
+
+      References
+            - [Qu Y, Cai H, Ren K, et al. Product-based neural networks for user response prediction[C]//Data Mining (ICDM), 2016 IEEE 16th International Conference on. IEEE, 2016: 1149-1154.](https://arxiv.org/pdf/1611.00144.pdf)
+    """
+
+    def __init__(self, kernel_type='mat', seed=1024, **kwargs):
+        if kernel_type not in ['mat', 'vec', 'num']:
+            raise ValueError("kernel_type must be mat,vec or num")
+        self.kernel_type = kernel_type
+        self.seed = seed
+        super(OutterProductLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError('A `OutterProductLayer` layer should be called '
+                             'on a list of at least 2 inputs')
+
+        reduced_inputs_shapes = [shape.as_list() for shape in input_shape]
+        shape_set = set()
+
+        for i in range(len(input_shape)):
+            shape_set.add(tuple(reduced_inputs_shapes[i]))
+
+        if len(shape_set) > 1:
+            raise ValueError('A `OutterProductLayer` layer requires '
+                             'inputs with same shapes '
+                             'Got different shapes: %s' % (shape_set))
+
+        if len(input_shape[0]) != 3 or input_shape[0][1] != 1:
+            raise ValueError('A `OutterProductLayer` layer requires '
+                             'inputs of a list with same shape tensor like (None,1,embedding_size)'
+                             'Got different shapes: %s' % (input_shape[0]))
+        num_inputs = len(input_shape)
+        num_pairs = int(num_inputs * (num_inputs - 1) / 2)
+        input_shape = input_shape[0]
+        embed_size = int(input_shape[-1])
+        if self.kernel_type == 'mat':
+
+            self.kernel = self.add_weight(shape=(embed_size, num_pairs, embed_size),
+                                          initializer=glorot_uniform(
+                                              seed=self.seed),
+                                          name='kernel')
+        elif self.kernel_type == 'vec':
+            self.kernel = self.add_weight(shape=(num_pairs, embed_size,), initializer=glorot_uniform(self.seed),
+                                          name='kernel'
+                                          )
+        elif self.kernel_type == 'num':
+            self.kernel = self.add_weight(
+                shape=(num_pairs, 1), initializer=glorot_uniform(self.seed), name='kernel')
+
+        super(OutterProductLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        embed_list = inputs
+        row = []
+        col = []
+        num_inputs = len(embed_list)
+        for i in range(num_inputs - 1):
+            for j in range(i + 1, num_inputs):
+                row.append(i)
+                col.append(j)
+        p = tf.concat([embed_list[idx]
+                       for idx in row], axis=1)  # batch num_pairs k
+        # Reshape([num_pairs, self.embedding_size])
+        q = tf.concat([embed_list[idx] for idx in col], axis=1)
+
+        # -------------------------
+        if self.kernel_type == 'mat':
+            p = tf.expand_dims(p, 1)
+            # k     k* pair* k
+            # batch * pair
+            kp = reduce_sum(
+
+                # batch * pair * k
+
+                tf.multiply(
+
+                    # batch * pair * k
+
+                    tf.transpose(
+
+                        # batch * k * pair
+
+                        reduce_sum(
+
+                            # batch * k * pair * k
+
+                            tf.multiply(
+
+                                p, self.kernel),
+
+                            -1),
+
+                        [0, 2, 1]),
+
+                    q),
+
+                -1)
+        else:
+            # 1 * pair * (k or 1)
+
+            k = tf.expand_dims(self.kernel, 0)
+
+            # batch * pair
+
+            kp = reduce_sum(p * q * k, -1)
+
+            # p q # b * p * k
+
+        return kp
+
+    def compute_output_shape(self, input_shape):
+        num_inputs = len(input_shape)
+        num_pairs = int(num_inputs * (num_inputs - 1) / 2)
+        return (None, num_pairs)
+
+    def get_config(self, ):
+        config = {'kernel_type': self.kernel_type, 'seed': self.seed}
+        base_config = super(OutterProductLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class FGCNNLayer(Layer):
+    """Feature Generation Layer used in FGCNN,including Convolution,MaxPooling and Recombination.
+
+      Input shape
+        - A 3D tensor with shape:``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size,new_feture_num,embedding_size)``.
+
+      References
+        - [Liu B, Tang R, Chen Y, et al. Feature Generation by Convolutional Neural Network for Click-Through Rate Prediction[J]. arXiv preprint arXiv:1904.04447, 2019.](https://arxiv.org/pdf/1904.04447)
+
+    """
+
+    def __init__(self, filters=(14, 16,), kernel_width=(7, 7,), new_maps=(3, 3,), pooling_width=(2, 2),
+                 **kwargs):
+        if not (len(filters) == len(kernel_width) == len(new_maps) == len(pooling_width)):
+            raise ValueError("length of argument must be equal")
+        self.filters = filters
+        self.kernel_width = kernel_width
+        self.new_maps = new_maps
+        self.pooling_width = pooling_width
+
+        super(FGCNNLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(input_shape)))
+        self.conv_layers = []
+        self.pooling_layers = []
+        self.dense_layers = []
+        pooling_shape = input_shape.as_list() + [1, ]
+        embedding_size = int(input_shape[-1])
+        for i in range(1, len(self.filters) + 1):
+            filters = self.filters[i - 1]
+            width = self.kernel_width[i - 1]
+            new_filters = self.new_maps[i - 1]
+            pooling_width = self.pooling_width[i - 1]
+            conv_output_shape = self._conv_output_shape(
+                pooling_shape, (width, 1))
+            pooling_shape = self._pooling_output_shape(
+                conv_output_shape, (pooling_width, 1))
+            self.conv_layers.append(Conv2D(filters=filters, kernel_size=(width, 1), strides=(1, 1),
+                                           padding='same',
+                                           activation='tanh', use_bias=True, ))
+            self.pooling_layers.append(
+                MaxPooling2D(pool_size=(pooling_width, 1)))
+            self.dense_layers.append(Dense(pooling_shape[1] * embedding_size * new_filters,
+                                           activation='tanh', use_bias=True))
+
+        self.flatten = Flatten()
+
+        super(FGCNNLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        embedding_size = int(inputs.shape[-1])
+        pooling_result = tf.expand_dims(inputs, axis=3)
+
+        new_feature_list = []
+
+        for i in range(1, len(self.filters) + 1):
+            new_filters = self.new_maps[i - 1]
+
+            conv_result = self.conv_layers[i - 1](pooling_result)
+
+            pooling_result = self.pooling_layers[i - 1](conv_result)
+
+            flatten_result = self.flatten(pooling_result)
+
+            new_result = self.dense_layers[i - 1](flatten_result)
+
+            new_feature_list.append(
+                tf.reshape(new_result, (-1, int(pooling_result.shape[1]) * new_filters, embedding_size)))
+
+        new_features = concat_func(new_feature_list, axis=1)
+        return new_features
+
+    def compute_output_shape(self, input_shape):
+
+        new_features_num = 0
+        features_num = input_shape[1]
+
+        for i in range(0, len(self.kernel_width)):
+            pooled_features_num = features_num // self.pooling_width[i]
+            new_features_num += self.new_maps[i] * pooled_features_num
+            features_num = pooled_features_num
+
+        return (None, new_features_num, input_shape[-1])
+
+    def get_config(self, ):
+        config = {'kernel_width': self.kernel_width, 'filters': self.filters, 'new_maps': self.new_maps,
+                  'pooling_width': self.pooling_width}
+        base_config = super(FGCNNLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+    def _conv_output_shape(self, input_shape, kernel_size):
+        # channels_last
+        space = input_shape[1:-1]
+        new_space = []
+        for i in range(len(space)):
+            new_dim = utils.conv_output_length(
+                space[i],
+                kernel_size[i],
+                padding='same',
+                stride=1,
+                dilation=1)
+            new_space.append(new_dim)
+        return ([input_shape[0]] + new_space + [self.filters])
+
+    def _pooling_output_shape(self, input_shape, pool_size):
+        # channels_last
+
+        rows = input_shape[1]
+        cols = input_shape[2]
+        rows = utils.conv_output_length(rows, pool_size[0], 'valid',
+                                        pool_size[0])
+        cols = utils.conv_output_length(cols, pool_size[1], 'valid',
+                                        pool_size[1])
+        return [input_shape[0], rows, cols, input_shape[3]]
+
+
+class SENETLayer(Layer):
+    """SENETLayer used in FiBiNET.
+
+      Input shape
+        - A list of 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      Output shape
+        - A list of 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      Arguments
+        - **reduction_ratio** : Positive integer, dimensionality of the
+         attention network output space.
+
+        - **seed** : A Python integer to use as random seed.
+
+      References
+        - [FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction](https://arxiv.org/pdf/1905.09433.pdf)
+    """
+
+    def __init__(self, reduction_ratio=3, seed=1024, **kwargs):
+        self.reduction_ratio = reduction_ratio
+
+        self.seed = seed
+        super(SENETLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError('A `AttentionalFM` layer should be called '
+                             'on a list of at least 2 inputs')
+
+        self.filed_size = len(input_shape)
+        self.embedding_size = input_shape[0][-1]
+        reduction_size = max(1, self.filed_size // self.reduction_ratio)
+
+        self.W_1 = self.add_weight(shape=(
+            self.filed_size, reduction_size), initializer=glorot_normal(seed=self.seed), name="W_1")
+        self.W_2 = self.add_weight(shape=(
+            reduction_size, self.filed_size), initializer=glorot_normal(seed=self.seed), name="W_2")
+
+        self.tensordot = Lambda(
+            lambda x: tf.tensordot(x[0], x[1], axes=(-1, 0)))
+
+        # Be sure to call this somewhere!
+        super(SENETLayer, self).build(input_shape)
+
+    def call(self, inputs, training=None, **kwargs):
+
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        inputs = concat_func(inputs, axis=1)
+        Z = reduce_mean(inputs, axis=-1, )
+
+        A_1 = tf.nn.relu(self.tensordot([Z, self.W_1]))
+        A_2 = tf.nn.relu(self.tensordot([A_1, self.W_2]))
+        V = tf.multiply(inputs, tf.expand_dims(A_2, axis=2))
+
+        return tf.split(V, self.filed_size, axis=1)
+
+    def compute_output_shape(self, input_shape):
+
+        return input_shape
+
+    def compute_mask(self, inputs, mask=None):
+        return [None] * self.filed_size
+
+    def get_config(self, ):
+        config = {'reduction_ratio': self.reduction_ratio, 'seed': self.seed}
+        base_config = super(SENETLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class BilinearInteraction(Layer):
+    """BilinearInteraction Layer used in FiBiNET.
+
+      Input shape
+        - A list of 3D tensor with shape: ``(batch_size,1,embedding_size)``. Its length is ``filed_size``.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size,filed_size*(filed_size-1)/2,embedding_size)``.
+
+      Arguments
+        - **bilinear_type** : String, types of bilinear functions used in this layer.
+
+        - **seed** : A Python integer to use as random seed.
+
+      References
+        - [FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction](https://arxiv.org/pdf/1905.09433.pdf)
+
+    """
+
+    def __init__(self, bilinear_type="interaction", seed=1024, **kwargs):
+        self.bilinear_type = bilinear_type
+        self.seed = seed
+
+        super(BilinearInteraction, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError('A `AttentionalFM` layer should be called '
+                             'on a list of at least 2 inputs')
+        embedding_size = int(input_shape[0][-1])
+
+        if self.bilinear_type == "all":
+            self.W = self.add_weight(shape=(embedding_size, embedding_size), initializer=glorot_normal(
+                seed=self.seed), name="bilinear_weight")
+        elif self.bilinear_type == "each":
+            self.W_list = [self.add_weight(shape=(embedding_size, embedding_size), initializer=glorot_normal(
+                seed=self.seed), name="bilinear_weight" + str(i)) for i in range(len(input_shape) - 1)]
+        elif self.bilinear_type == "interaction":
+            self.W_list = [self.add_weight(shape=(embedding_size, embedding_size), initializer=glorot_normal(
+                seed=self.seed), name="bilinear_weight" + str(i) + '_' + str(j)) for i, j in
+                           itertools.combinations(range(len(input_shape)), 2)]
+        else:
+            raise NotImplementedError
+
+        super(BilinearInteraction, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        n = len(inputs)
+        if self.bilinear_type == "all":
+            vidots = [tf.tensordot(inputs[i], self.W, axes=(-1, 0)) for i in range(n)]
+            p = [tf.multiply(vidots[i], inputs[j]) for i, j in itertools.combinations(range(n), 2)]
+        elif self.bilinear_type == "each":
+            vidots = [tf.tensordot(inputs[i], self.W_list[i], axes=(-1, 0)) for i in range(n - 1)]
+            p = [tf.multiply(vidots[i], inputs[j]) for i, j in itertools.combinations(range(n), 2)]
+        elif self.bilinear_type == "interaction":
+            p = [tf.multiply(tf.tensordot(v[0], w, axes=(-1, 0)), v[1])
+                 for v, w in zip(itertools.combinations(inputs, 2), self.W_list)]
+        else:
+            raise NotImplementedError
+        output = concat_func(p, axis=1)
+        return output
+
+    def compute_output_shape(self, input_shape):
+        filed_size = len(input_shape)
+        embedding_size = input_shape[0][-1]
+
+        return (None, filed_size * (filed_size - 1) // 2, embedding_size)
+
+    def get_config(self, ):
+        config = {'bilinear_type': self.bilinear_type, 'seed': self.seed}
+        base_config = super(BilinearInteraction, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class FieldWiseBiInteraction(Layer):
+    """Field-Wise Bi-Interaction Layer used in FLEN,compress the
+     pairwise element-wise product of features into one single vector.
+
+      Input shape
+        - A list of 3D tensor with shape:``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size,embedding_size)``.
+
+      Arguments
+        - **use_bias** : Boolean, if use bias.
+        - **seed** : A Python integer to use as random seed.
+
+      References
+        - [FLEN: Leveraging Field for Scalable CTR Prediction](https://arxiv.org/pdf/1911.04690)
+
+    """
+
+    def __init__(self, use_bias=True, seed=1024, **kwargs):
+        self.use_bias = use_bias
+        self.seed = seed
+
+        super(FieldWiseBiInteraction, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError(
+                'A `Field-Wise Bi-Interaction` layer should be called '
+                'on a list of at least 2 inputs')
+
+        self.num_fields = len(input_shape)
+        embedding_size = input_shape[0][-1]
+
+        self.kernel_mf = self.add_weight(
+            name='kernel_mf',
+            shape=(int(self.num_fields * (self.num_fields - 1) / 2), 1),
+            initializer=Ones(),
+            regularizer=None,
+            trainable=True)
+
+        self.kernel_fm = self.add_weight(
+            name='kernel_fm',
+            shape=(self.num_fields, 1),
+            initializer=Constant(value=0.5),
+            regularizer=None,
+            trainable=True)
+        if self.use_bias:
+            self.bias_mf = self.add_weight(name='bias_mf',
+                                           shape=(embedding_size),
+                                           initializer=Zeros())
+            self.bias_fm = self.add_weight(name='bias_fm',
+                                           shape=(embedding_size),
+                                           initializer=Zeros())
+
+        super(FieldWiseBiInteraction,
+              self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" %
+                (K.ndim(inputs)))
+
+        field_wise_embeds_list = inputs
+
+        # MF module
+        field_wise_vectors = tf.concat([
+            reduce_sum(field_i_vectors, axis=1, keep_dims=True)
+            for field_i_vectors in field_wise_embeds_list
+        ], 1)
+
+        left = []
+        right = []
+
+        for i, j in itertools.combinations(list(range(self.num_fields)), 2):
+            left.append(i)
+            right.append(j)
+
+        embeddings_left = tf.gather(params=field_wise_vectors,
+                                    indices=left,
+                                    axis=1)
+        embeddings_right = tf.gather(params=field_wise_vectors,
+                                     indices=right,
+                                     axis=1)
+
+        embeddings_prod = embeddings_left * embeddings_right
+        field_weighted_embedding = embeddings_prod * self.kernel_mf
+        h_mf = reduce_sum(field_weighted_embedding, axis=1)
+        if self.use_bias:
+            h_mf = tf.nn.bias_add(h_mf, self.bias_mf)
+
+        # FM module
+        square_of_sum_list = [
+            tf.square(reduce_sum(field_i_vectors, axis=1, keep_dims=True))
+            for field_i_vectors in field_wise_embeds_list
+        ]
+        sum_of_square_list = [
+            reduce_sum(field_i_vectors * field_i_vectors,
+                       axis=1,
+                       keep_dims=True)
+            for field_i_vectors in field_wise_embeds_list
+        ]
+
+        field_fm = tf.concat([
+            square_of_sum - sum_of_square for square_of_sum, sum_of_square in
+            zip(square_of_sum_list, sum_of_square_list)
+        ], 1)
+
+        h_fm = reduce_sum(field_fm * self.kernel_fm, axis=1)
+        if self.use_bias:
+            h_fm = tf.nn.bias_add(h_fm, self.bias_fm)
+
+        return h_mf + h_fm
+
+    def compute_output_shape(self, input_shape):
+        return (None, input_shape[0][-1])
+
+    def get_config(self, ):
+        config = {'use_bias': self.use_bias, 'seed': self.seed}
+        base_config = super(FieldWiseBiInteraction, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class FwFMLayer(Layer):
+    """Field-weighted Factorization Machines
+
+      Input shape
+        - 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, 1)``.
+
+      Arguments
+        - **num_fields** : integer for number of fields
+        - **regularizer** : L2 regularizer weight for the field strength parameters of PNN
+
+      References
+        - [Field-weighted Factorization Machines for Click-Through Rate Prediction in Display Advertising]
+        https://arxiv.org/pdf/1806.03514.pdf
+    """
+
+    def __init__(self, num_fields=4, regularizer=0.000001, **kwargs):
+        self.num_fields = num_fields
+        self.regularizer = regularizer
+        super(FwFMLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError("Unexpected inputs dimensions % d,\
+                             expect to be 3 dimensions" % (len(input_shape)))
+
+        if input_shape[1] != self.num_fields:
+            raise ValueError("Mismatch in number of fields {} and \
+                 concatenated embeddings dims {}".format(self.num_fields, input_shape[1]))
+
+        self.field_strengths = self.add_weight(name='field_pair_strengths',
+                                               shape=(self.num_fields, self.num_fields),
+                                               initializer=TruncatedNormal(),
+                                               regularizer=l2(self.regularizer),
+                                               trainable=True)
+
+        super(FwFMLayer, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions"
+                % (K.ndim(inputs)))
+
+        if inputs.shape[1] != self.num_fields:
+            raise ValueError("Mismatch in number of fields {} and \
+                 concatenated embeddings dims {}".format(self.num_fields, inputs.shape[1]))
+
+        pairwise_inner_prods = []
+        for fi, fj in itertools.combinations(range(self.num_fields), 2):
+            # get field strength for pair fi and fj
+            r_ij = self.field_strengths[fi, fj]
+
+            # get embeddings for the features of both the fields
+            feat_embed_i = tf.squeeze(inputs[0:, fi:fi + 1, 0:], axis=1)
+            feat_embed_j = tf.squeeze(inputs[0:, fj:fj + 1, 0:], axis=1)
+
+            f = tf.scalar_mul(r_ij, batch_dot(feat_embed_i, feat_embed_j, axes=1))
+            pairwise_inner_prods.append(f)
+
+        sum_ = tf.add_n(pairwise_inner_prods)
+        return sum_
+
+    def compute_output_shape(self, input_shape):
+        return (None, 1)
+
+    def get_config(self):
+        config = super(FwFMLayer, self).get_config().copy()
+        config.update({
+            'num_fields': self.num_fields,
+            'regularizer': self.regularizer
+        })
+        return config
+
+
+class FEFMLayer(Layer):
+    """Field-Embedded Factorization Machines
+
+      Input shape
+        - 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape:
+            ``(batch_size, (num_fields * (num_fields-1))/2)`` # concatenated FEFM interaction embeddings
+
+      Arguments
+        - **regularizer** : L2 regularizer weight for the field pair matrix embeddings parameters of FEFM
+
+      References
+        - [Field-Embedded Factorization Machines for Click-through Rate Prediction]
+         https://arxiv.org/pdf/2009.09931.pdf
+    """
+
+    def __init__(self, regularizer, **kwargs):
+        self.regularizer = regularizer
+        super(FEFMLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError("Unexpected inputs dimensions % d,\
+                                expect to be 3 dimensions" % (len(input_shape)))
+
+        self.num_fields = int(input_shape[1])
+        embedding_size = int(input_shape[2])
+
+        self.field_embeddings = {}
+        for fi, fj in itertools.combinations(range(self.num_fields), 2):
+            field_pair_id = str(fi) + "-" + str(fj)
+            self.field_embeddings[field_pair_id] = self.add_weight(name='field_embeddings' + field_pair_id,
+                                                                   shape=(embedding_size, embedding_size),
+                                                                   initializer=TruncatedNormal(),
+                                                                   regularizer=l2(self.regularizer),
+                                                                   trainable=True)
+
+        super(FEFMLayer, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions"
+                % (K.ndim(inputs)))
+
+        pairwise_inner_prods = []
+        for fi, fj in itertools.combinations(range(self.num_fields), 2):
+            field_pair_id = str(fi) + "-" + str(fj)
+            feat_embed_i = tf.squeeze(inputs[0:, fi:fi + 1, 0:], axis=1)
+            feat_embed_j = tf.squeeze(inputs[0:, fj:fj + 1, 0:], axis=1)
+            field_pair_embed_ij = self.field_embeddings[field_pair_id]
+
+            feat_embed_i_tr = tf.matmul(feat_embed_i, field_pair_embed_ij + tf.transpose(field_pair_embed_ij))
+
+            f = batch_dot(feat_embed_i_tr, feat_embed_j, axes=1)
+            pairwise_inner_prods.append(f)
+
+        concat_vec = tf.concat(pairwise_inner_prods, axis=1)
+        return concat_vec
+
+    def compute_output_shape(self, input_shape):
+        num_fields = int(input_shape[1])
+        return (None, (num_fields * (num_fields - 1)) / 2)
+
+    def get_config(self):
+        config = super(FEFMLayer, self).get_config().copy()
+        config.update({
+            'regularizer': self.regularizer,
+        })
+        return config
diff --git a/modelzoo/FwFM/script/layers/normalization.py b/modelzoo/FwFM/script/layers/normalization.py
new file mode 100644
index 00000000000..3fceb1257d8
--- /dev/null
+++ b/modelzoo/FwFM/script/layers/normalization.py
@@ -0,0 +1,51 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.layers import Layer
+
+try:
+    from tensorflow.python.ops.init_ops import Zeros, Ones
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import Zeros, Ones
+
+
+class LayerNormalization(Layer):
+    def __init__(self, axis=-1, eps=1e-9, center=True,
+                 scale=True, **kwargs):
+        self.axis = axis
+        self.eps = eps
+        self.center = center
+        self.scale = scale
+        super(LayerNormalization, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
+                                     initializer=Ones(), trainable=True)
+        self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
+                                    initializer=Zeros(), trainable=True)
+        super(LayerNormalization, self).build(input_shape)
+
+    def call(self, inputs):
+        mean = K.mean(inputs, axis=self.axis, keepdims=True)
+        variance = K.mean(K.square(inputs - mean), axis=-1, keepdims=True)
+        std = K.sqrt(variance + self.eps)
+        outputs = (inputs - mean) / std
+        if self.scale:
+            outputs *= self.gamma
+        if self.center:
+            outputs += self.beta
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self, ):
+        config = {'axis': self.axis, 'eps': self.eps, 'center': self.center, 'scale': self.scale}
+        base_config = super(LayerNormalization, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/modelzoo/FwFM/script/layers/sequence.py b/modelzoo/FwFM/script/layers/sequence.py
new file mode 100644
index 00000000000..45a65915c22
--- /dev/null
+++ b/modelzoo/FwFM/script/layers/sequence.py
@@ -0,0 +1,901 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.keras import backend as K
+
+try:
+    from tensorflow.python.ops.init_ops import TruncatedNormal, glorot_uniform_initializer as glorot_uniform, \
+        identity_initializer as identity
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import TruncatedNormal, glorot_uniform, identity
+
+from tensorflow.python.keras.layers import LSTM, Lambda, Layer, Dropout
+
+from .core import LocalActivationUnit
+from .normalization import LayerNormalization
+
+if tf.__version__ >= '2.0.0':
+    from ..contrib.rnn_v2 import dynamic_rnn
+else:
+    from ..contrib.rnn import dynamic_rnn
+from ..contrib.utils import QAAttGRUCell, VecAttGRUCell
+from .utils import reduce_sum, reduce_max, div, softmax, reduce_mean
+
+
+class SequencePoolingLayer(Layer):
+    """The SequencePoolingLayer is used to apply pooling operation(sum,mean,max) on variable-length sequence feature/multi-value feature.
+
+      Input shape
+        - A list of two  tensor [seq_value,seq_len]
+
+        - seq_value is a 3D tensor with shape: ``(batch_size, T, embedding_size)``
+
+        - seq_len is a 2D tensor with shape : ``(batch_size, 1)``,indicate valid length of each sequence.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, 1, embedding_size)``.
+
+      Arguments
+        - **mode**:str.Pooling operation to be used,can be sum,mean or max.
+
+        - **supports_masking**:If True,the input need to support masking.
+    """
+
+    def __init__(self, mode='mean', supports_masking=False, **kwargs):
+
+        if mode not in ['sum', 'mean', 'max']:
+            raise ValueError("mode must be sum or mean")
+        self.mode = mode
+        self.eps = tf.constant(1e-8, tf.float32)
+        super(SequencePoolingLayer, self).__init__(**kwargs)
+
+        self.supports_masking = supports_masking
+
+    def build(self, input_shape):
+        if not self.supports_masking:
+            self.seq_len_max = int(input_shape[0][1])
+        super(SequencePoolingLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, seq_value_len_list, mask=None, **kwargs):
+        if self.supports_masking:
+            if mask is None:
+                raise ValueError(
+                    "When supports_masking=True,input must support masking")
+            uiseq_embed_list = seq_value_len_list
+            mask = tf.cast(mask, tf.float32)  # tf.to_float(mask)
+            user_behavior_length = reduce_sum(mask, axis=-1, keep_dims=True)
+            mask = tf.expand_dims(mask, axis=2)
+        else:
+            uiseq_embed_list, user_behavior_length = seq_value_len_list
+
+            mask = tf.sequence_mask(user_behavior_length,
+                                    self.seq_len_max, dtype=tf.float32)
+            mask = tf.transpose(mask, (0, 2, 1))
+
+        embedding_size = uiseq_embed_list.shape[-1]
+
+        mask = tf.tile(mask, [1, 1, embedding_size])
+
+        if self.mode == "max":
+            hist = uiseq_embed_list - (1 - mask) * 1e9
+            return reduce_max(hist, 1, keep_dims=True)
+
+        hist = reduce_sum(uiseq_embed_list * mask, 1, keep_dims=False)
+
+        if self.mode == "mean":
+            hist = div(hist, tf.cast(user_behavior_length, tf.float32) + self.eps)
+
+        hist = tf.expand_dims(hist, axis=1)
+        return hist
+
+    def compute_output_shape(self, input_shape):
+        if self.supports_masking:
+            return (None, 1, input_shape[-1])
+        else:
+            return (None, 1, input_shape[0][-1])
+
+    def compute_mask(self, inputs, mask):
+        return None
+
+    def get_config(self, ):
+        config = {'mode': self.mode, 'supports_masking': self.supports_masking}
+        base_config = super(SequencePoolingLayer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class WeightedSequenceLayer(Layer):
+    """The WeightedSequenceLayer is used to apply weight score on variable-length sequence feature/multi-value feature.
+
+      Input shape
+        - A list of two  tensor [seq_value,seq_len,seq_weight]
+
+        - seq_value is a 3D tensor with shape: ``(batch_size, T, embedding_size)``
+
+        - seq_len is a 2D tensor with shape : ``(batch_size, 1)``,indicate valid length of each sequence.
+
+        - seq_weight is a 3D tensor with shape: ``(batch_size, T, 1)``
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, T, embedding_size)``.
+
+      Arguments
+        - **weight_normalization**: bool.Whether normalize the weight score before applying to sequence.
+
+        - **supports_masking**:If True,the input need to support masking.
+    """
+
+    def __init__(self, weight_normalization=True, supports_masking=False, **kwargs):
+        super(WeightedSequenceLayer, self).__init__(**kwargs)
+        self.weight_normalization = weight_normalization
+        self.supports_masking = supports_masking
+
+    def build(self, input_shape):
+        if not self.supports_masking:
+            self.seq_len_max = int(input_shape[0][1])
+        super(WeightedSequenceLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, input_list, mask=None, **kwargs):
+        if self.supports_masking:
+            if mask is None:
+                raise ValueError(
+                    "When supports_masking=True,input must support masking")
+            key_input, value_input = input_list
+            mask = tf.expand_dims(mask[0], axis=2)
+        else:
+            key_input, key_length_input, value_input = input_list
+            mask = tf.sequence_mask(key_length_input,
+                                    self.seq_len_max, dtype=tf.bool)
+            mask = tf.transpose(mask, (0, 2, 1))
+
+        embedding_size = key_input.shape[-1]
+
+        if self.weight_normalization:
+            paddings = tf.ones_like(value_input) * (-2 ** 32 + 1)
+        else:
+            paddings = tf.zeros_like(value_input)
+        value_input = tf.where(mask, value_input, paddings)
+
+        if self.weight_normalization:
+            value_input = softmax(value_input, dim=1)
+
+        if len(value_input.shape) == 2:
+            value_input = tf.expand_dims(value_input, axis=2)
+            value_input = tf.tile(value_input, [1, 1, embedding_size])
+
+        return tf.multiply(key_input, value_input)
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[0]
+
+    def compute_mask(self, inputs, mask):
+        if self.supports_masking:
+            return mask[0]
+        else:
+            return None
+
+    def get_config(self, ):
+        config = {'weight_normalization': self.weight_normalization, 'supports_masking': self.supports_masking}
+        base_config = super(WeightedSequenceLayer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class AttentionSequencePoolingLayer(Layer):
+    """The Attentional sequence pooling operation used in DIN.
+
+      Input shape
+        - A list of three tensor: [query,keys,keys_length]
+
+        - query is a 3D tensor with shape:  ``(batch_size, 1, embedding_size)``
+
+        - keys is a 3D tensor with shape:   ``(batch_size, T, embedding_size)``
+
+        - keys_length is a 2D tensor with shape: ``(batch_size, 1)``
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, 1, embedding_size)``.
+
+      Arguments
+        - **att_hidden_units**:list of positive integer, the attention net layer number and units in each layer.
+
+        - **att_activation**: Activation function to use in attention net.
+
+        - **weight_normalization**: bool.Whether normalize the attention score of local activation unit.
+
+        - **supports_masking**:If True,the input need to support masking.
+
+      References
+        - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
+    """
+
+    def __init__(self, att_hidden_units=(80, 40), att_activation='sigmoid', weight_normalization=False,
+                 return_score=False,
+                 supports_masking=False, **kwargs):
+
+        self.att_hidden_units = att_hidden_units
+        self.att_activation = att_activation
+        self.weight_normalization = weight_normalization
+        self.return_score = return_score
+        super(AttentionSequencePoolingLayer, self).__init__(**kwargs)
+        self.supports_masking = supports_masking
+
+    def build(self, input_shape):
+        if not self.supports_masking:
+            if not isinstance(input_shape, list) or len(input_shape) != 3:
+                raise ValueError('A `AttentionSequencePoolingLayer` layer should be called '
+                                 'on a list of 3 inputs')
+
+            if len(input_shape[0]) != 3 or len(input_shape[1]) != 3 or len(input_shape[2]) != 2:
+                raise ValueError(
+                    "Unexpected inputs dimensions,the 3 tensor dimensions are %d,%d and %d , expect to be 3,3 and 2" % (
+                        len(input_shape[0]), len(input_shape[1]), len(input_shape[2])))
+
+            if input_shape[0][-1] != input_shape[1][-1] or input_shape[0][1] != 1 or input_shape[2][1] != 1:
+                raise ValueError('A `AttentionSequencePoolingLayer` layer requires '
+                                 'inputs of a 3 tensor with shape (None,1,embedding_size),(None,T,embedding_size) and (None,1)'
+                                 'Got different shapes: %s' % (input_shape))
+        else:
+            pass
+        self.local_att = LocalActivationUnit(
+            self.att_hidden_units, self.att_activation, l2_reg=0, dropout_rate=0, use_bn=False, seed=1024, )
+        super(AttentionSequencePoolingLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, mask=None, training=None, **kwargs):
+
+        if self.supports_masking:
+            if mask is None:
+                raise ValueError(
+                    "When supports_masking=True,input must support masking")
+            queries, keys = inputs
+            key_masks = tf.expand_dims(mask[-1], axis=1)
+
+        else:
+
+            queries, keys, keys_length = inputs
+            hist_len = keys.get_shape()[1]
+            key_masks = tf.sequence_mask(keys_length, hist_len)
+
+        attention_score = self.local_att([queries, keys], training=training)
+
+        outputs = tf.transpose(attention_score, (0, 2, 1))
+
+        if self.weight_normalization:
+            paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
+        else:
+            paddings = tf.zeros_like(outputs)
+
+        outputs = tf.where(key_masks, outputs, paddings)
+
+        if self.weight_normalization:
+            outputs = softmax(outputs)
+
+        if not self.return_score:
+            outputs = tf.matmul(outputs, keys)
+
+        if tf.__version__ < '1.13.0':
+            outputs._uses_learning_phase = attention_score._uses_learning_phase
+        else:
+            outputs._uses_learning_phase = training is not None
+
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        if self.return_score:
+            return (None, 1, input_shape[1][1])
+        else:
+            return (None, 1, input_shape[0][-1])
+
+    def compute_mask(self, inputs, mask):
+        return None
+
+    def get_config(self, ):
+
+        config = {'att_hidden_units': self.att_hidden_units, 'att_activation': self.att_activation,
+                  'weight_normalization': self.weight_normalization, 'return_score': self.return_score,
+                  'supports_masking': self.supports_masking}
+        base_config = super(AttentionSequencePoolingLayer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class BiLSTM(Layer):
+    """A multiple layer Bidirectional Residual LSTM Layer.
+
+      Input shape
+        - 3D tensor with shape ``(batch_size, timesteps, input_dim)``.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, timesteps, units)``.
+
+      Arguments
+        - **units**: Positive integer, dimensionality of the output space.
+
+        - **layers**:Positive integer, number of LSTM layers to stacked.
+
+        - **res_layers**: Positive integer, number of residual connection to used in last ``res_layers``.
+
+        - **dropout_rate**:  Float between 0 and 1. Fraction of the units to drop for the linear transformation of the inputs.
+
+        - **merge_mode**: merge_mode: Mode by which outputs of the forward and backward RNNs will be combined. One of { ``'fw'`` , ``'bw'`` , ``'sum'`` , ``'mul'`` , ``'concat'`` , ``'ave'`` , ``None`` }. If None, the outputs will not be combined, they will be returned as a list.
+
+
+    """
+
+    def __init__(self, units, layers=2, res_layers=0, dropout_rate=0.2, merge_mode='ave', **kwargs):
+
+        if merge_mode not in ['fw', 'bw', 'sum', 'mul', 'ave', 'concat', None]:
+            raise ValueError('Invalid merge mode. '
+                             'Merge mode should be one of '
+                             '{"fw","bw","sum", "mul", "ave", "concat", None}')
+
+        self.units = units
+        self.layers = layers
+        self.res_layers = res_layers
+        self.dropout_rate = dropout_rate
+        self.merge_mode = merge_mode
+
+        super(BiLSTM, self).__init__(**kwargs)
+        self.supports_masking = True
+
+    def build(self, input_shape):
+
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(input_shape)))
+        self.fw_lstm = []
+        self.bw_lstm = []
+        for _ in range(self.layers):
+            self.fw_lstm.append(
+                LSTM(self.units, dropout=self.dropout_rate, bias_initializer='ones', return_sequences=True,
+                     unroll=True))
+            self.bw_lstm.append(
+                LSTM(self.units, dropout=self.dropout_rate, bias_initializer='ones', return_sequences=True,
+                     go_backwards=True, unroll=True))
+
+        super(BiLSTM, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, mask=None, **kwargs):
+
+        input_fw = inputs
+        input_bw = inputs
+        for i in range(self.layers):
+            output_fw = self.fw_lstm[i](input_fw)
+            output_bw = self.bw_lstm[i](input_bw)
+            output_bw = Lambda(lambda x: K.reverse(
+                x, 1), mask=lambda inputs, mask: mask)(output_bw)
+
+            if i >= self.layers - self.res_layers:
+                output_fw += input_fw
+                output_bw += input_bw
+            input_fw = output_fw
+            input_bw = output_bw
+
+        output_fw = input_fw
+        output_bw = input_bw
+
+        if self.merge_mode == "fw":
+            output = output_fw
+        elif self.merge_mode == "bw":
+            output = output_bw
+        elif self.merge_mode == 'concat':
+            output = K.concatenate([output_fw, output_bw])
+        elif self.merge_mode == 'sum':
+            output = output_fw + output_bw
+        elif self.merge_mode == 'ave':
+            output = (output_fw + output_bw) / 2
+        elif self.merge_mode == 'mul':
+            output = output_fw * output_bw
+        elif self.merge_mode is None:
+            output = [output_fw, output_bw]
+
+        return output
+
+    def compute_output_shape(self, input_shape):
+        print(self.merge_mode)
+        if self.merge_mode is None:
+            return [input_shape, input_shape]
+        elif self.merge_mode == 'concat':
+            return input_shape[:-1] + (input_shape[-1] * 2,)
+        else:
+            return input_shape
+
+    def compute_mask(self, inputs, mask):
+        return mask
+
+    def get_config(self, ):
+
+        config = {'units': self.units, 'layers': self.layers,
+                  'res_layers': self.res_layers, 'dropout_rate': self.dropout_rate, 'merge_mode': self.merge_mode}
+        base_config = super(BiLSTM, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class Transformer(Layer):
+    """  Simplified version of Transformer  proposed in 《Attention is all you need》
+
+      Input shape
+        - a list of two 3D tensor with shape ``(batch_size, timesteps, input_dim)`` if ``supports_masking=True`` .
+        - a list of two 4 tensors, first two tensors with shape ``(batch_size, timesteps, input_dim)``,last two tensors with shape ``(batch_size, 1)`` if ``supports_masking=False`` .
+
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, 1, input_dim)``  if ``output_type='mean'`` or ``output_type='sum'`` , else  ``(batch_size, timesteps, input_dim)`` .
+
+
+      Arguments
+            - **att_embedding_size**: int.The embedding size in multi-head self-attention network.
+            - **head_num**: int.The head number in multi-head  self-attention network.
+            - **dropout_rate**: float between 0 and 1. Fraction of the units to drop.
+            - **use_positional_encoding**: bool. Whether or not use positional_encoding
+            - **use_res**: bool. Whether or not use standard residual connections before output.
+            - **use_feed_forward**: bool. Whether or not use pointwise feed foward network.
+            - **use_layer_norm**: bool. Whether or not use Layer Normalization.
+            - **blinding**: bool. Whether or not use blinding.
+            - **seed**: A Python integer to use as random seed.
+            - **supports_masking**:bool. Whether or not support masking.
+            - **attention_type**: str, Type of attention, the value must be one of { ``'scaled_dot_product'`` , ``'additive'`` }.
+            - **output_type**: ``'mean'`` , ``'sum'`` or `None`. Whether or not use average/sum pooling for output.
+
+      References
+            - [Vaswani, Ashish, et al. "Attention is all you need." Advances in Neural Information Processing Systems. 2017.](https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf)
+    """
+
+    def __init__(self, att_embedding_size=1, head_num=8, dropout_rate=0.0, use_positional_encoding=True, use_res=True,
+                 use_feed_forward=True, use_layer_norm=False, blinding=True, seed=1024, supports_masking=False,
+                 attention_type="scaled_dot_product", output_type="mean", **kwargs):
+        if head_num <= 0:
+            raise ValueError('head_num must be a int > 0')
+        self.att_embedding_size = att_embedding_size
+        self.head_num = head_num
+        self.num_units = att_embedding_size * head_num
+        self.use_res = use_res
+        self.use_feed_forward = use_feed_forward
+        self.seed = seed
+        self.use_positional_encoding = use_positional_encoding
+        self.dropout_rate = dropout_rate
+        self.use_layer_norm = use_layer_norm
+        self.blinding = blinding
+        self.attention_type = attention_type
+        self.output_type = output_type
+        super(Transformer, self).__init__(**kwargs)
+        self.supports_masking = supports_masking
+
+    def build(self, input_shape):
+        embedding_size = int(input_shape[0][-1])
+        if self.num_units != embedding_size:
+            raise ValueError(
+                "att_embedding_size * head_num must equal the last dimension size of inputs,got %d * %d != %d" % (
+                    self.att_embedding_size, self.head_num, embedding_size))
+        self.seq_len_max = int(input_shape[0][-2])
+        self.W_Query = self.add_weight(name='query', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                       dtype=tf.float32,
+                                       initializer=TruncatedNormal(seed=self.seed))
+        self.W_key = self.add_weight(name='key', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                     dtype=tf.float32,
+                                     initializer=TruncatedNormal(seed=self.seed + 1))
+        self.W_Value = self.add_weight(name='value', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                       dtype=tf.float32,
+                                       initializer=TruncatedNormal(seed=self.seed + 2))
+        if self.attention_type == "additive":
+            self.b = self.add_weight('b', shape=[self.att_embedding_size], dtype=tf.float32,
+                                     initializer=glorot_uniform(seed=self.seed))
+            self.v = self.add_weight('v', shape=[self.att_embedding_size], dtype=tf.float32,
+                                     initializer=glorot_uniform(seed=self.seed))
+        # if self.use_res:
+        #     self.W_Res = self.add_weight(name='res', shape=[embedding_size, self.att_embedding_size * self.head_num], dtype=tf.float32,
+        #                                  initializer=TruncatedNormal(seed=self.seed))
+        if self.use_feed_forward:
+            self.fw1 = self.add_weight('fw1', shape=[self.num_units, 4 * self.num_units], dtype=tf.float32,
+                                       initializer=glorot_uniform(seed=self.seed))
+            self.fw2 = self.add_weight('fw2', shape=[4 * self.num_units, self.num_units], dtype=tf.float32,
+                                       initializer=glorot_uniform(seed=self.seed))
+
+        self.dropout = Dropout(
+            self.dropout_rate, seed=self.seed)
+        self.ln = LayerNormalization()
+        if self.use_positional_encoding:
+            self.query_pe = PositionEncoding()
+            self.key_pe = PositionEncoding()
+        # Be sure to call this somewhere!
+        super(Transformer, self).build(input_shape)
+
+    def call(self, inputs, mask=None, training=None, **kwargs):
+
+        if self.supports_masking:
+            queries, keys = inputs
+            query_masks, key_masks = mask
+            query_masks = tf.cast(query_masks, tf.float32)
+            key_masks = tf.cast(key_masks, tf.float32)
+        else:
+            queries, keys, query_masks, key_masks = inputs
+
+            query_masks = tf.sequence_mask(
+                query_masks, self.seq_len_max, dtype=tf.float32)
+            key_masks = tf.sequence_mask(
+                key_masks, self.seq_len_max, dtype=tf.float32)
+            query_masks = tf.squeeze(query_masks, axis=1)
+            key_masks = tf.squeeze(key_masks, axis=1)
+
+        if self.use_positional_encoding:
+            queries = self.query_pe(queries)
+            keys = self.key_pe(queries)
+
+        querys = tf.tensordot(queries, self.W_Query,
+                              axes=(-1, 0))  # None T_q D*head_num
+        keys = tf.tensordot(keys, self.W_key, axes=(-1, 0))
+        values = tf.tensordot(keys, self.W_Value, axes=(-1, 0))
+
+        # head_num*None T_q D
+        querys = tf.concat(tf.split(querys, self.head_num, axis=2), axis=0)
+        keys = tf.concat(tf.split(keys, self.head_num, axis=2), axis=0)
+        values = tf.concat(tf.split(values, self.head_num, axis=2), axis=0)
+
+        if self.attention_type == "scaled_dot_product":
+            # head_num*None T_q T_k
+            outputs = tf.matmul(querys, keys, transpose_b=True)
+
+            outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)
+        elif self.attention_type == "additive":
+            querys_reshaped = tf.expand_dims(querys, axis=-2)
+            keys_reshaped = tf.expand_dims(keys, axis=-3)
+            outputs = tf.tanh(tf.nn.bias_add(querys_reshaped + keys_reshaped, self.b))
+            outputs = tf.squeeze(tf.tensordot(outputs, tf.expand_dims(self.v, axis=-1), axes=[-1, 0]), axis=-1)
+        else:
+            raise ValueError("attention_type must be scaled_dot_product or additive")
+
+        key_masks = tf.tile(key_masks, [self.head_num, 1])
+
+        # (h*N, T_q, T_k)
+        key_masks = tf.tile(tf.expand_dims(key_masks, 1),
+                            [1, tf.shape(queries)[1], 1])
+
+        paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
+
+        # (h*N, T_q, T_k)
+
+        outputs = tf.where(tf.equal(key_masks, 1), outputs, paddings, )
+        if self.blinding:
+            try:
+                outputs = tf.matrix_set_diag(outputs, tf.ones_like(outputs)[
+                                                      :, :, 0] * (-2 ** 32 + 1))
+            except AttributeError:
+                outputs = tf.compat.v1.matrix_set_diag(outputs, tf.ones_like(outputs)[
+                                                                :, :, 0] * (-2 ** 32 + 1))
+
+        outputs -= reduce_max(outputs, axis=-1, keep_dims=True)
+        outputs = softmax(outputs)
+        query_masks = tf.tile(query_masks, [self.head_num, 1])  # (h*N, T_q)
+        # (h*N, T_q, T_k)
+        query_masks = tf.tile(tf.expand_dims(
+            query_masks, -1), [1, 1, tf.shape(keys)[1]])
+
+        outputs *= query_masks
+
+        outputs = self.dropout(outputs, training=training)
+        # Weighted sum
+        # ( h*N, T_q, C/h)
+        result = tf.matmul(outputs, values)
+        result = tf.concat(tf.split(result, self.head_num, axis=0), axis=2)
+
+        if self.use_res:
+            # tf.tensordot(queries, self.W_Res, axes=(-1, 0))
+            result += queries
+        if self.use_layer_norm:
+            result = self.ln(result)
+
+        if self.use_feed_forward:
+            fw1 = tf.nn.relu(tf.tensordot(result, self.fw1, axes=[-1, 0]))
+            fw1 = self.dropout(fw1, training=training)
+            fw2 = tf.tensordot(fw1, self.fw2, axes=[-1, 0])
+            if self.use_res:
+                result += fw2
+            if self.use_layer_norm:
+                result = self.ln(result)
+
+        if self.output_type == "mean":
+            return reduce_mean(result, axis=1, keep_dims=True)
+        elif self.output_type == "sum":
+            return reduce_sum(result, axis=1, keep_dims=True)
+        else:
+            return result
+
+    def compute_output_shape(self, input_shape):
+
+        return (None, 1, self.att_embedding_size * self.head_num)
+
+    def compute_mask(self, inputs, mask=None):
+        return None
+
+    def get_config(self, ):
+        config = {'att_embedding_size': self.att_embedding_size, 'head_num': self.head_num,
+                  'dropout_rate': self.dropout_rate, 'use_res': self.use_res,
+                  'use_positional_encoding': self.use_positional_encoding, 'use_feed_forward': self.use_feed_forward,
+                  'use_layer_norm': self.use_layer_norm, 'seed': self.seed, 'supports_masking': self.supports_masking,
+                  'blinding': self.blinding, 'attention_type': self.attention_type, 'output_type': self.output_type}
+        base_config = super(Transformer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class PositionEncoding(Layer):
+    def __init__(self, pos_embedding_trainable=True,
+                 zero_pad=False,
+                 scale=True, **kwargs):
+        self.pos_embedding_trainable = pos_embedding_trainable
+        self.zero_pad = zero_pad
+        self.scale = scale
+        super(PositionEncoding, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Create a trainable weight variable for this layer.
+        _, T, num_units = input_shape.as_list()  # inputs.get_shape().as_list()
+        # First part of the PE function: sin and cos argument
+        position_enc = np.array([
+            [pos / np.power(10000, 2. * (i // 2) / num_units) for i in range(num_units)]
+            for pos in range(T)])
+
+        # Second part, apply the cosine to even columns and sin to odds.
+        position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
+        position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
+        if self.zero_pad:
+            position_enc[0, :] = np.zeros(num_units)
+        self.lookup_table = self.add_weight("lookup_table", (T, num_units),
+                                            initializer=identity(position_enc),
+                                            trainable=self.pos_embedding_trainable)
+
+        # Be sure to call this somewhere!
+        super(PositionEncoding, self).build(input_shape)
+
+    def call(self, inputs, mask=None):
+        _, T, num_units = inputs.get_shape().as_list()
+        position_ind = tf.expand_dims(tf.range(T), 0)
+        outputs = tf.nn.embedding_lookup(self.lookup_table, position_ind)
+        if self.scale:
+            outputs = outputs * num_units ** 0.5
+        return outputs + inputs
+
+    def compute_output_shape(self, input_shape):
+
+        return input_shape
+
+    def compute_mask(self, inputs, mask=None):
+        return mask
+
+    def get_config(self, ):
+
+        config = {'pos_embedding_trainable': self.pos_embedding_trainable, 'zero_pad': self.zero_pad,
+                  'scale': self.scale}
+        base_config = super(PositionEncoding, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class BiasEncoding(Layer):
+    def __init__(self, sess_max_count, seed=1024, **kwargs):
+        self.sess_max_count = sess_max_count
+        self.seed = seed
+        super(BiasEncoding, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Create a trainable weight variable for this layer.
+
+        if self.sess_max_count == 1:
+            embed_size = input_shape[2].value
+            seq_len_max = input_shape[1].value
+        else:
+            try:
+                embed_size = input_shape[0][2].value
+                seq_len_max = input_shape[0][1].value
+            except AttributeError:
+                embed_size = input_shape[0][2]
+                seq_len_max = input_shape[0][1]
+
+        self.sess_bias_embedding = self.add_weight('sess_bias_embedding', shape=(self.sess_max_count, 1, 1),
+                                                   initializer=TruncatedNormal(
+                                                       mean=0.0, stddev=0.0001, seed=self.seed))
+        self.seq_bias_embedding = self.add_weight('seq_bias_embedding', shape=(1, seq_len_max, 1),
+                                                  initializer=TruncatedNormal(
+                                                      mean=0.0, stddev=0.0001, seed=self.seed))
+        self.item_bias_embedding = self.add_weight('item_bias_embedding', shape=(1, 1, embed_size),
+                                                   initializer=TruncatedNormal(
+                                                       mean=0.0, stddev=0.0001, seed=self.seed))
+
+        # Be sure to call this somewhere!
+        super(BiasEncoding, self).build(input_shape)
+
+    def call(self, inputs, mask=None):
+        """
+        :param concated_embeds_value: None * field_size * embedding_size
+        :return: None*1
+        """
+        transformer_out = []
+        for i in range(self.sess_max_count):
+            transformer_out.append(
+                inputs[i] + self.item_bias_embedding + self.seq_bias_embedding + self.sess_bias_embedding[i])
+        return transformer_out
+
+    def compute_output_shape(self, input_shape):
+
+        return input_shape
+
+    def compute_mask(self, inputs, mask=None):
+        return mask
+
+    def get_config(self, ):
+
+        config = {'sess_max_count': self.sess_max_count, 'seed': self.seed, }
+        base_config = super(BiasEncoding, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class DynamicGRU(Layer):
+    def __init__(self, num_units=None, gru_type='GRU', return_sequence=True, **kwargs):
+
+        self.num_units = num_units
+        self.return_sequence = return_sequence
+        self.gru_type = gru_type
+        super(DynamicGRU, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Create a trainable weight variable for this layer.
+        input_seq_shape = input_shape[0]
+        if self.num_units is None:
+            self.num_units = input_seq_shape.as_list()[-1]
+        if self.gru_type == "AGRU":
+            self.gru_cell = QAAttGRUCell(self.num_units)
+        elif self.gru_type == "AUGRU":
+            self.gru_cell = VecAttGRUCell(self.num_units)
+        else:
+            try:
+                self.gru_cell = tf.nn.rnn_cell.GRUCell(self.num_units)  # GRUCell
+            except AttributeError:
+                self.gru_cell = tf.compat.v1.nn.rnn_cell.GRUCell(self.num_units)
+
+        # Be sure to call this somewhere!
+        super(DynamicGRU, self).build(input_shape)
+
+    def call(self, input_list):
+        """
+        :param concated_embeds_value: None * field_size * embedding_size
+        :return: None*1
+        """
+        if self.gru_type == "GRU" or self.gru_type == "AIGRU":
+            rnn_input, sequence_length = input_list
+            att_score = None
+        else:
+            rnn_input, sequence_length, att_score = input_list
+
+        rnn_output, hidden_state = dynamic_rnn(self.gru_cell, inputs=rnn_input, att_scores=att_score,
+                                               sequence_length=tf.squeeze(sequence_length,
+                                                                          ), dtype=tf.float32, scope=self.name)
+        if self.return_sequence:
+            return rnn_output
+        else:
+            return tf.expand_dims(hidden_state, axis=1)
+
+    def compute_output_shape(self, input_shape):
+        rnn_input_shape = input_shape[0]
+        if self.return_sequence:
+            return rnn_input_shape
+        else:
+            return (None, 1, rnn_input_shape[2])
+
+    def get_config(self, ):
+        config = {'num_units': self.num_units, 'gru_type': self.gru_type, 'return_sequence': self.return_sequence}
+        base_config = super(DynamicGRU, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class KMaxPooling(Layer):
+    """K Max pooling that selects the k biggest value along the specific axis.
+
+      Input shape
+        -  nD tensor with shape: ``(batch_size, ..., input_dim)``.
+
+      Output shape
+        - nD tensor with shape: ``(batch_size, ..., output_dim)``.
+
+      Arguments
+        - **k**: positive integer, number of top elements to look for along the ``axis`` dimension.
+
+        - **axis**: positive integer, the dimension to look for elements.
+
+     """
+
+    def __init__(self, k=1, axis=-1, **kwargs):
+
+        self.k = k
+        self.axis = axis
+        super(KMaxPooling, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if self.axis < 1 or self.axis > len(input_shape):
+            raise ValueError("axis must be 1~%d,now is %d" %
+                             (len(input_shape), self.axis))
+
+        if self.k < 1 or self.k > input_shape[self.axis]:
+            raise ValueError("k must be in 1 ~ %d,now k is %d" %
+                             (input_shape[self.axis], self.k))
+        self.dims = len(input_shape)
+        # Be sure to call this somewhere!
+        super(KMaxPooling, self).build(input_shape)
+
+    def call(self, inputs):
+
+        # swap the last and the axis dimensions since top_k will be applied along the last dimension
+        perm = list(range(self.dims))
+        perm[-1], perm[self.axis] = perm[self.axis], perm[-1]
+        shifted_input = tf.transpose(inputs, perm)
+
+        # extract top_k, returns two tensors [values, indices]
+        top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=True, name=None)[0]
+        output = tf.transpose(top_k, perm)
+
+        return output
+
+    def compute_output_shape(self, input_shape):
+        output_shape = list(input_shape)
+        output_shape[self.axis] = self.k
+        return tuple(output_shape)
+
+    def get_config(self, ):
+        config = {'k': self.k, 'axis': self.axis}
+        base_config = super(KMaxPooling, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+# def positional_encoding(inputs,
+#                         pos_embedding_trainable=True,
+#                         zero_pad=False,
+#                         scale=True,
+#                         ):
+#     '''Sinusoidal Positional_Encoding.
+#
+#     Args:
+#
+#       - inputs: A 2d Tensor with shape of (N, T).
+#       - num_units: Output dimensionality
+#       - zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
+#       - scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper)
+#       - scope: Optional scope for `variable_scope`.
+#       - reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
+#
+#     Returns:
+#
+#       - A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units'
+#     '''
+#
+#     _, T, num_units = inputs.get_shape().as_list()
+#     # with tf.variable_scope(scope, reuse=reuse):
+#     position_ind = tf.expand_dims(tf.range(T), 0)
+#     # First part of the PE function: sin and cos argument
+#     position_enc = np.array([
+#         [pos / np.power(10000, 2. * i / num_units)
+#          for i in range(num_units)]
+#         for pos in range(T)])
+#
+#     # Second part, apply the cosine to even columns and sin to odds.
+#     position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
+#     position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
+#
+#     # Convert to a tensor
+#
+#     if pos_embedding_trainable:
+#         lookup_table = K.variable(position_enc, dtype=tf.float32)
+#
+#     if zero_pad:
+#         lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
+#                                   lookup_table[1:, :]), 0)
+#
+#     outputs = tf.nn.embedding_lookup(lookup_table, position_ind)
+#
+#     if scale:
+#         outputs = outputs * num_units ** 0.5
+#     return outputs + inputs
diff --git a/modelzoo/FwFM/script/layers/utils.py b/modelzoo/FwFM/script/layers/utils.py
new file mode 100644
index 00000000000..2be8f3fe5ef
--- /dev/null
+++ b/modelzoo/FwFM/script/layers/utils.py
@@ -0,0 +1,302 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+import tensorflow as tf
+from tensorflow.python.keras.layers import Flatten, Concatenate, Layer, Add
+from tensorflow.python.ops.lookup_ops import TextFileInitializer
+
+try:
+    from tensorflow.python.ops.init_ops import Zeros, glorot_normal_initializer as glorot_normal
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import Zeros, glorot_normal
+
+from tensorflow.python.keras.regularizers import l2
+
+try:
+    from tensorflow.python.ops.lookup_ops import StaticHashTable
+except ImportError:
+    from tensorflow.python.ops.lookup_ops import HashTable as StaticHashTable
+
+
+class NoMask(Layer):
+    def __init__(self, **kwargs):
+        super(NoMask, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Be sure to call this somewhere!
+        super(NoMask, self).build(input_shape)
+
+    def call(self, x, mask=None, **kwargs):
+        return x
+
+    def compute_mask(self, inputs, mask):
+        return None
+
+
+class Hash(Layer):
+    """Looks up keys in a table when setup `vocabulary_path`, which outputs the corresponding values.
+    If `vocabulary_path` is not set, `Hash` will hash the input to [0,num_buckets). When `mask_zero` = True,
+    input value `0` or `0.0` will be set to `0`, and other value will be set in range [1,num_buckets).
+
+    The following snippet initializes a `Hash` with `vocabulary_path` file with the first column as keys and
+    second column as values:
+
+    * `1,emerson`
+    * `2,lake`
+    * `3,palmer`
+
+    >>> hash = Hash(
+    ...   num_buckets=3+1,
+    ...   vocabulary_path=filename,
+    ...   default_value=0)
+    >>> hash(tf.constant('lake')).numpy()
+    2
+    >>> hash(tf.constant('lakeemerson')).numpy()
+    0
+
+    Args:
+        num_buckets: An `int` that is >= 1. The number of buckets or the vocabulary size + 1
+            when `vocabulary_path` is setup.
+        mask_zero: default is False. The `Hash` value will hash input `0` or `0.0` to value `0` when
+            the `mask_zero` is `True`. `mask_zero` is not used when `vocabulary_path` is setup.
+        vocabulary_path: default `None`. The `CSV` text file path of the vocabulary hash, which contains
+            two columns seperated by delimiter `comma`, the first column is the value and the second is
+            the key. The key data type is `string`, the value data type is `int`. The path must
+            be accessible from wherever `Hash` is initialized.
+        default_value: default '0'. The default value if a key is missing in the table.
+        **kwargs: Additional keyword arguments.
+    """
+
+    def __init__(self, num_buckets, mask_zero=False, vocabulary_path=None, default_value=0, **kwargs):
+        self.num_buckets = num_buckets
+        self.mask_zero = mask_zero
+        self.vocabulary_path = vocabulary_path
+        self.default_value = default_value
+        if self.vocabulary_path:
+            initializer = TextFileInitializer(vocabulary_path, 'string', 1, 'int64', 0, delimiter=',')
+            self.hash_table = StaticHashTable(initializer, default_value=self.default_value)
+        super(Hash, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Be sure to call this somewhere!
+        super(Hash, self).build(input_shape)
+
+    def call(self, x, mask=None, **kwargs):
+
+        if x.dtype != tf.string:
+            zero = tf.as_string(tf.zeros([1], dtype=x.dtype))
+            x = tf.as_string(x, )
+        else:
+            zero = tf.as_string(tf.zeros([1], dtype='int32'))
+
+        if self.vocabulary_path:
+            hash_x = self.hash_table.lookup(x)
+            return hash_x
+
+        num_buckets = self.num_buckets if not self.mask_zero else self.num_buckets - 1
+        try:
+            hash_x = tf.string_to_hash_bucket_fast(x, num_buckets,
+                                                   name=None)  # weak hash
+        except AttributeError:
+            hash_x = tf.strings.to_hash_bucket_fast(x, num_buckets,
+                                                    name=None)  # weak hash
+        if self.mask_zero:
+            mask = tf.cast(tf.not_equal(x, zero), dtype='int64')
+            hash_x = (hash_x + 1) * mask
+
+        return hash_x
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self, ):
+        config = {'num_buckets': self.num_buckets, 'mask_zero': self.mask_zero, 'vocabulary_path': self.vocabulary_path,
+                  'default_value': self.default_value}
+        base_config = super(Hash, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class Linear(Layer):
+
+    def __init__(self, l2_reg=0.0, mode=0, use_bias=False, seed=1024, **kwargs):
+
+        self.l2_reg = l2_reg
+        # self.l2_reg = tf.contrib.layers.l2_regularizer(float(l2_reg_linear))
+        if mode not in [0, 1, 2]:
+            raise ValueError("mode must be 0,1 or 2")
+        self.mode = mode
+        self.use_bias = use_bias
+        self.seed = seed
+        super(Linear, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if self.use_bias:
+            self.bias = self.add_weight(name='linear_bias',
+                                        shape=(1,),
+                                        initializer=Zeros(),
+                                        trainable=True)
+        if self.mode == 1:
+            self.kernel = self.add_weight(
+                'linear_kernel',
+                shape=[int(input_shape[-1]), 1],
+                initializer=glorot_normal(self.seed),
+                regularizer=l2(self.l2_reg),
+                trainable=True)
+        elif self.mode == 2:
+            self.kernel = self.add_weight(
+                'linear_kernel',
+                shape=[int(input_shape[1][-1]), 1],
+                initializer=glorot_normal(self.seed),
+                regularizer=l2(self.l2_reg),
+                trainable=True)
+
+        super(Linear, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+        if self.mode == 0:
+            sparse_input = inputs
+            linear_logit = reduce_sum(sparse_input, axis=-1, keep_dims=True)
+        elif self.mode == 1:
+            dense_input = inputs
+            fc = tf.tensordot(dense_input, self.kernel, axes=(-1, 0))
+            linear_logit = fc
+        else:
+            sparse_input, dense_input = inputs
+            fc = tf.tensordot(dense_input, self.kernel, axes=(-1, 0))
+            linear_logit = reduce_sum(sparse_input, axis=-1, keep_dims=False) + fc
+        if self.use_bias:
+            linear_logit += self.bias
+
+        return linear_logit
+
+    def compute_output_shape(self, input_shape):
+        return (None, 1)
+
+    def compute_mask(self, inputs, mask):
+        return None
+
+    def get_config(self, ):
+        config = {'mode': self.mode, 'l2_reg': self.l2_reg, 'use_bias': self.use_bias, 'seed': self.seed}
+        base_config = super(Linear, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+def concat_func(inputs, axis=-1, mask=False):
+    if not mask:
+        inputs = list(map(NoMask(), inputs))
+    if len(inputs) == 1:
+        return inputs[0]
+    else:
+        return Concatenate(axis=axis)(inputs)
+
+
+def reduce_mean(input_tensor,
+                axis=None,
+                keep_dims=False,
+                name=None,
+                reduction_indices=None):
+    try:
+        return tf.reduce_mean(input_tensor,
+                              axis=axis,
+                              keep_dims=keep_dims,
+                              name=name,
+                              reduction_indices=reduction_indices)
+    except TypeError:
+        return tf.reduce_mean(input_tensor,
+                              axis=axis,
+                              keepdims=keep_dims,
+                              name=name)
+
+
+def reduce_sum(input_tensor,
+               axis=None,
+               keep_dims=False,
+               name=None,
+               reduction_indices=None):
+    try:
+        return tf.reduce_sum(input_tensor,
+                             axis=axis,
+                             keep_dims=keep_dims,
+                             name=name,
+                             reduction_indices=reduction_indices)
+    except TypeError:
+        return tf.reduce_sum(input_tensor,
+                             axis=axis,
+                             keepdims=keep_dims,
+                             name=name)
+
+
+def reduce_max(input_tensor,
+               axis=None,
+               keep_dims=False,
+               name=None,
+               reduction_indices=None):
+    try:
+        return tf.reduce_max(input_tensor,
+                             axis=axis,
+                             keep_dims=keep_dims,
+                             name=name,
+                             reduction_indices=reduction_indices)
+    except TypeError:
+        return tf.reduce_max(input_tensor,
+                             axis=axis,
+                             keepdims=keep_dims,
+                             name=name)
+
+
+def div(x, y, name=None):
+    try:
+        return tf.div(x, y, name=name)
+    except AttributeError:
+        return tf.divide(x, y, name=name)
+
+
+def softmax(logits, dim=-1, name=None):
+    try:
+        return tf.nn.softmax(logits, dim=dim, name=name)
+    except TypeError:
+        return tf.nn.softmax(logits, axis=dim, name=name)
+
+
+class _Add(Layer):
+    def __init__(self, **kwargs):
+        super(_Add, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Be sure to call this somewhere!
+        super(_Add, self).build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        # if not isinstance(inputs, list):
+        #     return inputs
+        # if len(inputs) == 1:
+        #     return inputs[0]
+        if len(inputs) == 0:
+            return tf.constant([[0.0]])
+
+        return Add()(inputs)
+
+
+def add_func(inputs):
+    if not isinstance(inputs, list):
+        return inputs
+    if len(inputs) == 1:
+        return inputs[0]
+    return _Add()(inputs)
+
+
+def combined_dnn_input(sparse_embedding_list, dense_value_list):
+    if len(sparse_embedding_list) > 0 and len(dense_value_list) > 0:
+        sparse_dnn_input = Flatten()(concat_func(sparse_embedding_list))
+        dense_dnn_input = Flatten()(concat_func(dense_value_list))
+        return concat_func([sparse_dnn_input, dense_dnn_input])
+    elif len(sparse_embedding_list) > 0:
+        return Flatten()(concat_func(sparse_embedding_list))
+    elif len(dense_value_list) > 0:
+        return Flatten()(concat_func(dense_value_list))
+    else:
+        raise NotImplementedError("dnn_feature_columns can not be empty list")
diff --git a/modelzoo/FwFM/script/models/__init__.py b/modelzoo/FwFM/script/models/__init__.py
new file mode 100644
index 00000000000..87868a82459
--- /dev/null
+++ b/modelzoo/FwFM/script/models/__init__.py
@@ -0,0 +1,3 @@
+from .fwfm import FwFM
+
+__all__ = ["FwFM"]
diff --git a/modelzoo/FwFM/script/models/fwfm.py b/modelzoo/FwFM/script/models/fwfm.py
new file mode 100644
index 00000000000..3646d8f1b0c
--- /dev/null
+++ b/modelzoo/FwFM/script/models/fwfm.py
@@ -0,0 +1,72 @@
+# -*- coding:utf-8 -*-
+"""
+Author:
+    Harshit Pande
+
+Reference:
+    [1] Field-weighted Factorization Machines for Click-Through Rate Prediction in Display Advertising
+    (https://arxiv.org/pdf/1806.03514.pdf)
+
+"""
+
+from itertools import chain
+
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.layers import Dense
+
+from ..feature_column import build_input_features, get_linear_logit, DEFAULT_GROUP_NAME, input_from_feature_columns
+from ..layers.core import PredictionLayer, DNN
+from ..layers.interaction import FwFMLayer
+from ..layers.utils import concat_func, add_func, combined_dnn_input
+
+
+def FwFM(linear_feature_columns, dnn_feature_columns, fm_group=(DEFAULT_GROUP_NAME,), dnn_hidden_units=(256, 128, 64),
+         l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_field_strength=0.00001, l2_reg_dnn=0,
+         seed=1024, dnn_dropout=0, dnn_activation='relu', dnn_use_bn=False, task='binary'):
+    """Instantiates the PNN Network architecture.
+
+    :param linear_feature_columns: An iterable containing all the features used by linear part of the model.
+    :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
+    :param fm_group: list, group_name of features that will be used to do feature interactions.
+    :param dnn_hidden_units: list,list of positive integer or empty list if do not want DNN, the layer number and units
+    in each layer of DNN
+    :param l2_reg_linear: float. L2 regularizer strength applied to linear part
+    :param l2_reg_field_strength: float. L2 regularizer strength applied to the field pair strength parameters
+    :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
+    :param l2_reg_dnn: float. L2 regularizer strength applied to DNN
+    :param seed: integer ,to use as random seed.
+    :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
+    :param dnn_activation: Activation function to use in DNN
+    :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN
+    :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
+    :return: A Keras model instance.
+    """
+
+    features = build_input_features(linear_feature_columns + dnn_feature_columns)
+
+    inputs_list = list(features.values())
+
+    linear_logit = get_linear_logit(features, linear_feature_columns, seed=seed, prefix='linear',
+                                    l2_reg=l2_reg_linear)
+
+    group_embedding_dict, dense_value_list = input_from_feature_columns(features, dnn_feature_columns,
+                                                                        l2_reg_embedding, seed,
+                                                                        support_group=True)
+
+    fwfm_logit = add_func([FwFMLayer(num_fields=len(v), regularizer=l2_reg_field_strength)
+                           (concat_func(v, axis=1)) for k, v in group_embedding_dict.items() if k in fm_group])
+
+    final_logit_components = [linear_logit, fwfm_logit]
+
+    if dnn_hidden_units:
+        dnn_input = combined_dnn_input(list(chain.from_iterable(
+            group_embedding_dict.values())), dense_value_list)
+        dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, seed=seed)(dnn_input)
+        dnn_logit = Dense(1, use_bias=False)(dnn_output)
+        final_logit_components.append(dnn_logit)
+
+    final_logit = add_func(final_logit_components)
+
+    output = PredictionLayer(task)(final_logit)
+    model = Model(inputs=inputs_list, outputs=output)
+    return model
diff --git a/modelzoo/FwFM/script/utils.py b/modelzoo/FwFM/script/utils.py
new file mode 100644
index 00000000000..7fe3b25a518
--- /dev/null
+++ b/modelzoo/FwFM/script/utils.py
@@ -0,0 +1,46 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+import json
+import logging
+from threading import Thread
+
+import requests
+
+try:
+    from packaging.version import parse
+except ImportError:
+    from pip._vendor.packaging.version import parse
+
+
+def check_version(version):
+    """Return version of package on pypi.python.org using json."""
+
+    def check(version):
+        try:
+            url_pattern = 'https://pypi.python.org/pypi/deepctr/json'
+            req = requests.get(url_pattern)
+            latest_version = parse('0')
+            version = parse(version)
+            if req.status_code == requests.codes.ok:
+                j = json.loads(req.text.encode('utf-8'))
+                releases = j.get('releases', [])
+                for release in releases:
+                    ver = parse(release)
+                    if ver.is_prerelease or ver.is_postrelease:
+                        continue
+                    latest_version = max(latest_version, ver)
+                if latest_version > version:
+                    logging.warning(
+                        '\nDeepCTR version {0} detected. Your version is {1}.\nUse `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v{0}'.format(
+                            latest_version, version))
+        except:
+            print("Please check the latest version manually on https://pypi.org/project/deepctr/#history")
+            return
+
+    Thread(target=check, args=(version,)).start()
diff --git a/modelzoo/FwFM/train.py b/modelzoo/FwFM/train.py
new file mode 100644
index 00000000000..bdfb56e983c
--- /dev/null
+++ b/modelzoo/FwFM/train.py
@@ -0,0 +1,255 @@
+import os
+import sys
+import argparse
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.losses import binary_crossentropy
+from script.models.fwfm import FwFM
+from script.feature_column import SparseFeat, DenseFeat, get_feature_names,VarLenSparseFeat
+
+
+
+
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+UNSEQ_COLUMNS = ['UID', 'ITEM', 'CATEGORY']
+LABEL_COLUMN = ['CLICKED']
+TRAIN_DATA_COLUMNS = LABEL_COLUMN + UNSEQ_COLUMNS
+
+EMBEDDING_DIM=8
+
+def split(x):
+    key_ans = x.split(',')
+    for key in key_ans:
+        if key not in key2index:
+            key2index[key] = len(key2index) + 1
+    return list(map(lambda x: key2index[x], key_ans))
+
+
+#连续变量分箱处理
+def BinMap(data,acc):
+    if acc >=1 or acc<=0:
+        return print('acc must less than 1 and more than 0')
+    max = data.max()
+    min = data.min()
+    rangelist = [i+1 for i in range(int(1/acc))]
+    length = len(data)-1
+    data1 = data.sort_index()
+    bin_res = np.array([0] * data.shape[-1], dtype=int)
+    for r in rangelist:
+        if r ==1:
+            lower = min
+        else:
+            lower = data1[int(length*((r-1)*acc))]
+        rank = r*acc
+        i = int(length*rank)
+        # x = data[np.where(data>=lower) + np.where(data<data1[i])]
+        if r == rangelist[-1]:
+            mask = data.loc[(data>=lower) & (data<=max)].index
+        else:
+            mask = data.loc[(data >= lower) & (data <data1[i])].index
+
+        bin_res[mask]=r
+    bin_res=pd.Series(bin_res,index=data.index)
+    bin_res.name = data.name+'_BIN'
+    return bin_res
+
+def build_model_input(filename=None,chunkSize=1e6,loop=True):
+    chunks=[]
+    data = pd.read_csv(filename, encoding="utf-8", header=None, names=TRAIN_DATA_COLUMNS, iterator=True)
+    while loop:
+        try:
+            chunk = data.get_chunk(chunkSize)
+            chunks.append(chunk)
+        except StopIteration:
+            loop=False
+    dataset = pd.concat(chunks)
+
+
+    return dataset
+
+
+
+def build_feature_columns(data_location=None):
+
+    if data_location:
+        uid_file = os.path.join(data_location, 'uid_labelencode.csv')
+        mid_file = os.path.join(data_location, 'mid_labelencode.csv')
+        cat_file = os.path.join(data_location, 'cat_labelencode.csv')
+        if (not os.path.exists(uid_file)) or (not os.path.exists(mid_file)) or (
+                    not os.path.exists(cat_file)):
+            print("uid_labelencode.csv, mid_labelencode.csv or cat_labelencode.csv does not exist in data file.")
+            sys.exit()
+
+        uid_data = pd.read_csv(uid_file,encoding="utf-8")
+        mid_data = pd.read_csv(mid_file,encoding="utf-8")
+        cat_data = pd.read_csv(cat_file,encoding="utf-8")
+
+
+        feature_column=[SparseFeat('UID', vocabulary_size=uid_data['UID'+'_encode'].max() + 1, embedding_dim=EMBEDDING_DIM,embeddings_initializer=None),
+                        SparseFeat('ITEM',vocabulary_size=mid_data['ITEM'+'_encode'].max()+1,embedding_dim=EMBEDDING_DIM,embeddings_initializer=None),
+                        SparseFeat('CATEGORY',vocabulary_size=cat_data['CATEGORY'+'_encode'].max()+1,embedding_dim=EMBEDDING_DIM,embeddings_initializer=None)]
+
+    else:
+        print("data_location does not exist in data file. ")
+        sys.exit()
+
+
+    return feature_column
+
+
+def main(train_data=None,test_data=None,feature_colums=None):
+    feature_names = get_feature_names(feature_colums)
+    model = FwFM(feature_colums, feature_colums,task=args.task, dnn_hidden_units=args.dnn_hidden_units,l2_reg_linear=args.l2_reg_linear,
+                l2_reg_embedding=args.l2_reg_embedding,l2_reg_field_strength=args.l2_reg_field_strength,l2_reg_dnn=args.l2_reg_dnn,
+                seed=args.seed,dnn_dropout=args.dnn_dropout,dnn_activation=args.dnn_activation,dnn_use_bn=args.dnn_use_bn)
+
+    if args.optimizer=='adam':
+        optimizer = Adam(learning_rate=args.learning_rate, amsgrad=False)
+    model.compile(optimizer, loss=args.loss,
+                  metrics=args.metrics)
+    saver = tf.train.Saver()
+    gpu_options = tf.GPUOptions(allow_growth=True)
+    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
+        if args.training:
+            train_inputs = {name: train_data[name].values for name in feature_names}
+            sess.run(tf.tables_initializer())
+            history = model.fit(train_inputs, train_data[LABEL_COLUMN].values,
+                            batch_size=args.batch_size, epochs=args.epochs,
+                            verbose=args.verbose,validation_split=args.validation_split)
+            saver.save(sess,args.save_path,global_step=args.save_step)
+
+        else:
+            #new_saver = tf.train.import_meta_graph(save_path+'model.ckpt.meta')
+
+            saver.restore(sess, tf.train.latest_checkpoint(args.save_path))
+            test_inputs = {name:test_data[name].values for name in feature_names}
+            pred_ans = model.predict(test_inputs, batch_size=args.batch_size)
+
+
+# Get parse
+def get_arg_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--learning_rate',
+                        help='Learning rate for model',
+                        type=float,
+                        default=0.001)
+    parser.add_argument('--save_path',
+                        help='Full path to model output directory',
+                        required=False,
+                        default='results/')
+    parser.add_argument('--batch_size',
+                        help='Batch size to train. Default is 512',
+                        type=int,
+                        default=512)
+    parser.add_argument('--training',
+                        help='train or eval ',
+                        type=bool,
+                        default=True)
+    parser.add_argument('--epochs',
+                        help='Epoch to train.Default is 50',
+                        type=int,
+                        default=50)
+    parser.add_argument('--save_step',
+                        help='set the number of steps on saving checkpoints',
+                        type=int,
+                        default=1)
+    parser.add_argument('--verbose',
+                        help='set the random seed for tensorflow.',
+                        choices=[0,1,2],
+                        default=2)
+    parser.add_argument('--validation_split',
+                        help='Validation split.',
+                        type=float,
+                        default=0.2)
+    parser.add_argument('--optimizer',
+                        type=str, \
+                        default='adam')
+    parser.add_argument('--dnn_hidden_units',
+                        type=tuple,
+                        help='An iterable containing all the features used by deep part of the model.',
+                        default=(256, 128, 64))
+    parser.add_argument('--l2_reg_linear',
+                        help='L2 regularizer strength applied to linear part.',
+                        type=float,
+                        default=0.00001)
+    parser.add_argument('--l2_reg_embedding',
+                        help=' L2 regularizer strength applied to embedding vector.',
+                        type=float,
+                        default=0.00001)
+    parser.add_argument('--l2_reg_field_strength',
+                        help='L2 regularizer strength applied to the field pair strength parameter.',
+                        type=float,
+                        default=0.00001)
+    parser.add_argument('--l2_reg_dnn',
+                        help='L2 regularizer strength applied to DNN.',
+                        type=float,
+                        default=0)
+    parser.add_argument('--seed',
+                        help='to use as random seed.',
+                        type=int,
+                        default=1024)
+    parser.add_argument('--dnn_dropout',
+                        help='the probability we will drop out a given DNN coordinate,float in [0,1).',
+                        type=float,
+                        default=0)
+    parser.add_argument('--dnn_activation',
+                        help='Activation function to use in DNN.',
+                        type=str,
+                        default='relu')
+    parser.add_argument('--dnn_use_bn',
+                        help='Whether use BatchNormalization before activation or not in DNN',
+                        type=bool,
+                        default=False)
+    parser.add_argument('--task',
+                        help='``"binary"`` for  binary logloss or  ``"regression"`` for regression loss.',
+                        type=str,
+                        choices=['binary','regression'],
+                        default='binary')
+    parser.add_argument('--loss',
+                        type=str,
+                        default='binary_crossentropy')
+    parser.add_argument('--metrics',
+                        type=list,
+                        default=['binary_crossentropy', 'AUC'])
+
+
+    return parser
+
+
+
+if __name__=="__main__":
+    path = 'dataset'
+    train_path = path+'/local_train_splitByUser_to_labelencode.txt'
+    test_path = path+'/local_test_splitByUser_to_labelencode.txt'
+    feature_colums = build_feature_columns(path)
+
+    train_data = build_model_input(train_path)
+    test_data = build_model_input(test_path)
+
+    feature_names = get_feature_names(feature_colums)
+
+    parser = get_arg_parser()
+    args = parser.parse_args()
+    main(train_data,test_data,feature_colums)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

From 6421ff41c02421ebc04ee6212176be5c59cca634 Mon Sep 17 00:00:00 2001
From: lihangtian <936971274@qq.com>
Date: Wed, 12 Oct 2022 15:57:26 +0800
Subject: [PATCH 8/8] [ModelZoo] Support PNN

---
 modelzoo/PNN/README.md                        |   85 +
 modelzoo/PNN/data/README.md                   |    4 +
 modelzoo/PNN/data/prepare_data.sh             |   15 +
 modelzoo/PNN/data/script/data2labelencode.py  |   54 +
 modelzoo/PNN/data/script/generate_neg.py      |   63 +
 modelzoo/PNN/data/script/generate_voc.py      |   66 +
 .../PNN/data/script/history_behavior_list.py  |   41 +
 modelzoo/PNN/data/script/item_map.py          |   29 +
 modelzoo/PNN/data/script/local_aggretor.py    |   47 +
 modelzoo/PNN/data/script/pick2txt.py          |   14 +
 modelzoo/PNN/data/script/process_data.py      |  108 ++
 modelzoo/PNN/data/script/split_by_user.py     |   18 +
 modelzoo/PNN/result/README.md                 |    2 +
 modelzoo/PNN/script/__init__.py               |    0
 modelzoo/PNN/script/contrib/__init__.py       |    0
 modelzoo/PNN/script/contrib/rnn.py            | 1153 +++++++++++++
 modelzoo/PNN/script/contrib/rnn_v2.py         | 1452 ++++++++++++++++
 modelzoo/PNN/script/contrib/utils.py          |  378 +++++
 modelzoo/PNN/script/estimator/__init__.py     |    1 +
 .../PNN/script/estimator/feature_column.py    |   52 +
 modelzoo/PNN/script/estimator/inputs.py       |   52 +
 .../PNN/script/estimator/models/__init__.py   |   13 +
 modelzoo/PNN/script/estimator/models/pnn.py   |   93 +
 modelzoo/PNN/script/estimator/utils.py        |  217 +++
 modelzoo/PNN/script/feature_column.py         |  249 +++
 modelzoo/PNN/script/inputs.py                 |  155 ++
 modelzoo/PNN/script/layers/__init__.py        |   52 +
 modelzoo/PNN/script/layers/activation.py      |   85 +
 modelzoo/PNN/script/layers/core.py            |  267 +++
 modelzoo/PNN/script/layers/interaction.py     | 1492 +++++++++++++++++
 modelzoo/PNN/script/layers/normalization.py   |   51 +
 modelzoo/PNN/script/layers/sequence.py        |  901 ++++++++++
 modelzoo/PNN/script/layers/utils.py           |  302 ++++
 modelzoo/PNN/script/models/__init__.py        |    3 +
 modelzoo/PNN/script/models/pnn.py             |   72 +
 modelzoo/PNN/script/utils.py                  |   46 +
 modelzoo/PNN/train.py                         |  259 +++
 37 files changed, 7891 insertions(+)
 create mode 100644 modelzoo/PNN/README.md
 create mode 100644 modelzoo/PNN/data/README.md
 create mode 100644 modelzoo/PNN/data/prepare_data.sh
 create mode 100644 modelzoo/PNN/data/script/data2labelencode.py
 create mode 100644 modelzoo/PNN/data/script/generate_neg.py
 create mode 100644 modelzoo/PNN/data/script/generate_voc.py
 create mode 100644 modelzoo/PNN/data/script/history_behavior_list.py
 create mode 100644 modelzoo/PNN/data/script/item_map.py
 create mode 100644 modelzoo/PNN/data/script/local_aggretor.py
 create mode 100644 modelzoo/PNN/data/script/pick2txt.py
 create mode 100644 modelzoo/PNN/data/script/process_data.py
 create mode 100644 modelzoo/PNN/data/script/split_by_user.py
 create mode 100644 modelzoo/PNN/result/README.md
 create mode 100644 modelzoo/PNN/script/__init__.py
 create mode 100644 modelzoo/PNN/script/contrib/__init__.py
 create mode 100644 modelzoo/PNN/script/contrib/rnn.py
 create mode 100644 modelzoo/PNN/script/contrib/rnn_v2.py
 create mode 100644 modelzoo/PNN/script/contrib/utils.py
 create mode 100644 modelzoo/PNN/script/estimator/__init__.py
 create mode 100644 modelzoo/PNN/script/estimator/feature_column.py
 create mode 100644 modelzoo/PNN/script/estimator/inputs.py
 create mode 100644 modelzoo/PNN/script/estimator/models/__init__.py
 create mode 100644 modelzoo/PNN/script/estimator/models/pnn.py
 create mode 100644 modelzoo/PNN/script/estimator/utils.py
 create mode 100644 modelzoo/PNN/script/feature_column.py
 create mode 100644 modelzoo/PNN/script/inputs.py
 create mode 100644 modelzoo/PNN/script/layers/__init__.py
 create mode 100644 modelzoo/PNN/script/layers/activation.py
 create mode 100644 modelzoo/PNN/script/layers/core.py
 create mode 100644 modelzoo/PNN/script/layers/interaction.py
 create mode 100644 modelzoo/PNN/script/layers/normalization.py
 create mode 100644 modelzoo/PNN/script/layers/sequence.py
 create mode 100644 modelzoo/PNN/script/layers/utils.py
 create mode 100644 modelzoo/PNN/script/models/__init__.py
 create mode 100644 modelzoo/PNN/script/models/pnn.py
 create mode 100644 modelzoo/PNN/script/utils.py
 create mode 100644 modelzoo/PNN/train.py

diff --git a/modelzoo/PNN/README.md b/modelzoo/PNN/README.md
new file mode 100644
index 00000000000..0d02cde5540
--- /dev/null
+++ b/modelzoo/PNN/README.md
@@ -0,0 +1,85 @@
+# PNN
+
+The following is a brief directory structure and description for this example:
+
+
+
+```
+├── data                        # Data set directory
+│   ├── prepare_data.sh         # Shell script to download and process dataset
+│   └── README.md              # Documentation describing how to prepare dataset
+│	└──script                   # Directory contains scripts to process dataset
+│       ├──data2labelencode           # Convert data to csv file
+│       ├── generate_neg.py           # Create negative sample
+│       ├── generate_voc.py           # Create a list of features
+│       ├── history_behavior_list.py  # Count user's history behaviors
+│       ├── item_map.py               # Create a map between item id and cate
+│       ├── local_aggretor.py         # Generate sample data
+│       ├── pick2txt.py               # Convert voc's format
+│       ├── process_data.py           # Parse raw json data
+│       └── split_by_user.py          # Divide the dataset
+├── script                       # model set directory
+│	├── contrib                  #Directory contains rnn
+│	├── estimator                #Directory contains estimator to data
+│	├── layers                   #Directory contains layers of model 
+│	├── models                   #Directory contains FNN model
+│	├── feature_column.py        # Feature marker
+│	├── inputs.py                #Construction of Input Layer
+│	└──utils
+├── train.py                    # Training script
+└── README.md                      # Documentation
+```
+
+
+
+## Content
+
+[TOC]
+
+
+
+## Model Structure
+
+Implementation of paper "Product-based neural networks for user response prediction".
+
+
+
+## Usage
+
+### Stand-alone Training
+
+1. Please prepare the data set and DeepRec env.
+
+   1. Manually
+
+      - Follow [dataset preparation](https://github.com/alibaba/DeepRec/tree/main/modelzoo/DIEN#prepare) to prepare data set.
+      - Download code by `git clone https://github.com/alibaba/DeepRec`
+      - Follow [How to Build](https://github.com/alibaba/DeepRec#how-to-build) to build DeepRec whl package and install by `pip install $DEEPREC_WHL`.
+
+   2. Docker(Recommended)
+
+      ```
+      docker pull alideeprec/deeprec-release-modelzoo:latest
+      docker run -it alideeprec/deeprec-release-modelzoo:latest /bin/bash
+      
+      # In docker container
+      cd /root/modelzoo/CAN
+      ```
+
+​	2.train.
+
+```
+  python train.py
+```
+
+​	
+
+
+
+## Dataset
+
+ Amazon Dataset Books dataset is used as benchmark dataset.
+
+### Prepare
+
+For details of Data download, see [Data Preparation](https://github.com/Atomu2014/make-ipinyou-data)
diff --git a/modelzoo/PNN/data/README.md b/modelzoo/PNN/data/README.md
new file mode 100644
index 00000000000..15a0bc61c8d
--- /dev/null
+++ b/modelzoo/PNN/data/README.md
@@ -0,0 +1,4 @@
+make-ipinyou-data
+=================
+
+For details of Data download, see [Data Preparation](https://github.com/Atomu2014/make-ipinyou-data)
diff --git a/modelzoo/PNN/data/prepare_data.sh b/modelzoo/PNN/data/prepare_data.sh
new file mode 100644
index 00000000000..49fdb9a0da1
--- /dev/null
+++ b/modelzoo/PNN/data/prepare_data.sh
@@ -0,0 +1,15 @@
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Books.json.gz
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Books.json.gz
+gunzip reviews_Books.json.gz
+gunzip meta_Books.json.gz
+
+python script/process_data.py meta_Books.json reviews_Books.json
+python script/local_aggretor.py
+python script/split_by_user.py
+python script/generate_voc.py
+
+python script/item_map.py
+python script/history_behavior_list.py
+python script/generate_neg.py
+
+python script/data2labelencode.py
\ No newline at end of file
diff --git a/modelzoo/PNN/data/script/data2labelencode.py b/modelzoo/PNN/data/script/data2labelencode.py
new file mode 100644
index 00000000000..04daba5e28a
--- /dev/null
+++ b/modelzoo/PNN/data/script/data2labelencode.py
@@ -0,0 +1,54 @@
+import pandas as pd
+import numpy as np
+import pickle
+
+UNSEQ_COLUMNS = ['UID', 'ITEM', 'CATEGORY']
+HIS_COLUMNS = ['HISTORY_ITEM', 'HISTORY_CATEGORY']
+SEQ_COLUMNS = HIS_COLUMNS
+LABEL_COLUMN = ['CLICKED']
+TRAIN_DATA_COLUMNS = LABEL_COLUMN + UNSEQ_COLUMNS + SEQ_COLUMNS
+
+
+
+def inputs_to_labelencode(filename):
+    def encoder_dict(data, category_col):
+        category_dict = data[category_col].value_counts()
+        category_dict = pd.Series(np.arange(0, len(category_dict)), index=category_dict.index).to_dict()
+        data[category_col + '_encode'] = data[category_col].map(category_dict).astype('int32')
+        return data
+
+    uid_file = '../CAN/data/uid_voc.txt'
+    mid_file = '../CAN/data/mid_voc.txt'
+    cat_file = '../CAN/data/cat_voc.txt'
+
+    uid_data = pd.read_csv(uid_file, encoding="utf-8", header=None, names=['UID'])
+    mid_data = pd.read_csv(mid_file, encoding="utf-8", header=None, names=['ITEM'])
+    cat_data = pd.read_csv(cat_file, encoding="utf-8", header=None, names=['CATEGORY'])
+
+    uid_data = encoder_dict(uid_data, 'UID')
+    mid_data = encoder_dict(mid_data, 'ITEM')
+    cat_data = encoder_dict(cat_data, 'CATEGORY')
+
+    dataset = pd.read_csv(filename, encoding="utf-8",
+                          header=None, names=TRAIN_DATA_COLUMNS, sep="\t", low_memory=False)
+    for key in ['UID','ITEM','CATEGORY']:
+        if key=='UID':
+            dataset = pd.merge(dataset, uid_data, on=key, how='inner')
+        elif key=='ITEM':
+            dataset = pd.merge(dataset, mid_data, on=key, how='inner')
+        else:
+            dataset = pd.merge(dataset, cat_data, on=key, how='inner')
+
+    dataset = dataset.drop(UNSEQ_COLUMNS + SEQ_COLUMNS, axis=1)
+
+    dataset.to_csv(filename + '_to_labelencode.txt',index=0,header=0)
+    uid_data.to_csv('dataset/uid_labelencode.csv',index=False)
+    mid_data.to_csv('dataset/mid_labelencode.csv',index=False)
+    cat_data.to_csv('dataset/cat_labelencode.csv',index=False)
+
+
+
+if __name__ == '__main__':
+    inputs_to_labelencode('../CAN/data/local_train_splitByUser')
+    inputs_to_labelencode('../CAN/data/local_test_splitByUser')
+
diff --git a/modelzoo/PNN/data/script/generate_neg.py b/modelzoo/PNN/data/script/generate_neg.py
new file mode 100644
index 00000000000..a10ef919e13
--- /dev/null
+++ b/modelzoo/PNN/data/script/generate_neg.py
@@ -0,0 +1,63 @@
+import random
+
+NEG_SEQ_LENGTH_FOR_EACH_HISTORY_ITEM = 1
+
+
+def createNegData(file):
+    with open(file, 'r') as f_raw:
+        with open(file + '_neg', 'w') as f_out:
+            FirstLine = True
+            for line in f_raw:
+                linelist = line.strip().split('\t')
+                uid = linelist[1]
+
+                if uid not in user_history_behavior:
+                    str = '\t'
+                else:
+                    his_items = linelist[4].split('')
+                    neg_items_str = ''
+                    neg_cates_str = ''
+                    for pos in his_items:
+                        tmp_items_str = ''
+                        tmp_cates_str = ''
+                        tmp_items = []
+                        tmp_cates = []
+                        neg_length = 0
+                        while (True):
+                            index = random.randint(
+                                0,
+                                len(user_history_behavior[uid][0]) - 1)
+                            if user_history_behavior[uid][0][index] != pos:
+                                tmp_items.append(
+                                    user_history_behavior[uid][0][index])
+                                tmp_cates.append(
+                                    user_history_behavior[uid][1][index])
+                                neg_length += 1
+                            if neg_length >= NEG_SEQ_LENGTH_FOR_EACH_HISTORY_ITEM:
+                                break
+                        for item in tmp_items:
+                            tmp_items_str += (item + '')
+                        for cate in tmp_cates:
+                            tmp_cates_str += (cate + '')
+                        neg_items_str += (tmp_items_str[:-1] + '')
+                        neg_cates_str += (tmp_cates_str[:-1] + '')
+                    str = neg_items_str[:-1] + '\t' + neg_cates_str[:-1]
+                if FirstLine:
+                    f_out.write(str)
+                    FirstLine = False
+                else:
+                    f_out.write('\n' + str)
+
+
+user_history_behavior = {}
+with open('user_history_behavior.txt', 'r') as f:
+    for line in f:
+        linelist = line.strip().split('\t')
+        uid = linelist[0]
+        items = linelist[1].split('')
+        cates = linelist[2].split('')
+        user_history_behavior[uid] = [items, cates]
+
+data_file = ['local_test_splitByUser', 'local_train_splitByUser']
+for file in data_file:
+    createNegData(file)
diff --git a/modelzoo/PNN/data/script/generate_voc.py b/modelzoo/PNN/data/script/generate_voc.py
new file mode 100644
index 00000000000..447fe6393b7
--- /dev/null
+++ b/modelzoo/PNN/data/script/generate_voc.py
@@ -0,0 +1,66 @@
+# import cPickle
+import pickle as cPickle
+
+f_train = open("local_train_splitByUser", "r")
+uid_dict = {}
+mid_dict = {}
+cat_dict = {}
+
+iddd = 0
+for line in f_train:
+    arr = line.strip("\n").split("\t")
+    clk = arr[0]
+    uid = arr[1]
+    mid = arr[2]
+    cat = arr[3]
+    mid_list = arr[4]
+    cat_list = arr[5]
+    if uid not in uid_dict:
+        uid_dict[uid] = 0
+    uid_dict[uid] += 1
+    if mid not in mid_dict:
+        mid_dict[mid] = 0
+    mid_dict[mid] += 1
+    if cat not in cat_dict:
+        cat_dict[cat] = 0
+    cat_dict[cat] += 1
+    if len(mid_list) == 0:
+        continue
+    for m in mid_list.split(""):
+        if m not in mid_dict:
+            mid_dict[m] = 0
+        mid_dict[m] += 1
+    #print iddd
+    iddd+=1
+    for c in cat_list.split(""):
+        if c not in cat_dict:
+            cat_dict[c] = 0
+        cat_dict[c] += 1
+
+sorted_uid_dict = sorted(uid_dict.items(), key=lambda x:x[1], reverse=True)
+sorted_mid_dict = sorted(mid_dict.items(), key=lambda x:x[1], reverse=True)
+sorted_cat_dict = sorted(cat_dict.items(), key=lambda x:x[1], reverse=True)
+
+uid_voc = {}
+index = 0
+for key, value in sorted_uid_dict:
+    uid_voc[key] = index
+    index += 1
+
+mid_voc = {}
+mid_voc["default_mid"] = 0
+index = 1
+for key, value in sorted_mid_dict:
+    mid_voc[key] = index
+    index += 1
+
+cat_voc = {}
+cat_voc["default_cat"] = 0
+index = 1
+for key, value in sorted_cat_dict:
+    cat_voc[key] = index
+    index += 1
+
+cPickle.dump(uid_voc, open("uid_voc.pkl", "wb"))
+cPickle.dump(mid_voc, open("mid_voc.pkl", "wb"))
+cPickle.dump(cat_voc, open("cat_voc.pkl", "wb"))
diff --git a/modelzoo/PNN/data/script/history_behavior_list.py b/modelzoo/PNN/data/script/history_behavior_list.py
new file mode 100644
index 00000000000..6adaf398cef
--- /dev/null
+++ b/modelzoo/PNN/data/script/history_behavior_list.py
@@ -0,0 +1,41 @@
+item_to_cate_map = {}
+with open('item2catmap.txt', 'r') as f:
+    for line in f:
+        linelist = line.strip().split('\t')
+        item = linelist[0]
+        cate = linelist[1]
+        item_to_cate_map[item] = cate
+
+user_history_behavior = {}
+with open('reviews-info', 'r') as f:
+    for line in f:
+        linelist = line.strip().split('\t')
+        uid = linelist[0]
+        item = linelist[1]
+        if uid not in user_history_behavior:
+            user_history_behavior[uid] = [item]
+        else:
+            if item not in user_history_behavior[uid]:
+                user_history_behavior[uid].append(item)
+
+FirstLine = True
+with open('user_history_behavior.txt', 'w') as f:
+    for uid, items in user_history_behavior.items():
+        itemstr = ''
+        catestr = ''
+        for i in items:
+            if i in item_to_cate_map:
+                c = item_to_cate_map[i]
+            else:
+                c = 'Unknown'
+            if not itemstr:
+                itemstr += i
+                catestr += c
+            else:
+                itemstr += ('' + i)
+                catestr += ('' + c)
+        if FirstLine:
+            f.write(uid + '\t' + itemstr + '\t' + catestr)
+            FirstLine = False
+        else:
+            f.write('\n' + uid + '\t' + itemstr + '\t' + catestr)
diff --git a/modelzoo/PNN/data/script/item_map.py b/modelzoo/PNN/data/script/item_map.py
new file mode 100644
index 00000000000..94bebee5184
--- /dev/null
+++ b/modelzoo/PNN/data/script/item_map.py
@@ -0,0 +1,29 @@
+import sys
+from tqdm import tqdm
+
+data_file = ['local_test_splitByUser', 'local_train_splitByUser']
+
+item_to_cate_map = {}
+# 367983
+for file_name in data_file:
+    with open(file_name, 'r') as f:
+        for line in f:
+            linelist = line.strip().split('\t')
+            items = linelist[4].split('')
+            cates = linelist[5].split('')
+            items.append(linelist[2])
+            cates.append(linelist[3])
+            # print(items)
+            # print(cates)
+            for index, item in enumerate(items):
+                if item not in item_to_cate_map:
+                    item_to_cate_map[item] = cates[index]
+
+with open('item2catmap.txt', 'w') as f:
+    firstline = True
+    for item, cate in item_to_cate_map.items():
+        if firstline:
+            f.write(item + '\t' + cate)
+            firstline = False
+        else:
+            f.write('\n' + item + '\t' + cate)
diff --git a/modelzoo/PNN/data/script/local_aggretor.py b/modelzoo/PNN/data/script/local_aggretor.py
new file mode 100644
index 00000000000..1fd8aceb32c
--- /dev/null
+++ b/modelzoo/PNN/data/script/local_aggretor.py
@@ -0,0 +1,47 @@
+import sys
+import hashlib
+import random
+
+fin = open("jointed-new-split-info", "r")
+ftrain = open("local_train", "w")
+ftest = open("local_test", "w")
+
+last_user = "0"
+common_fea = ""
+line_idx = 0
+for line in fin:
+    items = line.strip().split("\t")
+    ds = items[0]
+    clk = int(items[1])
+    user = items[2]
+    movie_id = items[3]
+    dt = items[5]
+    cat1 = items[6]
+
+    if ds == "20180118":
+        fo = ftrain
+    else:
+        fo = ftest
+    if user != last_user:
+        movie_id_list = []
+        cate1_list = []
+        #print >> fo, items[1] + "\t" + user + "\t" + movie_id + "\t" + cat1 +"\t" + "" + "\t" + ""
+    else:
+        history_clk_num = len(movie_id_list)
+        cat_str = ""
+        mid_str = ""
+        for c1 in cate1_list:
+            cat_str += c1 + ""
+        for mid in movie_id_list:
+            mid_str += mid + ""
+        if len(cat_str) > 0: cat_str = cat_str[:-1]
+        if len(mid_str) > 0: mid_str = mid_str[:-1]
+        if history_clk_num >= 1:  # 8 is the average length of user behavior
+            print(items[1] + "\t" + user + "\t" + movie_id + "\t" + cat1 +
+                  "\t" + mid_str + "\t" + cat_str,
+                  file=fo)
+    last_user = user
+    if clk:
+        movie_id_list.append(movie_id)
+        cate1_list.append(cat1)
+    line_idx += 1
diff --git a/modelzoo/PNN/data/script/pick2txt.py b/modelzoo/PNN/data/script/pick2txt.py
new file mode 100644
index 00000000000..b7c129ffbe0
--- /dev/null
+++ b/modelzoo/PNN/data/script/pick2txt.py
@@ -0,0 +1,14 @@
+import pickle
+
+def pkl2txt(filename):
+    pklfile = pickle.load(open(filename+'.pkl', 'rb'))
+    with open(filename+'.txt','w') as f:
+        f.write('\n'.join(pklfile))
+
+
+
+
+if __name__ == '__main__':
+    pkl2txt('uid_voc')
+    pkl2txt('mid_voc')
+    pkl2txt('cat_voc')
\ No newline at end of file
diff --git a/modelzoo/PNN/data/script/process_data.py b/modelzoo/PNN/data/script/process_data.py
new file mode 100644
index 00000000000..0bff64f30bd
--- /dev/null
+++ b/modelzoo/PNN/data/script/process_data.py
@@ -0,0 +1,108 @@
+import sys
+import random
+import time
+
+
+def process_meta(file):
+    fi = open(file, "r")
+    fo = open("item-info", "w")
+    for line in fi:
+        obj = eval(line)
+        cat = obj["categories"][0][-1]
+        print(obj["asin"] + "\t" + cat, file=fo)
+
+
+def process_reviews(file):
+    fi = open(file, "r")
+    user_map = {}
+    fo = open("reviews-info", "w")
+    for line in fi:
+        obj = eval(line)
+        userID = obj["reviewerID"]
+        itemID = obj["asin"]
+        rating = obj["overall"]
+        time = obj["unixReviewTime"]
+        print(userID + "\t" + itemID + "\t" + str(rating) + "\t" + str(time),
+              file=fo)
+
+
+def manual_join():
+    f_rev = open("reviews-info", "r")
+    user_map = {}
+    item_list = []
+    for line in f_rev:
+        line = line.strip()
+        items = line.split("\t")
+        #loctime = time.localtime(float(items[-1]))
+        #items[-1] = time.strftime('%Y-%m-%d', loctime)
+        if items[0] not in user_map:
+            user_map[items[0]] = []
+        user_map[items[0]].append(("\t".join(items), float(items[-1])))
+        item_list.append(items[1])
+    f_meta = open("item-info", "r")
+    meta_map = {}
+    for line in f_meta:
+        arr = line.strip().split("\t")
+        if arr[0] not in meta_map:
+            meta_map[arr[0]] = arr[1]
+            arr = line.strip().split("\t")
+    fo = open("jointed-new", "w")
+    for key in user_map:
+        sorted_user_bh = sorted(user_map[key], key=lambda x: x[1])
+        for line, t in sorted_user_bh:
+            items = line.split("\t")
+            asin = items[1]
+            j = 0
+            while True:
+                asin_neg_index = random.randint(0, len(item_list) - 1)
+                asin_neg = item_list[asin_neg_index]
+                if asin_neg == asin:
+                    continue
+                items[1] = asin_neg
+                print("0" + "\t" + "\t".join(items) + "\t" +
+                      meta_map[asin_neg],
+                      file=fo)
+                j += 1
+                if j == 1:  #negative sampling frequency
+                    break
+            if asin in meta_map:
+                print("1" + "\t" + line + "\t" + meta_map[asin], file=fo)
+            else:
+                print("1" + "\t" + line + "\t" + "default_cat", file=fo)
+
+
+def split_test():
+    fi = open("jointed-new", "r")
+    fo = open("jointed-new-split-info", "w")
+    user_count = {}
+    for line in fi:
+        line = line.strip()
+        user = line.split("\t")[1]
+        if user not in user_count:
+            user_count[user] = 0
+        user_count[user] += 1
+    fi.seek(0)
+    i = 0
+    last_user = "A26ZDKC53OP6JD"
+    for line in fi:
+        line = line.strip()
+        user = line.split("\t")[1]
+        if user == last_user:
+            if i < user_count[user] - 2:  # 1 + negative samples
+                print("20180118" + "\t" + line, file=fo)
+            else:
+                print("20190119" + "\t" + line, file=fo)
+        else:
+            last_user = user
+            i = 0
+            if i < user_count[user] - 2:
+                print("20180118" + "\t" + line, file=fo)
+            else:
+                print("20190119" + "\t" + line, file=fo)
+        i += 1
+
+
+process_meta(sys.argv[1])
+process_reviews(sys.argv[2])
+manual_join()
+split_test()
diff --git a/modelzoo/PNN/data/script/split_by_user.py b/modelzoo/PNN/data/script/split_by_user.py
new file mode 100644
index 00000000000..cc7988c6601
--- /dev/null
+++ b/modelzoo/PNN/data/script/split_by_user.py
@@ -0,0 +1,18 @@
+import random
+
+fi = open("local_test", "r")
+ftrain = open("local_train_splitByUser", "w")
+ftest = open("local_test_splitByUser", "w")
+
+while True:
+    rand_int = random.randint(1, 10)
+    noclk_line = fi.readline().strip()
+    clk_line = fi.readline().strip()
+    if noclk_line == "" or clk_line == "":
+        break
+    if rand_int == 2:
+        print(noclk_line, file=ftest)
+        print(clk_line, file=ftest)
+    else:
+        print(noclk_line, file=ftrain)
+        print(clk_line, file=ftrain)
diff --git a/modelzoo/PNN/result/README.md b/modelzoo/PNN/result/README.md
new file mode 100644
index 00000000000..ccec44eb9a5
--- /dev/null
+++ b/modelzoo/PNN/result/README.md
@@ -0,0 +1,2 @@
+# Result
+Checkpoint & timeline file are default saved in this folder.
diff --git a/modelzoo/PNN/script/__init__.py b/modelzoo/PNN/script/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/modelzoo/PNN/script/contrib/__init__.py b/modelzoo/PNN/script/contrib/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/modelzoo/PNN/script/contrib/rnn.py b/modelzoo/PNN/script/contrib/rnn.py
new file mode 100644
index 00000000000..b3554993063
--- /dev/null
+++ b/modelzoo/PNN/script/contrib/rnn.py
@@ -0,0 +1,1153 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+#
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+
+# you may not use this file except in compliance with the License.
+
+# You may obtain a copy of the License at
+
+#
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+#
+
+# Unless required by applicable law or agreed to in writing, software
+
+# distributed under the License is distributed on an "AS IS" BASIS,
+
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+# See the License for the specific language governing permissions and
+
+# limitations under the License.
+
+# ==============================================================================
+
+
+"""RNN helpers for TensorFlow models.
+@@bidirectional_dynamic_rnn
+@@dynamic_rnn
+@@raw_rnn
+@@static_rnn
+@@static_state_saving_rnn
+@@static_bidirectional_rnn
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.util import nest
+import tensorflow as tf
+
+
+def _like_rnncell_(cell):
+    """Checks that a given object is an RNNCell by using duck typing."""
+
+    conditions = [hasattr(cell, "output_size"), hasattr(cell, "state_size"),
+
+                  hasattr(cell, "zero_state"), callable(cell)]
+
+    return all(conditions)
+
+
+# pylint: disable=protected-access
+
+_concat = rnn_cell_impl._concat
+try:
+    _like_rnncell = rnn_cell_impl._like_rnncell
+except Exception as e:
+    _like_rnncell = _like_rnncell_
+
+
+# pylint: enable=protected-access
+
+
+def _transpose_batch_time(x):
+    """Transpose the batch and time dimensions of a Tensor.
+    Retains as much of the static shape information as possible.
+    Args:
+      x: A tensor of rank 2 or higher.
+    Returns:
+      x transposed along the first two dimensions.
+    Raises:
+      ValueError: if `x` is rank 1 or lower.
+    """
+
+    x_static_shape = x.get_shape()
+
+    if x_static_shape.ndims is not None and x_static_shape.ndims < 2:
+        raise ValueError(
+
+            "Expected input tensor %s to have rank at least 2, but saw shape: %s" %
+
+            (x, x_static_shape))
+
+    x_rank = array_ops.rank(x)
+
+    x_t = array_ops.transpose(
+
+        x, array_ops.concat(
+
+            ([1, 0], math_ops.range(2, x_rank)), axis=0))
+
+    x_t.set_shape(
+
+        tensor_shape.TensorShape([
+
+            x_static_shape[1].value, x_static_shape[0].value
+
+        ]).concatenate(x_static_shape[2:]))
+
+    return x_t
+
+
+def _best_effort_input_batch_size(flat_input):
+    """Get static input batch size if available, with fallback to the dynamic one.
+    Args:
+      flat_input: An iterable of time major input Tensors of shape [max_time,
+        batch_size, ...]. All inputs should have compatible batch sizes.
+    Returns:
+      The batch size in Python integer if available, or a scalar Tensor otherwise.
+    Raises:
+      ValueError: if there is any input with an invalid shape.
+    """
+
+    for input_ in flat_input:
+
+        shape = input_.shape
+
+        if shape.ndims is None:
+            continue
+
+        if shape.ndims < 2:
+            raise ValueError(
+
+                "Expected input tensor %s to have rank at least 2" % input_)
+
+        batch_size = shape[1].value
+
+        if batch_size is not None:
+            return batch_size
+
+    # Fallback to the dynamic batch size of the first input.
+
+    return array_ops.shape(flat_input[0])[1]
+
+
+def _infer_state_dtype(explicit_dtype, state):
+    """Infer the dtype of an RNN state.
+    Args:
+      explicit_dtype: explicitly declared dtype or None.
+      state: RNN's hidden state. Must be a Tensor or a nested iterable containing
+        Tensors.
+    Returns:
+      dtype: inferred dtype of hidden state.
+    Raises:
+      ValueError: if `state` has heterogeneous dtypes or is empty.
+    """
+
+    if explicit_dtype is not None:
+
+        return explicit_dtype
+
+    elif nest.is_sequence(state):
+
+        inferred_dtypes = [element.dtype for element in nest.flatten(state)]
+
+        if not inferred_dtypes:
+            raise ValueError("Unable to infer dtype from empty state.")
+
+        all_same = all([x == inferred_dtypes[0] for x in inferred_dtypes])
+
+        if not all_same:
+            raise ValueError(
+
+                "State has tensors of different inferred_dtypes. Unable to infer a "
+
+                "single representative dtype.")
+
+        return inferred_dtypes[0]
+
+    else:
+
+        return state.dtype
+
+
+# pylint: disable=unused-argument
+
+def _rnn_step(
+
+        time, sequence_length, min_sequence_length, max_sequence_length,
+
+        zero_output, state, call_cell, state_size, skip_conditionals=False):
+    """Calculate one step of a dynamic RNN minibatch.
+    Returns an (output, state) pair conditioned on the sequence_lengths.
+    When skip_conditionals=False, the pseudocode is something like:
+    if t >= max_sequence_length:
+      return (zero_output, state)
+    if t < min_sequence_length:
+      return call_cell()
+    # Selectively output zeros or output, old state or new state depending
+    # on if we've finished calculating each row.
+    new_output, new_state = call_cell()
+    final_output = np.vstack([
+      zero_output if time >= sequence_lengths[r] else new_output_r
+      for r, new_output_r in enumerate(new_output)
+    ])
+    final_state = np.vstack([
+      state[r] if time >= sequence_lengths[r] else new_state_r
+      for r, new_state_r in enumerate(new_state)
+    ])
+    return (final_output, final_state)
+    Args:
+      time: Python int, the current time step
+      sequence_length: int32 `Tensor` vector of size [batch_size]
+      min_sequence_length: int32 `Tensor` scalar, min of sequence_length
+      max_sequence_length: int32 `Tensor` scalar, max of sequence_length
+      zero_output: `Tensor` vector of shape [output_size]
+      state: Either a single `Tensor` matrix of shape `[batch_size, state_size]`,
+        or a list/tuple of such tensors.
+      call_cell: lambda returning tuple of (new_output, new_state) where
+        new_output is a `Tensor` matrix of shape `[batch_size, output_size]`.
+        new_state is a `Tensor` matrix of shape `[batch_size, state_size]`.
+      state_size: The `cell.state_size` associated with the state.
+      skip_conditionals: Python bool, whether to skip using the conditional
+        calculations.  This is useful for `dynamic_rnn`, where the input tensor
+        matches `max_sequence_length`, and using conditionals just slows
+        everything down.
+    Returns:
+      A tuple of (`final_output`, `final_state`) as given by the pseudocode above:
+        final_output is a `Tensor` matrix of shape [batch_size, output_size]
+        final_state is either a single `Tensor` matrix, or a tuple of such
+          matrices (matching length and shapes of input `state`).
+    Raises:
+      ValueError: If the cell returns a state tuple whose length does not match
+        that returned by `state_size`.
+    """
+
+    # Convert state to a list for ease of use
+
+    flat_state = nest.flatten(state)
+
+    flat_zero_output = nest.flatten(zero_output)
+
+    def _copy_one_through(output, new_output):
+
+        # If the state contains a scalar value we simply pass it through.
+
+        if output.shape.ndims == 0:
+            return new_output
+
+        copy_cond = (time >= sequence_length)
+
+        with ops.colocate_with(new_output):
+            return array_ops.where(copy_cond, output, new_output)
+
+    def _copy_some_through(flat_new_output, flat_new_state):
+
+        # Use broadcasting select to determine which values should get
+
+        # the previous state & zero output, and which values should get
+
+        # a calculated state & output.
+
+        flat_new_output = [
+
+            _copy_one_through(zero_output, new_output)
+
+            for zero_output, new_output in zip(flat_zero_output, flat_new_output)]
+
+        flat_new_state = [
+
+            _copy_one_through(state, new_state)
+
+            for state, new_state in zip(flat_state, flat_new_state)]
+
+        return flat_new_output + flat_new_state
+
+    def _maybe_copy_some_through():
+
+        """Run RNN step.  Pass through either no or some past state."""
+
+        new_output, new_state = call_cell()
+
+        nest.assert_same_structure(state, new_state)
+
+        flat_new_state = nest.flatten(new_state)
+
+        flat_new_output = nest.flatten(new_output)
+
+        return control_flow_ops.cond(
+
+            # if t < min_seq_len: calculate and return everything
+
+            time < min_sequence_length, lambda: flat_new_output + flat_new_state,
+
+            # else copy some of it through
+
+            lambda: _copy_some_through(flat_new_output, flat_new_state))
+
+    # TODO(ebrevdo): skipping these conditionals may cause a slowdown,
+
+    # but benefits from removing cond() and its gradient.  We should
+
+    # profile with and without this switch here.
+
+    if skip_conditionals:
+
+        # Instead of using conditionals, perform the selective copy at all time
+
+        # steps.  This is faster when max_seq_len is equal to the number of unrolls
+
+        # (which is typical for dynamic_rnn).
+
+        new_output, new_state = call_cell()
+
+        nest.assert_same_structure(state, new_state)
+
+        new_state = nest.flatten(new_state)
+
+        new_output = nest.flatten(new_output)
+
+        final_output_and_state = _copy_some_through(new_output, new_state)
+
+    else:
+
+        empty_update = lambda: flat_zero_output + flat_state
+
+        final_output_and_state = control_flow_ops.cond(
+
+            # if t >= max_seq_len: copy all state through, output zeros
+
+            time >= max_sequence_length, empty_update,
+
+            # otherwise calculation is required: copy some or all of it through
+
+            _maybe_copy_some_through)
+
+    if len(final_output_and_state) != len(flat_zero_output) + len(flat_state):
+        raise ValueError("Internal error: state and output were not concatenated "
+
+                         "correctly.")
+
+    final_output = final_output_and_state[:len(flat_zero_output)]
+
+    final_state = final_output_and_state[len(flat_zero_output):]
+
+    for output, flat_output in zip(final_output, flat_zero_output):
+        output.set_shape(flat_output.get_shape())
+
+    for substate, flat_substate in zip(final_state, flat_state):
+        substate.set_shape(flat_substate.get_shape())
+
+    final_output = nest.pack_sequence_as(
+
+        structure=zero_output, flat_sequence=final_output)
+
+    final_state = nest.pack_sequence_as(
+
+        structure=state, flat_sequence=final_state)
+
+    return final_output, final_state
+
+
+def _reverse_seq(input_seq, lengths):
+    """Reverse a list of Tensors up to specified lengths.
+    Args:
+      input_seq: Sequence of seq_len tensors of dimension (batch_size, n_features)
+                 or nested tuples of tensors.
+      lengths:   A `Tensor` of dimension batch_size, containing lengths for each
+                 sequence in the batch. If "None" is specified, simply reverses
+                 the list.
+    Returns:
+      time-reversed sequence
+    """
+
+    if lengths is None:
+        return list(reversed(input_seq))
+
+    flat_input_seq = tuple(nest.flatten(input_) for input_ in input_seq)
+
+    flat_results = [[] for _ in range(len(input_seq))]
+
+    for sequence in zip(*flat_input_seq):
+
+        input_shape = tensor_shape.unknown_shape(
+
+            ndims=sequence[0].get_shape().ndims)
+
+        for input_ in sequence:
+            input_shape.merge_with(input_.get_shape())
+
+            input_.set_shape(input_shape)
+
+        # Join into (time, batch_size, depth)
+
+        s_joined = array_ops.stack(sequence)
+
+        # Reverse along dimension 0
+
+        s_reversed = array_ops.reverse_sequence(s_joined, lengths, 0, 1)
+
+        # Split again into list
+
+        result = array_ops.unstack(s_reversed)
+
+        for r, flat_result in zip(result, flat_results):
+            r.set_shape(input_shape)
+
+            flat_result.append(r)
+
+    results = [nest.pack_sequence_as(structure=input_, flat_sequence=flat_result)
+
+               for input_, flat_result in zip(input_seq, flat_results)]
+
+    return results
+
+
+#
+# def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
+#
+#                               initial_state_fw=None, initial_state_bw=None,
+#
+#                               dtype=None, parallel_iterations=None,
+#
+#                               swap_memory=False, time_major=False, scope=None):
+#
+#   """Creates a dynamic version of bidirectional recurrent neural network.
+#
+#
+#
+#   Takes input and builds independent forward and backward RNNs. The input_size
+#
+#   of forward and backward cell must match. The initial state for both directions
+#
+#   is zero by default (but can be set optionally) and no intermediate states are
+#
+#   ever returned -- the network is fully unrolled for the given (passed in)
+#
+#   length(s) of the sequence(s) or completely unrolled if length(s) is not
+#
+#   given.
+#
+#
+#
+#   Args:
+#
+#     cell_fw: An instance of RNNCell, to be used for forward direction.
+#
+#     cell_bw: An instance of RNNCell, to be used for backward direction.
+#
+#     inputs: The RNN inputs.
+#
+#       If time_major == False (default), this must be a tensor of shape:
+#
+#         `[batch_size, max_time, ...]`, or a nested tuple of such elements.
+#
+#       If time_major == True, this must be a tensor of shape:
+#
+#         `[max_time, batch_size, ...]`, or a nested tuple of such elements.
+#
+#     sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
+#
+#       containing the actual lengths for each of the sequences in the batch.
+#
+#       If not provided, all batch entries are assumed to be full sequences; and
+#
+#       time reversal is applied from time `0` to `max_time` for each sequence.
+#
+#     initial_state_fw: (optional) An initial state for the forward RNN.
+#
+#       This must be a tensor of appropriate type and shape
+#
+#       `[batch_size, cell_fw.state_size]`.
+#
+#       If `cell_fw.state_size` is a tuple, this should be a tuple of
+#
+#       tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
+#
+#     initial_state_bw: (optional) Same as for `initial_state_fw`, but using
+#
+#       the corresponding properties of `cell_bw`.
+#
+#     dtype: (optional) The data type for the initial states and expected output.
+#
+#       Required if initial_states are not provided or RNN states have a
+#
+#       heterogeneous dtype.
+#
+#     parallel_iterations: (Default: 32).  The number of iterations to run in
+#
+#       parallel.  Those operations which do not have any temporal dependency
+#
+#       and can be run in parallel, will be.  This parameter trades off
+#
+#       time for space.  Values >> 1 use more memory but take less time,
+#
+#       while smaller values use less memory but computations take longer.
+#
+#     swap_memory: Transparently swap the tensors produced in forward inference
+#
+#       but needed for back prop from GPU to CPU.  This allows training RNNs
+#
+#       which would typically not fit on a single GPU, with very minimal (or no)
+#
+#       performance penalty.
+#
+#     time_major: The shape format of the `inputs` and `outputs` Tensors.
+#
+#       If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+#
+#       If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+#
+#       Using `time_major = True` is a bit more efficient because it avoids
+#
+#       transposes at the beginning and end of the RNN calculation.  However,
+#
+#       most TensorFlow data is batch-major, so by default this function
+#
+#       accepts input and emits output in batch-major form.
+#
+#     scope: VariableScope for the created subgraph; defaults to
+#
+#       "bidirectional_rnn"
+#
+#
+#
+#   Returns:
+#
+#     A tuple (outputs, output_states) where:
+#
+#       outputs: A tuple (output_fw, output_bw) containing the forward and
+#
+#         the backward rnn output `Tensor`.
+#
+#         If time_major == False (default),
+#
+#           output_fw will be a `Tensor` shaped:
+#
+#           `[batch_size, max_time, cell_fw.output_size]`
+#
+#           and output_bw will be a `Tensor` shaped:
+#
+#           `[batch_size, max_time, cell_bw.output_size]`.
+#
+#         If time_major == True,
+#
+#           output_fw will be a `Tensor` shaped:
+#
+#           `[max_time, batch_size, cell_fw.output_size]`
+#
+#           and output_bw will be a `Tensor` shaped:
+#
+#           `[max_time, batch_size, cell_bw.output_size]`.
+#
+#         It returns a tuple instead of a single concatenated `Tensor`, unlike
+#
+#         in the `bidirectional_rnn`. If the concatenated one is preferred,
+#
+#         the forward and backward outputs can be concatenated as
+#
+#         `tf.concat(outputs, 2)`.
+#
+#       output_states: A tuple (output_state_fw, output_state_bw) containing
+#
+#         the forward and the backward final states of bidirectional rnn.
+#
+#
+#
+#   Raises:
+#
+#     TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
+#
+#   """
+#
+#
+#
+#   if not _like_rnncell(cell_fw):
+#
+#     raise TypeError("cell_fw must be an instance of RNNCell")
+#
+#   if not _like_rnncell(cell_bw):
+#
+#     raise TypeError("cell_bw must be an instance of RNNCell")
+#
+#
+#
+#   with vs.variable_scope(scope or "bidirectional_rnn"):
+#
+#     # Forward direction
+#
+#     with vs.variable_scope("fw") as fw_scope:
+#
+#       output_fw, output_state_fw = dynamic_rnn(
+#
+#           cell=cell_fw, inputs=inputs, sequence_length=sequence_length,
+#
+#           initial_state=initial_state_fw, dtype=dtype,
+#
+#           parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+#
+#           time_major=time_major, scope=fw_scope)
+#
+#
+#
+#     # Backward direction
+#
+#     if not time_major:
+#
+#       time_dim = 1
+#
+#       batch_dim = 0
+#
+#     else:
+#
+#       time_dim = 0
+#
+#       batch_dim = 1
+#
+#
+#
+#     def _reverse(input_, seq_lengths, seq_dim, batch_dim):
+#
+#       if seq_lengths is not None:
+#
+#         return array_ops.reverse_sequence(
+#
+#             input=input_, seq_lengths=seq_lengths,
+#
+#             seq_dim=seq_dim, batch_dim=batch_dim)
+#
+#       else:
+#
+#         return array_ops.reverse(input_, axis=[seq_dim])
+#
+#
+#
+#     with vs.variable_scope("bw") as bw_scope:
+#
+#       inputs_reverse = _reverse(
+#
+#           inputs, seq_lengths=sequence_length,
+#
+#           seq_dim=time_dim, batch_dim=batch_dim)
+#
+#       tmp, output_state_bw = dynamic_rnn(
+#
+#           cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
+#
+#           initial_state=initial_state_bw, dtype=dtype,
+#
+#           parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+#
+#           time_major=time_major, scope=bw_scope)
+#
+#
+#
+#   output_bw = _reverse(
+#
+#       tmp, seq_lengths=sequence_length,
+#
+#       seq_dim=time_dim, batch_dim=batch_dim)
+#
+#
+#
+#   outputs = (output_fw, output_bw)
+#
+#   output_states = (output_state_fw, output_state_bw)
+#
+#
+#
+#   return (outputs, output_states)
+#
+
+
+def dynamic_rnn(cell, inputs, att_scores=None, sequence_length=None, initial_state=None,
+
+                dtype=None, parallel_iterations=None, swap_memory=False,
+
+                time_major=False, scope=None):
+    """Creates a recurrent neural network specified by RNNCell `cell`.
+    Performs fully dynamic unrolling of `inputs`.
+    Example:
+    ```python
+    # create a BasicRNNCell
+    rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+    # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size]
+    # defining initial state
+    initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32)
+    # 'state' is a tensor of shape [batch_size, cell_state_size]
+    outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data,
+                                       initial_state=initial_state,
+                                       dtype=tf.float32)
+    ```
+    ```python
+    # create 2 LSTMCells
+    rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [128, 256]]
+    # create a RNN cell composed sequentially of a number of RNNCells
+    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
+    # 'outputs' is a tensor of shape [batch_size, max_time, 256]
+    # 'state' is a N-tuple where N is the number of LSTMCells containing a
+    # tf.contrib.rnn.LSTMStateTuple for each cell
+    outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
+                                       inputs=data,
+                                       dtype=tf.float32)
+    ```
+    Args:
+      cell: An instance of RNNCell.
+      inputs: The RNN inputs.
+        If `time_major == False` (default), this must be a `Tensor` of shape:
+          `[batch_size, max_time, ...]`, or a nested tuple of such
+          elements.
+        If `time_major == True`, this must be a `Tensor` of shape:
+          `[max_time, batch_size, ...]`, or a nested tuple of such
+          elements.
+        This may also be a (possibly nested) tuple of Tensors satisfying
+        this property.  The first two dimensions must match across all the inputs,
+        but otherwise the ranks and other shape components may differ.
+        In this case, input to `cell` at each time-step will replicate the
+        structure of these tuples, except for the time dimension (from which the
+        time is taken).
+        The input to `cell` at each time step will be a `Tensor` or (possibly
+        nested) tuple of Tensors each with dimensions `[batch_size, ...]`.
+      sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
+        Used to copy-through state and zero-out outputs when past a batch
+        element's sequence length.  So it's more for correctness than performance.
+      initial_state: (optional) An initial state for the RNN.
+        If `cell.state_size` is an integer, this must be
+        a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
+        If `cell.state_size` is a tuple, this should be a tuple of
+        tensors having shapes `[batch_size, s] for s in cell.state_size`.
+      dtype: (optional) The data type for the initial state and expected output.
+        Required if initial_state is not provided or RNN state has a heterogeneous
+        dtype.
+      parallel_iterations: (Default: 32).  The number of iterations to run in
+        parallel.  Those operations which do not have any temporal dependency
+        and can be run in parallel, will be.  This parameter trades off
+        time for space.  Values >> 1 use more memory but take less time,
+        while smaller values use less memory but computations take longer.
+      swap_memory: Transparently swap the tensors produced in forward inference
+        but needed for back prop from GPU to CPU.  This allows training RNNs
+        which would typically not fit on a single GPU, with very minimal (or no)
+        performance penalty.
+      time_major: The shape format of the `inputs` and `outputs` Tensors.
+        If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+        If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+        Using `time_major = True` is a bit more efficient because it avoids
+        transposes at the beginning and end of the RNN calculation.  However,
+        most TensorFlow data is batch-major, so by default this function
+        accepts input and emits output in batch-major form.
+      scope: VariableScope for the created subgraph; defaults to "rnn".
+    Returns:
+      A pair (outputs, state) where:
+      outputs: The RNN output `Tensor`.
+        If time_major == False (default), this will be a `Tensor` shaped:
+          `[batch_size, max_time, cell.output_size]`.
+        If time_major == True, this will be a `Tensor` shaped:
+          `[max_time, batch_size, cell.output_size]`.
+        Note, if `cell.output_size` is a (possibly nested) tuple of integers
+        or `TensorShape` objects, then `outputs` will be a tuple having the
+        same structure as `cell.output_size`, containing Tensors having shapes
+        corresponding to the shape data in `cell.output_size`.
+      state: The final state.  If `cell.state_size` is an int, this
+        will be shaped `[batch_size, cell.state_size]`.  If it is a
+        `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
+        If it is a (possibly nested) tuple of ints or `TensorShape`, this will
+        be a tuple having the corresponding shapes. If cells are `LSTMCells`
+        `state` will be a tuple containing a `LSTMStateTuple` for each cell.
+    Raises:
+      TypeError: If `cell` is not an instance of RNNCell.
+      ValueError: If inputs is None or an empty list.
+    """
+
+    if not _like_rnncell(cell):
+        raise TypeError("cell must be an instance of RNNCell")
+
+    # By default, time_major==False and inputs are batch-major: shaped
+
+    #   [batch, time, depth]
+
+    # For internal calculations, we transpose to [time, batch, depth]
+
+    flat_input = nest.flatten(inputs)
+
+    if not time_major:
+        # (B,T,D) => (T,B,D)
+
+        flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
+
+        flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
+
+    parallel_iterations = parallel_iterations or 32
+
+    if sequence_length is not None:
+
+        sequence_length = math_ops.to_int32(sequence_length)
+
+        if sequence_length.get_shape().ndims not in (None, 1):
+            raise ValueError(
+
+                "sequence_length must be a vector of length batch_size, "
+
+                "but saw shape: %s" % sequence_length.get_shape())
+
+        sequence_length = array_ops.identity(  # Just to find it in the graph.
+
+            sequence_length, name="sequence_length")
+
+    # Create a new scope in which the caching device is either
+
+    # determined by the parent scope, or is set to place the cached
+
+    # Variable using the same placement as for the rest of the RNN.
+
+    with vs.variable_scope(scope or "rnn",reuse=tf.AUTO_REUSE) as varscope:#TODO:user defined reuse
+
+        if varscope.caching_device is None:
+            varscope.set_caching_device(lambda op: op.device)
+
+        batch_size = _best_effort_input_batch_size(flat_input)
+
+        if initial_state is not None:
+
+            state = initial_state
+
+        else:
+
+            if not dtype:
+                raise ValueError("If there is no initial_state, you must give a dtype.")
+
+            state = cell.zero_state(batch_size, dtype)
+
+        def _assert_has_shape(x, shape):
+
+            x_shape = array_ops.shape(x)
+
+            packed_shape = array_ops.stack(shape)
+
+            return control_flow_ops.Assert(
+
+                math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)),
+
+                ["Expected shape for Tensor %s is " % x.name,
+
+                 packed_shape, " but saw shape: ", x_shape])
+
+        if sequence_length is not None:
+            # Perform some shape validation
+
+            with ops.control_dependencies(
+
+                    [_assert_has_shape(sequence_length, [batch_size])]):
+                sequence_length = array_ops.identity(
+
+                    sequence_length, name="CheckSeqLen")
+
+        inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input)
+
+        (outputs, final_state) = _dynamic_rnn_loop(
+
+            cell,
+
+            inputs,
+
+            state,
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory,
+
+            att_scores=att_scores,
+
+            sequence_length=sequence_length,
+
+            dtype=dtype)
+
+        # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth].
+
+        # If we are performing batch-major calculations, transpose output back
+
+        # to shape [batch, time, depth]
+
+        if not time_major:
+            # (T,B,D) => (B,T,D)
+
+            outputs = nest.map_structure(_transpose_batch_time, outputs)
+
+        return (outputs, final_state)
+
+
+def _dynamic_rnn_loop(cell,
+
+                      inputs,
+
+                      initial_state,
+
+                      parallel_iterations,
+
+                      swap_memory,
+
+                      att_scores=None,
+
+                      sequence_length=None,
+
+                      dtype=None):
+    """Internal implementation of Dynamic RNN.
+    Args:
+      cell: An instance of RNNCell.
+      inputs: A `Tensor` of shape [time, batch_size, input_size], or a nested
+        tuple of such elements.
+      initial_state: A `Tensor` of shape `[batch_size, state_size]`, or if
+        `cell.state_size` is a tuple, then this should be a tuple of
+        tensors having shapes `[batch_size, s] for s in cell.state_size`.
+      parallel_iterations: Positive Python int.
+      swap_memory: A Python boolean
+      sequence_length: (optional) An `int32` `Tensor` of shape [batch_size].
+      dtype: (optional) Expected dtype of output. If not specified, inferred from
+        initial_state.
+    Returns:
+      Tuple `(final_outputs, final_state)`.
+      final_outputs:
+        A `Tensor` of shape `[time, batch_size, cell.output_size]`.  If
+        `cell.output_size` is a (possibly nested) tuple of ints or `TensorShape`
+        objects, then this returns a (possibly nsted) tuple of Tensors matching
+        the corresponding shapes.
+      final_state:
+        A `Tensor`, or possibly nested tuple of Tensors, matching in length
+        and shapes to `initial_state`.
+    Raises:
+      ValueError: If the input depth cannot be inferred via shape inference
+        from the inputs.
+    """
+
+    state = initial_state
+
+    assert isinstance(parallel_iterations, int), "parallel_iterations must be int"
+
+    state_size = cell.state_size
+
+    flat_input = nest.flatten(inputs)
+
+    flat_output_size = nest.flatten(cell.output_size)
+
+    # Construct an initial output
+
+    input_shape = array_ops.shape(flat_input[0])
+
+    time_steps = input_shape[0]
+
+    batch_size = _best_effort_input_batch_size(flat_input)
+
+    inputs_got_shape = tuple(input_.get_shape().with_rank_at_least(3)
+
+                             for input_ in flat_input)
+
+    const_time_steps, const_batch_size = inputs_got_shape[0].as_list()[:2]
+
+    for shape in inputs_got_shape:
+
+        if not shape[2:].is_fully_defined():
+            raise ValueError(
+
+                "Input size (depth of inputs) must be accessible via shape inference,"
+
+                " but saw value None.")
+
+        got_time_steps = shape[0].value
+
+        got_batch_size = shape[1].value
+
+        if const_time_steps != got_time_steps:
+            raise ValueError(
+
+                "Time steps is not the same for all the elements in the input in a "
+
+                "batch.")
+
+        if const_batch_size != got_batch_size:
+            raise ValueError(
+
+                "Batch_size is not the same for all the elements in the input.")
+
+    # Prepare dynamic conditional copying of state & output
+
+    def _create_zero_arrays(size):
+
+        size = _concat(batch_size, size)
+
+        return array_ops.zeros(
+
+            array_ops.stack(size), _infer_state_dtype(dtype, state))
+
+    flat_zero_output = tuple(_create_zero_arrays(output)
+
+                             for output in flat_output_size)
+
+    zero_output = nest.pack_sequence_as(structure=cell.output_size,
+
+                                        flat_sequence=flat_zero_output)
+
+    if sequence_length is not None:
+        min_sequence_length = math_ops.reduce_min(sequence_length)
+
+        max_sequence_length = math_ops.reduce_max(sequence_length)
+
+    time = array_ops.constant(0, dtype=dtypes.int32, name="time")
+
+    with ops.name_scope("dynamic_rnn") as scope:
+
+        base_name = scope
+
+    def _create_ta(name, dtype):
+
+        return tensor_array_ops.TensorArray(dtype=dtype,
+
+                                            size=time_steps,
+
+                                            tensor_array_name=base_name + name)
+
+    output_ta = tuple(_create_ta("output_%d" % i,
+
+                                 _infer_state_dtype(dtype, state))
+
+                      for i in range(len(flat_output_size)))
+
+    input_ta = tuple(_create_ta("input_%d" % i, flat_input[i].dtype)
+
+                     for i in range(len(flat_input)))
+
+    input_ta = tuple(ta.unstack(input_)
+
+                     for ta, input_ in zip(input_ta, flat_input))
+
+    def _time_step(time, output_ta_t, state, att_scores=None):
+
+        """Take a time step of the dynamic RNN.
+        Args:
+          time: int32 scalar Tensor.
+          output_ta_t: List of `TensorArray`s that represent the output.
+          state: nested tuple of vector tensors that represent the state.
+        Returns:
+          The tuple (time + 1, output_ta_t with updated flow, new_state).
+        """
+
+        input_t = tuple(ta.read(time) for ta in input_ta)
+
+        # Restore some shape information
+
+        for input_, shape in zip(input_t, inputs_got_shape):
+            input_.set_shape(shape[1:])
+
+        input_t = nest.pack_sequence_as(structure=inputs, flat_sequence=input_t)
+
+        if att_scores is not None:
+
+            att_score = att_scores[:, time, :]
+
+            call_cell = lambda: cell(input_t, state, att_score)
+
+        else:
+
+            call_cell = lambda: cell(input_t, state)
+
+        if sequence_length is not None:
+
+            (output, new_state) = _rnn_step(
+
+                time=time,
+
+                sequence_length=sequence_length,
+
+                min_sequence_length=min_sequence_length,
+
+                max_sequence_length=max_sequence_length,
+
+                zero_output=zero_output,
+
+                state=state,
+
+                call_cell=call_cell,
+
+                state_size=state_size,
+
+                skip_conditionals=True)
+
+        else:
+
+            (output, new_state) = call_cell()
+
+        # Pack state if using state tuples
+
+        output = nest.flatten(output)
+
+        output_ta_t = tuple(
+
+            ta.write(time, out) for ta, out in zip(output_ta_t, output))
+
+        if att_scores is not None:
+
+            return (time + 1, output_ta_t, new_state, att_scores)
+
+        else:
+
+            return (time + 1, output_ta_t, new_state)
+
+    if att_scores is not None:
+
+        _, output_final_ta, final_state, _ = control_flow_ops.while_loop(
+
+            cond=lambda time, *_: time < time_steps,
+
+            body=_time_step,
+
+            loop_vars=(time, output_ta, state, att_scores),
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory)
+
+    else:
+
+        _, output_final_ta, final_state = control_flow_ops.while_loop(
+
+            cond=lambda time, *_: time < time_steps,
+
+            body=_time_step,
+
+            loop_vars=(time, output_ta, state),
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory)
+
+    # Unpack final output if not using output tuples.
+
+    final_outputs = tuple(ta.stack() for ta in output_final_ta)
+
+    # Restore some shape information
+
+    for output, output_size in zip(final_outputs, flat_output_size):
+        shape = _concat(
+
+            [const_time_steps, const_batch_size], output_size, static=True)
+
+        output.set_shape(shape)
+
+    final_outputs = nest.pack_sequence_as(
+
+        structure=cell.output_size, flat_sequence=final_outputs)
+
+    return (final_outputs, final_state)
\ No newline at end of file
diff --git a/modelzoo/PNN/script/contrib/rnn_v2.py b/modelzoo/PNN/script/contrib/rnn_v2.py
new file mode 100644
index 00000000000..a2bd625cd8b
--- /dev/null
+++ b/modelzoo/PNN/script/contrib/rnn_v2.py
@@ -0,0 +1,1452 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+#
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+
+# you may not use this file except in compliance with the License.
+
+# You may obtain a copy of the License at
+
+#
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+#
+
+# Unless required by applicable law or agreed to in writing, software
+
+# distributed under the License is distributed on an "AS IS" BASIS,
+
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+# See the License for the specific language governing permissions and
+
+# limitations under the License.
+
+# ==============================================================================
+
+
+"""RNN helpers for TensorFlow models.
+
+
+
+
+
+@@bidirectional_dynamic_rnn
+
+@@dynamic_rnn
+
+@@raw_rnn
+
+@@static_rnn
+
+@@static_state_saving_rnn
+
+@@static_bidirectional_rnn
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.util import nest
+import tensorflow as tf
+
+
+def _like_rnncell_(cell):
+    """Checks that a given object is an RNNCell by using duck typing."""
+
+    conditions = [hasattr(cell, "output_size"), hasattr(cell, "state_size"),
+
+                  hasattr(cell, "zero_state"), callable(cell)]
+
+    return all(conditions)
+
+
+# pylint: disable=protected-access
+
+_concat = rnn_cell_impl._concat
+try:
+    _like_rnncell = rnn_cell_impl._like_rnncell
+except:
+    _like_rnncell = _like_rnncell_
+
+
+# pylint: enable=protected-access
+
+
+def _transpose_batch_time(x):
+    """Transpose the batch and time dimensions of a Tensor.
+
+
+
+    Retains as much of the static shape information as possible.
+
+
+
+    Args:
+
+      x: A tensor of rank 2 or higher.
+
+
+
+    Returns:
+
+      x transposed along the first two dimensions.
+
+
+
+    Raises:
+
+      ValueError: if `x` is rank 1 or lower.
+
+    """
+
+    x_static_shape = x.get_shape()
+
+    if x_static_shape.ndims is not None and x_static_shape.ndims < 2:
+        raise ValueError(
+
+            "Expected input tensor %s to have rank at least 2, but saw shape: %s" %
+
+            (x, x_static_shape))
+
+    x_rank = array_ops.rank(x)
+
+    x_t = array_ops.transpose(
+
+        x, array_ops.concat(
+
+            ([1, 0], math_ops.range(2, x_rank)), axis=0))
+
+    x_t.set_shape(
+
+        tensor_shape.TensorShape([
+
+            x_static_shape[1], x_static_shape[0]
+
+        ]).concatenate(x_static_shape[2:]))
+
+    return x_t
+
+
+def _best_effort_input_batch_size(flat_input):
+    """Get static input batch size if available, with fallback to the dynamic one.
+
+
+
+    Args:
+
+      flat_input: An iterable of time major input Tensors of shape [max_time,
+
+        batch_size, ...]. All inputs should have compatible batch sizes.
+
+
+
+    Returns:
+
+      The batch size in Python integer if available, or a scalar Tensor otherwise.
+
+
+
+    Raises:
+
+      ValueError: if there is any input with an invalid shape.
+
+    """
+
+    for input_ in flat_input:
+
+        shape = input_.shape
+
+        if shape.ndims is None:
+            continue
+
+        if shape.ndims < 2:
+            raise ValueError(
+
+                "Expected input tensor %s to have rank at least 2" % input_)
+
+        batch_size = shape[1]
+
+        if batch_size is not None:
+            return batch_size
+
+    # Fallback to the dynamic batch size of the first input.
+
+    return array_ops.shape(flat_input[0])[1]
+
+
+def _infer_state_dtype(explicit_dtype, state):
+    """Infer the dtype of an RNN state.
+
+
+
+    Args:
+
+      explicit_dtype: explicitly declared dtype or None.
+
+      state: RNN's hidden state. Must be a Tensor or a nested iterable containing
+
+        Tensors.
+
+
+
+    Returns:
+
+      dtype: inferred dtype of hidden state.
+
+
+
+    Raises:
+
+      ValueError: if `state` has heterogeneous dtypes or is empty.
+
+    """
+
+    if explicit_dtype is not None:
+
+        return explicit_dtype
+
+    elif nest.is_sequence(state):
+
+        inferred_dtypes = [element.dtype for element in nest.flatten(state)]
+
+        if not inferred_dtypes:
+            raise ValueError("Unable to infer dtype from empty state.")
+
+        all_same = all([x == inferred_dtypes[0] for x in inferred_dtypes])
+
+        if not all_same:
+            raise ValueError(
+
+                "State has tensors of different inferred_dtypes. Unable to infer a "
+
+                "single representative dtype.")
+
+        return inferred_dtypes[0]
+
+    else:
+
+        return state.dtype
+
+
+# pylint: disable=unused-argument
+
+def _rnn_step(
+
+        time, sequence_length, min_sequence_length, max_sequence_length,
+
+        zero_output, state, call_cell, state_size, skip_conditionals=False):
+    """Calculate one step of a dynamic RNN minibatch.
+
+
+
+    Returns an (output, state) pair conditioned on the sequence_lengths.
+
+    When skip_conditionals=False, the pseudocode is something like:
+
+
+
+    if t >= max_sequence_length:
+
+      return (zero_output, state)
+
+    if t < min_sequence_length:
+
+      return call_cell()
+
+
+
+    # Selectively output zeros or output, old state or new state depending
+
+    # on if we've finished calculating each row.
+
+    new_output, new_state = call_cell()
+
+    final_output = np.vstack([
+
+      zero_output if time >= sequence_lengths[r] else new_output_r
+
+      for r, new_output_r in enumerate(new_output)
+
+    ])
+
+    final_state = np.vstack([
+
+      state[r] if time >= sequence_lengths[r] else new_state_r
+
+      for r, new_state_r in enumerate(new_state)
+
+    ])
+
+    return (final_output, final_state)
+
+
+
+    Args:
+
+      time: Python int, the current time step
+
+      sequence_length: int32 `Tensor` vector of size [batch_size]
+
+      min_sequence_length: int32 `Tensor` scalar, min of sequence_length
+
+      max_sequence_length: int32 `Tensor` scalar, max of sequence_length
+
+      zero_output: `Tensor` vector of shape [output_size]
+
+      state: Either a single `Tensor` matrix of shape `[batch_size, state_size]`,
+
+        or a list/tuple of such tensors.
+
+      call_cell: lambda returning tuple of (new_output, new_state) where
+
+        new_output is a `Tensor` matrix of shape `[batch_size, output_size]`.
+
+        new_state is a `Tensor` matrix of shape `[batch_size, state_size]`.
+
+      state_size: The `cell.state_size` associated with the state.
+
+      skip_conditionals: Python bool, whether to skip using the conditional
+
+        calculations.  This is useful for `dynamic_rnn`, where the input tensor
+
+        matches `max_sequence_length`, and using conditionals just slows
+
+        everything down.
+
+
+
+    Returns:
+
+      A tuple of (`final_output`, `final_state`) as given by the pseudocode above:
+
+        final_output is a `Tensor` matrix of shape [batch_size, output_size]
+
+        final_state is either a single `Tensor` matrix, or a tuple of such
+
+          matrices (matching length and shapes of input `state`).
+
+
+
+    Raises:
+
+      ValueError: If the cell returns a state tuple whose length does not match
+
+        that returned by `state_size`.
+
+    """
+
+    # Convert state to a list for ease of use
+
+    flat_state = nest.flatten(state)
+
+    flat_zero_output = nest.flatten(zero_output)
+
+    def _copy_one_through(output, new_output):
+
+        # If the state contains a scalar value we simply pass it through.
+
+        if output.shape.ndims == 0:
+            return new_output
+
+        copy_cond = (time >= sequence_length)
+
+        with ops.colocate_with(new_output):
+            return array_ops.where(copy_cond, output, new_output)
+
+    def _copy_some_through(flat_new_output, flat_new_state):
+
+        # Use broadcasting select to determine which values should get
+
+        # the previous state & zero output, and which values should get
+
+        # a calculated state & output.
+
+        flat_new_output = [
+
+            _copy_one_through(zero_output, new_output)
+
+            for zero_output, new_output in zip(flat_zero_output, flat_new_output)]
+
+        flat_new_state = [
+
+            _copy_one_through(state, new_state)
+
+            for state, new_state in zip(flat_state, flat_new_state)]
+
+        return flat_new_output + flat_new_state
+
+    def _maybe_copy_some_through():
+
+        """Run RNN step.  Pass through either no or some past state."""
+
+        new_output, new_state = call_cell()
+
+        nest.assert_same_structure(state, new_state)
+
+        flat_new_state = nest.flatten(new_state)
+
+        flat_new_output = nest.flatten(new_output)
+
+        return control_flow_ops.cond(
+
+            # if t < min_seq_len: calculate and return everything
+
+            time < min_sequence_length, lambda: flat_new_output + flat_new_state,
+
+            # else copy some of it through
+
+            lambda: _copy_some_through(flat_new_output, flat_new_state))
+
+    # TODO(ebrevdo): skipping these conditionals may cause a slowdown,
+
+    # but benefits from removing cond() and its gradient.  We should
+
+    # profile with and without this switch here.
+
+    if skip_conditionals:
+
+        # Instead of using conditionals, perform the selective copy at all time
+
+        # steps.  This is faster when max_seq_len is equal to the number of unrolls
+
+        # (which is typical for dynamic_rnn).
+
+        new_output, new_state = call_cell()
+
+        nest.assert_same_structure(state, new_state)
+
+        new_state = nest.flatten(new_state)
+
+        new_output = nest.flatten(new_output)
+
+        final_output_and_state = _copy_some_through(new_output, new_state)
+
+    else:
+
+        empty_update = lambda: flat_zero_output + flat_state
+
+        final_output_and_state = control_flow_ops.cond(
+
+            # if t >= max_seq_len: copy all state through, output zeros
+
+            time >= max_sequence_length, empty_update,
+
+            # otherwise calculation is required: copy some or all of it through
+
+            _maybe_copy_some_through)
+
+    if len(final_output_and_state) != len(flat_zero_output) + len(flat_state):
+        raise ValueError("Internal error: state and output were not concatenated "
+
+                         "correctly.")
+
+    final_output = final_output_and_state[:len(flat_zero_output)]
+
+    final_state = final_output_and_state[len(flat_zero_output):]
+
+    for output, flat_output in zip(final_output, flat_zero_output):
+        output.set_shape(flat_output.get_shape())
+
+    for substate, flat_substate in zip(final_state, flat_state):
+        substate.set_shape(flat_substate.get_shape())
+
+    final_output = nest.pack_sequence_as(
+
+        structure=zero_output, flat_sequence=final_output)
+
+    final_state = nest.pack_sequence_as(
+
+        structure=state, flat_sequence=final_state)
+
+    return final_output, final_state
+
+
+def _reverse_seq(input_seq, lengths):
+    """Reverse a list of Tensors up to specified lengths.
+
+
+
+    Args:
+
+      input_seq: Sequence of seq_len tensors of dimension (batch_size, n_features)
+
+                 or nested tuples of tensors.
+
+      lengths:   A `Tensor` of dimension batch_size, containing lengths for each
+
+                 sequence in the batch. If "None" is specified, simply reverses
+
+                 the list.
+
+
+
+    Returns:
+
+      time-reversed sequence
+
+    """
+
+    if lengths is None:
+        return list(reversed(input_seq))
+
+    flat_input_seq = tuple(nest.flatten(input_) for input_ in input_seq)
+
+    flat_results = [[] for _ in range(len(input_seq))]
+
+    for sequence in zip(*flat_input_seq):
+
+        input_shape = tensor_shape.unknown_shape(
+
+            ndims=sequence[0].get_shape().ndims)
+
+        for input_ in sequence:
+            input_shape.merge_with(input_.get_shape())
+
+            input_.set_shape(input_shape)
+
+        # Join into (time, batch_size, depth)
+
+        s_joined = array_ops.stack(sequence)
+
+        # Reverse along dimension 0
+
+        s_reversed = array_ops.reverse_sequence(s_joined, lengths, 0, 1)
+
+        # Split again into list
+
+        result = array_ops.unstack(s_reversed)
+
+        for r, flat_result in zip(result, flat_results):
+            r.set_shape(input_shape)
+
+            flat_result.append(r)
+
+    results = [nest.pack_sequence_as(structure=input_, flat_sequence=flat_result)
+
+               for input_, flat_result in zip(input_seq, flat_results)]
+
+    return results
+
+
+#
+# def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
+#
+#                               initial_state_fw=None, initial_state_bw=None,
+#
+#                               dtype=None, parallel_iterations=None,
+#
+#                               swap_memory=False, time_major=False, scope=None):
+#
+#   """Creates a dynamic version of bidirectional recurrent neural network.
+#
+#
+#
+#   Takes input and builds independent forward and backward RNNs. The input_size
+#
+#   of forward and backward cell must match. The initial state for both directions
+#
+#   is zero by default (but can be set optionally) and no intermediate states are
+#
+#   ever returned -- the network is fully unrolled for the given (passed in)
+#
+#   length(s) of the sequence(s) or completely unrolled if length(s) is not
+#
+#   given.
+#
+#
+#
+#   Args:
+#
+#     cell_fw: An instance of RNNCell, to be used for forward direction.
+#
+#     cell_bw: An instance of RNNCell, to be used for backward direction.
+#
+#     inputs: The RNN inputs.
+#
+#       If time_major == False (default), this must be a tensor of shape:
+#
+#         `[batch_size, max_time, ...]`, or a nested tuple of such elements.
+#
+#       If time_major == True, this must be a tensor of shape:
+#
+#         `[max_time, batch_size, ...]`, or a nested tuple of such elements.
+#
+#     sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
+#
+#       containing the actual lengths for each of the sequences in the batch.
+#
+#       If not provided, all batch entries are assumed to be full sequences; and
+#
+#       time reversal is applied from time `0` to `max_time` for each sequence.
+#
+#     initial_state_fw: (optional) An initial state for the forward RNN.
+#
+#       This must be a tensor of appropriate type and shape
+#
+#       `[batch_size, cell_fw.state_size]`.
+#
+#       If `cell_fw.state_size` is a tuple, this should be a tuple of
+#
+#       tensors having shapes `[batch_size, s] for s in cell_fw.state_size`.
+#
+#     initial_state_bw: (optional) Same as for `initial_state_fw`, but using
+#
+#       the corresponding properties of `cell_bw`.
+#
+#     dtype: (optional) The data type for the initial states and expected output.
+#
+#       Required if initial_states are not provided or RNN states have a
+#
+#       heterogeneous dtype.
+#
+#     parallel_iterations: (Default: 32).  The number of iterations to run in
+#
+#       parallel.  Those operations which do not have any temporal dependency
+#
+#       and can be run in parallel, will be.  This parameter trades off
+#
+#       time for space.  Values >> 1 use more memory but take less time,
+#
+#       while smaller values use less memory but computations take longer.
+#
+#     swap_memory: Transparently swap the tensors produced in forward inference
+#
+#       but needed for back prop from GPU to CPU.  This allows training RNNs
+#
+#       which would typically not fit on a single GPU, with very minimal (or no)
+#
+#       performance penalty.
+#
+#     time_major: The shape format of the `inputs` and `outputs` Tensors.
+#
+#       If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+#
+#       If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+#
+#       Using `time_major = True` is a bit more efficient because it avoids
+#
+#       transposes at the beginning and end of the RNN calculation.  However,
+#
+#       most TensorFlow data is batch-major, so by default this function
+#
+#       accepts input and emits output in batch-major form.
+#
+#     scope: VariableScope for the created subgraph; defaults to
+#
+#       "bidirectional_rnn"
+#
+#
+#
+#   Returns:
+#
+#     A tuple (outputs, output_states) where:
+#
+#       outputs: A tuple (output_fw, output_bw) containing the forward and
+#
+#         the backward rnn output `Tensor`.
+#
+#         If time_major == False (default),
+#
+#           output_fw will be a `Tensor` shaped:
+#
+#           `[batch_size, max_time, cell_fw.output_size]`
+#
+#           and output_bw will be a `Tensor` shaped:
+#
+#           `[batch_size, max_time, cell_bw.output_size]`.
+#
+#         If time_major == True,
+#
+#           output_fw will be a `Tensor` shaped:
+#
+#           `[max_time, batch_size, cell_fw.output_size]`
+#
+#           and output_bw will be a `Tensor` shaped:
+#
+#           `[max_time, batch_size, cell_bw.output_size]`.
+#
+#         It returns a tuple instead of a single concatenated `Tensor`, unlike
+#
+#         in the `bidirectional_rnn`. If the concatenated one is preferred,
+#
+#         the forward and backward outputs can be concatenated as
+#
+#         `tf.concat(outputs, 2)`.
+#
+#       output_states: A tuple (output_state_fw, output_state_bw) containing
+#
+#         the forward and the backward final states of bidirectional rnn.
+#
+#
+#
+#   Raises:
+#
+#     TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
+#
+#   """
+#
+#
+#
+#   if not _like_rnncell(cell_fw):
+#
+#     raise TypeError("cell_fw must be an instance of RNNCell")
+#
+#   if not _like_rnncell(cell_bw):
+#
+#     raise TypeError("cell_bw must be an instance of RNNCell")
+#
+#
+#
+#   with vs.variable_scope(scope or "bidirectional_rnn"):
+#
+#     # Forward direction
+#
+#     with vs.variable_scope("fw") as fw_scope:
+#
+#       output_fw, output_state_fw = dynamic_rnn(
+#
+#           cell=cell_fw, inputs=inputs, sequence_length=sequence_length,
+#
+#           initial_state=initial_state_fw, dtype=dtype,
+#
+#           parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+#
+#           time_major=time_major, scope=fw_scope)
+#
+#
+#
+#     # Backward direction
+#
+#     if not time_major:
+#
+#       time_dim = 1
+#
+#       batch_dim = 0
+#
+#     else:
+#
+#       time_dim = 0
+#
+#       batch_dim = 1
+#
+#
+#
+#     def _reverse(input_, seq_lengths, seq_dim, batch_dim):
+#
+#       if seq_lengths is not None:
+#
+#         return array_ops.reverse_sequence(
+#
+#             input=input_, seq_lengths=seq_lengths,
+#
+#             seq_dim=seq_dim, batch_dim=batch_dim)
+#
+#       else:
+#
+#         return array_ops.reverse(input_, axis=[seq_dim])
+#
+#
+#
+#     with vs.variable_scope("bw") as bw_scope:
+#
+#       inputs_reverse = _reverse(
+#
+#           inputs, seq_lengths=sequence_length,
+#
+#           seq_dim=time_dim, batch_dim=batch_dim)
+#
+#       tmp, output_state_bw = dynamic_rnn(
+#
+#           cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
+#
+#           initial_state=initial_state_bw, dtype=dtype,
+#
+#           parallel_iterations=parallel_iterations, swap_memory=swap_memory,
+#
+#           time_major=time_major, scope=bw_scope)
+#
+#
+#
+#   output_bw = _reverse(
+#
+#       tmp, seq_lengths=sequence_length,
+#
+#       seq_dim=time_dim, batch_dim=batch_dim)
+#
+#
+#
+#   outputs = (output_fw, output_bw)
+#
+#   output_states = (output_state_fw, output_state_bw)
+#
+#
+#
+#   return (outputs, output_states)
+#
+
+
+def dynamic_rnn(cell, inputs, att_scores=None, sequence_length=None, initial_state=None,
+
+                dtype=None, parallel_iterations=None, swap_memory=False,
+
+                time_major=False, scope=None):
+    """Creates a recurrent neural network specified by RNNCell `cell`.
+
+
+
+    Performs fully dynamic unrolling of `inputs`.
+
+
+
+    Example:
+
+
+
+    ```python
+
+    # create a BasicRNNCell
+
+    rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+
+
+
+    # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size]
+
+
+
+    # defining initial state
+
+    initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32)
+
+
+
+    # 'state' is a tensor of shape [batch_size, cell_state_size]
+
+    outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data,
+
+                                       initial_state=initial_state,
+
+                                       dtype=tf.float32)
+
+    ```
+
+
+
+    ```python
+
+    # create 2 LSTMCells
+
+    rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [128, 256]]
+
+
+
+    # create a RNN cell composed sequentially of a number of RNNCells
+
+    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
+
+
+
+    # 'outputs' is a tensor of shape [batch_size, max_time, 256]
+
+    # 'state' is a N-tuple where N is the number of LSTMCells containing a
+
+    # tf.contrib.rnn.LSTMStateTuple for each cell
+
+    outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
+
+                                       inputs=data,
+
+                                       dtype=tf.float32)
+
+    ```
+
+
+
+
+
+    Args:
+
+      cell: An instance of RNNCell.
+
+      inputs: The RNN inputs.
+
+        If `time_major == False` (default), this must be a `Tensor` of shape:
+
+          `[batch_size, max_time, ...]`, or a nested tuple of such
+
+          elements.
+
+        If `time_major == True`, this must be a `Tensor` of shape:
+
+          `[max_time, batch_size, ...]`, or a nested tuple of such
+
+          elements.
+
+        This may also be a (possibly nested) tuple of Tensors satisfying
+
+        this property.  The first two dimensions must match across all the inputs,
+
+        but otherwise the ranks and other shape components may differ.
+
+        In this case, input to `cell` at each time-step will replicate the
+
+        structure of these tuples, except for the time dimension (from which the
+
+        time is taken).
+
+        The input to `cell` at each time step will be a `Tensor` or (possibly
+
+        nested) tuple of Tensors each with dimensions `[batch_size, ...]`.
+
+      sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
+
+        Used to copy-through state and zero-out outputs when past a batch
+
+        element's sequence length.  So it's more for correctness than performance.
+
+      initial_state: (optional) An initial state for the RNN.
+
+        If `cell.state_size` is an integer, this must be
+
+        a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
+
+        If `cell.state_size` is a tuple, this should be a tuple of
+
+        tensors having shapes `[batch_size, s] for s in cell.state_size`.
+
+      dtype: (optional) The data type for the initial state and expected output.
+
+        Required if initial_state is not provided or RNN state has a heterogeneous
+
+        dtype.
+
+      parallel_iterations: (Default: 32).  The number of iterations to run in
+
+        parallel.  Those operations which do not have any temporal dependency
+
+        and can be run in parallel, will be.  This parameter trades off
+
+        time for space.  Values >> 1 use more memory but take less time,
+
+        while smaller values use less memory but computations take longer.
+
+      swap_memory: Transparently swap the tensors produced in forward inference
+
+        but needed for back prop from GPU to CPU.  This allows training RNNs
+
+        which would typically not fit on a single GPU, with very minimal (or no)
+
+        performance penalty.
+
+      time_major: The shape format of the `inputs` and `outputs` Tensors.
+
+        If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+
+        If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+
+        Using `time_major = True` is a bit more efficient because it avoids
+
+        transposes at the beginning and end of the RNN calculation.  However,
+
+        most TensorFlow data is batch-major, so by default this function
+
+        accepts input and emits output in batch-major form.
+
+      scope: VariableScope for the created subgraph; defaults to "rnn".
+
+
+
+    Returns:
+
+      A pair (outputs, state) where:
+
+
+
+      outputs: The RNN output `Tensor`.
+
+
+
+        If time_major == False (default), this will be a `Tensor` shaped:
+
+          `[batch_size, max_time, cell.output_size]`.
+
+
+
+        If time_major == True, this will be a `Tensor` shaped:
+
+          `[max_time, batch_size, cell.output_size]`.
+
+
+
+        Note, if `cell.output_size` is a (possibly nested) tuple of integers
+
+        or `TensorShape` objects, then `outputs` will be a tuple having the
+
+        same structure as `cell.output_size`, containing Tensors having shapes
+
+        corresponding to the shape data in `cell.output_size`.
+
+
+
+      state: The final state.  If `cell.state_size` is an int, this
+
+        will be shaped `[batch_size, cell.state_size]`.  If it is a
+
+        `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
+
+        If it is a (possibly nested) tuple of ints or `TensorShape`, this will
+
+        be a tuple having the corresponding shapes. If cells are `LSTMCells`
+
+        `state` will be a tuple containing a `LSTMStateTuple` for each cell.
+
+
+
+    Raises:
+
+      TypeError: If `cell` is not an instance of RNNCell.
+
+      ValueError: If inputs is None or an empty list.
+
+    """
+
+    if not _like_rnncell(cell):
+        raise TypeError("cell must be an instance of RNNCell")
+
+    # By default, time_major==False and inputs are batch-major: shaped
+
+    #   [batch, time, depth]
+
+    # For internal calculations, we transpose to [time, batch, depth]
+
+    flat_input = nest.flatten(inputs)
+
+    if not time_major:
+        # (B,T,D) => (T,B,D)
+
+        flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
+
+        flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
+
+    parallel_iterations = parallel_iterations or 32
+
+    if sequence_length is not None:
+
+        sequence_length = math_ops.to_int32(sequence_length)
+
+        if sequence_length.get_shape().ndims not in (None, 1):
+            raise ValueError(
+
+                "sequence_length must be a vector of length batch_size, "
+
+                "but saw shape: %s" % sequence_length.get_shape())
+
+        sequence_length = array_ops.identity(  # Just to find it in the graph.
+
+            sequence_length, name="sequence_length")
+
+    # Create a new scope in which the caching device is either
+
+    # determined by the parent scope, or is set to place the cached
+
+    # Variable using the same placement as for the rest of the RNN.
+
+    try:
+        resue = tf.AUTO_REUSE
+    except:
+        resue = tf.compat.v1.AUTO_REUSE
+
+    with vs.variable_scope(scope or "rnn",reuse=resue) as varscope:#TODO:user defined reuse
+
+        if varscope.caching_device is None:
+            varscope.set_caching_device(lambda op: op.device)
+
+        batch_size = _best_effort_input_batch_size(flat_input)
+
+        if initial_state is not None:
+
+            state = initial_state
+
+        else:
+
+            if not dtype:
+                raise ValueError("If there is no initial_state, you must give a dtype.")
+
+            state = cell.zero_state(batch_size, dtype)
+
+        def _assert_has_shape(x, shape):
+
+            x_shape = array_ops.shape(x)
+
+            packed_shape = array_ops.stack(shape)
+
+            return control_flow_ops.Assert(
+
+                math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)),
+
+                ["Expected shape for Tensor %s is " % x.name,
+
+                 packed_shape, " but saw shape: ", x_shape])
+
+        if sequence_length is not None:
+            # Perform some shape validation
+
+            with ops.control_dependencies(
+
+                    [_assert_has_shape(sequence_length, [batch_size])]):
+                sequence_length = array_ops.identity(
+
+                    sequence_length, name="CheckSeqLen")
+
+        inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input)
+
+        (outputs, final_state) = _dynamic_rnn_loop(
+
+            cell,
+
+            inputs,
+
+            state,
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory,
+
+            att_scores=att_scores,
+
+            sequence_length=sequence_length,
+
+            dtype=dtype)
+
+        # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth].
+
+        # If we are performing batch-major calculations, transpose output back
+
+        # to shape [batch, time, depth]
+
+        if not time_major:
+            # (T,B,D) => (B,T,D)
+
+            outputs = nest.map_structure(_transpose_batch_time, outputs)
+
+        return (outputs, final_state)
+
+
+def _dynamic_rnn_loop(cell,
+
+                      inputs,
+
+                      initial_state,
+
+                      parallel_iterations,
+
+                      swap_memory,
+
+                      att_scores=None,
+
+                      sequence_length=None,
+
+                      dtype=None):
+    """Internal implementation of Dynamic RNN.
+
+
+
+    Args:
+
+      cell: An instance of RNNCell.
+
+      inputs: A `Tensor` of shape [time, batch_size, input_size], or a nested
+
+        tuple of such elements.
+
+      initial_state: A `Tensor` of shape `[batch_size, state_size]`, or if
+
+        `cell.state_size` is a tuple, then this should be a tuple of
+
+        tensors having shapes `[batch_size, s] for s in cell.state_size`.
+
+      parallel_iterations: Positive Python int.
+
+      swap_memory: A Python boolean
+
+      sequence_length: (optional) An `int32` `Tensor` of shape [batch_size].
+
+      dtype: (optional) Expected dtype of output. If not specified, inferred from
+
+        initial_state.
+
+
+
+    Returns:
+
+      Tuple `(final_outputs, final_state)`.
+
+      final_outputs:
+
+        A `Tensor` of shape `[time, batch_size, cell.output_size]`.  If
+
+        `cell.output_size` is a (possibly nested) tuple of ints or `TensorShape`
+
+        objects, then this returns a (possibly nsted) tuple of Tensors matching
+
+        the corresponding shapes.
+
+      final_state:
+
+        A `Tensor`, or possibly nested tuple of Tensors, matching in length
+
+        and shapes to `initial_state`.
+
+
+
+    Raises:
+
+      ValueError: If the input depth cannot be inferred via shape inference
+
+        from the inputs.
+
+    """
+
+    state = initial_state
+
+    assert isinstance(parallel_iterations, int), "parallel_iterations must be int"
+
+    state_size = cell.state_size
+
+    flat_input = nest.flatten(inputs)
+
+    flat_output_size = nest.flatten(cell.output_size)
+
+    # Construct an initial output
+
+    input_shape = array_ops.shape(flat_input[0])
+
+    time_steps = input_shape[0]
+
+    batch_size = _best_effort_input_batch_size(flat_input)
+
+    inputs_got_shape = tuple(input_.get_shape().with_rank_at_least(3)
+
+                             for input_ in flat_input)
+
+    const_time_steps, const_batch_size = inputs_got_shape[0].as_list()[:2]
+
+    for shape in inputs_got_shape:
+
+        if not shape[2:].is_fully_defined():
+            raise ValueError(
+
+                "Input size (depth of inputs) must be accessible via shape inference,"
+
+                " but saw value None.")
+
+        got_time_steps = shape[0]
+
+        got_batch_size = shape[1]
+
+        if const_time_steps != got_time_steps:
+            raise ValueError(
+
+                "Time steps is not the same for all the elements in the input in a "
+
+                "batch.")
+
+        if const_batch_size != got_batch_size:
+            raise ValueError(
+
+                "Batch_size is not the same for all the elements in the input.")
+
+    # Prepare dynamic conditional copying of state & output
+
+    def _create_zero_arrays(size):
+
+        size = _concat(batch_size, size)
+
+        return array_ops.zeros(
+
+            array_ops.stack(size), _infer_state_dtype(dtype, state))
+
+    flat_zero_output = tuple(_create_zero_arrays(output)
+
+                             for output in flat_output_size)
+
+    zero_output = nest.pack_sequence_as(structure=cell.output_size,
+
+                                        flat_sequence=flat_zero_output)
+
+    if sequence_length is not None:
+        min_sequence_length = math_ops.reduce_min(sequence_length)
+
+        max_sequence_length = math_ops.reduce_max(sequence_length)
+
+    time = array_ops.constant(0, dtype=dtypes.int32, name="time")
+
+    with ops.name_scope("dynamic_rnn") as scope:
+
+        base_name = scope
+
+    def _create_ta(name, dtype):
+
+        return tensor_array_ops.TensorArray(dtype=dtype,
+
+                                            size=time_steps,
+
+                                            tensor_array_name=base_name + name)
+
+    output_ta = tuple(_create_ta("output_%d" % i,
+
+                                 _infer_state_dtype(dtype, state))
+
+                      for i in range(len(flat_output_size)))
+
+    input_ta = tuple(_create_ta("input_%d" % i, flat_input[i].dtype)
+
+                     for i in range(len(flat_input)))
+
+    input_ta = tuple(ta.unstack(input_)
+
+                     for ta, input_ in zip(input_ta, flat_input))
+
+    def _time_step(time, output_ta_t, state, att_scores=None):
+
+        """Take a time step of the dynamic RNN.
+
+
+
+        Args:
+
+          time: int32 scalar Tensor.
+
+          output_ta_t: List of `TensorArray`s that represent the output.
+
+          state: nested tuple of vector tensors that represent the state.
+
+
+
+        Returns:
+
+          The tuple (time + 1, output_ta_t with updated flow, new_state).
+
+        """
+
+        input_t = tuple(ta.read(time) for ta in input_ta)
+
+        # Restore some shape information
+
+        for input_, shape in zip(input_t, inputs_got_shape):
+            input_.set_shape(shape[1:])
+
+        input_t = nest.pack_sequence_as(structure=inputs, flat_sequence=input_t)
+
+        if att_scores is not None:
+
+            att_score = att_scores[:, time, :]
+
+            call_cell = lambda: cell(input_t, state, att_score)
+
+        else:
+
+            call_cell = lambda: cell(input_t, state)
+
+        if sequence_length is not None:
+
+            (output, new_state) = _rnn_step(
+
+                time=time,
+
+                sequence_length=sequence_length,
+
+                min_sequence_length=min_sequence_length,
+
+                max_sequence_length=max_sequence_length,
+
+                zero_output=zero_output,
+
+                state=state,
+
+                call_cell=call_cell,
+
+                state_size=state_size,
+
+                skip_conditionals=True)
+
+        else:
+
+            (output, new_state) = call_cell()
+
+        # Pack state if using state tuples
+
+        output = nest.flatten(output)
+
+        output_ta_t = tuple(
+
+            ta.write(time, out) for ta, out in zip(output_ta_t, output))
+
+        if att_scores is not None:
+
+            return (time + 1, output_ta_t, new_state, att_scores)
+
+        else:
+
+            return (time + 1, output_ta_t, new_state)
+
+    if att_scores is not None:
+
+        _, output_final_ta, final_state, _ = control_flow_ops.while_loop(
+
+            cond=lambda time, *_: time < time_steps,
+
+            body=_time_step,
+
+            loop_vars=(time, output_ta, state, att_scores),
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory)
+
+    else:
+
+        _, output_final_ta, final_state = control_flow_ops.while_loop(
+
+            cond=lambda time, *_: time < time_steps,
+
+            body=_time_step,
+
+            loop_vars=(time, output_ta, state),
+
+            parallel_iterations=parallel_iterations,
+
+            swap_memory=swap_memory)
+
+    # Unpack final output if not using output tuples.
+
+    final_outputs = tuple(ta.stack() for ta in output_final_ta)
+
+    # Restore some shape information
+
+    for output, output_size in zip(final_outputs, flat_output_size):
+        shape = _concat(
+
+            [const_time_steps, const_batch_size], output_size, static=True)
+
+        output.set_shape(shape)
+
+    final_outputs = nest.pack_sequence_as(
+
+        structure=cell.output_size, flat_sequence=final_outputs)
+
+    return (final_outputs, final_state)
diff --git a/modelzoo/PNN/script/contrib/utils.py b/modelzoo/PNN/script/contrib/utils.py
new file mode 100644
index 00000000000..692f4ef6e89
--- /dev/null
+++ b/modelzoo/PNN/script/contrib/utils.py
@@ -0,0 +1,378 @@
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops.rnn_cell import *
+from tensorflow.python.util import nest
+
+_BIAS_VARIABLE_NAME = "bias"
+
+_WEIGHTS_VARIABLE_NAME = "kernel"
+
+
+class _Linear_(object):
+    """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
+
+
+
+    Args:
+
+      args: a 2D Tensor or a list of 2D, batch x n, Tensors.
+
+      output_size: int, second dimension of weight variable.
+
+      dtype: data type for variables.
+
+      build_bias: boolean, whether to build a bias variable.
+
+      bias_initializer: starting value to initialize the bias
+
+        (default is all zeros).
+
+      kernel_initializer: starting value to initialize the weight.
+
+
+
+    Raises:
+
+      ValueError: if inputs_shape is wrong.
+
+    """
+
+    def __init__(self,
+
+                 args,
+
+                 output_size,
+
+                 build_bias,
+
+                 bias_initializer=None,
+
+                 kernel_initializer=None):
+
+        self._build_bias = build_bias
+
+        if args is None or (nest.is_sequence(args) and not args):
+            raise ValueError("`args` must be specified")
+
+        if not nest.is_sequence(args):
+
+            args = [args]
+
+            self._is_sequence = False
+
+        else:
+
+            self._is_sequence = True
+
+        # Calculate the total size of arguments on dimension 1.
+
+        total_arg_size = 0
+
+        shapes = [a.get_shape() for a in args]
+
+        for shape in shapes:
+
+            if shape.ndims != 2:
+                raise ValueError(
+                    "linear is expecting 2D arguments: %s" % shapes)
+
+            if shape[1] is None:
+
+                raise ValueError("linear expects shape[1] to be provided for shape %s, "
+
+                                 "but saw %s" % (shape, shape[1]))
+
+            else:
+
+                total_arg_size += int(shape[1])#.value
+
+        dtype = [a.dtype for a in args][0]
+
+        scope = vs.get_variable_scope()
+
+        with vs.variable_scope(scope) as outer_scope:
+
+            self._weights = vs.get_variable(
+
+                _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size],
+
+                dtype=dtype,
+
+                initializer=kernel_initializer)
+
+            if build_bias:
+
+                with vs.variable_scope(outer_scope) as inner_scope:
+
+                    inner_scope.set_partitioner(None)
+
+                    if bias_initializer is None:
+                        bias_initializer = init_ops.constant_initializer(
+                            0.0, dtype=dtype)
+
+                    self._biases = vs.get_variable(
+
+                        _BIAS_VARIABLE_NAME, [output_size],
+
+                        dtype=dtype,
+
+                        initializer=bias_initializer)
+
+    def __call__(self, args):
+
+        if not self._is_sequence:
+            args = [args]
+
+        if len(args) == 1:
+
+            res = math_ops.matmul(args[0], self._weights)
+
+        else:
+
+            res = math_ops.matmul(array_ops.concat(args, 1), self._weights)
+
+        if self._build_bias:
+            res = nn_ops.bias_add(res, self._biases)
+
+        return res
+
+
+try:
+    from tensorflow.python.ops.rnn_cell_impl import _Linear
+except:
+    _Linear = _Linear_
+
+
+class QAAttGRUCell(RNNCell):
+    """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
+
+    Args:
+
+      num_units: int, The number of units in the GRU cell.
+
+      activation: Nonlinearity to use.  Default: `tanh`.
+
+      reuse: (optional) Python boolean describing whether to reuse variables
+
+       in an existing scope.  If not `True`, and the existing scope already has
+
+       the given variables, an error is raised.
+
+      kernel_initializer: (optional) The initializer to use for the weight and
+
+      projection matrices.
+
+      bias_initializer: (optional) The initializer to use for the bias.
+
+    """
+
+    def __init__(self,
+
+                 num_units,
+
+                 activation=None,
+
+                 reuse=None,
+
+                 kernel_initializer=None,
+
+                 bias_initializer=None):
+
+        super(QAAttGRUCell, self).__init__(_reuse=reuse)
+
+        self._num_units = num_units
+
+        self._activation = activation or math_ops.tanh
+
+        self._kernel_initializer = kernel_initializer
+
+        self._bias_initializer = bias_initializer
+
+        self._gate_linear = None
+
+        self._candidate_linear = None
+
+    @property
+    def state_size(self):
+
+        return self._num_units
+
+    @property
+    def output_size(self):
+
+        return self._num_units
+
+    def __call__(self, inputs, state, att_score):
+
+        return self.call(inputs, state, att_score)
+
+    def call(self, inputs, state, att_score=None):
+        """Gated recurrent unit (GRU) with nunits cells."""
+
+        if self._gate_linear is None:
+
+            bias_ones = self._bias_initializer
+
+            if self._bias_initializer is None:
+                bias_ones = init_ops.constant_initializer(
+                    1.0, dtype=inputs.dtype)
+
+            with vs.variable_scope("gates"):  # Reset gate and update gate.
+
+                self._gate_linear = _Linear(
+
+                    [inputs, state],
+
+                    2 * self._num_units,
+
+                    True,
+
+                    bias_initializer=bias_ones,
+
+                    kernel_initializer=self._kernel_initializer)
+
+        value = math_ops.sigmoid(self._gate_linear([inputs, state]))
+
+        r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
+
+        r_state = r * state
+
+        if self._candidate_linear is None:
+            with vs.variable_scope("candidate"):
+                self._candidate_linear = _Linear(
+
+                    [inputs, r_state],
+
+                    self._num_units,
+
+                    True,
+
+                    bias_initializer=self._bias_initializer,
+
+                    kernel_initializer=self._kernel_initializer)
+
+        c = self._activation(self._candidate_linear([inputs, r_state]))
+
+        new_h = (1. - att_score) * state + att_score * c
+
+        return new_h, new_h
+
+
+class VecAttGRUCell(RNNCell):
+    """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
+
+    Args:
+
+      num_units: int, The number of units in the GRU cell.
+
+      activation: Nonlinearity to use.  Default: `tanh`.
+
+      reuse: (optional) Python boolean describing whether to reuse variables
+
+       in an existing scope.  If not `True`, and the existing scope already has
+
+       the given variables, an error is raised.
+
+      kernel_initializer: (optional) The initializer to use for the weight and
+
+      projection matrices.
+
+      bias_initializer: (optional) The initializer to use for the bias.
+
+    """
+
+    def __init__(self,
+
+                 num_units,
+
+                 activation=None,
+
+                 reuse=None,
+
+                 kernel_initializer=None,
+
+                 bias_initializer=None):
+
+        super(VecAttGRUCell, self).__init__(_reuse=reuse)
+
+        self._num_units = num_units
+
+        self._activation = activation or math_ops.tanh
+
+        self._kernel_initializer = kernel_initializer
+
+        self._bias_initializer = bias_initializer
+
+        self._gate_linear = None
+
+        self._candidate_linear = None
+
+    @property
+    def state_size(self):
+
+        return self._num_units
+
+    @property
+    def output_size(self):
+
+        return self._num_units
+
+    def __call__(self, inputs, state, att_score):
+
+        return self.call(inputs, state, att_score)
+
+    def call(self, inputs, state, att_score=None):
+        """Gated recurrent unit (GRU) with nunits cells."""
+
+        if self._gate_linear is None:
+
+            bias_ones = self._bias_initializer
+
+            if self._bias_initializer is None:
+                bias_ones = init_ops.constant_initializer(
+                    1.0, dtype=inputs.dtype)
+
+            with vs.variable_scope("gates"):  # Reset gate and update gate.
+
+                self._gate_linear = _Linear(
+
+                    [inputs, state],
+
+                    2 * self._num_units,
+
+                    True,
+
+                    bias_initializer=bias_ones,
+
+                    kernel_initializer=self._kernel_initializer)
+
+        value = math_ops.sigmoid(self._gate_linear([inputs, state]))
+
+        r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
+
+        r_state = r * state
+
+        if self._candidate_linear is None:
+            with vs.variable_scope("candidate"):
+                self._candidate_linear = _Linear(
+
+                    [inputs, r_state],
+
+                    self._num_units,
+
+                    True,
+
+                    bias_initializer=self._bias_initializer,
+
+                    kernel_initializer=self._kernel_initializer)
+
+        c = self._activation(self._candidate_linear([inputs, r_state]))
+
+        u = (1.0 - att_score) * u
+
+        new_h = u * state + (1 - u) * c
+
+        return new_h, new_h
diff --git a/modelzoo/PNN/script/estimator/__init__.py b/modelzoo/PNN/script/estimator/__init__.py
new file mode 100644
index 00000000000..cf4f59d6c09
--- /dev/null
+++ b/modelzoo/PNN/script/estimator/__init__.py
@@ -0,0 +1 @@
+from .models import *
\ No newline at end of file
diff --git a/modelzoo/PNN/script/estimator/feature_column.py b/modelzoo/PNN/script/estimator/feature_column.py
new file mode 100644
index 00000000000..c8d7a6cd013
--- /dev/null
+++ b/modelzoo/PNN/script/estimator/feature_column.py
@@ -0,0 +1,52 @@
+import tensorflow as tf
+from tensorflow.python.feature_column.feature_column import _EmbeddingColumn
+
+from .utils import LINEAR_SCOPE_NAME, variable_scope, get_collection, get_GraphKeys, input_layer, get_losses
+
+
+def linear_model(features, linear_feature_columns):
+    if tf.__version__ >= '2.0.0':
+        linear_logits = tf.compat.v1.feature_column.linear_model(features, linear_feature_columns)
+    else:
+        linear_logits = tf.feature_column.linear_model(features, linear_feature_columns)
+    return linear_logits
+
+
+def get_linear_logit(features, linear_feature_columns, l2_reg_linear=0):
+    with variable_scope(LINEAR_SCOPE_NAME):
+        if not linear_feature_columns:
+            linear_logits = tf.Variable([[0.0]], name='bias_weights')
+        else:
+
+            linear_logits = linear_model(features, linear_feature_columns)
+
+            if l2_reg_linear > 0:
+                for var in get_collection(get_GraphKeys().TRAINABLE_VARIABLES, LINEAR_SCOPE_NAME)[:-1]:
+                    get_losses().add_loss(l2_reg_linear * tf.nn.l2_loss(var, name=var.name.split(":")[0] + "_l2loss"),
+                                          get_GraphKeys().REGULARIZATION_LOSSES)
+    return linear_logits
+
+
+def input_from_feature_columns(features, feature_columns, l2_reg_embedding=0.0):
+    dense_value_list = []
+    sparse_emb_list = []
+    for feat in feature_columns:
+        if is_embedding(feat):
+            sparse_emb = tf.expand_dims(input_layer(features, [feat]), axis=1)
+            sparse_emb_list.append(sparse_emb)
+            if l2_reg_embedding > 0:
+                get_losses().add_loss(l2_reg_embedding * tf.nn.l2_loss(sparse_emb, name=feat.name + "_l2loss"),
+                                      get_GraphKeys().REGULARIZATION_LOSSES)
+
+        else:
+            dense_value_list.append(input_layer(features, [feat]))
+
+    return sparse_emb_list, dense_value_list
+
+
+def is_embedding(feature_column):
+    try:
+        from tensorflow.python.feature_column.feature_column_v2 import EmbeddingColumn
+    except ImportError:
+        EmbeddingColumn = _EmbeddingColumn
+    return isinstance(feature_column, (_EmbeddingColumn, EmbeddingColumn))
diff --git a/modelzoo/PNN/script/estimator/inputs.py b/modelzoo/PNN/script/estimator/inputs.py
new file mode 100644
index 00000000000..2c175a9934e
--- /dev/null
+++ b/modelzoo/PNN/script/estimator/inputs.py
@@ -0,0 +1,52 @@
+import tensorflow as tf
+
+
+def input_fn_pandas(df, features, label=None, batch_size=256, num_epochs=1, shuffle=False, queue_capacity_factor=10,
+                    num_threads=1):
+    if label is not None:
+        y = df[label]
+    else:
+        y = None
+    if tf.__version__ >= "2.0.0":
+        return tf.compat.v1.estimator.inputs.pandas_input_fn(df[features], y, batch_size=batch_size,
+                                                             num_epochs=num_epochs,
+                                                             shuffle=shuffle,
+                                                             queue_capacity=batch_size * queue_capacity_factor,
+                                                             num_threads=num_threads)
+
+    return tf.estimator.inputs.pandas_input_fn(df[features], y, batch_size=batch_size, num_epochs=num_epochs,
+                                               shuffle=shuffle, queue_capacity=batch_size * queue_capacity_factor,
+                                               num_threads=num_threads)
+
+
+def input_fn_tfrecord(filenames, feature_description, label=None, batch_size=256, num_epochs=1, num_parallel_calls=8,
+                      shuffle_factor=10, prefetch_factor=1,
+                      ):
+    def _parse_examples(serial_exmp):
+        try:
+            features = tf.parse_single_example(serial_exmp, features=feature_description)
+        except AttributeError:
+            features = tf.io.parse_single_example(serial_exmp, features=feature_description)
+        if label is not None:
+            labels = features.pop(label)
+            return features, labels
+        return features
+
+    def input_fn():
+        dataset = tf.data.TFRecordDataset(filenames)
+        dataset = dataset.map(_parse_examples, num_parallel_calls=num_parallel_calls)
+        if shuffle_factor > 0:
+            dataset = dataset.shuffle(buffer_size=batch_size * shuffle_factor)
+
+        dataset = dataset.repeat(num_epochs).batch(batch_size)
+
+        if prefetch_factor > 0:
+            dataset = dataset.prefetch(buffer_size=batch_size * prefetch_factor)
+        try:
+            iterator = dataset.make_one_shot_iterator()
+        except AttributeError:
+            iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
+
+        return iterator.get_next()
+
+    return input_fn
diff --git a/modelzoo/PNN/script/estimator/models/__init__.py b/modelzoo/PNN/script/estimator/models/__init__.py
new file mode 100644
index 00000000000..9bc1e120dbc
--- /dev/null
+++ b/modelzoo/PNN/script/estimator/models/__init__.py
@@ -0,0 +1,13 @@
+from .afm import AFMEstimator
+from .autoint import AutoIntEstimator
+from .ccpm import CCPMEstimator
+from .dcn import DCNEstimator
+from .deepfm import DeepFMEstimator
+from .fwfm import FwFMEstimator
+from .fibinet import FiBiNETEstimator
+from .fnn import FNNEstimator
+from .nfm import NFMEstimator
+from .pnn import PNNEstimator
+from .wdl import WDLEstimator
+from .xdeepfm import xDeepFMEstimator
+from .deepfefm import DeepFEFMEstimator
diff --git a/modelzoo/PNN/script/estimator/models/pnn.py b/modelzoo/PNN/script/estimator/models/pnn.py
new file mode 100644
index 00000000000..7e4c159c276
--- /dev/null
+++ b/modelzoo/PNN/script/estimator/models/pnn.py
@@ -0,0 +1,93 @@
+# -*- coding:utf-8 -*-
+"""
+Author:
+    Weichen Shen, weichenswc@163.com
+
+Reference:
+    [1] Qu Y, Cai H, Ren K, et al. Product-based neural networks for user response prediction[C]//Data Mining (ICDM), 2016 IEEE 16th International Conference on. IEEE, 2016: 1149-1154.(https://arxiv.org/pdf/1611.00144.pdf)
+"""
+
+import tensorflow as tf
+
+from ..feature_column import get_linear_logit, input_from_feature_columns
+from ..utils import deepctr_model_fn, DNN_SCOPE_NAME, variable_scope
+from ...layers.core import DNN
+from ...layers.interaction import InnerProductLayer, OutterProductLayer
+from ...layers.utils import concat_func, combined_dnn_input
+
+
+def PNNEstimator(dnn_feature_columns, dnn_hidden_units=(256, 128, 64), l2_reg_embedding=1e-5, l2_reg_dnn=0,
+                 seed=1024, dnn_dropout=0, dnn_activation='relu', use_inner=True, use_outter=False, kernel_type='mat',
+                 task='binary', model_dir=None, config=None,
+                 linear_optimizer='Ftrl',
+                 dnn_optimizer='Adagrad', training_chief_hooks=None):
+    """Instantiates the Product-based Neural Network architecture.
+
+    :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
+    :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of deep net
+    :param l2_reg_embedding: float . L2 regularizer strength applied to embedding vector
+    :param l2_reg_dnn: float. L2 regularizer strength applied to DNN
+    :param seed: integer ,to use as random seed.
+    :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
+    :param dnn_activation: Activation function to use in DNN
+    :param use_inner: bool,whether use inner-product or not.
+    :param use_outter: bool,whether use outter-product or not.
+    :param kernel_type: str,kernel_type used in outter-product,can be ``'mat'`` , ``'vec'`` or ``'num'``
+    :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
+    :param model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+    :param config: tf.RunConfig object to configure the runtime settings.
+    :param linear_optimizer: An instance of `tf.Optimizer` used to apply gradients to
+        the linear part of the model. Defaults to FTRL optimizer.
+    :param dnn_optimizer: An instance of `tf.Optimizer` used to apply gradients to
+        the deep part of the model. Defaults to Adagrad optimizer.
+    :param training_chief_hooks: Iterable of `tf.train.SessionRunHook` objects to
+        run on the chief worker during training.
+    :return: A Tensorflow Estimator  instance.
+
+    """
+
+    if kernel_type not in ['mat', 'vec', 'num']:
+        raise ValueError("kernel_type must be mat,vec or num")
+
+    def _model_fn(features, labels, mode, config):
+        train_flag = (mode == tf.estimator.ModeKeys.TRAIN)
+
+        linear_logits = get_linear_logit(features, [], l2_reg_linear=0)
+
+        with variable_scope(DNN_SCOPE_NAME):
+            sparse_embedding_list, dense_value_list = input_from_feature_columns(features, dnn_feature_columns,
+                                                                                 l2_reg_embedding=l2_reg_embedding)
+
+            inner_product = tf.keras.layers.Flatten()(
+                InnerProductLayer()(sparse_embedding_list))
+            outter_product = OutterProductLayer(kernel_type)(sparse_embedding_list)
+
+            # ipnn deep input
+            linear_signal = tf.keras.layers.Reshape(
+                [sum(map(lambda x: int(x.shape[-1]), sparse_embedding_list))])(concat_func(sparse_embedding_list))
+
+            if use_inner and use_outter:
+                deep_input = tf.keras.layers.Concatenate()(
+                    [linear_signal, inner_product, outter_product])
+            elif use_inner:
+                deep_input = tf.keras.layers.Concatenate()(
+                    [linear_signal, inner_product])
+            elif use_outter:
+                deep_input = tf.keras.layers.Concatenate()(
+                    [linear_signal, outter_product])
+            else:
+                deep_input = linear_signal
+
+            dnn_input = combined_dnn_input([deep_input], dense_value_list)
+            dnn_out = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, False, seed=seed)(dnn_input, training=train_flag)
+            dnn_logit = tf.keras.layers.Dense(
+                1, use_bias=False, kernel_initializer=tf.keras.initializers.glorot_normal(seed))(dnn_out)
+
+        logits = linear_logits + dnn_logit
+
+        return deepctr_model_fn(features, mode, logits, labels, task, linear_optimizer, dnn_optimizer,
+                                training_chief_hooks=training_chief_hooks)
+
+    return tf.estimator.Estimator(_model_fn, model_dir=model_dir, config=config)
diff --git a/modelzoo/PNN/script/estimator/utils.py b/modelzoo/PNN/script/estimator/utils.py
new file mode 100644
index 00000000000..5d722515f6b
--- /dev/null
+++ b/modelzoo/PNN/script/estimator/utils.py
@@ -0,0 +1,217 @@
+import tensorflow as tf
+from tensorflow.python.estimator.canned.head import _Head
+from tensorflow.python.estimator.canned.optimizers import get_optimizer_instance
+
+LINEAR_SCOPE_NAME = 'linear'
+DNN_SCOPE_NAME = 'dnn'
+
+
+def _summary_key(head_name, val):
+    return '%s/%s' % (val, head_name) if head_name else val
+
+
+class Head(_Head):
+
+    def __init__(self, task,
+                 name=None):
+        self._task = task
+        self._name = name
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def logits_dimension(self):
+        return 1
+
+    def _eval_metric_ops(self,
+                         labels,
+                         logits,
+                         predictions,
+                         unweighted_loss,
+                         weights=None):
+
+        labels = to_float(labels)
+        predictions = to_float(predictions)
+
+        # with name_scope(None, 'metrics', (labels, logits, predictions,
+        # unweighted_loss, weights)):
+        metrics = get_metrics()
+        losses = get_losses()
+
+        metric_ops = {
+            _summary_key(self._name, "prediction/mean"): metrics.mean(predictions, weights=weights),
+            _summary_key(self._name, "label/mean"): metrics.mean(labels, weights=weights),
+        }
+
+        summary_scalar("prediction/mean", metric_ops[_summary_key(self._name, "prediction/mean")][1])
+        summary_scalar("label/mean", metric_ops[_summary_key(self._name, "label/mean")][1])
+
+
+        mean_loss = losses.compute_weighted_loss(
+            unweighted_loss, weights=1.0, reduction=losses.Reduction.MEAN)
+
+        if self._task == "binary":
+            metric_ops[_summary_key(self._name, "LogLoss")] = metrics.mean(mean_loss, weights=weights, )
+            summary_scalar("LogLoss", mean_loss)
+
+            metric_ops[_summary_key(self._name, "AUC")] = metrics.auc(labels, predictions, weights=weights)
+            summary_scalar("AUC", metric_ops[_summary_key(self._name, "AUC")][1])
+        else:
+
+            metric_ops[_summary_key(self._name, "MSE")] = metrics.mean_squared_error(labels, predictions,
+                                                                                     weights=weights)
+            summary_scalar("MSE", mean_loss)
+
+            metric_ops[_summary_key(self._name, "MAE")] = metrics.mean_absolute_error(labels, predictions,
+                                                                                      weights=weights)
+            summary_scalar("MAE", metric_ops[_summary_key(self._name, "MAE")][1])
+
+        return metric_ops
+
+    def create_loss(self, features, mode, logits, labels):
+        del mode, features  # Unused for this head.
+        losses = get_losses()
+        if self._task == "binary":
+            loss = losses.sigmoid_cross_entropy(labels, logits, reduction=losses.Reduction.NONE)
+        else:
+            loss = losses.mean_squared_error(labels, logits, reduction=losses.Reduction.NONE)
+        return loss
+
+    def create_estimator_spec(
+            self, features, mode, logits, labels=None, train_op_fn=None, training_chief_hooks=None):
+        # with name_scope('head'):
+        logits = tf.reshape(logits, [-1, 1])
+        if self._task == 'binary':
+            pred = tf.sigmoid(logits)
+        else:
+            pred = logits
+
+        predictions = {"pred": pred, "logits": logits}
+        export_outputs = {"predict": tf.estimator.export.PredictOutput(predictions)}
+        if mode == tf.estimator.ModeKeys.PREDICT:
+            return tf.estimator.EstimatorSpec(
+                mode=mode,
+                predictions=predictions,
+                export_outputs=export_outputs)
+
+        labels = tf.reshape(labels, [-1, 1])
+
+        unweighted_loss = self.create_loss(features, mode, logits, labels)
+
+        losses = get_losses()
+        loss = losses.compute_weighted_loss(
+            unweighted_loss, weights=1.0, reduction=losses.Reduction.SUM)
+        reg_loss = losses.get_regularization_loss()
+
+        training_loss = loss + reg_loss
+
+        eval_metric_ops = self._eval_metric_ops(labels, logits, pred, unweighted_loss)
+
+        return tf.estimator.EstimatorSpec(
+            mode=mode,
+            predictions=predictions,
+            loss=training_loss,
+            train_op=train_op_fn(training_loss),
+            eval_metric_ops=eval_metric_ops,
+            training_chief_hooks=training_chief_hooks)
+
+
+def deepctr_model_fn(features, mode, logits, labels, task, linear_optimizer, dnn_optimizer, training_chief_hooks):
+    linear_optimizer = get_optimizer_instance(linear_optimizer, 0.005)
+    dnn_optimizer = get_optimizer_instance(dnn_optimizer, 0.01)
+    train_op_fn = get_train_op_fn(linear_optimizer, dnn_optimizer)
+
+    head = Head(task)
+    return head.create_estimator_spec(features=features,
+                                      mode=mode,
+                                      labels=labels,
+                                      train_op_fn=train_op_fn,
+                                      logits=logits, training_chief_hooks=training_chief_hooks)
+
+
+def get_train_op_fn(linear_optimizer, dnn_optimizer):
+    def _train_op_fn(loss):
+        train_ops = []
+        try:
+            global_step = tf.train.get_global_step()
+        except AttributeError:
+            global_step = tf.compat.v1.train.get_global_step()
+        linear_var_list = get_collection(get_GraphKeys().TRAINABLE_VARIABLES, LINEAR_SCOPE_NAME)
+        dnn_var_list = get_collection(get_GraphKeys().TRAINABLE_VARIABLES, DNN_SCOPE_NAME)
+
+        if len(dnn_var_list) > 0:
+            train_ops.append(
+                dnn_optimizer.minimize(
+                    loss,
+                    var_list=dnn_var_list))
+        if len(linear_var_list) > 0:
+            train_ops.append(
+                linear_optimizer.minimize(
+                    loss,
+                    var_list=linear_var_list))
+
+        train_op = tf.group(*train_ops)
+        with tf.control_dependencies([train_op]):
+            try:
+                return tf.assign_add(global_step, 1).op
+            except AttributeError:
+                return tf.compat.v1.assign_add(global_step, 1).op
+
+    return _train_op_fn
+
+
+def variable_scope(name_or_scope):
+    try:
+        return tf.variable_scope(name_or_scope)
+    except AttributeError:
+        return tf.compat.v1.variable_scope(name_or_scope)
+
+def get_collection(key, scope=None):
+    try:
+        return tf.get_collection(key, scope=scope)
+    except AttributeError:
+        return tf.compat.v1.get_collection(key, scope=scope)
+
+
+def get_GraphKeys():
+    try:
+        return tf.GraphKeys
+    except AttributeError:
+        return tf.compat.v1.GraphKeys
+
+
+def get_losses():
+    try:
+        return tf.compat.v1.losses
+    except AttributeError:
+        return tf.losses
+
+
+def input_layer(features, feature_columns):
+    try:
+        return tf.feature_column.input_layer(features, feature_columns)
+    except AttributeError:
+        return tf.compat.v1.feature_column.input_layer(features, feature_columns)
+
+
+def get_metrics():
+    try:
+        return tf.compat.v1.metrics
+    except AttributeError:
+        return tf.metrics
+
+
+def to_float(x, name="ToFloat"):
+    try:
+        return tf.to_float(x, name)
+    except AttributeError:
+        return tf.compat.v1.to_float(x, name)
+
+
+def summary_scalar(name, data):
+    try:
+        tf.summary.scalar(name, data)
+    except AttributeError:  # tf version 2.5.0+:AttributeError: module 'tensorflow._api.v2.summary' has no attribute 'scalar'
+        tf.compat.v1.summary.scalar(name, data)
\ No newline at end of file
diff --git a/modelzoo/PNN/script/feature_column.py b/modelzoo/PNN/script/feature_column.py
new file mode 100644
index 00000000000..3b778360b33
--- /dev/null
+++ b/modelzoo/PNN/script/feature_column.py
@@ -0,0 +1,249 @@
+import tensorflow as tf
+from collections import namedtuple, OrderedDict
+from copy import copy
+from itertools import chain
+
+from tensorflow.python.keras.initializers import RandomNormal, Zeros
+from tensorflow.python.keras.layers import Input, Lambda
+
+from .inputs import create_embedding_matrix, embedding_lookup, get_dense_input, varlen_embedding_lookup, \
+    get_varlen_pooling_list, mergeDict
+from .layers import Linear
+from .layers.utils import concat_func
+#from keras import backend as K
+import pandas as pd
+import numpy as np
+
+fi = open('../../deep_ctr_master/data/fm.model.txt','r')
+
+first = True
+feat_weights={}
+k=0
+for line in fi:
+    s = line.strip().split()
+    if first:
+        first = False
+        w_0 = float(s[0])
+        feat_num = int(s[1])
+        k = int(s[2]) + 1 # w and v
+
+    else:
+        feat = int(s[0])
+        weights = [float(s[1 + i]) for i in range(k)]
+        feat_weights[feat] = weights
+
+list1 =[]
+for col,val in feat_weights.items():
+    list1.append(val)
+
+# def my_init(shape,dtype=None):
+#     weight = np.array(list1)
+#
+#     return weight.reshape(shape)
+
+
+DEFAULT_GROUP_NAME = "default_group"
+
+
+class SparseFeat(namedtuple('SparseFeat',
+                            ['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'vocabulary_path', 'dtype', 'embeddings_initializer',
+                             'embedding_name',
+                             'group_name', 'trainable'])):
+    __slots__ = ()
+
+    def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype="int32", embeddings_initializer=None,
+                embedding_name=None,
+                group_name=DEFAULT_GROUP_NAME, trainable=True):
+
+        if embedding_dim == "auto":
+            embedding_dim = 6 * int(pow(vocabulary_size, 0.25))
+        if embeddings_initializer is None:
+            embeddings_initializer = RandomNormal(mean=0.0, stddev=0.0001, seed=2020)
+        # if embeddings_initializer=='fm':
+        #     embeddings_initializer = my_init(shape=(vocabulary_size,embedding_dim))
+
+
+
+        if embedding_name is None:
+            embedding_name = name
+
+        return super(SparseFeat, cls).__new__(cls, name, vocabulary_size, embedding_dim, use_hash, vocabulary_path, dtype,
+                                              embeddings_initializer,
+                                              embedding_name, group_name, trainable)
+
+    def __hash__(self):
+        return self.name.__hash__()
+
+
+class VarLenSparseFeat(namedtuple('VarLenSparseFeat',
+                                  ['sparsefeat', 'maxlen', 'combiner', 'length_name', 'weight_name', 'weight_norm'])):
+    __slots__ = ()
+
+    def __new__(cls, sparsefeat, maxlen, combiner="mean", length_name=None, weight_name=None, weight_norm=True):
+        return super(VarLenSparseFeat, cls).__new__(cls, sparsefeat, maxlen, combiner, length_name, weight_name,
+                                                    weight_norm)
+
+    @property
+    def name(self):
+        return self.sparsefeat.name
+
+    @property
+    def vocabulary_size(self):
+        return self.sparsefeat.vocabulary_size
+
+    @property
+    def embedding_dim(self):
+        return self.sparsefeat.embedding_dim
+
+    @property
+    def use_hash(self):
+        return self.sparsefeat.use_hash
+
+    @property
+    def vocabulary_path(self):
+        return self.sparsefeat.vocabulary_path
+
+    @property
+    def dtype(self):
+        return self.sparsefeat.dtype
+
+    @property
+    def embeddings_initializer(self):
+        return self.sparsefeat.embeddings_initializer
+
+    @property
+    def embedding_name(self):
+        return self.sparsefeat.embedding_name
+
+    @property
+    def group_name(self):
+        return self.sparsefeat.group_name
+
+    @property
+    def trainable(self):
+        return self.sparsefeat.trainable
+
+    def __hash__(self):
+        return self.name.__hash__()
+
+
+class DenseFeat(namedtuple('DenseFeat', ['name', 'dimension', 'dtype', 'transform_fn'])):
+    """ Dense feature
+    Args:
+        name: feature name,
+        dimension: dimension of the feature, default = 1.
+        dtype: dtype of the feature, default="float32".
+        transform_fn: If not `None` , a function that can be used to transform
+        values of the feature.  the function takes the input Tensor as its
+        argument, and returns the output Tensor.
+        (e.g. lambda x: (x - 3.0) / 4.2).
+    """
+    __slots__ = ()
+
+    def __new__(cls, name, dimension=1, dtype="float32", transform_fn=None):
+        return super(DenseFeat, cls).__new__(cls, name, dimension, dtype, transform_fn)
+
+    def __hash__(self):
+        return self.name.__hash__()
+
+    # def __eq__(self, other):
+    #     if self.name == other.name:
+    #         return True
+    #     return False
+
+    # def __repr__(self):
+    #     return 'DenseFeat:'+self.name
+
+
+def get_feature_names(feature_columns):
+    features = build_input_features(feature_columns)
+    return list(features.keys())
+
+
+def build_input_features(feature_columns, prefix=''):
+    input_features = OrderedDict()
+    for fc in feature_columns:
+        if isinstance(fc, SparseFeat):
+            input_features[fc.name] = Input(
+                shape=(1,), name=prefix + fc.name, dtype=fc.dtype)
+        elif isinstance(fc, DenseFeat):
+            input_features[fc.name] = Input(
+                shape=(fc.dimension,), name=prefix + fc.name, dtype=fc.dtype)
+        elif isinstance(fc, VarLenSparseFeat):
+            input_features[fc.name] = Input(shape=(fc.maxlen,), name=prefix + fc.name,
+                                            dtype=fc.dtype)
+            if fc.weight_name is not None:
+                input_features[fc.weight_name] = Input(shape=(fc.maxlen, 1), name=prefix + fc.weight_name,
+                                                       dtype="float32")
+            if fc.length_name is not None:
+                input_features[fc.length_name] = Input((1,), name=prefix + fc.length_name, dtype='int32')
+
+        else:
+            raise TypeError("Invalid feature column type,got", type(fc))
+
+    return input_features
+
+
+def get_linear_logit(features, feature_columns, units=1, use_bias=False, seed=1024, prefix='linear',
+                     l2_reg=0, sparse_feat_refine_weight=None):
+    linear_feature_columns = copy(feature_columns)
+    for i in range(len(linear_feature_columns)):
+        if isinstance(linear_feature_columns[i], SparseFeat):
+            linear_feature_columns[i] = linear_feature_columns[i]._replace(embedding_dim=1,
+                                                                           embeddings_initializer=Zeros())
+        if isinstance(linear_feature_columns[i], VarLenSparseFeat):
+            linear_feature_columns[i] = linear_feature_columns[i]._replace(
+                sparsefeat=linear_feature_columns[i].sparsefeat._replace(embedding_dim=1,
+                                                                         embeddings_initializer=Zeros()))
+
+    linear_emb_list = [input_from_feature_columns(features, linear_feature_columns, l2_reg, seed,
+                                                  prefix=prefix + str(i))[0] for i in range(units)]
+    _, dense_input_list = input_from_feature_columns(features, linear_feature_columns, l2_reg, seed, prefix=prefix)
+
+    linear_logit_list = []
+    for i in range(units):
+
+        if len(linear_emb_list[i]) > 0 and len(dense_input_list) > 0:
+            sparse_input = concat_func(linear_emb_list[i])
+            dense_input = concat_func(dense_input_list)
+            if sparse_feat_refine_weight is not None:
+                sparse_input = Lambda(lambda x: x[0] * tf.expand_dims(x[1], axis=1))(
+                    [sparse_input, sparse_feat_refine_weight])
+            linear_logit = Linear(l2_reg, mode=2, use_bias=use_bias, seed=seed)([sparse_input, dense_input])
+        elif len(linear_emb_list[i]) > 0:
+            sparse_input = concat_func(linear_emb_list[i])
+            if sparse_feat_refine_weight is not None:
+                sparse_input = Lambda(lambda x: x[0] * tf.expand_dims(x[1], axis=1))(
+                    [sparse_input, sparse_feat_refine_weight])
+            linear_logit = Linear(l2_reg, mode=0, use_bias=use_bias, seed=seed)(sparse_input)
+        elif len(dense_input_list) > 0:
+            dense_input = concat_func(dense_input_list)
+            linear_logit = Linear(l2_reg, mode=1, use_bias=use_bias, seed=seed)(dense_input)
+        else:   #empty feature_columns
+            return Lambda(lambda x: tf.constant([[0.0]]))(list(features.values())[0])
+        linear_logit_list.append(linear_logit)
+
+    return concat_func(linear_logit_list)
+
+
+def input_from_feature_columns(features, feature_columns, l2_reg, seed, prefix='', seq_mask_zero=True,
+                               support_dense=True, support_group=False):
+    sparse_feature_columns = list(
+        filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
+    varlen_sparse_feature_columns = list(
+        filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else []
+
+    embedding_matrix_dict = create_embedding_matrix(feature_columns, l2_reg, seed, prefix=prefix,
+                                                    seq_mask_zero=seq_mask_zero)
+    group_sparse_embedding_dict = embedding_lookup(embedding_matrix_dict, features, sparse_feature_columns)
+    dense_value_list = get_dense_input(features, feature_columns)
+    if not support_dense and len(dense_value_list) > 0:
+        raise ValueError("DenseFeat is not supported in dnn_feature_columns")
+
+    sequence_embed_dict = varlen_embedding_lookup(embedding_matrix_dict, features, varlen_sparse_feature_columns)
+    group_varlen_sparse_embedding_dict = get_varlen_pooling_list(sequence_embed_dict, features,
+                                                                 varlen_sparse_feature_columns)
+    group_embedding_dict = mergeDict(group_sparse_embedding_dict, group_varlen_sparse_embedding_dict)
+    if not support_group:
+        group_embedding_dict = list(chain.from_iterable(group_embedding_dict.values()))
+    return group_embedding_dict, dense_value_list
diff --git a/modelzoo/PNN/script/inputs.py b/modelzoo/PNN/script/inputs.py
new file mode 100644
index 00000000000..d567f846265
--- /dev/null
+++ b/modelzoo/PNN/script/inputs.py
@@ -0,0 +1,155 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+from collections import defaultdict
+from itertools import chain
+
+from tensorflow.python.keras.layers import Embedding, Lambda
+from tensorflow.python.keras.regularizers import l2
+
+from .layers.sequence import SequencePoolingLayer, WeightedSequenceLayer
+from .layers.utils import Hash
+
+
+def get_inputs_list(inputs):
+    return list(chain(*list(map(lambda x: x.values(), filter(lambda x: x is not None, inputs)))))
+
+
+def create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, seed, l2_reg,
+                          prefix='sparse_', seq_mask_zero=True):
+    sparse_embedding = {}
+    for feat in sparse_feature_columns:
+        emb = Embedding(feat.vocabulary_size, feat.embedding_dim,
+                        embeddings_initializer=feat.embeddings_initializer,
+                        embeddings_regularizer=l2(l2_reg),
+                        name=prefix + '_emb_' + feat.embedding_name)
+        emb.trainable = feat.trainable
+        sparse_embedding[feat.embedding_name] = emb
+
+    if varlen_sparse_feature_columns and len(varlen_sparse_feature_columns) > 0:
+        for feat in varlen_sparse_feature_columns:
+            # if feat.name not in sparse_embedding:
+            emb = Embedding(feat.vocabulary_size, feat.embedding_dim,
+                            embeddings_initializer=feat.embeddings_initializer,
+                            embeddings_regularizer=l2(
+                                l2_reg),
+                            name=prefix + '_seq_emb_' + feat.name,
+                            mask_zero=seq_mask_zero)
+            emb.trainable = feat.trainable
+            sparse_embedding[feat.embedding_name] = emb
+    return sparse_embedding
+
+
+def get_embedding_vec_list(embedding_dict, input_dict, sparse_feature_columns, return_feat_list=(), mask_feat_list=()):
+    embedding_vec_list = []
+    for fg in sparse_feature_columns:
+        feat_name = fg.name
+        if len(return_feat_list) == 0 or feat_name in return_feat_list:
+            if fg.use_hash:
+                lookup_idx = Hash(fg.vocabulary_size, mask_zero=(feat_name in mask_feat_list), vocabulary_path=fg.vocabulary_path)(input_dict[feat_name])
+            else:
+                lookup_idx = input_dict[feat_name]
+
+            embedding_vec_list.append(embedding_dict[feat_name](lookup_idx))
+
+    return embedding_vec_list
+
+
+def create_embedding_matrix(feature_columns, l2_reg, seed, prefix="", seq_mask_zero=True):
+    from . import feature_column as fc_lib
+
+    sparse_feature_columns = list(
+        filter(lambda x: isinstance(x, fc_lib.SparseFeat), feature_columns)) if feature_columns else []
+    varlen_sparse_feature_columns = list(
+        filter(lambda x: isinstance(x, fc_lib.VarLenSparseFeat), feature_columns)) if feature_columns else []
+    sparse_emb_dict = create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, seed,
+                                            l2_reg, prefix=prefix + 'sparse', seq_mask_zero=seq_mask_zero)
+    return sparse_emb_dict
+
+
+def embedding_lookup(sparse_embedding_dict, sparse_input_dict, sparse_feature_columns, return_feat_list=(),
+                     mask_feat_list=(), to_list=False):
+    group_embedding_dict = defaultdict(list)
+    for fc in sparse_feature_columns:
+        feature_name = fc.name
+        embedding_name = fc.embedding_name
+        if (len(return_feat_list) == 0 or feature_name in return_feat_list):
+            if fc.use_hash:
+                lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list), vocabulary_path=fc.vocabulary_path)(
+                    sparse_input_dict[feature_name])
+            else:
+                lookup_idx = sparse_input_dict[feature_name]
+
+            group_embedding_dict[fc.group_name].append(sparse_embedding_dict[embedding_name](lookup_idx))
+    if to_list:
+        return list(chain.from_iterable(group_embedding_dict.values()))
+    return group_embedding_dict
+
+
+def varlen_embedding_lookup(embedding_dict, sequence_input_dict, varlen_sparse_feature_columns):
+    varlen_embedding_vec_dict = {}
+    for fc in varlen_sparse_feature_columns:
+        feature_name = fc.name
+        embedding_name = fc.embedding_name
+        if fc.use_hash:
+            lookup_idx = Hash(fc.vocabulary_size, mask_zero=True, vocabulary_path=fc.vocabulary_path)(sequence_input_dict[feature_name])
+        else:
+            lookup_idx = sequence_input_dict[feature_name]
+        varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](lookup_idx)
+    return varlen_embedding_vec_dict
+
+
+def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_columns, to_list=False):
+    pooling_vec_list = defaultdict(list)
+    for fc in varlen_sparse_feature_columns:
+        feature_name = fc.name
+        combiner = fc.combiner
+        feature_length_name = fc.length_name
+        if feature_length_name is not None:
+            if fc.weight_name is not None:
+                seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm)(
+                    [embedding_dict[feature_name], features[feature_length_name], features[fc.weight_name]])
+            else:
+                seq_input = embedding_dict[feature_name]
+            vec = SequencePoolingLayer(combiner, supports_masking=False)(
+                [seq_input, features[feature_length_name]])
+        else:
+            if fc.weight_name is not None:
+                seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm, supports_masking=True)(
+                    [embedding_dict[feature_name], features[fc.weight_name]])
+            else:
+                seq_input = embedding_dict[feature_name]
+            vec = SequencePoolingLayer(combiner, supports_masking=True)(
+                seq_input)
+        pooling_vec_list[fc.group_name].append(vec)
+    if to_list:
+        return chain.from_iterable(pooling_vec_list.values())
+    return pooling_vec_list
+
+
+def get_dense_input(features, feature_columns):
+    from . import feature_column as fc_lib
+    dense_feature_columns = list(
+        filter(lambda x: isinstance(x, fc_lib.DenseFeat), feature_columns)) if feature_columns else []
+    dense_input_list = []
+    for fc in dense_feature_columns:
+        if fc.transform_fn is None:
+            dense_input_list.append(features[fc.name])
+        else:
+            transform_result = Lambda(fc.transform_fn)(features[fc.name])
+            dense_input_list.append(transform_result)
+    return dense_input_list
+
+
+def mergeDict(a, b):
+    c = defaultdict(list)
+    for k, v in a.items():
+        c[k].extend(v)
+    for k, v in b.items():
+        c[k].extend(v)
+    return c
diff --git a/modelzoo/PNN/script/layers/__init__.py b/modelzoo/PNN/script/layers/__init__.py
new file mode 100644
index 00000000000..1bfd40effe7
--- /dev/null
+++ b/modelzoo/PNN/script/layers/__init__.py
@@ -0,0 +1,52 @@
+import tensorflow as tf
+
+from .activation import Dice
+from .core import DNN, LocalActivationUnit, PredictionLayer
+from .interaction import (CIN, FM, AFMLayer, BiInteractionPooling, CrossNet, CrossNetMix,
+                          InnerProductLayer, InteractingLayer,
+                          OutterProductLayer, FGCNNLayer, SENETLayer, BilinearInteraction,
+                          FieldWiseBiInteraction, FwFMLayer, FEFMLayer)
+from .normalization import LayerNormalization
+from .sequence import (AttentionSequencePoolingLayer, BiasEncoding, BiLSTM,
+                       KMaxPooling, SequencePoolingLayer, WeightedSequenceLayer,
+                       Transformer, DynamicGRU,PositionEncoding)
+
+from .utils import NoMask, Hash, Linear, _Add, combined_dnn_input, softmax, reduce_sum
+
+custom_objects = {'tf': tf,
+                  'InnerProductLayer': InnerProductLayer,
+                  'OutterProductLayer': OutterProductLayer,
+                  'DNN': DNN,
+                  'PredictionLayer': PredictionLayer,
+                  'FM': FM,
+                  'AFMLayer': AFMLayer,
+                  'CrossNet': CrossNet,
+                  'CrossNetMix': CrossNetMix,
+                  'BiInteractionPooling': BiInteractionPooling,
+                  'LocalActivationUnit': LocalActivationUnit,
+                  'Dice': Dice,
+                  'SequencePoolingLayer': SequencePoolingLayer,
+                  'AttentionSequencePoolingLayer': AttentionSequencePoolingLayer,
+                  'CIN': CIN,
+                  'InteractingLayer': InteractingLayer,
+                  'LayerNormalization': LayerNormalization,
+                  'BiLSTM': BiLSTM,
+                  'Transformer': Transformer,
+                  'NoMask': NoMask,
+                  'BiasEncoding': BiasEncoding,
+                  'KMaxPooling': KMaxPooling,
+                  'FGCNNLayer': FGCNNLayer,
+                  'Hash': Hash,
+                  'Linear': Linear,
+                  'DynamicGRU': DynamicGRU,
+                  'SENETLayer': SENETLayer,
+                  'BilinearInteraction': BilinearInteraction,
+                  'WeightedSequenceLayer': WeightedSequenceLayer,
+                  '_Add': _Add,
+                  'FieldWiseBiInteraction': FieldWiseBiInteraction,
+                  'FwFMLayer': FwFMLayer,
+                  'softmax': softmax,
+                  'FEFMLayer': FEFMLayer,
+                  'reduce_sum': reduce_sum,
+                  'PositionEncoding':PositionEncoding
+                  }
diff --git a/modelzoo/PNN/script/layers/activation.py b/modelzoo/PNN/script/layers/activation.py
new file mode 100644
index 00000000000..1b953bff8bc
--- /dev/null
+++ b/modelzoo/PNN/script/layers/activation.py
@@ -0,0 +1,85 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+import tensorflow as tf
+
+try:
+    from tensorflow.python.ops.init_ops import Zeros
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import Zeros
+from tensorflow.python.keras.layers import Layer, Activation
+
+try:
+    from tensorflow.python.keras.layers import BatchNormalization
+except ImportError:
+    BatchNormalization = tf.keras.layers.BatchNormalization
+
+try:
+    unicode
+except NameError:
+    unicode = str
+
+
+class Dice(Layer):
+    """The Data Adaptive Activation Function in DIN,which can be viewed as a generalization of PReLu and can adaptively adjust the rectified point according to distribution of input data.
+
+      Input shape
+        - Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model.
+
+      Output shape
+        - Same shape as the input.
+
+      Arguments
+        - **axis** : Integer, the axis that should be used to compute data distribution (typically the features axis).
+
+        - **epsilon** : Small float added to variance to avoid dividing by zero.
+
+      References
+        - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
+    """
+
+    def __init__(self, axis=-1, epsilon=1e-9, **kwargs):
+        self.axis = axis
+        self.epsilon = epsilon
+        super(Dice, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        self.bn = BatchNormalization(
+            axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
+        self.alphas = self.add_weight(shape=(input_shape[-1],), initializer=Zeros(
+        ), dtype=tf.float32, name='dice_alpha')  # name='alpha_'+self.name
+        super(Dice, self).build(input_shape)  # Be sure to call this somewhere!
+        self.uses_learning_phase = True
+
+    def call(self, inputs, training=None, **kwargs):
+        inputs_normed = self.bn(inputs, training=training)
+        # tf.layers.batch_normalization(
+        # inputs, axis=self.axis, epsilon=self.epsilon, center=False, scale=False)
+        x_p = tf.sigmoid(inputs_normed)
+        return self.alphas * (1.0 - x_p) * inputs + x_p * inputs
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self, ):
+        config = {'axis': self.axis, 'epsilon': self.epsilon}
+        base_config = super(Dice, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+def activation_layer(activation):
+    if activation in ("dice", "Dice"):
+        act_layer = Dice()
+    elif isinstance(activation, (str, unicode)):
+        act_layer = Activation(activation)
+    elif issubclass(activation, Layer):
+        act_layer = activation()
+    else:
+        raise ValueError(
+            "Invalid activation,found %s.You should use a str or a Activation Layer Class." % (activation))
+    return act_layer
diff --git a/modelzoo/PNN/script/layers/core.py b/modelzoo/PNN/script/layers/core.py
new file mode 100644
index 00000000000..668348d2eb7
--- /dev/null
+++ b/modelzoo/PNN/script/layers/core.py
@@ -0,0 +1,267 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+import tensorflow as tf
+from tensorflow.python.keras import backend as K
+
+try:
+    from tensorflow.python.ops.init_ops_v2 import Zeros, glorot_normal
+except ImportError:
+    from tensorflow.python.ops.init_ops import Zeros, glorot_normal_initializer as glorot_normal
+
+from tensorflow.python.keras.layers import Layer, Dropout
+
+try:
+    from tensorflow.python.keras.layers import BatchNormalization
+except ImportError:
+    BatchNormalization = tf.keras.layers.BatchNormalization
+from tensorflow.python.keras.regularizers import l2
+
+from .activation import activation_layer
+
+
+class LocalActivationUnit(Layer):
+    """The LocalActivationUnit used in DIN with which the representation of
+    user interests varies adaptively given different candidate items.
+
+      Input shape
+        - A list of two 3D tensor with shape:  ``(batch_size, 1, embedding_size)`` and ``(batch_size, T, embedding_size)``
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, T, 1)``.
+
+      Arguments
+        - **hidden_units**:list of positive integer, the attention net layer number and units in each layer.
+
+        - **activation**: Activation function to use in attention net.
+
+        - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix of attention net.
+
+        - **dropout_rate**: float in [0,1). Fraction of the units to dropout in attention net.
+
+        - **use_bn**: bool. Whether use BatchNormalization before activation or not in attention net.
+
+        - **seed**: A Python integer to use as random seed.
+
+      References
+        - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
+    """
+
+    def __init__(self, hidden_units=(64, 32), activation='sigmoid', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024,
+                 **kwargs):
+        self.hidden_units = hidden_units
+        self.activation = activation
+        self.l2_reg = l2_reg
+        self.dropout_rate = dropout_rate
+        self.use_bn = use_bn
+        self.seed = seed
+        super(LocalActivationUnit, self).__init__(**kwargs)
+        self.supports_masking = True
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) != 2:
+            raise ValueError('A `LocalActivationUnit` layer should be called '
+                             'on a list of 2 inputs')
+
+        if len(input_shape[0]) != 3 or len(input_shape[1]) != 3:
+            raise ValueError("Unexpected inputs dimensions %d and %d, expect to be 3 dimensions" % (
+                len(input_shape[0]), len(input_shape[1])))
+
+        if input_shape[0][-1] != input_shape[1][-1] or input_shape[0][1] != 1:
+            raise ValueError('A `LocalActivationUnit` layer requires '
+                             'inputs of a two inputs with shape (None,1,embedding_size) and (None,T,embedding_size)'
+                             'Got different shapes: %s,%s' % (input_shape[0], input_shape[1]))
+        size = 4 * \
+               int(input_shape[0][-1]
+                   ) if len(self.hidden_units) == 0 else self.hidden_units[-1]
+        self.kernel = self.add_weight(shape=(size, 1),
+                                      initializer=glorot_normal(
+                                          seed=self.seed),
+                                      name="kernel")
+        self.bias = self.add_weight(
+            shape=(1,), initializer=Zeros(), name="bias")
+        self.dnn = DNN(self.hidden_units, self.activation, self.l2_reg, self.dropout_rate, self.use_bn, seed=self.seed)
+
+        super(LocalActivationUnit, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, training=None, **kwargs):
+
+        query, keys = inputs
+
+        keys_len = keys.get_shape()[1]
+        queries = K.repeat_elements(query, keys_len, 1)
+
+        att_input = tf.concat(
+            [queries, keys, queries - keys, queries * keys], axis=-1)
+
+        att_out = self.dnn(att_input, training=training)
+
+        attention_score = tf.nn.bias_add(tf.tensordot(att_out, self.kernel, axes=(-1, 0)), self.bias)
+
+        return attention_score
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[1][:2] + (1,)
+
+    def compute_mask(self, inputs, mask):
+        return mask
+
+    def get_config(self, ):
+        config = {'activation': self.activation, 'hidden_units': self.hidden_units,
+                  'l2_reg': self.l2_reg, 'dropout_rate': self.dropout_rate, 'use_bn': self.use_bn, 'seed': self.seed}
+        base_config = super(LocalActivationUnit, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class DNN(Layer):
+    """The Multi Layer Percetron
+
+      Input shape
+        - nD tensor with shape: ``(batch_size, ..., input_dim)``. The most common situation would be a 2D input with shape ``(batch_size, input_dim)``.
+
+      Output shape
+        - nD tensor with shape: ``(batch_size, ..., hidden_size[-1])``. For instance, for a 2D input with shape ``(batch_size, input_dim)``, the output would have shape ``(batch_size, hidden_size[-1])``.
+
+      Arguments
+        - **hidden_units**:list of positive integer, the layer number and units in each layer.
+
+        - **activation**: Activation function to use.
+
+        - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix.
+
+        - **dropout_rate**: float in [0,1). Fraction of the units to dropout.
+
+        - **use_bn**: bool. Whether use BatchNormalization before activation or not.
+
+        - **output_activation**: Activation function to use in the last layer.If ``None``,it will be same as ``activation``.
+
+        - **seed**: A Python integer to use as random seed.
+    """
+
+    def __init__(self, hidden_units, activation='relu', l2_reg=0, dropout_rate=0, use_bn=False, output_activation=None,
+                 seed=1024, **kwargs):
+        self.hidden_units = hidden_units
+        self.activation = activation
+        self.l2_reg = l2_reg
+        self.dropout_rate = dropout_rate
+        self.use_bn = use_bn
+        self.output_activation = output_activation
+        self.seed = seed
+
+        super(DNN, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # if len(self.hidden_units) == 0:
+        #     raise ValueError("hidden_units is empty")
+        input_size = input_shape[-1]
+        hidden_units = [int(input_size)] + list(self.hidden_units)
+        self.kernels = [self.add_weight(name='kernel' + str(i),
+                                        shape=(
+                                            hidden_units[i], hidden_units[i + 1]),
+                                        initializer=glorot_normal(
+                                            seed=self.seed),
+                                        regularizer=l2(self.l2_reg),
+                                        trainable=True) for i in range(len(self.hidden_units))]
+        self.bias = [self.add_weight(name='bias' + str(i),
+                                     shape=(self.hidden_units[i],),
+                                     initializer=Zeros(),
+                                     trainable=True) for i in range(len(self.hidden_units))]
+        if self.use_bn:
+            self.bn_layers = [BatchNormalization() for _ in range(len(self.hidden_units))]
+
+        self.dropout_layers = [Dropout(self.dropout_rate, seed=self.seed + i) for i in
+                               range(len(self.hidden_units))]
+
+        self.activation_layers = [activation_layer(self.activation) for _ in range(len(self.hidden_units))]
+
+        if self.output_activation:
+            self.activation_layers[-1] = activation_layer(self.output_activation)
+
+        super(DNN, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, training=None, **kwargs):
+
+        deep_input = inputs
+
+        for i in range(len(self.hidden_units)):
+            fc = tf.nn.bias_add(tf.tensordot(
+                deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i])
+
+            if self.use_bn:
+                fc = self.bn_layers[i](fc, training=training)
+            try:
+                fc = self.activation_layers[i](fc, training=training)
+            except TypeError as e:  # TypeError: call() got an unexpected keyword argument 'training'
+                print("make sure the activation function use training flag properly", e)
+                fc = self.activation_layers[i](fc)
+
+            fc = self.dropout_layers[i](fc, training=training)
+            deep_input = fc
+
+        return deep_input
+
+    def compute_output_shape(self, input_shape):
+        if len(self.hidden_units) > 0:
+            shape = input_shape[:-1] + (self.hidden_units[-1],)
+        else:
+            shape = input_shape
+
+        return tuple(shape)
+
+    def get_config(self, ):
+        config = {'activation': self.activation, 'hidden_units': self.hidden_units,
+                  'l2_reg': self.l2_reg, 'use_bn': self.use_bn, 'dropout_rate': self.dropout_rate,
+                  'output_activation': self.output_activation, 'seed': self.seed}
+        base_config = super(DNN, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class PredictionLayer(Layer):
+    """
+      Arguments
+         - **task**: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
+
+         - **use_bias**: bool.Whether add bias term or not.
+    """
+
+    def __init__(self, task='binary', use_bias=True, **kwargs):
+        if task not in ["binary", "multiclass", "regression"]:
+            raise ValueError("task must be binary,multiclass or regression")
+        self.task = task
+        self.use_bias = use_bias
+        super(PredictionLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if self.use_bias:
+            self.global_bias = self.add_weight(
+                shape=(1,), initializer=Zeros(), name="global_bias")
+
+        # Be sure to call this somewhere!
+        super(PredictionLayer, self).build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        x = inputs
+        if self.use_bias:
+            x = tf.nn.bias_add(x, self.global_bias, data_format='NHWC')
+        if self.task == "binary":
+            x = tf.sigmoid(x)
+
+        output = tf.reshape(x, (-1, 1))
+
+        return output
+
+    def compute_output_shape(self, input_shape):
+        return (None, 1)
+
+    def get_config(self, ):
+        config = {'task': self.task, 'use_bias': self.use_bias}
+        base_config = super(PredictionLayer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/modelzoo/PNN/script/layers/interaction.py b/modelzoo/PNN/script/layers/interaction.py
new file mode 100644
index 00000000000..e18b159059a
--- /dev/null
+++ b/modelzoo/PNN/script/layers/interaction.py
@@ -0,0 +1,1492 @@
+# -*- coding:utf-8 -*-
+"""
+
+Authors:
+    Weichen Shen,weichenswc@163.com,
+    Harshit Pande
+
+"""
+
+import itertools
+
+import tensorflow as tf
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.backend import batch_dot
+
+try:
+    from tensorflow.python.ops.init_ops import Zeros, Ones, Constant, TruncatedNormal, \
+        glorot_normal_initializer as glorot_normal, \
+        glorot_uniform_initializer as glorot_uniform
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import Zeros, Ones, Constant, TruncatedNormal, glorot_normal, glorot_uniform
+
+from tensorflow.python.keras.layers import Layer, MaxPooling2D, Conv2D, Dropout, Lambda, Dense, Flatten
+from tensorflow.python.keras.regularizers import l2
+from tensorflow.python.layers import utils
+
+from .activation import activation_layer
+from .utils import concat_func, reduce_sum, softmax, reduce_mean
+
+
+class AFMLayer(Layer):
+    """Attentonal Factorization Machine models pairwise (order-2) feature
+    interactions without linear term and bias.
+
+      Input shape
+        - A list of 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, 1)``.
+
+      Arguments
+        - **attention_factor** : Positive integer, dimensionality of the
+         attention network output space.
+
+        - **l2_reg_w** : float between 0 and 1. L2 regularizer strength
+         applied to attention network.
+
+        - **dropout_rate** : float between in [0,1). Fraction of the attention net output units to dropout.
+
+        - **seed** : A Python integer to use as random seed.
+
+      References
+        - [Attentional Factorization Machines : Learning the Weight of Feature
+        Interactions via Attention Networks](https://arxiv.org/pdf/1708.04617.pdf)
+    """
+
+    def __init__(self, attention_factor=4, l2_reg_w=0, dropout_rate=0, seed=1024, **kwargs):
+        self.attention_factor = attention_factor
+        self.l2_reg_w = l2_reg_w
+        self.dropout_rate = dropout_rate
+        self.seed = seed
+        super(AFMLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            # input_shape = input_shape[0]
+            # if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError('A `AttentionalFM` layer should be called '
+                             'on a list of at least 2 inputs')
+
+        shape_set = set()
+        reduced_input_shape = [shape.as_list() for shape in input_shape]
+        for i in range(len(input_shape)):
+            shape_set.add(tuple(reduced_input_shape[i]))
+
+        if len(shape_set) > 1:
+            raise ValueError('A `AttentionalFM` layer requires '
+                             'inputs with same shapes '
+                             'Got different shapes: %s' % (shape_set))
+
+        if len(input_shape[0]) != 3 or input_shape[0][1] != 1:
+            raise ValueError('A `AttentionalFM` layer requires '
+                             'inputs of a list with same shape tensor like\
+                             (None, 1, embedding_size)'
+                             'Got different shapes: %s' % (input_shape[0]))
+
+        embedding_size = int(input_shape[0][-1])
+
+        self.attention_W = self.add_weight(shape=(embedding_size,
+                                                  self.attention_factor), initializer=glorot_normal(seed=self.seed),
+                                           regularizer=l2(self.l2_reg_w), name="attention_W")
+        self.attention_b = self.add_weight(
+            shape=(self.attention_factor,), initializer=Zeros(), name="attention_b")
+        self.projection_h = self.add_weight(shape=(self.attention_factor, 1),
+                                            initializer=glorot_normal(seed=self.seed), name="projection_h")
+        self.projection_p = self.add_weight(shape=(
+            embedding_size, 1), initializer=glorot_normal(seed=self.seed), name="projection_p")
+        self.dropout = Dropout(
+            self.dropout_rate, seed=self.seed)
+
+        self.tensordot = Lambda(
+            lambda x: tf.tensordot(x[0], x[1], axes=(-1, 0)))
+
+        # Be sure to call this somewhere!
+        super(AFMLayer, self).build(input_shape)
+
+    def call(self, inputs, training=None, **kwargs):
+
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        embeds_vec_list = inputs
+        row = []
+        col = []
+
+        for r, c in itertools.combinations(embeds_vec_list, 2):
+            row.append(r)
+            col.append(c)
+
+        p = tf.concat(row, axis=1)
+        q = tf.concat(col, axis=1)
+        inner_product = p * q
+
+        bi_interaction = inner_product
+        attention_temp = tf.nn.relu(tf.nn.bias_add(tf.tensordot(
+            bi_interaction, self.attention_W, axes=(-1, 0)), self.attention_b))
+        #  Dense(self.attention_factor,'relu',kernel_regularizer=l2(self.l2_reg_w))(bi_interaction)
+        self.normalized_att_score = softmax(tf.tensordot(
+            attention_temp, self.projection_h, axes=(-1, 0)), dim=1)
+        attention_output = reduce_sum(
+            self.normalized_att_score * bi_interaction, axis=1)
+
+        attention_output = self.dropout(attention_output, training=training)  # training
+
+        afm_out = self.tensordot([attention_output, self.projection_p])
+        return afm_out
+
+    def compute_output_shape(self, input_shape):
+
+        if not isinstance(input_shape, list):
+            raise ValueError('A `AFMLayer` layer should be called '
+                             'on a list of inputs.')
+        return (None, 1)
+
+    def get_config(self, ):
+        config = {'attention_factor': self.attention_factor,
+                  'l2_reg_w': self.l2_reg_w, 'dropout_rate': self.dropout_rate, 'seed': self.seed}
+        base_config = super(AFMLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class BiInteractionPooling(Layer):
+    """Bi-Interaction Layer used in Neural FM,compress the
+     pairwise element-wise product of features into one single vector.
+
+      Input shape
+        - A 3D tensor with shape:``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      References
+        - [He X, Chua T S. Neural factorization machines for sparse predictive analytics[C]//Proceedings of the 40th International ACM SIGIR conference on Research and Development in Information Retrieval. ACM, 2017: 355-364.](http://arxiv.org/abs/1708.05027)
+    """
+
+    def __init__(self, **kwargs):
+
+        super(BiInteractionPooling, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(input_shape)))
+
+        super(BiInteractionPooling, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        concated_embeds_value = inputs
+        square_of_sum = tf.square(reduce_sum(
+            concated_embeds_value, axis=1, keep_dims=True))
+        sum_of_square = reduce_sum(
+            concated_embeds_value * concated_embeds_value, axis=1, keep_dims=True)
+        cross_term = 0.5 * (square_of_sum - sum_of_square)
+
+        return cross_term
+
+    def compute_output_shape(self, input_shape):
+        return (None, 1, input_shape[-1])
+
+
+class CIN(Layer):
+    """Compressed Interaction Network used in xDeepFM.This implemention is
+    adapted from code that the author of the paper published on https://github.com/Leavingseason/xDeepFM.
+
+      Input shape
+        - 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, featuremap_num)`` ``featuremap_num =  sum(self.layer_size[:-1]) // 2 + self.layer_size[-1]`` if ``split_half=True``,else  ``sum(layer_size)`` .
+
+      Arguments
+        - **layer_size** : list of int.Feature maps in each layer.
+
+        - **activation** : activation function used on feature maps.
+
+        - **split_half** : bool.if set to False, half of the feature maps in each hidden will connect to output unit.
+
+        - **seed** : A Python integer to use as random seed.
+
+      References
+        - [Lian J, Zhou X, Zhang F, et al. xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems[J]. arXiv preprint arXiv:1803.05170, 2018.] (https://arxiv.org/pdf/1803.05170.pdf)
+    """
+
+    def __init__(self, layer_size=(128, 128), activation='relu', split_half=True, l2_reg=1e-5, seed=1024, **kwargs):
+        if len(layer_size) == 0:
+            raise ValueError(
+                "layer_size must be a list(tuple) of length greater than 1")
+        self.layer_size = layer_size
+        self.split_half = split_half
+        self.activation = activation
+        self.l2_reg = l2_reg
+        self.seed = seed
+        super(CIN, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(input_shape)))
+
+        self.field_nums = [int(input_shape[1])]
+        self.filters = []
+        self.bias = []
+        for i, size in enumerate(self.layer_size):
+
+            self.filters.append(self.add_weight(name='filter' + str(i),
+                                                shape=[1, self.field_nums[-1]
+                                                       * self.field_nums[0], size],
+                                                dtype=tf.float32, initializer=glorot_uniform(
+                    seed=self.seed + i),
+                                                regularizer=l2(self.l2_reg)))
+
+            self.bias.append(self.add_weight(name='bias' + str(i), shape=[size], dtype=tf.float32,
+                                             initializer=Zeros()))
+
+            if self.split_half:
+                if i != len(self.layer_size) - 1 and size % 2 > 0:
+                    raise ValueError(
+                        "layer_size must be even number except for the last layer when split_half=True")
+
+                self.field_nums.append(size // 2)
+            else:
+                self.field_nums.append(size)
+
+        self.activation_layers = [activation_layer(
+            self.activation) for _ in self.layer_size]
+
+        super(CIN, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        dim = int(inputs.get_shape()[-1])
+        hidden_nn_layers = [inputs]
+        final_result = []
+
+        split_tensor0 = tf.split(hidden_nn_layers[0], dim * [1], 2)
+        for idx, layer_size in enumerate(self.layer_size):
+            split_tensor = tf.split(hidden_nn_layers[-1], dim * [1], 2)
+
+            dot_result_m = tf.matmul(
+                split_tensor0, split_tensor, transpose_b=True)
+
+            dot_result_o = tf.reshape(
+                dot_result_m, shape=[dim, -1, self.field_nums[0] * self.field_nums[idx]])
+
+            dot_result = tf.transpose(dot_result_o, perm=[1, 0, 2])
+
+            curr_out = tf.nn.conv1d(
+                dot_result, filters=self.filters[idx], stride=1, padding='VALID')
+
+            curr_out = tf.nn.bias_add(curr_out, self.bias[idx])
+
+            curr_out = self.activation_layers[idx](curr_out)
+
+            curr_out = tf.transpose(curr_out, perm=[0, 2, 1])
+
+            if self.split_half:
+                if idx != len(self.layer_size) - 1:
+                    next_hidden, direct_connect = tf.split(
+                        curr_out, 2 * [layer_size // 2], 1)
+                else:
+                    direct_connect = curr_out
+                    next_hidden = 0
+            else:
+                direct_connect = curr_out
+                next_hidden = curr_out
+
+            final_result.append(direct_connect)
+            hidden_nn_layers.append(next_hidden)
+
+        result = tf.concat(final_result, axis=1)
+        result = reduce_sum(result, -1, keep_dims=False)
+
+        return result
+
+    def compute_output_shape(self, input_shape):
+        if self.split_half:
+            featuremap_num = sum(
+                self.layer_size[:-1]) // 2 + self.layer_size[-1]
+        else:
+            featuremap_num = sum(self.layer_size)
+        return (None, featuremap_num)
+
+    def get_config(self, ):
+
+        config = {'layer_size': self.layer_size, 'split_half': self.split_half, 'activation': self.activation,
+                  'seed': self.seed}
+        base_config = super(CIN, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class CrossNet(Layer):
+    """The Cross Network part of Deep&Cross Network model,
+    which leans both low and high degree cross feature.
+
+      Input shape
+        - 2D tensor with shape: ``(batch_size, units)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, units)``.
+
+      Arguments
+        - **layer_num**: Positive integer, the cross layer number
+
+        - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix
+
+        - **parameterization**: string, ``"vector"``  or ``"matrix"`` ,  way to parameterize the cross network.
+
+        - **seed**: A Python integer to use as random seed.
+
+      References
+        - [Wang R, Fu B, Fu G, et al. Deep & cross network for ad click predictions[C]//Proceedings of the ADKDD'17. ACM, 2017: 12.](https://arxiv.org/abs/1708.05123)
+    """
+
+    def __init__(self, layer_num=2, parameterization='vector', l2_reg=0, seed=1024, **kwargs):
+        self.layer_num = layer_num
+        self.parameterization = parameterization
+        self.l2_reg = l2_reg
+        self.seed = seed
+        print('CrossNet parameterization:', self.parameterization)
+        super(CrossNet, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if len(input_shape) != 2:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 2 dimensions" % (len(input_shape),))
+
+        dim = int(input_shape[-1])
+        if self.parameterization == 'vector':
+            self.kernels = [self.add_weight(name='kernel' + str(i),
+                                            shape=(dim, 1),
+                                            initializer=glorot_normal(
+                                                seed=self.seed),
+                                            regularizer=l2(self.l2_reg),
+                                            trainable=True) for i in range(self.layer_num)]
+        elif self.parameterization == 'matrix':
+            self.kernels = [self.add_weight(name='kernel' + str(i),
+                                            shape=(dim, dim),
+                                            initializer=glorot_normal(
+                                                seed=self.seed),
+                                            regularizer=l2(self.l2_reg),
+                                            trainable=True) for i in range(self.layer_num)]
+        else:  # error
+            raise ValueError("parameterization should be 'vector' or 'matrix'")
+        self.bias = [self.add_weight(name='bias' + str(i),
+                                     shape=(dim, 1),
+                                     initializer=Zeros(),
+                                     trainable=True) for i in range(self.layer_num)]
+        # Be sure to call this somewhere!
+        super(CrossNet, self).build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 2:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 2 dimensions" % (K.ndim(inputs)))
+
+        x_0 = tf.expand_dims(inputs, axis=2)
+        x_l = x_0
+        for i in range(self.layer_num):
+            if self.parameterization == 'vector':
+                xl_w = tf.tensordot(x_l, self.kernels[i], axes=(1, 0))
+                dot_ = tf.matmul(x_0, xl_w)
+                x_l = dot_ + self.bias[i] + x_l
+            elif self.parameterization == 'matrix':
+                xl_w = tf.einsum('ij,bjk->bik', self.kernels[i], x_l)  # W * xi  (bs, dim, 1)
+                dot_ = xl_w + self.bias[i]  # W * xi + b
+                x_l = x_0 * dot_ + x_l  # x0 · (W * xi + b) +xl  Hadamard-product
+            else:  # error
+                raise ValueError("parameterization should be 'vector' or 'matrix'")
+        x_l = tf.squeeze(x_l, axis=2)
+        return x_l
+
+    def get_config(self, ):
+
+        config = {'layer_num': self.layer_num, 'parameterization': self.parameterization,
+                  'l2_reg': self.l2_reg, 'seed': self.seed}
+        base_config = super(CrossNet, self).get_config()
+        base_config.update(config)
+        return base_config
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+
+class CrossNetMix(Layer):
+    """The Cross Network part of DCN-Mix model, which improves DCN-M by:
+      1 add MOE to learn feature interactions in different subspaces
+      2 add nonlinear transformations in low-dimensional space
+
+      Input shape
+        - 2D tensor with shape: ``(batch_size, units)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, units)``.
+
+      Arguments
+        - **low_rank** : Positive integer, dimensionality of low-rank sapce.
+
+        - **num_experts** : Positive integer, number of experts.
+
+        - **layer_num**: Positive integer, the cross layer number
+
+        - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix
+
+        - **seed**: A Python integer to use as random seed.
+
+      References
+        - [Wang R, Shivanna R, Cheng D Z, et al. DCN-M: Improved Deep & Cross Network for Feature Cross Learning in Web-scale Learning to Rank Systems[J]. 2020.](https://arxiv.org/abs/2008.13535)
+    """
+
+    def __init__(self, low_rank=32, num_experts=4, layer_num=2, l2_reg=0, seed=1024, **kwargs):
+        self.low_rank = low_rank
+        self.num_experts = num_experts
+        self.layer_num = layer_num
+        self.l2_reg = l2_reg
+        self.seed = seed
+        super(CrossNetMix, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if len(input_shape) != 2:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 2 dimensions" % (len(input_shape),))
+
+        dim = int(input_shape[-1])
+
+        # U: (dim, low_rank)
+        self.U_list = [self.add_weight(name='U_list' + str(i),
+                                       shape=(self.num_experts, dim, self.low_rank),
+                                       initializer=glorot_normal(
+                                           seed=self.seed),
+                                       regularizer=l2(self.l2_reg),
+                                       trainable=True) for i in range(self.layer_num)]
+        # V: (dim, low_rank)
+        self.V_list = [self.add_weight(name='V_list' + str(i),
+                                       shape=(self.num_experts, dim, self.low_rank),
+                                       initializer=glorot_normal(
+                                           seed=self.seed),
+                                       regularizer=l2(self.l2_reg),
+                                       trainable=True) for i in range(self.layer_num)]
+        # C: (low_rank, low_rank)
+        self.C_list = [self.add_weight(name='C_list' + str(i),
+                                       shape=(self.num_experts, self.low_rank, self.low_rank),
+                                       initializer=glorot_normal(
+                                           seed=self.seed),
+                                       regularizer=l2(self.l2_reg),
+                                       trainable=True) for i in range(self.layer_num)]
+
+        self.gating = [Dense(1, use_bias=False) for i in range(self.num_experts)]
+
+        self.bias = [self.add_weight(name='bias' + str(i),
+                                     shape=(dim, 1),
+                                     initializer=Zeros(),
+                                     trainable=True) for i in range(self.layer_num)]
+        # Be sure to call this somewhere!
+        super(CrossNetMix, self).build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 2:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 2 dimensions" % (K.ndim(inputs)))
+
+        x_0 = tf.expand_dims(inputs, axis=2)
+        x_l = x_0
+        for i in range(self.layer_num):
+            output_of_experts = []
+            gating_score_of_experts = []
+            for expert_id in range(self.num_experts):
+                # (1) G(x_l)
+                # compute the gating score by x_l
+                gating_score_of_experts.append(self.gating[expert_id](tf.squeeze(x_l, axis=2)))
+
+                # (2) E(x_l)
+                # project the input x_l to $\mathbb{R}^{r}$
+                v_x = tf.einsum('ij,bjk->bik', tf.transpose(self.V_list[i][expert_id]), x_l)  # (bs, low_rank, 1)
+
+                # nonlinear activation in low rank space
+                v_x = tf.nn.tanh(v_x)
+                v_x = tf.einsum('ij,bjk->bik', self.C_list[i][expert_id], v_x)  # (bs, low_rank, 1)
+                v_x = tf.nn.tanh(v_x)
+
+                # project back to $\mathbb{R}^{d}$
+                uv_x = tf.einsum('ij,bjk->bik', self.U_list[i][expert_id], v_x)  # (bs, dim, 1)
+
+                dot_ = uv_x + self.bias[i]
+                dot_ = x_0 * dot_  # Hadamard-product
+
+                output_of_experts.append(tf.squeeze(dot_, axis=2))
+
+            # (3) mixture of low-rank experts
+            output_of_experts = tf.stack(output_of_experts, 2)  # (bs, dim, num_experts)
+            gating_score_of_experts = tf.stack(gating_score_of_experts, 1)  # (bs, num_experts, 1)
+            moe_out = tf.matmul(output_of_experts, tf.nn.softmax(gating_score_of_experts, 1))
+            x_l = moe_out + x_l  # (bs, dim, 1)
+        x_l = tf.squeeze(x_l, axis=2)
+        return x_l
+
+    def get_config(self, ):
+
+        config = {'low_rank': self.low_rank, 'num_experts': self.num_experts, 'layer_num': self.layer_num,
+                  'l2_reg': self.l2_reg, 'seed': self.seed}
+        base_config = super(CrossNetMix, self).get_config()
+        base_config.update(config)
+        return base_config
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+
+class FM(Layer):
+    """Factorization Machine models pairwise (order-2) feature interactions
+     without linear term and bias.
+
+      Input shape
+        - 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, 1)``.
+
+      References
+        - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
+    """
+
+    def __init__(self, **kwargs):
+
+        super(FM, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError("Unexpected inputs dimensions % d,\
+                             expect to be 3 dimensions" % (len(input_shape)))
+
+        super(FM, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions"
+                % (K.ndim(inputs)))
+
+        concated_embeds_value = inputs
+
+        square_of_sum = tf.square(reduce_sum(
+            concated_embeds_value, axis=1, keep_dims=True))
+        sum_of_square = reduce_sum(
+            concated_embeds_value * concated_embeds_value, axis=1, keep_dims=True)
+        cross_term = square_of_sum - sum_of_square
+        cross_term = 0.5 * reduce_sum(cross_term, axis=2, keep_dims=False)
+
+        return cross_term
+
+    def compute_output_shape(self, input_shape):
+        return (None, 1)
+
+
+
+class InnerProductLayer(Layer):
+    """InnerProduct Layer used in PNN that compute the element-wise
+    product or inner product between feature vectors.
+
+      Input shape
+        - a list of 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, N*(N-1)/2 ,1)`` if use reduce_sum. or 3D tensor with shape: ``(batch_size, N*(N-1)/2, embedding_size )`` if not use reduce_sum.
+
+      Arguments
+        - **reduce_sum**: bool. Whether return inner product or element-wise product
+
+      References
+            - [Qu Y, Cai H, Ren K, et al. Product-based neural networks for user response prediction[C]//Data Mining (ICDM), 2016 IEEE 16th International Conference on. IEEE, 2016: 1149-1154.](https://arxiv.org/pdf/1611.00144.pdf)
+    """
+
+    def __init__(self, reduce_sum=True, **kwargs):
+        self.reduce_sum = reduce_sum
+        super(InnerProductLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError('A `InnerProductLayer` layer should be called '
+                             'on a list of at least 2 inputs')
+
+        reduced_inputs_shapes = [shape.as_list() for shape in input_shape]
+        shape_set = set()
+
+        for i in range(len(input_shape)):
+            shape_set.add(tuple(reduced_inputs_shapes[i]))
+
+        if len(shape_set) > 1:
+            raise ValueError('A `InnerProductLayer` layer requires '
+                             'inputs with same shapes '
+                             'Got different shapes: %s' % (shape_set))
+
+        if len(input_shape[0]) != 3 or input_shape[0][1] != 1:
+            raise ValueError('A `InnerProductLayer` layer requires '
+                             'inputs of a list with same shape tensor like (None,1,embedding_size)'
+                             'Got different shapes: %s' % (input_shape[0]))
+        super(InnerProductLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        embed_list = inputs
+        row = []
+        col = []
+        num_inputs = len(embed_list)
+
+        for i in range(num_inputs - 1):
+            for j in range(i + 1, num_inputs):
+                row.append(i)
+                col.append(j)
+        p = tf.concat([embed_list[idx]
+                       for idx in row], axis=1)  # batch num_pairs k
+        q = tf.concat([embed_list[idx]
+                       for idx in col], axis=1)
+
+        inner_product = p * q
+        if self.reduce_sum:
+            inner_product = reduce_sum(
+                inner_product, axis=2, keep_dims=True)
+        return inner_product
+
+    def compute_output_shape(self, input_shape):
+        num_inputs = len(input_shape)
+        num_pairs = int(num_inputs * (num_inputs - 1) / 2)
+        input_shape = input_shape[0]
+        embed_size = input_shape[-1]
+        if self.reduce_sum:
+            return (input_shape[0], num_pairs, 1)
+        else:
+            return (input_shape[0], num_pairs, embed_size)
+
+    def get_config(self, ):
+        config = {'reduce_sum': self.reduce_sum, }
+        base_config = super(InnerProductLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class InteractingLayer(Layer):
+    """A Layer used in AutoInt that model the correlations between different feature fields by multi-head self-attention mechanism.
+
+      Input shape
+            - A 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+            - 3D tensor with shape:``(batch_size,field_size,att_embedding_size * head_num)``.
+
+
+      Arguments
+            - **att_embedding_size**: int.The embedding size in multi-head self-attention network.
+            - **head_num**: int.The head number in multi-head  self-attention network.
+            - **use_res**: bool.Whether or not use standard residual connections before output.
+            - **seed**: A Python integer to use as random seed.
+
+      References
+            - [Song W, Shi C, Xiao Z, et al. AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks[J]. arXiv preprint arXiv:1810.11921, 2018.](https://arxiv.org/abs/1810.11921)
+    """
+
+    def __init__(self, att_embedding_size=8, head_num=2, use_res=True, scaling=False, seed=1024, **kwargs):
+        if head_num <= 0:
+            raise ValueError('head_num must be a int > 0')
+        self.att_embedding_size = att_embedding_size
+        self.head_num = head_num
+        self.use_res = use_res
+        self.seed = seed
+        self.scaling = scaling
+        super(InteractingLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(input_shape)))
+        embedding_size = int(input_shape[-1])
+        self.W_Query = self.add_weight(name='query', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                       dtype=tf.float32,
+                                       initializer=TruncatedNormal(seed=self.seed))
+        self.W_key = self.add_weight(name='key', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                     dtype=tf.float32,
+                                     initializer=TruncatedNormal(seed=self.seed + 1))
+        self.W_Value = self.add_weight(name='value', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                       dtype=tf.float32,
+                                       initializer=TruncatedNormal(seed=self.seed + 2))
+        if self.use_res:
+            self.W_Res = self.add_weight(name='res', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                         dtype=tf.float32,
+                                         initializer=TruncatedNormal(seed=self.seed))
+
+        # Be sure to call this somewhere!
+        super(InteractingLayer, self).build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        querys = tf.tensordot(inputs, self.W_Query,
+                              axes=(-1, 0))  # None F D*head_num
+        keys = tf.tensordot(inputs, self.W_key, axes=(-1, 0))
+        values = tf.tensordot(inputs, self.W_Value, axes=(-1, 0))
+
+        # head_num None F D
+        querys = tf.stack(tf.split(querys, self.head_num, axis=2))
+        keys = tf.stack(tf.split(keys, self.head_num, axis=2))
+        values = tf.stack(tf.split(values, self.head_num, axis=2))
+
+        inner_product = tf.matmul(
+            querys, keys, transpose_b=True)  # head_num None F F
+        if self.scaling:
+            inner_product /= self.att_embedding_size ** 0.5
+        self.normalized_att_scores = softmax(inner_product)
+
+        result = tf.matmul(self.normalized_att_scores,
+                           values)  # head_num None F D
+        result = tf.concat(tf.split(result, self.head_num, ), axis=-1)
+        result = tf.squeeze(result, axis=0)  # None F D*head_num
+
+        if self.use_res:
+            result += tf.tensordot(inputs, self.W_Res, axes=(-1, 0))
+        result = tf.nn.relu(result)
+
+        return result
+
+    def compute_output_shape(self, input_shape):
+
+        return (None, input_shape[1], self.att_embedding_size * self.head_num)
+
+    def get_config(self, ):
+        config = {'att_embedding_size': self.att_embedding_size, 'head_num': self.head_num, 'use_res': self.use_res,
+                  'seed': self.seed}
+        base_config = super(InteractingLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class OutterProductLayer(Layer):
+    """OutterProduct Layer used in PNN.This implemention is
+    adapted from code that the author of the paper published on https://github.com/Atomu2014/product-nets.
+
+      Input shape
+            - A list of N 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      Output shape
+            - 2D tensor with shape:``(batch_size,N*(N-1)/2 )``.
+
+      Arguments
+            - **kernel_type**: str. The kernel weight matrix type to use,can be mat,vec or num
+
+            - **seed**: A Python integer to use as random seed.
+
+      References
+            - [Qu Y, Cai H, Ren K, et al. Product-based neural networks for user response prediction[C]//Data Mining (ICDM), 2016 IEEE 16th International Conference on. IEEE, 2016: 1149-1154.](https://arxiv.org/pdf/1611.00144.pdf)
+    """
+
+    def __init__(self, kernel_type='mat', seed=1024, **kwargs):
+        if kernel_type not in ['mat', 'vec', 'num']:
+            raise ValueError("kernel_type must be mat,vec or num")
+        self.kernel_type = kernel_type
+        self.seed = seed
+        super(OutterProductLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError('A `OutterProductLayer` layer should be called '
+                             'on a list of at least 2 inputs')
+
+        reduced_inputs_shapes = [shape.as_list() for shape in input_shape]
+        shape_set = set()
+
+        for i in range(len(input_shape)):
+            shape_set.add(tuple(reduced_inputs_shapes[i]))
+
+        if len(shape_set) > 1:
+            raise ValueError('A `OutterProductLayer` layer requires '
+                             'inputs with same shapes '
+                             'Got different shapes: %s' % (shape_set))
+
+        if len(input_shape[0]) != 3 or input_shape[0][1] != 1:
+            raise ValueError('A `OutterProductLayer` layer requires '
+                             'inputs of a list with same shape tensor like (None,1,embedding_size)'
+                             'Got different shapes: %s' % (input_shape[0]))
+        num_inputs = len(input_shape)
+        num_pairs = int(num_inputs * (num_inputs - 1) / 2)
+        input_shape = input_shape[0]
+        embed_size = int(input_shape[-1])
+        if self.kernel_type == 'mat':
+
+            self.kernel = self.add_weight(shape=(embed_size, num_pairs, embed_size),
+                                          initializer=glorot_uniform(
+                                              seed=self.seed),
+                                          name='kernel')
+        elif self.kernel_type == 'vec':
+            self.kernel = self.add_weight(shape=(num_pairs, embed_size,), initializer=glorot_uniform(self.seed),
+                                          name='kernel'
+                                          )
+        elif self.kernel_type == 'num':
+            self.kernel = self.add_weight(
+                shape=(num_pairs, 1), initializer=glorot_uniform(self.seed), name='kernel')
+
+        super(OutterProductLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        embed_list = inputs
+        row = []
+        col = []
+        num_inputs = len(embed_list)
+        for i in range(num_inputs - 1):
+            for j in range(i + 1, num_inputs):
+                row.append(i)
+                col.append(j)
+        p = tf.concat([embed_list[idx]
+                       for idx in row], axis=1)  # batch num_pairs k
+        # Reshape([num_pairs, self.embedding_size])
+        q = tf.concat([embed_list[idx] for idx in col], axis=1)
+
+        # -------------------------
+        if self.kernel_type == 'mat':
+            p = tf.expand_dims(p, 1)
+            # k     k* pair* k
+            # batch * pair
+            kp = reduce_sum(
+
+                # batch * pair * k
+
+                tf.multiply(
+
+                    # batch * pair * k
+
+                    tf.transpose(
+
+                        # batch * k * pair
+
+                        reduce_sum(
+
+                            # batch * k * pair * k
+
+                            tf.multiply(
+
+                                p, self.kernel),
+
+                            -1),
+
+                        [0, 2, 1]),
+
+                    q),
+
+                -1)
+        else:
+            # 1 * pair * (k or 1)
+
+            k = tf.expand_dims(self.kernel, 0)
+
+            # batch * pair
+
+            kp = reduce_sum(p * q * k, -1)
+
+            # p q # b * p * k
+
+        return kp
+
+    def compute_output_shape(self, input_shape):
+        num_inputs = len(input_shape)
+        num_pairs = int(num_inputs * (num_inputs - 1) / 2)
+        return (None, num_pairs)
+
+    def get_config(self, ):
+        config = {'kernel_type': self.kernel_type, 'seed': self.seed}
+        base_config = super(OutterProductLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class FGCNNLayer(Layer):
+    """Feature Generation Layer used in FGCNN,including Convolution,MaxPooling and Recombination.
+
+      Input shape
+        - A 3D tensor with shape:``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size,new_feture_num,embedding_size)``.
+
+      References
+        - [Liu B, Tang R, Chen Y, et al. Feature Generation by Convolutional Neural Network for Click-Through Rate Prediction[J]. arXiv preprint arXiv:1904.04447, 2019.](https://arxiv.org/pdf/1904.04447)
+
+    """
+
+    def __init__(self, filters=(14, 16,), kernel_width=(7, 7,), new_maps=(3, 3,), pooling_width=(2, 2),
+                 **kwargs):
+        if not (len(filters) == len(kernel_width) == len(new_maps) == len(pooling_width)):
+            raise ValueError("length of argument must be equal")
+        self.filters = filters
+        self.kernel_width = kernel_width
+        self.new_maps = new_maps
+        self.pooling_width = pooling_width
+
+        super(FGCNNLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(input_shape)))
+        self.conv_layers = []
+        self.pooling_layers = []
+        self.dense_layers = []
+        pooling_shape = input_shape.as_list() + [1, ]
+        embedding_size = int(input_shape[-1])
+        for i in range(1, len(self.filters) + 1):
+            filters = self.filters[i - 1]
+            width = self.kernel_width[i - 1]
+            new_filters = self.new_maps[i - 1]
+            pooling_width = self.pooling_width[i - 1]
+            conv_output_shape = self._conv_output_shape(
+                pooling_shape, (width, 1))
+            pooling_shape = self._pooling_output_shape(
+                conv_output_shape, (pooling_width, 1))
+            self.conv_layers.append(Conv2D(filters=filters, kernel_size=(width, 1), strides=(1, 1),
+                                           padding='same',
+                                           activation='tanh', use_bias=True, ))
+            self.pooling_layers.append(
+                MaxPooling2D(pool_size=(pooling_width, 1)))
+            self.dense_layers.append(Dense(pooling_shape[1] * embedding_size * new_filters,
+                                           activation='tanh', use_bias=True))
+
+        self.flatten = Flatten()
+
+        super(FGCNNLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        embedding_size = int(inputs.shape[-1])
+        pooling_result = tf.expand_dims(inputs, axis=3)
+
+        new_feature_list = []
+
+        for i in range(1, len(self.filters) + 1):
+            new_filters = self.new_maps[i - 1]
+
+            conv_result = self.conv_layers[i - 1](pooling_result)
+
+            pooling_result = self.pooling_layers[i - 1](conv_result)
+
+            flatten_result = self.flatten(pooling_result)
+
+            new_result = self.dense_layers[i - 1](flatten_result)
+
+            new_feature_list.append(
+                tf.reshape(new_result, (-1, int(pooling_result.shape[1]) * new_filters, embedding_size)))
+
+        new_features = concat_func(new_feature_list, axis=1)
+        return new_features
+
+    def compute_output_shape(self, input_shape):
+
+        new_features_num = 0
+        features_num = input_shape[1]
+
+        for i in range(0, len(self.kernel_width)):
+            pooled_features_num = features_num // self.pooling_width[i]
+            new_features_num += self.new_maps[i] * pooled_features_num
+            features_num = pooled_features_num
+
+        return (None, new_features_num, input_shape[-1])
+
+    def get_config(self, ):
+        config = {'kernel_width': self.kernel_width, 'filters': self.filters, 'new_maps': self.new_maps,
+                  'pooling_width': self.pooling_width}
+        base_config = super(FGCNNLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+    def _conv_output_shape(self, input_shape, kernel_size):
+        # channels_last
+        space = input_shape[1:-1]
+        new_space = []
+        for i in range(len(space)):
+            new_dim = utils.conv_output_length(
+                space[i],
+                kernel_size[i],
+                padding='same',
+                stride=1,
+                dilation=1)
+            new_space.append(new_dim)
+        return ([input_shape[0]] + new_space + [self.filters])
+
+    def _pooling_output_shape(self, input_shape, pool_size):
+        # channels_last
+
+        rows = input_shape[1]
+        cols = input_shape[2]
+        rows = utils.conv_output_length(rows, pool_size[0], 'valid',
+                                        pool_size[0])
+        cols = utils.conv_output_length(cols, pool_size[1], 'valid',
+                                        pool_size[1])
+        return [input_shape[0], rows, cols, input_shape[3]]
+
+
+class SENETLayer(Layer):
+    """SENETLayer used in FiBiNET.
+
+      Input shape
+        - A list of 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      Output shape
+        - A list of 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+
+      Arguments
+        - **reduction_ratio** : Positive integer, dimensionality of the
+         attention network output space.
+
+        - **seed** : A Python integer to use as random seed.
+
+      References
+        - [FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction](https://arxiv.org/pdf/1905.09433.pdf)
+    """
+
+    def __init__(self, reduction_ratio=3, seed=1024, **kwargs):
+        self.reduction_ratio = reduction_ratio
+
+        self.seed = seed
+        super(SENETLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError('A `AttentionalFM` layer should be called '
+                             'on a list of at least 2 inputs')
+
+        self.filed_size = len(input_shape)
+        self.embedding_size = input_shape[0][-1]
+        reduction_size = max(1, self.filed_size // self.reduction_ratio)
+
+        self.W_1 = self.add_weight(shape=(
+            self.filed_size, reduction_size), initializer=glorot_normal(seed=self.seed), name="W_1")
+        self.W_2 = self.add_weight(shape=(
+            reduction_size, self.filed_size), initializer=glorot_normal(seed=self.seed), name="W_2")
+
+        self.tensordot = Lambda(
+            lambda x: tf.tensordot(x[0], x[1], axes=(-1, 0)))
+
+        # Be sure to call this somewhere!
+        super(SENETLayer, self).build(input_shape)
+
+    def call(self, inputs, training=None, **kwargs):
+
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        inputs = concat_func(inputs, axis=1)
+        Z = reduce_mean(inputs, axis=-1, )
+
+        A_1 = tf.nn.relu(self.tensordot([Z, self.W_1]))
+        A_2 = tf.nn.relu(self.tensordot([A_1, self.W_2]))
+        V = tf.multiply(inputs, tf.expand_dims(A_2, axis=2))
+
+        return tf.split(V, self.filed_size, axis=1)
+
+    def compute_output_shape(self, input_shape):
+
+        return input_shape
+
+    def compute_mask(self, inputs, mask=None):
+        return [None] * self.filed_size
+
+    def get_config(self, ):
+        config = {'reduction_ratio': self.reduction_ratio, 'seed': self.seed}
+        base_config = super(SENETLayer, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class BilinearInteraction(Layer):
+    """BilinearInteraction Layer used in FiBiNET.
+
+      Input shape
+        - A list of 3D tensor with shape: ``(batch_size,1,embedding_size)``. Its length is ``filed_size``.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size,filed_size*(filed_size-1)/2,embedding_size)``.
+
+      Arguments
+        - **bilinear_type** : String, types of bilinear functions used in this layer.
+
+        - **seed** : A Python integer to use as random seed.
+
+      References
+        - [FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction](https://arxiv.org/pdf/1905.09433.pdf)
+
+    """
+
+    def __init__(self, bilinear_type="interaction", seed=1024, **kwargs):
+        self.bilinear_type = bilinear_type
+        self.seed = seed
+
+        super(BilinearInteraction, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError('A `AttentionalFM` layer should be called '
+                             'on a list of at least 2 inputs')
+        embedding_size = int(input_shape[0][-1])
+
+        if self.bilinear_type == "all":
+            self.W = self.add_weight(shape=(embedding_size, embedding_size), initializer=glorot_normal(
+                seed=self.seed), name="bilinear_weight")
+        elif self.bilinear_type == "each":
+            self.W_list = [self.add_weight(shape=(embedding_size, embedding_size), initializer=glorot_normal(
+                seed=self.seed), name="bilinear_weight" + str(i)) for i in range(len(input_shape) - 1)]
+        elif self.bilinear_type == "interaction":
+            self.W_list = [self.add_weight(shape=(embedding_size, embedding_size), initializer=glorot_normal(
+                seed=self.seed), name="bilinear_weight" + str(i) + '_' + str(j)) for i, j in
+                           itertools.combinations(range(len(input_shape)), 2)]
+        else:
+            raise NotImplementedError
+
+        super(BilinearInteraction, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
+
+        n = len(inputs)
+        if self.bilinear_type == "all":
+            vidots = [tf.tensordot(inputs[i], self.W, axes=(-1, 0)) for i in range(n)]
+            p = [tf.multiply(vidots[i], inputs[j]) for i, j in itertools.combinations(range(n), 2)]
+        elif self.bilinear_type == "each":
+            vidots = [tf.tensordot(inputs[i], self.W_list[i], axes=(-1, 0)) for i in range(n - 1)]
+            p = [tf.multiply(vidots[i], inputs[j]) for i, j in itertools.combinations(range(n), 2)]
+        elif self.bilinear_type == "interaction":
+            p = [tf.multiply(tf.tensordot(v[0], w, axes=(-1, 0)), v[1])
+                 for v, w in zip(itertools.combinations(inputs, 2), self.W_list)]
+        else:
+            raise NotImplementedError
+        output = concat_func(p, axis=1)
+        return output
+
+    def compute_output_shape(self, input_shape):
+        filed_size = len(input_shape)
+        embedding_size = input_shape[0][-1]
+
+        return (None, filed_size * (filed_size - 1) // 2, embedding_size)
+
+    def get_config(self, ):
+        config = {'bilinear_type': self.bilinear_type, 'seed': self.seed}
+        base_config = super(BilinearInteraction, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class FieldWiseBiInteraction(Layer):
+    """Field-Wise Bi-Interaction Layer used in FLEN,compress the
+     pairwise element-wise product of features into one single vector.
+
+      Input shape
+        - A list of 3D tensor with shape:``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size,embedding_size)``.
+
+      Arguments
+        - **use_bias** : Boolean, if use bias.
+        - **seed** : A Python integer to use as random seed.
+
+      References
+        - [FLEN: Leveraging Field for Scalable CTR Prediction](https://arxiv.org/pdf/1911.04690)
+
+    """
+
+    def __init__(self, use_bias=True, seed=1024, **kwargs):
+        self.use_bias = use_bias
+        self.seed = seed
+
+        super(FieldWiseBiInteraction, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if not isinstance(input_shape, list) or len(input_shape) < 2:
+            raise ValueError(
+                'A `Field-Wise Bi-Interaction` layer should be called '
+                'on a list of at least 2 inputs')
+
+        self.num_fields = len(input_shape)
+        embedding_size = input_shape[0][-1]
+
+        self.kernel_mf = self.add_weight(
+            name='kernel_mf',
+            shape=(int(self.num_fields * (self.num_fields - 1) / 2), 1),
+            initializer=Ones(),
+            regularizer=None,
+            trainable=True)
+
+        self.kernel_fm = self.add_weight(
+            name='kernel_fm',
+            shape=(self.num_fields, 1),
+            initializer=Constant(value=0.5),
+            regularizer=None,
+            trainable=True)
+        if self.use_bias:
+            self.bias_mf = self.add_weight(name='bias_mf',
+                                           shape=(embedding_size),
+                                           initializer=Zeros())
+            self.bias_fm = self.add_weight(name='bias_fm',
+                                           shape=(embedding_size),
+                                           initializer=Zeros())
+
+        super(FieldWiseBiInteraction,
+              self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+
+        if K.ndim(inputs[0]) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" %
+                (K.ndim(inputs)))
+
+        field_wise_embeds_list = inputs
+
+        # MF module
+        field_wise_vectors = tf.concat([
+            reduce_sum(field_i_vectors, axis=1, keep_dims=True)
+            for field_i_vectors in field_wise_embeds_list
+        ], 1)
+
+        left = []
+        right = []
+
+        for i, j in itertools.combinations(list(range(self.num_fields)), 2):
+            left.append(i)
+            right.append(j)
+
+        embeddings_left = tf.gather(params=field_wise_vectors,
+                                    indices=left,
+                                    axis=1)
+        embeddings_right = tf.gather(params=field_wise_vectors,
+                                     indices=right,
+                                     axis=1)
+
+        embeddings_prod = embeddings_left * embeddings_right
+        field_weighted_embedding = embeddings_prod * self.kernel_mf
+        h_mf = reduce_sum(field_weighted_embedding, axis=1)
+        if self.use_bias:
+            h_mf = tf.nn.bias_add(h_mf, self.bias_mf)
+
+        # FM module
+        square_of_sum_list = [
+            tf.square(reduce_sum(field_i_vectors, axis=1, keep_dims=True))
+            for field_i_vectors in field_wise_embeds_list
+        ]
+        sum_of_square_list = [
+            reduce_sum(field_i_vectors * field_i_vectors,
+                       axis=1,
+                       keep_dims=True)
+            for field_i_vectors in field_wise_embeds_list
+        ]
+
+        field_fm = tf.concat([
+            square_of_sum - sum_of_square for square_of_sum, sum_of_square in
+            zip(square_of_sum_list, sum_of_square_list)
+        ], 1)
+
+        h_fm = reduce_sum(field_fm * self.kernel_fm, axis=1)
+        if self.use_bias:
+            h_fm = tf.nn.bias_add(h_fm, self.bias_fm)
+
+        return h_mf + h_fm
+
+    def compute_output_shape(self, input_shape):
+        return (None, input_shape[0][-1])
+
+    def get_config(self, ):
+        config = {'use_bias': self.use_bias, 'seed': self.seed}
+        base_config = super(FieldWiseBiInteraction, self).get_config()
+        base_config.update(config)
+        return base_config
+
+
+class FwFMLayer(Layer):
+    """Field-weighted Factorization Machines
+
+      Input shape
+        - 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape: ``(batch_size, 1)``.
+
+      Arguments
+        - **num_fields** : integer for number of fields
+        - **regularizer** : L2 regularizer weight for the field strength parameters of FwFM
+
+      References
+        - [Field-weighted Factorization Machines for Click-Through Rate Prediction in Display Advertising]
+        https://arxiv.org/pdf/1806.03514.pdf
+    """
+
+    def __init__(self, num_fields=4, regularizer=0.000001, **kwargs):
+        self.num_fields = num_fields
+        self.regularizer = regularizer
+        super(FwFMLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError("Unexpected inputs dimensions % d,\
+                             expect to be 3 dimensions" % (len(input_shape)))
+
+        if input_shape[1] != self.num_fields:
+            raise ValueError("Mismatch in number of fields {} and \
+                 concatenated embeddings dims {}".format(self.num_fields, input_shape[1]))
+
+        self.field_strengths = self.add_weight(name='field_pair_strengths',
+                                               shape=(self.num_fields, self.num_fields),
+                                               initializer=TruncatedNormal(),
+                                               regularizer=l2(self.regularizer),
+                                               trainable=True)
+
+        super(FwFMLayer, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions"
+                % (K.ndim(inputs)))
+
+        if inputs.shape[1] != self.num_fields:
+            raise ValueError("Mismatch in number of fields {} and \
+                 concatenated embeddings dims {}".format(self.num_fields, inputs.shape[1]))
+
+        pairwise_inner_prods = []
+        for fi, fj in itertools.combinations(range(self.num_fields), 2):
+            # get field strength for pair fi and fj
+            r_ij = self.field_strengths[fi, fj]
+
+            # get embeddings for the features of both the fields
+            feat_embed_i = tf.squeeze(inputs[0:, fi:fi + 1, 0:], axis=1)
+            feat_embed_j = tf.squeeze(inputs[0:, fj:fj + 1, 0:], axis=1)
+
+            f = tf.scalar_mul(r_ij, batch_dot(feat_embed_i, feat_embed_j, axes=1))
+            pairwise_inner_prods.append(f)
+
+        sum_ = tf.add_n(pairwise_inner_prods)
+        return sum_
+
+    def compute_output_shape(self, input_shape):
+        return (None, 1)
+
+    def get_config(self):
+        config = super(FwFMLayer, self).get_config().copy()
+        config.update({
+            'num_fields': self.num_fields,
+            'regularizer': self.regularizer
+        })
+        return config
+
+
+class FEFMLayer(Layer):
+    """Field-Embedded Factorization Machines
+
+      Input shape
+        - 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+
+      Output shape
+        - 2D tensor with shape:
+            ``(batch_size, (num_fields * (num_fields-1))/2)`` # concatenated FEFM interaction embeddings
+
+      Arguments
+        - **regularizer** : L2 regularizer weight for the field pair matrix embeddings parameters of FEFM
+
+      References
+        - [Field-Embedded Factorization Machines for Click-through Rate Prediction]
+         https://arxiv.org/pdf/2009.09931.pdf
+    """
+
+    def __init__(self, regularizer, **kwargs):
+        self.regularizer = regularizer
+        super(FEFMLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError("Unexpected inputs dimensions % d,\
+                                expect to be 3 dimensions" % (len(input_shape)))
+
+        self.num_fields = int(input_shape[1])
+        embedding_size = int(input_shape[2])
+
+        self.field_embeddings = {}
+        for fi, fj in itertools.combinations(range(self.num_fields), 2):
+            field_pair_id = str(fi) + "-" + str(fj)
+            self.field_embeddings[field_pair_id] = self.add_weight(name='field_embeddings' + field_pair_id,
+                                                                   shape=(embedding_size, embedding_size),
+                                                                   initializer=TruncatedNormal(),
+                                                                   regularizer=l2(self.regularizer),
+                                                                   trainable=True)
+
+        super(FEFMLayer, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+        if K.ndim(inputs) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions"
+                % (K.ndim(inputs)))
+
+        pairwise_inner_prods = []
+        for fi, fj in itertools.combinations(range(self.num_fields), 2):
+            field_pair_id = str(fi) + "-" + str(fj)
+            feat_embed_i = tf.squeeze(inputs[0:, fi:fi + 1, 0:], axis=1)
+            feat_embed_j = tf.squeeze(inputs[0:, fj:fj + 1, 0:], axis=1)
+            field_pair_embed_ij = self.field_embeddings[field_pair_id]
+
+            feat_embed_i_tr = tf.matmul(feat_embed_i, field_pair_embed_ij + tf.transpose(field_pair_embed_ij))
+
+            f = batch_dot(feat_embed_i_tr, feat_embed_j, axes=1)
+            pairwise_inner_prods.append(f)
+
+        concat_vec = tf.concat(pairwise_inner_prods, axis=1)
+        return concat_vec
+
+    def compute_output_shape(self, input_shape):
+        num_fields = int(input_shape[1])
+        return (None, (num_fields * (num_fields - 1)) / 2)
+
+    def get_config(self):
+        config = super(FEFMLayer, self).get_config().copy()
+        config.update({
+            'regularizer': self.regularizer,
+        })
+        return config
diff --git a/modelzoo/PNN/script/layers/normalization.py b/modelzoo/PNN/script/layers/normalization.py
new file mode 100644
index 00000000000..3fceb1257d8
--- /dev/null
+++ b/modelzoo/PNN/script/layers/normalization.py
@@ -0,0 +1,51 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.layers import Layer
+
+try:
+    from tensorflow.python.ops.init_ops import Zeros, Ones
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import Zeros, Ones
+
+
+class LayerNormalization(Layer):
+    def __init__(self, axis=-1, eps=1e-9, center=True,
+                 scale=True, **kwargs):
+        self.axis = axis
+        self.eps = eps
+        self.center = center
+        self.scale = scale
+        super(LayerNormalization, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
+                                     initializer=Ones(), trainable=True)
+        self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
+                                    initializer=Zeros(), trainable=True)
+        super(LayerNormalization, self).build(input_shape)
+
+    def call(self, inputs):
+        mean = K.mean(inputs, axis=self.axis, keepdims=True)
+        variance = K.mean(K.square(inputs - mean), axis=-1, keepdims=True)
+        std = K.sqrt(variance + self.eps)
+        outputs = (inputs - mean) / std
+        if self.scale:
+            outputs *= self.gamma
+        if self.center:
+            outputs += self.beta
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self, ):
+        config = {'axis': self.axis, 'eps': self.eps, 'center': self.center, 'scale': self.scale}
+        base_config = super(LayerNormalization, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/modelzoo/PNN/script/layers/sequence.py b/modelzoo/PNN/script/layers/sequence.py
new file mode 100644
index 00000000000..45a65915c22
--- /dev/null
+++ b/modelzoo/PNN/script/layers/sequence.py
@@ -0,0 +1,901 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.keras import backend as K
+
+try:
+    from tensorflow.python.ops.init_ops import TruncatedNormal, glorot_uniform_initializer as glorot_uniform, \
+        identity_initializer as identity
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import TruncatedNormal, glorot_uniform, identity
+
+from tensorflow.python.keras.layers import LSTM, Lambda, Layer, Dropout
+
+from .core import LocalActivationUnit
+from .normalization import LayerNormalization
+
+if tf.__version__ >= '2.0.0':
+    from ..contrib.rnn_v2 import dynamic_rnn
+else:
+    from ..contrib.rnn import dynamic_rnn
+from ..contrib.utils import QAAttGRUCell, VecAttGRUCell
+from .utils import reduce_sum, reduce_max, div, softmax, reduce_mean
+
+
+class SequencePoolingLayer(Layer):
+    """The SequencePoolingLayer is used to apply pooling operation(sum,mean,max) on variable-length sequence feature/multi-value feature.
+
+      Input shape
+        - A list of two  tensor [seq_value,seq_len]
+
+        - seq_value is a 3D tensor with shape: ``(batch_size, T, embedding_size)``
+
+        - seq_len is a 2D tensor with shape : ``(batch_size, 1)``,indicate valid length of each sequence.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, 1, embedding_size)``.
+
+      Arguments
+        - **mode**:str.Pooling operation to be used,can be sum,mean or max.
+
+        - **supports_masking**:If True,the input need to support masking.
+    """
+
+    def __init__(self, mode='mean', supports_masking=False, **kwargs):
+
+        if mode not in ['sum', 'mean', 'max']:
+            raise ValueError("mode must be sum or mean")
+        self.mode = mode
+        self.eps = tf.constant(1e-8, tf.float32)
+        super(SequencePoolingLayer, self).__init__(**kwargs)
+
+        self.supports_masking = supports_masking
+
+    def build(self, input_shape):
+        if not self.supports_masking:
+            self.seq_len_max = int(input_shape[0][1])
+        super(SequencePoolingLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, seq_value_len_list, mask=None, **kwargs):
+        if self.supports_masking:
+            if mask is None:
+                raise ValueError(
+                    "When supports_masking=True,input must support masking")
+            uiseq_embed_list = seq_value_len_list
+            mask = tf.cast(mask, tf.float32)  # tf.to_float(mask)
+            user_behavior_length = reduce_sum(mask, axis=-1, keep_dims=True)
+            mask = tf.expand_dims(mask, axis=2)
+        else:
+            uiseq_embed_list, user_behavior_length = seq_value_len_list
+
+            mask = tf.sequence_mask(user_behavior_length,
+                                    self.seq_len_max, dtype=tf.float32)
+            mask = tf.transpose(mask, (0, 2, 1))
+
+        embedding_size = uiseq_embed_list.shape[-1]
+
+        mask = tf.tile(mask, [1, 1, embedding_size])
+
+        if self.mode == "max":
+            hist = uiseq_embed_list - (1 - mask) * 1e9
+            return reduce_max(hist, 1, keep_dims=True)
+
+        hist = reduce_sum(uiseq_embed_list * mask, 1, keep_dims=False)
+
+        if self.mode == "mean":
+            hist = div(hist, tf.cast(user_behavior_length, tf.float32) + self.eps)
+
+        hist = tf.expand_dims(hist, axis=1)
+        return hist
+
+    def compute_output_shape(self, input_shape):
+        if self.supports_masking:
+            return (None, 1, input_shape[-1])
+        else:
+            return (None, 1, input_shape[0][-1])
+
+    def compute_mask(self, inputs, mask):
+        return None
+
+    def get_config(self, ):
+        config = {'mode': self.mode, 'supports_masking': self.supports_masking}
+        base_config = super(SequencePoolingLayer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class WeightedSequenceLayer(Layer):
+    """The WeightedSequenceLayer is used to apply weight score on variable-length sequence feature/multi-value feature.
+
+      Input shape
+        - A list of two  tensor [seq_value,seq_len,seq_weight]
+
+        - seq_value is a 3D tensor with shape: ``(batch_size, T, embedding_size)``
+
+        - seq_len is a 2D tensor with shape : ``(batch_size, 1)``,indicate valid length of each sequence.
+
+        - seq_weight is a 3D tensor with shape: ``(batch_size, T, 1)``
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, T, embedding_size)``.
+
+      Arguments
+        - **weight_normalization**: bool.Whether normalize the weight score before applying to sequence.
+
+        - **supports_masking**:If True,the input need to support masking.
+    """
+
+    def __init__(self, weight_normalization=True, supports_masking=False, **kwargs):
+        super(WeightedSequenceLayer, self).__init__(**kwargs)
+        self.weight_normalization = weight_normalization
+        self.supports_masking = supports_masking
+
+    def build(self, input_shape):
+        if not self.supports_masking:
+            self.seq_len_max = int(input_shape[0][1])
+        super(WeightedSequenceLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, input_list, mask=None, **kwargs):
+        if self.supports_masking:
+            if mask is None:
+                raise ValueError(
+                    "When supports_masking=True,input must support masking")
+            key_input, value_input = input_list
+            mask = tf.expand_dims(mask[0], axis=2)
+        else:
+            key_input, key_length_input, value_input = input_list
+            mask = tf.sequence_mask(key_length_input,
+                                    self.seq_len_max, dtype=tf.bool)
+            mask = tf.transpose(mask, (0, 2, 1))
+
+        embedding_size = key_input.shape[-1]
+
+        if self.weight_normalization:
+            paddings = tf.ones_like(value_input) * (-2 ** 32 + 1)
+        else:
+            paddings = tf.zeros_like(value_input)
+        value_input = tf.where(mask, value_input, paddings)
+
+        if self.weight_normalization:
+            value_input = softmax(value_input, dim=1)
+
+        if len(value_input.shape) == 2:
+            value_input = tf.expand_dims(value_input, axis=2)
+            value_input = tf.tile(value_input, [1, 1, embedding_size])
+
+        return tf.multiply(key_input, value_input)
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[0]
+
+    def compute_mask(self, inputs, mask):
+        if self.supports_masking:
+            return mask[0]
+        else:
+            return None
+
+    def get_config(self, ):
+        config = {'weight_normalization': self.weight_normalization, 'supports_masking': self.supports_masking}
+        base_config = super(WeightedSequenceLayer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class AttentionSequencePoolingLayer(Layer):
+    """The Attentional sequence pooling operation used in DIN.
+
+      Input shape
+        - A list of three tensor: [query,keys,keys_length]
+
+        - query is a 3D tensor with shape:  ``(batch_size, 1, embedding_size)``
+
+        - keys is a 3D tensor with shape:   ``(batch_size, T, embedding_size)``
+
+        - keys_length is a 2D tensor with shape: ``(batch_size, 1)``
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, 1, embedding_size)``.
+
+      Arguments
+        - **att_hidden_units**:list of positive integer, the attention net layer number and units in each layer.
+
+        - **att_activation**: Activation function to use in attention net.
+
+        - **weight_normalization**: bool.Whether normalize the attention score of local activation unit.
+
+        - **supports_masking**:If True,the input need to support masking.
+
+      References
+        - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
+    """
+
+    def __init__(self, att_hidden_units=(80, 40), att_activation='sigmoid', weight_normalization=False,
+                 return_score=False,
+                 supports_masking=False, **kwargs):
+
+        self.att_hidden_units = att_hidden_units
+        self.att_activation = att_activation
+        self.weight_normalization = weight_normalization
+        self.return_score = return_score
+        super(AttentionSequencePoolingLayer, self).__init__(**kwargs)
+        self.supports_masking = supports_masking
+
+    def build(self, input_shape):
+        if not self.supports_masking:
+            if not isinstance(input_shape, list) or len(input_shape) != 3:
+                raise ValueError('A `AttentionSequencePoolingLayer` layer should be called '
+                                 'on a list of 3 inputs')
+
+            if len(input_shape[0]) != 3 or len(input_shape[1]) != 3 or len(input_shape[2]) != 2:
+                raise ValueError(
+                    "Unexpected inputs dimensions,the 3 tensor dimensions are %d,%d and %d , expect to be 3,3 and 2" % (
+                        len(input_shape[0]), len(input_shape[1]), len(input_shape[2])))
+
+            if input_shape[0][-1] != input_shape[1][-1] or input_shape[0][1] != 1 or input_shape[2][1] != 1:
+                raise ValueError('A `AttentionSequencePoolingLayer` layer requires '
+                                 'inputs of a 3 tensor with shape (None,1,embedding_size),(None,T,embedding_size) and (None,1)'
+                                 'Got different shapes: %s' % (input_shape))
+        else:
+            pass
+        self.local_att = LocalActivationUnit(
+            self.att_hidden_units, self.att_activation, l2_reg=0, dropout_rate=0, use_bn=False, seed=1024, )
+        super(AttentionSequencePoolingLayer, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, mask=None, training=None, **kwargs):
+
+        if self.supports_masking:
+            if mask is None:
+                raise ValueError(
+                    "When supports_masking=True,input must support masking")
+            queries, keys = inputs
+            key_masks = tf.expand_dims(mask[-1], axis=1)
+
+        else:
+
+            queries, keys, keys_length = inputs
+            hist_len = keys.get_shape()[1]
+            key_masks = tf.sequence_mask(keys_length, hist_len)
+
+        attention_score = self.local_att([queries, keys], training=training)
+
+        outputs = tf.transpose(attention_score, (0, 2, 1))
+
+        if self.weight_normalization:
+            paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
+        else:
+            paddings = tf.zeros_like(outputs)
+
+        outputs = tf.where(key_masks, outputs, paddings)
+
+        if self.weight_normalization:
+            outputs = softmax(outputs)
+
+        if not self.return_score:
+            outputs = tf.matmul(outputs, keys)
+
+        if tf.__version__ < '1.13.0':
+            outputs._uses_learning_phase = attention_score._uses_learning_phase
+        else:
+            outputs._uses_learning_phase = training is not None
+
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        if self.return_score:
+            return (None, 1, input_shape[1][1])
+        else:
+            return (None, 1, input_shape[0][-1])
+
+    def compute_mask(self, inputs, mask):
+        return None
+
+    def get_config(self, ):
+
+        config = {'att_hidden_units': self.att_hidden_units, 'att_activation': self.att_activation,
+                  'weight_normalization': self.weight_normalization, 'return_score': self.return_score,
+                  'supports_masking': self.supports_masking}
+        base_config = super(AttentionSequencePoolingLayer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class BiLSTM(Layer):
+    """A multiple layer Bidirectional Residual LSTM Layer.
+
+      Input shape
+        - 3D tensor with shape ``(batch_size, timesteps, input_dim)``.
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, timesteps, units)``.
+
+      Arguments
+        - **units**: Positive integer, dimensionality of the output space.
+
+        - **layers**:Positive integer, number of LSTM layers to stacked.
+
+        - **res_layers**: Positive integer, number of residual connection to used in last ``res_layers``.
+
+        - **dropout_rate**:  Float between 0 and 1. Fraction of the units to drop for the linear transformation of the inputs.
+
+        - **merge_mode**: merge_mode: Mode by which outputs of the forward and backward RNNs will be combined. One of { ``'fw'`` , ``'bw'`` , ``'sum'`` , ``'mul'`` , ``'concat'`` , ``'ave'`` , ``None`` }. If None, the outputs will not be combined, they will be returned as a list.
+
+
+    """
+
+    def __init__(self, units, layers=2, res_layers=0, dropout_rate=0.2, merge_mode='ave', **kwargs):
+
+        if merge_mode not in ['fw', 'bw', 'sum', 'mul', 'ave', 'concat', None]:
+            raise ValueError('Invalid merge mode. '
+                             'Merge mode should be one of '
+                             '{"fw","bw","sum", "mul", "ave", "concat", None}')
+
+        self.units = units
+        self.layers = layers
+        self.res_layers = res_layers
+        self.dropout_rate = dropout_rate
+        self.merge_mode = merge_mode
+
+        super(BiLSTM, self).__init__(**kwargs)
+        self.supports_masking = True
+
+    def build(self, input_shape):
+
+        if len(input_shape) != 3:
+            raise ValueError(
+                "Unexpected inputs dimensions %d, expect to be 3 dimensions" % (len(input_shape)))
+        self.fw_lstm = []
+        self.bw_lstm = []
+        for _ in range(self.layers):
+            self.fw_lstm.append(
+                LSTM(self.units, dropout=self.dropout_rate, bias_initializer='ones', return_sequences=True,
+                     unroll=True))
+            self.bw_lstm.append(
+                LSTM(self.units, dropout=self.dropout_rate, bias_initializer='ones', return_sequences=True,
+                     go_backwards=True, unroll=True))
+
+        super(BiLSTM, self).build(
+            input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, mask=None, **kwargs):
+
+        input_fw = inputs
+        input_bw = inputs
+        for i in range(self.layers):
+            output_fw = self.fw_lstm[i](input_fw)
+            output_bw = self.bw_lstm[i](input_bw)
+            output_bw = Lambda(lambda x: K.reverse(
+                x, 1), mask=lambda inputs, mask: mask)(output_bw)
+
+            if i >= self.layers - self.res_layers:
+                output_fw += input_fw
+                output_bw += input_bw
+            input_fw = output_fw
+            input_bw = output_bw
+
+        output_fw = input_fw
+        output_bw = input_bw
+
+        if self.merge_mode == "fw":
+            output = output_fw
+        elif self.merge_mode == "bw":
+            output = output_bw
+        elif self.merge_mode == 'concat':
+            output = K.concatenate([output_fw, output_bw])
+        elif self.merge_mode == 'sum':
+            output = output_fw + output_bw
+        elif self.merge_mode == 'ave':
+            output = (output_fw + output_bw) / 2
+        elif self.merge_mode == 'mul':
+            output = output_fw * output_bw
+        elif self.merge_mode is None:
+            output = [output_fw, output_bw]
+
+        return output
+
+    def compute_output_shape(self, input_shape):
+        print(self.merge_mode)
+        if self.merge_mode is None:
+            return [input_shape, input_shape]
+        elif self.merge_mode == 'concat':
+            return input_shape[:-1] + (input_shape[-1] * 2,)
+        else:
+            return input_shape
+
+    def compute_mask(self, inputs, mask):
+        return mask
+
+    def get_config(self, ):
+
+        config = {'units': self.units, 'layers': self.layers,
+                  'res_layers': self.res_layers, 'dropout_rate': self.dropout_rate, 'merge_mode': self.merge_mode}
+        base_config = super(BiLSTM, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class Transformer(Layer):
+    """  Simplified version of Transformer  proposed in 《Attention is all you need》
+
+      Input shape
+        - a list of two 3D tensor with shape ``(batch_size, timesteps, input_dim)`` if ``supports_masking=True`` .
+        - a list of two 4 tensors, first two tensors with shape ``(batch_size, timesteps, input_dim)``,last two tensors with shape ``(batch_size, 1)`` if ``supports_masking=False`` .
+
+
+      Output shape
+        - 3D tensor with shape: ``(batch_size, 1, input_dim)``  if ``output_type='mean'`` or ``output_type='sum'`` , else  ``(batch_size, timesteps, input_dim)`` .
+
+
+      Arguments
+            - **att_embedding_size**: int.The embedding size in multi-head self-attention network.
+            - **head_num**: int.The head number in multi-head  self-attention network.
+            - **dropout_rate**: float between 0 and 1. Fraction of the units to drop.
+            - **use_positional_encoding**: bool. Whether or not use positional_encoding
+            - **use_res**: bool. Whether or not use standard residual connections before output.
+            - **use_feed_forward**: bool. Whether or not use pointwise feed foward network.
+            - **use_layer_norm**: bool. Whether or not use Layer Normalization.
+            - **blinding**: bool. Whether or not use blinding.
+            - **seed**: A Python integer to use as random seed.
+            - **supports_masking**:bool. Whether or not support masking.
+            - **attention_type**: str, Type of attention, the value must be one of { ``'scaled_dot_product'`` , ``'additive'`` }.
+            - **output_type**: ``'mean'`` , ``'sum'`` or `None`. Whether or not use average/sum pooling for output.
+
+      References
+            - [Vaswani, Ashish, et al. "Attention is all you need." Advances in Neural Information Processing Systems. 2017.](https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf)
+    """
+
+    def __init__(self, att_embedding_size=1, head_num=8, dropout_rate=0.0, use_positional_encoding=True, use_res=True,
+                 use_feed_forward=True, use_layer_norm=False, blinding=True, seed=1024, supports_masking=False,
+                 attention_type="scaled_dot_product", output_type="mean", **kwargs):
+        if head_num <= 0:
+            raise ValueError('head_num must be a int > 0')
+        self.att_embedding_size = att_embedding_size
+        self.head_num = head_num
+        self.num_units = att_embedding_size * head_num
+        self.use_res = use_res
+        self.use_feed_forward = use_feed_forward
+        self.seed = seed
+        self.use_positional_encoding = use_positional_encoding
+        self.dropout_rate = dropout_rate
+        self.use_layer_norm = use_layer_norm
+        self.blinding = blinding
+        self.attention_type = attention_type
+        self.output_type = output_type
+        super(Transformer, self).__init__(**kwargs)
+        self.supports_masking = supports_masking
+
+    def build(self, input_shape):
+        embedding_size = int(input_shape[0][-1])
+        if self.num_units != embedding_size:
+            raise ValueError(
+                "att_embedding_size * head_num must equal the last dimension size of inputs,got %d * %d != %d" % (
+                    self.att_embedding_size, self.head_num, embedding_size))
+        self.seq_len_max = int(input_shape[0][-2])
+        self.W_Query = self.add_weight(name='query', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                       dtype=tf.float32,
+                                       initializer=TruncatedNormal(seed=self.seed))
+        self.W_key = self.add_weight(name='key', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                     dtype=tf.float32,
+                                     initializer=TruncatedNormal(seed=self.seed + 1))
+        self.W_Value = self.add_weight(name='value', shape=[embedding_size, self.att_embedding_size * self.head_num],
+                                       dtype=tf.float32,
+                                       initializer=TruncatedNormal(seed=self.seed + 2))
+        if self.attention_type == "additive":
+            self.b = self.add_weight('b', shape=[self.att_embedding_size], dtype=tf.float32,
+                                     initializer=glorot_uniform(seed=self.seed))
+            self.v = self.add_weight('v', shape=[self.att_embedding_size], dtype=tf.float32,
+                                     initializer=glorot_uniform(seed=self.seed))
+        # if self.use_res:
+        #     self.W_Res = self.add_weight(name='res', shape=[embedding_size, self.att_embedding_size * self.head_num], dtype=tf.float32,
+        #                                  initializer=TruncatedNormal(seed=self.seed))
+        if self.use_feed_forward:
+            self.fw1 = self.add_weight('fw1', shape=[self.num_units, 4 * self.num_units], dtype=tf.float32,
+                                       initializer=glorot_uniform(seed=self.seed))
+            self.fw2 = self.add_weight('fw2', shape=[4 * self.num_units, self.num_units], dtype=tf.float32,
+                                       initializer=glorot_uniform(seed=self.seed))
+
+        self.dropout = Dropout(
+            self.dropout_rate, seed=self.seed)
+        self.ln = LayerNormalization()
+        if self.use_positional_encoding:
+            self.query_pe = PositionEncoding()
+            self.key_pe = PositionEncoding()
+        # Be sure to call this somewhere!
+        super(Transformer, self).build(input_shape)
+
+    def call(self, inputs, mask=None, training=None, **kwargs):
+
+        if self.supports_masking:
+            queries, keys = inputs
+            query_masks, key_masks = mask
+            query_masks = tf.cast(query_masks, tf.float32)
+            key_masks = tf.cast(key_masks, tf.float32)
+        else:
+            queries, keys, query_masks, key_masks = inputs
+
+            query_masks = tf.sequence_mask(
+                query_masks, self.seq_len_max, dtype=tf.float32)
+            key_masks = tf.sequence_mask(
+                key_masks, self.seq_len_max, dtype=tf.float32)
+            query_masks = tf.squeeze(query_masks, axis=1)
+            key_masks = tf.squeeze(key_masks, axis=1)
+
+        if self.use_positional_encoding:
+            queries = self.query_pe(queries)
+            keys = self.key_pe(queries)
+
+        querys = tf.tensordot(queries, self.W_Query,
+                              axes=(-1, 0))  # None T_q D*head_num
+        keys = tf.tensordot(keys, self.W_key, axes=(-1, 0))
+        values = tf.tensordot(keys, self.W_Value, axes=(-1, 0))
+
+        # head_num*None T_q D
+        querys = tf.concat(tf.split(querys, self.head_num, axis=2), axis=0)
+        keys = tf.concat(tf.split(keys, self.head_num, axis=2), axis=0)
+        values = tf.concat(tf.split(values, self.head_num, axis=2), axis=0)
+
+        if self.attention_type == "scaled_dot_product":
+            # head_num*None T_q T_k
+            outputs = tf.matmul(querys, keys, transpose_b=True)
+
+            outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)
+        elif self.attention_type == "additive":
+            querys_reshaped = tf.expand_dims(querys, axis=-2)
+            keys_reshaped = tf.expand_dims(keys, axis=-3)
+            outputs = tf.tanh(tf.nn.bias_add(querys_reshaped + keys_reshaped, self.b))
+            outputs = tf.squeeze(tf.tensordot(outputs, tf.expand_dims(self.v, axis=-1), axes=[-1, 0]), axis=-1)
+        else:
+            raise ValueError("attention_type must be scaled_dot_product or additive")
+
+        key_masks = tf.tile(key_masks, [self.head_num, 1])
+
+        # (h*N, T_q, T_k)
+        key_masks = tf.tile(tf.expand_dims(key_masks, 1),
+                            [1, tf.shape(queries)[1], 1])
+
+        paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
+
+        # (h*N, T_q, T_k)
+
+        outputs = tf.where(tf.equal(key_masks, 1), outputs, paddings, )
+        if self.blinding:
+            try:
+                outputs = tf.matrix_set_diag(outputs, tf.ones_like(outputs)[
+                                                      :, :, 0] * (-2 ** 32 + 1))
+            except AttributeError:
+                outputs = tf.compat.v1.matrix_set_diag(outputs, tf.ones_like(outputs)[
+                                                                :, :, 0] * (-2 ** 32 + 1))
+
+        outputs -= reduce_max(outputs, axis=-1, keep_dims=True)
+        outputs = softmax(outputs)
+        query_masks = tf.tile(query_masks, [self.head_num, 1])  # (h*N, T_q)
+        # (h*N, T_q, T_k)
+        query_masks = tf.tile(tf.expand_dims(
+            query_masks, -1), [1, 1, tf.shape(keys)[1]])
+
+        outputs *= query_masks
+
+        outputs = self.dropout(outputs, training=training)
+        # Weighted sum
+        # ( h*N, T_q, C/h)
+        result = tf.matmul(outputs, values)
+        result = tf.concat(tf.split(result, self.head_num, axis=0), axis=2)
+
+        if self.use_res:
+            # tf.tensordot(queries, self.W_Res, axes=(-1, 0))
+            result += queries
+        if self.use_layer_norm:
+            result = self.ln(result)
+
+        if self.use_feed_forward:
+            fw1 = tf.nn.relu(tf.tensordot(result, self.fw1, axes=[-1, 0]))
+            fw1 = self.dropout(fw1, training=training)
+            fw2 = tf.tensordot(fw1, self.fw2, axes=[-1, 0])
+            if self.use_res:
+                result += fw2
+            if self.use_layer_norm:
+                result = self.ln(result)
+
+        if self.output_type == "mean":
+            return reduce_mean(result, axis=1, keep_dims=True)
+        elif self.output_type == "sum":
+            return reduce_sum(result, axis=1, keep_dims=True)
+        else:
+            return result
+
+    def compute_output_shape(self, input_shape):
+
+        return (None, 1, self.att_embedding_size * self.head_num)
+
+    def compute_mask(self, inputs, mask=None):
+        return None
+
+    def get_config(self, ):
+        config = {'att_embedding_size': self.att_embedding_size, 'head_num': self.head_num,
+                  'dropout_rate': self.dropout_rate, 'use_res': self.use_res,
+                  'use_positional_encoding': self.use_positional_encoding, 'use_feed_forward': self.use_feed_forward,
+                  'use_layer_norm': self.use_layer_norm, 'seed': self.seed, 'supports_masking': self.supports_masking,
+                  'blinding': self.blinding, 'attention_type': self.attention_type, 'output_type': self.output_type}
+        base_config = super(Transformer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class PositionEncoding(Layer):
+    def __init__(self, pos_embedding_trainable=True,
+                 zero_pad=False,
+                 scale=True, **kwargs):
+        self.pos_embedding_trainable = pos_embedding_trainable
+        self.zero_pad = zero_pad
+        self.scale = scale
+        super(PositionEncoding, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Create a trainable weight variable for this layer.
+        _, T, num_units = input_shape.as_list()  # inputs.get_shape().as_list()
+        # First part of the PE function: sin and cos argument
+        position_enc = np.array([
+            [pos / np.power(10000, 2. * (i // 2) / num_units) for i in range(num_units)]
+            for pos in range(T)])
+
+        # Second part, apply the cosine to even columns and sin to odds.
+        position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
+        position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
+        if self.zero_pad:
+            position_enc[0, :] = np.zeros(num_units)
+        self.lookup_table = self.add_weight("lookup_table", (T, num_units),
+                                            initializer=identity(position_enc),
+                                            trainable=self.pos_embedding_trainable)
+
+        # Be sure to call this somewhere!
+        super(PositionEncoding, self).build(input_shape)
+
+    def call(self, inputs, mask=None):
+        _, T, num_units = inputs.get_shape().as_list()
+        position_ind = tf.expand_dims(tf.range(T), 0)
+        outputs = tf.nn.embedding_lookup(self.lookup_table, position_ind)
+        if self.scale:
+            outputs = outputs * num_units ** 0.5
+        return outputs + inputs
+
+    def compute_output_shape(self, input_shape):
+
+        return input_shape
+
+    def compute_mask(self, inputs, mask=None):
+        return mask
+
+    def get_config(self, ):
+
+        config = {'pos_embedding_trainable': self.pos_embedding_trainable, 'zero_pad': self.zero_pad,
+                  'scale': self.scale}
+        base_config = super(PositionEncoding, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class BiasEncoding(Layer):
+    def __init__(self, sess_max_count, seed=1024, **kwargs):
+        self.sess_max_count = sess_max_count
+        self.seed = seed
+        super(BiasEncoding, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Create a trainable weight variable for this layer.
+
+        if self.sess_max_count == 1:
+            embed_size = input_shape[2].value
+            seq_len_max = input_shape[1].value
+        else:
+            try:
+                embed_size = input_shape[0][2].value
+                seq_len_max = input_shape[0][1].value
+            except AttributeError:
+                embed_size = input_shape[0][2]
+                seq_len_max = input_shape[0][1]
+
+        self.sess_bias_embedding = self.add_weight('sess_bias_embedding', shape=(self.sess_max_count, 1, 1),
+                                                   initializer=TruncatedNormal(
+                                                       mean=0.0, stddev=0.0001, seed=self.seed))
+        self.seq_bias_embedding = self.add_weight('seq_bias_embedding', shape=(1, seq_len_max, 1),
+                                                  initializer=TruncatedNormal(
+                                                      mean=0.0, stddev=0.0001, seed=self.seed))
+        self.item_bias_embedding = self.add_weight('item_bias_embedding', shape=(1, 1, embed_size),
+                                                   initializer=TruncatedNormal(
+                                                       mean=0.0, stddev=0.0001, seed=self.seed))
+
+        # Be sure to call this somewhere!
+        super(BiasEncoding, self).build(input_shape)
+
+    def call(self, inputs, mask=None):
+        """
+        :param concated_embeds_value: None * field_size * embedding_size
+        :return: None*1
+        """
+        transformer_out = []
+        for i in range(self.sess_max_count):
+            transformer_out.append(
+                inputs[i] + self.item_bias_embedding + self.seq_bias_embedding + self.sess_bias_embedding[i])
+        return transformer_out
+
+    def compute_output_shape(self, input_shape):
+
+        return input_shape
+
+    def compute_mask(self, inputs, mask=None):
+        return mask
+
+    def get_config(self, ):
+
+        config = {'sess_max_count': self.sess_max_count, 'seed': self.seed, }
+        base_config = super(BiasEncoding, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class DynamicGRU(Layer):
+    def __init__(self, num_units=None, gru_type='GRU', return_sequence=True, **kwargs):
+
+        self.num_units = num_units
+        self.return_sequence = return_sequence
+        self.gru_type = gru_type
+        super(DynamicGRU, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Create a trainable weight variable for this layer.
+        input_seq_shape = input_shape[0]
+        if self.num_units is None:
+            self.num_units = input_seq_shape.as_list()[-1]
+        if self.gru_type == "AGRU":
+            self.gru_cell = QAAttGRUCell(self.num_units)
+        elif self.gru_type == "AUGRU":
+            self.gru_cell = VecAttGRUCell(self.num_units)
+        else:
+            try:
+                self.gru_cell = tf.nn.rnn_cell.GRUCell(self.num_units)  # GRUCell
+            except AttributeError:
+                self.gru_cell = tf.compat.v1.nn.rnn_cell.GRUCell(self.num_units)
+
+        # Be sure to call this somewhere!
+        super(DynamicGRU, self).build(input_shape)
+
+    def call(self, input_list):
+        """
+        :param concated_embeds_value: None * field_size * embedding_size
+        :return: None*1
+        """
+        if self.gru_type == "GRU" or self.gru_type == "AIGRU":
+            rnn_input, sequence_length = input_list
+            att_score = None
+        else:
+            rnn_input, sequence_length, att_score = input_list
+
+        rnn_output, hidden_state = dynamic_rnn(self.gru_cell, inputs=rnn_input, att_scores=att_score,
+                                               sequence_length=tf.squeeze(sequence_length,
+                                                                          ), dtype=tf.float32, scope=self.name)
+        if self.return_sequence:
+            return rnn_output
+        else:
+            return tf.expand_dims(hidden_state, axis=1)
+
+    def compute_output_shape(self, input_shape):
+        rnn_input_shape = input_shape[0]
+        if self.return_sequence:
+            return rnn_input_shape
+        else:
+            return (None, 1, rnn_input_shape[2])
+
+    def get_config(self, ):
+        config = {'num_units': self.num_units, 'gru_type': self.gru_type, 'return_sequence': self.return_sequence}
+        base_config = super(DynamicGRU, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class KMaxPooling(Layer):
+    """K Max pooling that selects the k biggest value along the specific axis.
+
+      Input shape
+        -  nD tensor with shape: ``(batch_size, ..., input_dim)``.
+
+      Output shape
+        - nD tensor with shape: ``(batch_size, ..., output_dim)``.
+
+      Arguments
+        - **k**: positive integer, number of top elements to look for along the ``axis`` dimension.
+
+        - **axis**: positive integer, the dimension to look for elements.
+
+     """
+
+    def __init__(self, k=1, axis=-1, **kwargs):
+
+        self.k = k
+        self.axis = axis
+        super(KMaxPooling, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+
+        if self.axis < 1 or self.axis > len(input_shape):
+            raise ValueError("axis must be 1~%d,now is %d" %
+                             (len(input_shape), self.axis))
+
+        if self.k < 1 or self.k > input_shape[self.axis]:
+            raise ValueError("k must be in 1 ~ %d,now k is %d" %
+                             (input_shape[self.axis], self.k))
+        self.dims = len(input_shape)
+        # Be sure to call this somewhere!
+        super(KMaxPooling, self).build(input_shape)
+
+    def call(self, inputs):
+
+        # swap the last and the axis dimensions since top_k will be applied along the last dimension
+        perm = list(range(self.dims))
+        perm[-1], perm[self.axis] = perm[self.axis], perm[-1]
+        shifted_input = tf.transpose(inputs, perm)
+
+        # extract top_k, returns two tensors [values, indices]
+        top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=True, name=None)[0]
+        output = tf.transpose(top_k, perm)
+
+        return output
+
+    def compute_output_shape(self, input_shape):
+        output_shape = list(input_shape)
+        output_shape[self.axis] = self.k
+        return tuple(output_shape)
+
+    def get_config(self, ):
+        config = {'k': self.k, 'axis': self.axis}
+        base_config = super(KMaxPooling, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+# def positional_encoding(inputs,
+#                         pos_embedding_trainable=True,
+#                         zero_pad=False,
+#                         scale=True,
+#                         ):
+#     '''Sinusoidal Positional_Encoding.
+#
+#     Args:
+#
+#       - inputs: A 2d Tensor with shape of (N, T).
+#       - num_units: Output dimensionality
+#       - zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
+#       - scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper)
+#       - scope: Optional scope for `variable_scope`.
+#       - reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
+#
+#     Returns:
+#
+#       - A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units'
+#     '''
+#
+#     _, T, num_units = inputs.get_shape().as_list()
+#     # with tf.variable_scope(scope, reuse=reuse):
+#     position_ind = tf.expand_dims(tf.range(T), 0)
+#     # First part of the PE function: sin and cos argument
+#     position_enc = np.array([
+#         [pos / np.power(10000, 2. * i / num_units)
+#          for i in range(num_units)]
+#         for pos in range(T)])
+#
+#     # Second part, apply the cosine to even columns and sin to odds.
+#     position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
+#     position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
+#
+#     # Convert to a tensor
+#
+#     if pos_embedding_trainable:
+#         lookup_table = K.variable(position_enc, dtype=tf.float32)
+#
+#     if zero_pad:
+#         lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
+#                                   lookup_table[1:, :]), 0)
+#
+#     outputs = tf.nn.embedding_lookup(lookup_table, position_ind)
+#
+#     if scale:
+#         outputs = outputs * num_units ** 0.5
+#     return outputs + inputs
diff --git a/modelzoo/PNN/script/layers/utils.py b/modelzoo/PNN/script/layers/utils.py
new file mode 100644
index 00000000000..2be8f3fe5ef
--- /dev/null
+++ b/modelzoo/PNN/script/layers/utils.py
@@ -0,0 +1,302 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+import tensorflow as tf
+from tensorflow.python.keras.layers import Flatten, Concatenate, Layer, Add
+from tensorflow.python.ops.lookup_ops import TextFileInitializer
+
+try:
+    from tensorflow.python.ops.init_ops import Zeros, glorot_normal_initializer as glorot_normal
+except ImportError:
+    from tensorflow.python.ops.init_ops_v2 import Zeros, glorot_normal
+
+from tensorflow.python.keras.regularizers import l2
+
+try:
+    from tensorflow.python.ops.lookup_ops import StaticHashTable
+except ImportError:
+    from tensorflow.python.ops.lookup_ops import HashTable as StaticHashTable
+
+
+class NoMask(Layer):
+    def __init__(self, **kwargs):
+        super(NoMask, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Be sure to call this somewhere!
+        super(NoMask, self).build(input_shape)
+
+    def call(self, x, mask=None, **kwargs):
+        return x
+
+    def compute_mask(self, inputs, mask):
+        return None
+
+
+class Hash(Layer):
+    """Looks up keys in a table when setup `vocabulary_path`, which outputs the corresponding values.
+    If `vocabulary_path` is not set, `Hash` will hash the input to [0,num_buckets). When `mask_zero` = True,
+    input value `0` or `0.0` will be set to `0`, and other value will be set in range [1,num_buckets).
+
+    The following snippet initializes a `Hash` with `vocabulary_path` file with the first column as keys and
+    second column as values:
+
+    * `1,emerson`
+    * `2,lake`
+    * `3,palmer`
+
+    >>> hash = Hash(
+    ...   num_buckets=3+1,
+    ...   vocabulary_path=filename,
+    ...   default_value=0)
+    >>> hash(tf.constant('lake')).numpy()
+    2
+    >>> hash(tf.constant('lakeemerson')).numpy()
+    0
+
+    Args:
+        num_buckets: An `int` that is >= 1. The number of buckets or the vocabulary size + 1
+            when `vocabulary_path` is setup.
+        mask_zero: default is False. The `Hash` value will hash input `0` or `0.0` to value `0` when
+            the `mask_zero` is `True`. `mask_zero` is not used when `vocabulary_path` is setup.
+        vocabulary_path: default `None`. The `CSV` text file path of the vocabulary hash, which contains
+            two columns seperated by delimiter `comma`, the first column is the value and the second is
+            the key. The key data type is `string`, the value data type is `int`. The path must
+            be accessible from wherever `Hash` is initialized.
+        default_value: default '0'. The default value if a key is missing in the table.
+        **kwargs: Additional keyword arguments.
+    """
+
+    def __init__(self, num_buckets, mask_zero=False, vocabulary_path=None, default_value=0, **kwargs):
+        self.num_buckets = num_buckets
+        self.mask_zero = mask_zero
+        self.vocabulary_path = vocabulary_path
+        self.default_value = default_value
+        if self.vocabulary_path:
+            initializer = TextFileInitializer(vocabulary_path, 'string', 1, 'int64', 0, delimiter=',')
+            self.hash_table = StaticHashTable(initializer, default_value=self.default_value)
+        super(Hash, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Be sure to call this somewhere!
+        super(Hash, self).build(input_shape)
+
+    def call(self, x, mask=None, **kwargs):
+
+        if x.dtype != tf.string:
+            zero = tf.as_string(tf.zeros([1], dtype=x.dtype))
+            x = tf.as_string(x, )
+        else:
+            zero = tf.as_string(tf.zeros([1], dtype='int32'))
+
+        if self.vocabulary_path:
+            hash_x = self.hash_table.lookup(x)
+            return hash_x
+
+        num_buckets = self.num_buckets if not self.mask_zero else self.num_buckets - 1
+        try:
+            hash_x = tf.string_to_hash_bucket_fast(x, num_buckets,
+                                                   name=None)  # weak hash
+        except AttributeError:
+            hash_x = tf.strings.to_hash_bucket_fast(x, num_buckets,
+                                                    name=None)  # weak hash
+        if self.mask_zero:
+            mask = tf.cast(tf.not_equal(x, zero), dtype='int64')
+            hash_x = (hash_x + 1) * mask
+
+        return hash_x
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self, ):
+        config = {'num_buckets': self.num_buckets, 'mask_zero': self.mask_zero, 'vocabulary_path': self.vocabulary_path,
+                  'default_value': self.default_value}
+        base_config = super(Hash, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class Linear(Layer):
+
+    def __init__(self, l2_reg=0.0, mode=0, use_bias=False, seed=1024, **kwargs):
+
+        self.l2_reg = l2_reg
+        # self.l2_reg = tf.contrib.layers.l2_regularizer(float(l2_reg_linear))
+        if mode not in [0, 1, 2]:
+            raise ValueError("mode must be 0,1 or 2")
+        self.mode = mode
+        self.use_bias = use_bias
+        self.seed = seed
+        super(Linear, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        if self.use_bias:
+            self.bias = self.add_weight(name='linear_bias',
+                                        shape=(1,),
+                                        initializer=Zeros(),
+                                        trainable=True)
+        if self.mode == 1:
+            self.kernel = self.add_weight(
+                'linear_kernel',
+                shape=[int(input_shape[-1]), 1],
+                initializer=glorot_normal(self.seed),
+                regularizer=l2(self.l2_reg),
+                trainable=True)
+        elif self.mode == 2:
+            self.kernel = self.add_weight(
+                'linear_kernel',
+                shape=[int(input_shape[1][-1]), 1],
+                initializer=glorot_normal(self.seed),
+                regularizer=l2(self.l2_reg),
+                trainable=True)
+
+        super(Linear, self).build(input_shape)  # Be sure to call this somewhere!
+
+    def call(self, inputs, **kwargs):
+        if self.mode == 0:
+            sparse_input = inputs
+            linear_logit = reduce_sum(sparse_input, axis=-1, keep_dims=True)
+        elif self.mode == 1:
+            dense_input = inputs
+            fc = tf.tensordot(dense_input, self.kernel, axes=(-1, 0))
+            linear_logit = fc
+        else:
+            sparse_input, dense_input = inputs
+            fc = tf.tensordot(dense_input, self.kernel, axes=(-1, 0))
+            linear_logit = reduce_sum(sparse_input, axis=-1, keep_dims=False) + fc
+        if self.use_bias:
+            linear_logit += self.bias
+
+        return linear_logit
+
+    def compute_output_shape(self, input_shape):
+        return (None, 1)
+
+    def compute_mask(self, inputs, mask):
+        return None
+
+    def get_config(self, ):
+        config = {'mode': self.mode, 'l2_reg': self.l2_reg, 'use_bias': self.use_bias, 'seed': self.seed}
+        base_config = super(Linear, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+def concat_func(inputs, axis=-1, mask=False):
+    if not mask:
+        inputs = list(map(NoMask(), inputs))
+    if len(inputs) == 1:
+        return inputs[0]
+    else:
+        return Concatenate(axis=axis)(inputs)
+
+
+def reduce_mean(input_tensor,
+                axis=None,
+                keep_dims=False,
+                name=None,
+                reduction_indices=None):
+    try:
+        return tf.reduce_mean(input_tensor,
+                              axis=axis,
+                              keep_dims=keep_dims,
+                              name=name,
+                              reduction_indices=reduction_indices)
+    except TypeError:
+        return tf.reduce_mean(input_tensor,
+                              axis=axis,
+                              keepdims=keep_dims,
+                              name=name)
+
+
+def reduce_sum(input_tensor,
+               axis=None,
+               keep_dims=False,
+               name=None,
+               reduction_indices=None):
+    try:
+        return tf.reduce_sum(input_tensor,
+                             axis=axis,
+                             keep_dims=keep_dims,
+                             name=name,
+                             reduction_indices=reduction_indices)
+    except TypeError:
+        return tf.reduce_sum(input_tensor,
+                             axis=axis,
+                             keepdims=keep_dims,
+                             name=name)
+
+
+def reduce_max(input_tensor,
+               axis=None,
+               keep_dims=False,
+               name=None,
+               reduction_indices=None):
+    try:
+        return tf.reduce_max(input_tensor,
+                             axis=axis,
+                             keep_dims=keep_dims,
+                             name=name,
+                             reduction_indices=reduction_indices)
+    except TypeError:
+        return tf.reduce_max(input_tensor,
+                             axis=axis,
+                             keepdims=keep_dims,
+                             name=name)
+
+
+def div(x, y, name=None):
+    try:
+        return tf.div(x, y, name=name)
+    except AttributeError:
+        return tf.divide(x, y, name=name)
+
+
+def softmax(logits, dim=-1, name=None):
+    try:
+        return tf.nn.softmax(logits, dim=dim, name=name)
+    except TypeError:
+        return tf.nn.softmax(logits, axis=dim, name=name)
+
+
+class _Add(Layer):
+    def __init__(self, **kwargs):
+        super(_Add, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Be sure to call this somewhere!
+        super(_Add, self).build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        # if not isinstance(inputs, list):
+        #     return inputs
+        # if len(inputs) == 1:
+        #     return inputs[0]
+        if len(inputs) == 0:
+            return tf.constant([[0.0]])
+
+        return Add()(inputs)
+
+
+def add_func(inputs):
+    if not isinstance(inputs, list):
+        return inputs
+    if len(inputs) == 1:
+        return inputs[0]
+    return _Add()(inputs)
+
+
+def combined_dnn_input(sparse_embedding_list, dense_value_list):
+    if len(sparse_embedding_list) > 0 and len(dense_value_list) > 0:
+        sparse_dnn_input = Flatten()(concat_func(sparse_embedding_list))
+        dense_dnn_input = Flatten()(concat_func(dense_value_list))
+        return concat_func([sparse_dnn_input, dense_dnn_input])
+    elif len(sparse_embedding_list) > 0:
+        return Flatten()(concat_func(sparse_embedding_list))
+    elif len(dense_value_list) > 0:
+        return Flatten()(concat_func(dense_value_list))
+    else:
+        raise NotImplementedError("dnn_feature_columns can not be empty list")
diff --git a/modelzoo/PNN/script/models/__init__.py b/modelzoo/PNN/script/models/__init__.py
new file mode 100644
index 00000000000..6c2b9cd07f5
--- /dev/null
+++ b/modelzoo/PNN/script/models/__init__.py
@@ -0,0 +1,3 @@
+from .pnn import PNN
+
+__all__ = ["PNN"]
diff --git a/modelzoo/PNN/script/models/pnn.py b/modelzoo/PNN/script/models/pnn.py
new file mode 100644
index 00000000000..6a75271ca81
--- /dev/null
+++ b/modelzoo/PNN/script/models/pnn.py
@@ -0,0 +1,72 @@
+# -*- coding:utf-8 -*-
+"""
+Author:
+    Weichen Shen, weichenswc@163.com
+
+Reference:
+    [1] Qu Y, Cai H, Ren K, et al. Product-based neural networks for user response prediction[C]//Data Mining (ICDM), 2016 IEEE 16th International Conference on. IEEE, 2016: 1149-1154.(https://arxiv.org/pdf/1611.00144.pdf)
+"""
+
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.layers import Dense, Reshape, Flatten
+
+from ..feature_column import build_input_features, input_from_feature_columns
+from ..layers.core import PredictionLayer, DNN
+from ..layers.interaction import InnerProductLayer, OutterProductLayer
+from ..layers.utils import concat_func, combined_dnn_input
+
+
+def PNN(dnn_feature_columns, dnn_hidden_units=(256, 128, 64), l2_reg_embedding=0.00001, l2_reg_dnn=0,
+        seed=1024, dnn_dropout=0, dnn_activation='relu', use_inner=True, use_outter=False, kernel_type='mat',
+        task='binary'):
+    """Instantiates the Product-based Neural Network architecture.
+
+    :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
+    :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of deep net
+    :param l2_reg_embedding: float . L2 regularizer strength applied to embedding vector
+    :param l2_reg_dnn: float. L2 regularizer strength applied to DNN
+    :param seed: integer ,to use as random seed.
+    :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
+    :param dnn_activation: Activation function to use in DNN
+    :param use_inner: bool,whether use inner-product or not.
+    :param use_outter: bool,whether use outter-product or not.
+    :param kernel_type: str,kernel_type used in outter-product,can be ``'mat'`` , ``'vec'`` or ``'num'``
+    :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
+    :return: A Keras model instance.
+    """
+
+    if kernel_type not in ['mat', 'vec', 'num']:
+        raise ValueError("kernel_type must be mat,vec or num")
+
+    features = build_input_features(dnn_feature_columns)
+
+    inputs_list = list(features.values())
+
+    sparse_embedding_list, dense_value_list = input_from_feature_columns(features, dnn_feature_columns,
+                                                                         l2_reg_embedding, seed)
+    inner_product = Flatten()(
+        InnerProductLayer()(sparse_embedding_list))
+    outter_product = OutterProductLayer(kernel_type)(sparse_embedding_list)
+
+    # ipnn deep input
+    linear_signal = Reshape(
+        [sum(map(lambda x: int(x.shape[-1]), sparse_embedding_list))])(concat_func(sparse_embedding_list))
+
+    if use_inner and use_outter:
+        deep_input = concat_func([linear_signal, inner_product, outter_product])
+    elif use_inner:
+        deep_input = concat_func([linear_signal, inner_product])
+    elif use_outter:
+        deep_input = concat_func([linear_signal, outter_product])
+    else:
+        deep_input = linear_signal
+
+    dnn_input = combined_dnn_input([deep_input], dense_value_list)
+    dnn_out = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, False, seed=seed)(dnn_input)
+    dnn_logit = Dense(1, use_bias=False)(dnn_out)
+
+    output = PredictionLayer(task)(dnn_logit)
+
+    model = Model(inputs=inputs_list,
+                  outputs=output)
+    return model
diff --git a/modelzoo/PNN/script/utils.py b/modelzoo/PNN/script/utils.py
new file mode 100644
index 00000000000..7fe3b25a518
--- /dev/null
+++ b/modelzoo/PNN/script/utils.py
@@ -0,0 +1,46 @@
+# -*- coding:utf-8 -*-
+"""
+
+Author:
+    Weichen Shen,weichenswc@163.com
+
+"""
+
+import json
+import logging
+from threading import Thread
+
+import requests
+
+try:
+    from packaging.version import parse
+except ImportError:
+    from pip._vendor.packaging.version import parse
+
+
+def check_version(version):
+    """Return version of package on pypi.python.org using json."""
+
+    def check(version):
+        try:
+            url_pattern = 'https://pypi.python.org/pypi/deepctr/json'
+            req = requests.get(url_pattern)
+            latest_version = parse('0')
+            version = parse(version)
+            if req.status_code == requests.codes.ok:
+                j = json.loads(req.text.encode('utf-8'))
+                releases = j.get('releases', [])
+                for release in releases:
+                    ver = parse(release)
+                    if ver.is_prerelease or ver.is_postrelease:
+                        continue
+                    latest_version = max(latest_version, ver)
+                if latest_version > version:
+                    logging.warning(
+                        '\nDeepCTR version {0} detected. Your version is {1}.\nUse `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v{0}'.format(
+                            latest_version, version))
+        except:
+            print("Please check the latest version manually on https://pypi.org/project/deepctr/#history")
+            return
+
+    Thread(target=check, args=(version,)).start()
diff --git a/modelzoo/PNN/train.py b/modelzoo/PNN/train.py
new file mode 100644
index 00000000000..55eef980f30
--- /dev/null
+++ b/modelzoo/PNN/train.py
@@ -0,0 +1,259 @@
+import os
+import sys
+import argparse
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+from sklearn.metrics import log_loss, roc_auc_score
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder, MinMaxScaler,MultiLabelBinarizer
+from tensorflow.keras.optimizers import Adam,SGD
+from tensorflow.keras.losses import binary_crossentropy
+from scipy.sparse import coo_matrix
+from script.models.pnn import PNN
+from script.feature_column import SparseFeat, DenseFeat, get_feature_names,VarLenSparseFeat
+
+
+
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+UNSEQ_COLUMNS = ['UID', 'ITEM', 'CATEGORY']
+LABEL_COLUMN = ['CLICKED']
+TRAIN_DATA_COLUMNS = LABEL_COLUMN + UNSEQ_COLUMNS
+
+EMBEDDING_DIM=8
+
+def split(x):
+    key_ans = x.split(',')
+    for key in key_ans:
+        if key not in key2index:
+            key2index[key] = len(key2index) + 1
+    return list(map(lambda x: key2index[x], key_ans))
+
+
+#连续变量分箱处理
+def BinMap(data,acc):
+    if acc >=1 or acc<=0:
+        return print('acc must less than 1 and more than 0')
+    max = data.max()
+    min = data.min()
+    rangelist = [i+1 for i in range(int(1/acc))]
+    length = len(data)-1
+    data1 = data.sort_index()
+    bin_res = np.array([0] * data.shape[-1], dtype=int)
+    for r in rangelist:
+        if r ==1:
+            lower = min
+        else:
+            lower = data1[int(length*((r-1)*acc))]
+        rank = r*acc
+        i = int(length*rank)
+        # x = data[np.where(data>=lower) + np.where(data<data1[i])]
+        if r == rangelist[-1]:
+            mask = data.loc[(data>=lower) & (data<=max)].index
+        else:
+            mask = data.loc[(data >= lower) & (data <data1[i])].index
+
+        bin_res[mask]=r
+    bin_res=pd.Series(bin_res,index=data.index)
+    bin_res.name = data.name+'_BIN'
+    return bin_res
+
+def build_model_input(filename=None,chunkSize=1e6,loop=True):
+    chunks=[]
+    data = pd.read_csv(filename, encoding="utf-8", header=None, names=TRAIN_DATA_COLUMNS, iterator=True)
+    while loop:
+        try:
+            chunk = data.get_chunk(chunkSize)
+            chunks.append(chunk)
+        except StopIteration:
+            loop=False
+    dataset = pd.concat(chunks)
+
+
+    return dataset
+
+
+
+def build_feature_columns(data_location=None):
+
+    if data_location:
+        uid_file = os.path.join(data_location, 'uid_labelencode.csv')
+        mid_file = os.path.join(data_location, 'mid_labelencode.csv')
+        cat_file = os.path.join(data_location, 'cat_labelencode.csv')
+        if (not os.path.exists(uid_file)) or (not os.path.exists(mid_file)) or (
+                    not os.path.exists(cat_file)):
+            print("uid_labelencode.csv, mid_labelencode.csv or cat_labelencode.csv does not exist in data file.")
+            sys.exit()
+
+        uid_data = pd.read_csv(uid_file,encoding="utf-8")
+        mid_data = pd.read_csv(mid_file,encoding="utf-8")
+        cat_data = pd.read_csv(cat_file,encoding="utf-8")
+
+
+        feature_column=[SparseFeat('UID', vocabulary_size=uid_data['UID'+'_encode'].max() + 1, embedding_dim=EMBEDDING_DIM,embeddings_initializer=None),
+                        SparseFeat('ITEM',vocabulary_size=mid_data['ITEM'+'_encode'].max()+1,embedding_dim=EMBEDDING_DIM,embeddings_initializer=None),
+                        SparseFeat('CATEGORY',vocabulary_size=cat_data['CATEGORY'+'_encode'].max()+1,embedding_dim=EMBEDDING_DIM,embeddings_initializer=None)]
+
+    else:
+        print("data_location does not exist in data file. ")
+        sys.exit()
+
+
+    return feature_column
+
+
+def main(train_data=None,test_data=None,feature_colums=None):
+    feature_names = get_feature_names(feature_colums)
+    model = PNN(feature_colums, dnn_hidden_units=args.dnn_hidden_units, l2_reg_embedding=args.l2_reg_embedding,task=args.task,
+                l2_reg_dnn=args.l2_reg_dnn,seed=args.seed,dnn_dropout=args.dnn_dropout,dnn_activation=args.dnn_activation,
+                kernel_type=args.kernel_type,use_inner=args.use_inner,use_outter=args.use_outter)
+
+    if args.optimizer=='adam':
+        optimizer = Adam(learning_rate=args.learning_rate, amsgrad=False)
+    model.compile(optimizer, loss=args.loss,
+                  metrics=args.metrics)
+    saver = tf.train.Saver()
+    gpu_options = tf.GPUOptions(allow_growth=True)
+    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
+        if args.training:
+            train_inputs = {name: train_data[name].values for name in feature_names}
+            sess.run(tf.tables_initializer())
+            history = model.fit(train_inputs, train_data[LABEL_COLUMN].values,
+                            batch_size=args.batch_size, epochs=args.epochs,
+                            verbose=args.verbose,validation_split=args.validation_split)
+            saver.save(sess,args.save_path,global_step=args.save_step)
+
+        else:
+            #new_saver = tf.train.import_meta_graph(save_path+'model.ckpt.meta')
+
+            saver.restore(sess, tf.train.latest_checkpoint(args.save_path))
+            test_inputs = {name:test_data[name].values for name in feature_names}
+            pred_ans = model.predict(test_inputs, batch_size=args.batch_size)
+
+
+# Get parse
+def get_arg_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--learning_rate',
+                        help='Learning rate for model',
+                        type=float,
+                        default=0.001)
+    parser.add_argument('--save_path',
+                        help='Full path to model output directory',
+                        required=False,
+                        default='results/')
+    parser.add_argument('--batch_size',
+                        help='Batch size to train. Default is 512',
+                        type=int,
+                        default=512)
+    parser.add_argument('--training',
+                        help='train or eval ',
+                        type=bool,
+                        default=True)
+    parser.add_argument('--epochs',
+                        help='Epoch to train.Default is 50',
+                        type=int,
+                        default=50)
+    parser.add_argument('--save_step',
+                        help='set the number of steps on saving checkpoints',
+                        type=int,
+                        default=1)
+    parser.add_argument('--verbose',
+                        help='set the random seed for tensorflow.',
+                        choices=[0,1,2],
+                        default=2)
+    parser.add_argument('--validation_split',
+                        help='Validation split.',
+                        type=float,
+                        default=0.2)
+    parser.add_argument('--optimizer',
+                        type=str, \
+                        choices=['adam', 'adamasync', 'adagraddecay'],
+                        default='adam')
+    parser.add_argument('--use_inner',
+                        help='IPNN',
+                        type=bool,
+                        default=True)
+    parser.add_argument('--use_outter',
+                        help='OPNN',
+                        type=bool,
+                        default=False)
+    parser.add_argument('--dnn_hidden_units',
+                        type=tuple,
+                        help='An iterable containing all the features used by deep part of the model.',
+                        default=(256, 128, 64))
+    parser.add_argument('--l2_reg_embedding',
+                        help=' L2 regularizer strength applied to embedding vector.',
+                        type=float,
+                        default=0.00001)
+    parser.add_argument('--l2_reg_dnn',
+                        help='L2 regularizer strength applied to DNN.',
+                        type=float,
+                        default=0)
+    parser.add_argument('--seed',
+                        help='to use as random seed.',
+                        type=int,
+                        default=1024)
+    parser.add_argument('--dnn_dropout',
+                        help='the probability we will drop out a given DNN coordinate,float in [0,1).',
+                        type=float,
+                        default=0)
+    parser.add_argument('--dnn_activation',
+                        help='Activation function to use in DNN.',
+                        type=str,
+                        default='relu')
+    parser.add_argument('--task',
+                        help='``"binary"`` for  binary logloss or  ``"regression"`` for regression loss.',
+                        type=str,
+                        choices=['binary', 'regression'],
+                        default='binary')
+    parser.add_argument('--kernel_type',
+                        help='kernel_type used in outter-product,can be \'mat\' , \'vec\' or \'num\'.',
+                        type=str,
+                        choices=['mat','vec','num'],
+                        default='mat')
+    parser.add_argument('--loss',
+                        type=str,
+                        default='binary_crossentropy')
+    parser.add_argument('--metrics',
+                        type=list,
+                        default=['binary_crossentropy', 'AUC'])
+
+    return parser
+
+
+
+if __name__=="__main__":
+    path = 'dataset'
+    train_path = path+'/local_train_splitByUser_to_labelencode.txt'
+    test_path = path+'/local_test_splitByUser_to_labelencode.txt'
+    feature_colums = build_feature_columns(path)
+
+    train_data = build_model_input(train_path)
+    test_data = build_model_input(test_path)
+
+    feature_names = get_feature_names(feature_colums)
+
+    parser = get_arg_parser()
+    args = parser.parse_args()
+    main(train_data,test_data,feature_colums)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+