diff --git a/recstudio/data/__init__.py b/recstudio/data/__init__.py index f98378c..28eefdc 100644 --- a/recstudio/data/__init__.py +++ b/recstudio/data/__init__.py @@ -1,4 +1,4 @@ -from recstudio.data.dataset import TripletDataset, SeqDataset, UserDataset, FullSeqDataset +from recstudio.data.dataset import TripletDataset, SeqDataset, UserDataset, FullSeqDataset, DICEDataset, UBPRDataset from recstudio.data.advance_dataset import ALSDataset import os diff --git a/recstudio/data/config/all.yaml b/recstudio/data/config/all.yaml index 92a1a66..32cd255 100644 --- a/recstudio/data/config/all.yaml +++ b/recstudio/data/config/all.yaml @@ -49,4 +49,9 @@ mapped_feat_field: [*u, *i] network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]] network_feat_header: [0, 0] +# interactions that are missing completely at random +mcar_feat_name: ~ +inter_feat_field: [*u, *i, *r, *t] +inter_feat_header: ~ + save_cache: False # whether to save processed dataset to cache. diff --git a/recstudio/data/config/amazon-beauty.yaml b/recstudio/data/config/amazon-beauty.yaml index 68a9346..5b8932c 100644 --- a/recstudio/data/config/amazon-beauty.yaml +++ b/recstudio/data/config/amazon-beauty.yaml @@ -22,7 +22,7 @@ field_separator: "," min_user_inter: 5 min_item_inter: 5 field_max_len: ~ -low_rating_threshold: 3.0 +low_rating_thres: 3.0 max_seq_len: 50 # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features diff --git a/recstudio/data/config/amazon-books.yaml b/recstudio/data/config/amazon-books.yaml index f6ce204..15b4bee 100644 --- a/recstudio/data/config/amazon-books.yaml +++ b/recstudio/data/config/amazon-books.yaml @@ -23,7 +23,7 @@ field_separator: "," min_user_inter: 5 min_item_inter: 5 field_max_len: ~ -low_rating_threshold: ~ +low_rating_thres: ~ max_seq_len: 20 # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features diff --git a/recstudio/data/config/amazon-electronics.yaml b/recstudio/data/config/amazon-electronics.yaml index 57ba128..6453c4a 100644 --- a/recstudio/data/config/amazon-electronics.yaml +++ b/recstudio/data/config/amazon-electronics.yaml @@ -23,7 +23,7 @@ field_separator: "," min_user_inter: 5 min_item_inter: 5 field_max_len: ~ -low_rating_threshold: 3 +low_rating_thres: 3 max_seq_len: 20 # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features diff --git a/recstudio/data/config/coat.yaml b/recstudio/data/config/coat.yaml new file mode 100644 index 0000000..420c87f --- /dev/null +++ b/recstudio/data/config/coat.yaml @@ -0,0 +1,47 @@ +url: https://rec.ustc.edu.cn/share/b097d230-c7e1-11ed-997a-bba57c4b600a +user_id_field: &u user_id:token # TODO: comments for &u and *u +item_id_field: &i item_id:token +rating_field: &r rating:float +time_field: ~ +time_format: ~ + + +inter_feat_name: coat.mnar +inter_feat_field: [*u, *i, *r] +inter_feat_header: 0 + +user_feat_name: [coat.user] +user_feat_field: [[*u, ugender(men):token, ugender(women):token, age(20-30):token, age(30-40):token, age(40-50):token, age(50-60):token, age(over 60):token, age(under 20):token, location(rural):token, location(suburban):token, location(urban):token, fashioninterest(moderately):token, fashioninterest(not at all):token, fashioninterest(very):token]] +user_feat_header: 0 + + +item_feat_name: [coat.item] +item_feat_field: [[*i, igender(men):token, igender(women):token, jackettype(bomber):token, jackettype(cropped):token, jackettype(field):token, jackettype(fleece):token, jackettype(insulated):token, jackettype(motorcycle):token, jackettype(other):token, jackettype(packable):token, jackettype(parkas):token, jackettype(pea):token, jackettype(rain):token, jackettype(shells):token, jackettype(track):token, jackettype(trench):token, jackettype(vests):token, jackettype(waterproof):token, color(beige):token, color(black):token, color(blue):token, color(brown):token, color(gray):token, color(green):token, color(multi):token, color(navy):token, color(olive):token, color(other):token, color(pink):token, color(purple):token, color(red):token, onfrontpage(yes):token, onfrontpage(no):token]] +item_feat_header: 0 + + +field_separator: "\t" +min_user_inter: 5 +min_item_inter: 5 +field_max_len: ~ +low_rating_thres: 3.0 +max_seq_len: 20 + +# network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features +network_feat_name: ~ +mapped_feat_field: ~ +network_feat_field: ~ +network_feat_header: ~ + +# interactions that are missing completely at random +mcar_feat_name: coat.mcar +mcar_feat_field: [*u, *i, *r] +mcar_feat_header: 0 + +# propensities of each (u, i) pair +# propensity_feat_name: coat.propensities +# propensity_feat_field: [*u, *i, propensity:float] +# propensity_feat_header: 0 + + +save_cache: True # whether to save processed dataset to cache. diff --git a/recstudio/data/config/gowalla.yaml b/recstudio/data/config/gowalla.yaml index b44d741..a2e01f9 100644 --- a/recstudio/data/config/gowalla.yaml +++ b/recstudio/data/config/gowalla.yaml @@ -24,7 +24,7 @@ seq_separator: " " min_user_inter: 5 min_item_inter: 5 field_max_len: ~ -low_rating_threshold: ~ +low_rating_thres: ~ max_seq_len: 20 # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features diff --git a/recstudio/data/config/ml-100k.yaml b/recstudio/data/config/ml-100k.yaml index c705ef3..6ca1038 100644 --- a/recstudio/data/config/ml-100k.yaml +++ b/recstudio/data/config/ml-100k.yaml @@ -21,8 +21,8 @@ item_feat_header: 0 field_separator: "\t" -min_user_inter: 0 -min_item_inter: 0 +min_user_inter: 5 +min_item_inter: 5 field_max_len: ~ low_rating_thres: 3.0 max_seq_len: 20 diff --git a/recstudio/data/config/ml-10m.yaml b/recstudio/data/config/ml-10m.yaml index 0a03701..aa5c184 100644 --- a/recstudio/data/config/ml-10m.yaml +++ b/recstudio/data/config/ml-10m.yaml @@ -26,7 +26,7 @@ field_separator: "::" min_user_inter: 5 min_item_inter: 5 field_max_len: ~ -low_rating_threshold: 3.0 +low_rating_thres: 3.0 max_seq_len: 20 # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features diff --git a/recstudio/data/config/ml-20m.yaml b/recstudio/data/config/ml-20m.yaml index 880a480..cac833c 100644 --- a/recstudio/data/config/ml-20m.yaml +++ b/recstudio/data/config/ml-20m.yaml @@ -26,7 +26,7 @@ field_separator: "," min_user_inter: 5 min_item_inter: 5 field_max_len: ~ -low_rating_threshold: ~ +low_rating_thres: ~ max_seq_len: 20 # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features diff --git a/recstudio/data/config/tmall.yaml b/recstudio/data/config/tmall.yaml index d03dca9..5930dfd 100644 --- a/recstudio/data/config/tmall.yaml +++ b/recstudio/data/config/tmall.yaml @@ -26,7 +26,7 @@ field_separator: "::" min_user_inter: 5 min_item_inter: 5 field_max_len: ~ -low_rating_threshold: ~ +low_rating_thres: ~ max_seq_len: 50 # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features diff --git a/recstudio/data/config/yahoor3.yaml b/recstudio/data/config/yahoor3.yaml new file mode 100644 index 0000000..4457fa2 --- /dev/null +++ b/recstudio/data/config/yahoor3.yaml @@ -0,0 +1,42 @@ +url: https://rec.ustc.edu.cn/share/e8e98ae0-c700-11ed-b3cf-cb390cf64bc2 +user_id_field: &u user_id:token # TODO: comments for &u and *u +item_id_field: &i item_id:token +rating_field: &r rating:float +time_field: ~ +time_format: ~ + + +inter_feat_name: yahooR3.inter +inter_feat_field: [*u, *i, *r] +inter_feat_header: 0 + +user_feat_name: ~ +user_feat_field: ~ +user_feat_header: ~ + + +item_feat_name: ~ +item_feat_field: ~ +item_feat_header: ~ + + +field_separator: "," +min_user_inter: 5 +min_item_inter: 5 +field_max_len: ~ +low_rating_thres: 3.0 +max_seq_len: 20 + +# network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features +network_feat_name: ~ +mapped_feat_field: ~ +network_feat_field: ~ +network_feat_header: ~ + +# interactions that are missing completely at random +mcar_feat_name: yahooR3.mcar +mcar_feat_field: [*u, *i, *r] +mcar_feat_header: 0 + + +save_cache: True # whether to save processed dataset to cache. diff --git a/recstudio/data/config/yelp.yaml b/recstudio/data/config/yelp.yaml index 3a672ef..e17c970 100644 --- a/recstudio/data/config/yelp.yaml +++ b/recstudio/data/config/yelp.yaml @@ -27,7 +27,7 @@ field_separator: "," min_user_inter: 5 min_item_inter: 5 field_max_len: ~ -low_rating_threshold: 3 +low_rating_thres: 3 max_seq_len: 20 # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features diff --git a/recstudio/data/dataset.py b/recstudio/data/dataset.py index 397d810..195a210 100644 --- a/recstudio/data/dataset.py +++ b/recstudio/data/dataset.py @@ -102,7 +102,10 @@ def _init_common_field(self): self.field2maxlen = self.config['field_max_len'] or {} self.fuid = self.config['user_id_field'].split(':')[0] self.fiid = self.config['item_id_field'].split(':')[0] - self.ftime = self.config['time_field'].split(':')[0] + if self.config['time_field'] is not None: + self.ftime = self.config['time_field'].split(':')[0] + else: + self.ftime = None if self.config['rating_field'] is not None: self.frating = self.config['rating_field'].split(':')[0] else: @@ -160,8 +163,7 @@ def _filter_ratings(self, thres: float=None): def _load_all_data(self, data_dir, field_sep): r"""Load features for user, item, interaction and network.""" # load interaction features - inter_feat_path = os.path.join( - data_dir, self.config['inter_feat_name']) + inter_feat_path = os.path.join(data_dir, self.config['inter_feat_name']) self.inter_feat = self._load_feat( inter_feat_path, self.config['inter_feat_header'], field_sep, self.config['inter_feat_field']) self.inter_feat = self.inter_feat.dropna(how="any") @@ -186,9 +188,9 @@ def _load_all_data(self, data_dir, field_sep): self.user_feat.reset_index(inplace=True) self._fill_nan(self.user_feat) + # load item features self.item_feat = None if self.config['item_feat_name'] is not None: - # load item features item_feat = [] for _, item_feat_col in zip(self.config['item_feat_name'], self.config['item_feat_field']): item_feat_path = os.path.join(data_dir, _) @@ -225,6 +227,14 @@ def _load_all_data(self, data_dir, field_sep): net_name, net_field = name[0], fields[0] self.network_feat[i] = self._load_feat( os.path.join(data_dir, net_name), self.config['network_feat_header'][i][0], field_sep, net_field) + + # load missing-completely-at-random interaction features + if self.config['mcar_feat_name'] is not None: + mcar_feat_path = os.path.join(data_dir, self.config['mcar_feat_name']) + self.mcar_feat = self._load_feat( + mcar_feat_path, self.config['mcar_feat_header'], field_sep, self.config['mcar_feat_field']) + self.mcar_feat = self.mcar_feat.dropna(how="any") + def _fill_nan(self, feat, mapped=False): r"""Fill the missing data in the original data. @@ -309,6 +319,11 @@ def _get_feat_list(self): if self.config['network_feat_name'] is not None: feat_list.extend(self.network_feat) # return list(feat for feat in feat_list if feat is not None) + + if hasattr(self, 'mcar_feat'): + feat_list.append(self.mcar_feat) + if hasattr(self, 'mcar_feat_for_train'): + feat_list.append(self.mcar_feat_for_train) return feat_list def _map_all_ids(self): @@ -617,11 +632,14 @@ def _split_by_ratio(self, ratio, data_count, user_mode): m = len(data_count) if not user_mode: splits = np.outer(data_count, ratio).astype(np.int32) - splits[:, 0] = data_count - splits[:, 1:].sum(axis=1) - for i in range(1, len(ratio)): - idx = (splits[:, -i] == 0) & (splits[:, 0] > 1) - splits[idx, -i] += 1 - splits[idx, 0] -= 1 + major = np.argmax(ratio) + minors = [i for i, _ in enumerate(ratio) if i != major] + splits[:, major] = data_count - splits[:, minors].sum(axis=1) + for i in minors: + if ratio[i] != 0: + idx = (splits[:, i] == 0) & (splits[:, major] > 1) + splits[idx, i] += 1 + splits[idx, major] -= 1 else: idx = np.random.permutation(m) sp_ = (m * np.array(ratio)).astype(np.int32) @@ -687,6 +705,37 @@ def get_splits(bool_index): splits = cumsum.reshape(-1, 1) + splits return splits, data_count.index if m > 1 else None + def _inverse_popularity_mcar_sampling(self, mcar_ratio_or_num): + """ + Sample missing-completely-at-random data by inverse popularity. + """ + user_count = self.inter_feat[self.fuid].groupby( + self.inter_feat[self.fuid], sort=False).count() + if isinstance(mcar_ratio_or_num, int): + splits, _ = self._split_by_leave_one_out(mcar_ratio_or_num, user_count, True) + elif isinstance(mcar_ratio_or_num, float): + splits, _ = self._split_by_ratio( + [1 - mcar_ratio_or_num, mcar_ratio_or_num], user_count, False) + mcar_count = splits[:, -1] - splits[:, 1] + + item_freq = torch.from_numpy(self.inter_feat[self.fiid].groupby( + self.inter_feat[self.fiid]).count().to_numpy()) + weight = torch.hstack([torch.tensor(0.), 1 / item_freq]) + all_mcar_idx = [] + for u, sub_inter_feat in list(self.inter_feat.groupby(self.inter_feat[self.fuid])): + iid_idx = sub_inter_feat[self.fiid].index + iid = sub_inter_feat[self.fiid].values + mcar_idx = iid_idx[torch.multinomial(weight[iid], mcar_count[u - 1])] + if isinstance(mcar_idx, np.int64): + mcar_idx = np.array([mcar_idx]) + all_mcar_idx.append(mcar_idx) + all_mcar_idx = np.concatenate(all_mcar_idx) + self.mcar_feat = self.inter_feat.loc[all_mcar_idx].copy() + self.inter_feat.drop(index=all_mcar_idx, inplace=True) + self.mcar_feat.reset_index(drop=True, inplace=True) + self.inter_feat.reset_index(drop=True, inplace=True) + + def _get_data_idx(self, splits): r""" Return data index for train/valid/test dataset. """ @@ -753,7 +802,7 @@ def _get_neg_data(self, data: Dict): user_hist = self.user_hist[data[self.fuid]][:, 0:user_count] else: user_hist = data['user_hist'] - neg_id = uniform_sampling(data[self.frating].size(0), self.num_items, + neg_id = uniform_neg_sampling(data[self.frating].size(0), self.num_items, self.neg_count, user_hist).long() # [B, neg] neg_id = neg_id.transpose(0,1).contiguous().view(-1) # [neg*B] neg_item_feat = self.item_feat[neg_id] @@ -778,7 +827,7 @@ def __getitem__(self, index): dict: A dict contains different feature. """ data = self._get_pos_data(index) - if self.eval_mode and 'user_hist' not in data: + if (self.eval_mode and 'user_hist' not in data): user_count = self.user_count[data[self.fuid]].max() data['user_hist'] = self.user_hist[data[self.fuid]][:, 0:user_count] else: @@ -825,6 +874,7 @@ def build( shuffle: bool = True, split_mode: str = 'user_entry', split_ratio: List = [0.8, 0.1, 0.1], + mcar_sampling_ratio: float = 0, **kwargs ): """Build dataset. @@ -844,10 +894,11 @@ def build( """ self.fmeval = fmeval self.split_mode = split_mode + self.mcar_sampling_ratio = mcar_sampling_ratio self._init_sampler(sampler, neg_count) - return self._build(split_ratio, shuffle, split_mode, False, binarized_rating_thres) + return self._build(split_ratio, shuffle, split_mode, False, binarized_rating_thres, mcar_sampling_ratio) - def _build(self, ratio_or_num, shuffle, split_mode, rep, binarized_rating_thres=None): + def _build(self, ratio_or_num, shuffle, split_mode, rep, binarized_rating_thres=None, mcar_sampling_ratio=0): # for general recommendation, only support non-repetive recommendation # keep first data, sorted by time or not, split by user or not if binarized_rating_thres is not None: @@ -858,6 +909,10 @@ def _build(self, ratio_or_num, shuffle, split_mode, rep, binarized_rating_thres= if self.drop_dup and (not rep): # drop duplicated interactions self.inter_feat = self.inter_feat[self.first_item_idx] + # in need of mcar_feat but not given in data + if not hasattr(self, 'mcar_feat') and mcar_sampling_ratio: + self._inverse_popularity_mcar_sampling(mcar_sampling_ratio) + if (split_mode == 'user_entry') or (split_mode == 'user'): if self.ftime in self.inter_feat: self.inter_feat.sort_values(by=[self.fuid, self.ftime], inplace=True) @@ -915,23 +970,49 @@ def _build(self, ratio_or_num, shuffle, split_mode, rep, binarized_rating_thres= splits = (splits, uids.view(-1, 1)) else: splits = (splits.numpy(), uids) - - + + mcar_split_ratio = self.config['mcar_split_ratio'] + if hasattr(self, 'mcar_feat') and mcar_split_ratio: + mcar_datasets = self._build_for_mcar(mcar_split_ratio, shuffle, rep) + self.dataframe2tensors() datasets = [self._copy(_) for _ in self._get_data_idx(splits)] - user_hist, user_count = datasets[0].get_hist(True) + trn_uh, trn_uc = datasets[0].get_hist(True) + + if hasattr(self, 'mcar_feat') and mcar_split_ratio: + if self.config['mcar_split_ratio'][0] != 0: + datasets[0].mcar_feat_for_train = mcar_datasets[0].inter_feat + self.mcar_for_train_index = mcar_datasets[0].data_index + mcar_trn_uh, mcar_trn_uc = mcar_datasets[0].get_hist(True) + trn_uh = torch.cat((trn_uh, mcar_trn_uh), dim=-1).sort(dim=-1, descending=True).values + trn_uc = trn_uc + mcar_trn_uc + for i, r in enumerate(ratio_or_num[1:]): + if r == 0: + assert mcar_split_ratio[i+1] > 0 + datasets[i+1] = mcar_datasets[i+1] + else: + assert mcar_split_ratio[i+1] == 0 + for d in datasets[:2]: - d.user_hist = user_hist - d.user_count = user_count + d.user_hist = trn_uh + d.user_count = trn_uc if len(datasets) > 2: assert len(datasets) == 3 - uh, uc = datasets[1].get_hist(True) - uh = torch.cat((user_hist, uh), dim=-1).sort(dim=-1, descending=True).values - uc = uc + user_count - datasets[-1].user_hist = uh - datasets[-1].user_count = uc + val_uh, val_uc = datasets[1].get_hist(True) + datasets[-1].user_hist = torch.cat((trn_uh, val_uh), dim=-1).sort(dim=-1, descending=True).values + datasets[-1].user_count = trn_uc + val_uc + return datasets + def _build_for_mcar(self, mcar_split_ratio, shuffle, rep): + mcar_d = copy.copy(self) + mcar_d.inter_feat = self.mcar_feat + del mcar_d.mcar_feat + del mcar_d.first_item_idx + datasets = mcar_d._build(mcar_split_ratio, shuffle, 'user_entry', rep) + return datasets + + def dataframe2tensors(self): r"""Convert the data type from TensorFrame to Tensor """ @@ -942,6 +1023,8 @@ def dataframe2tensors(self): for i in range(len(self.network_feat)): self.network_feat[i] = TensorFrame.fromPandasDF( self.network_feat[i], self) + if hasattr(self, 'mcar_feat'): + self.mcar_feat = TensorFrame.fromPandasDF(self.mcar_feat, self) def train_loader(self, batch_size, shuffle=True, num_workers=0, drop_last=False, ddp=False): r"""Return a dataloader for training. @@ -1033,6 +1116,8 @@ def get_hist(self, isUser=True): torch.Tensor: length of the history sequence. """ + if len(self.data_index) == 0: + return torch.tensor([]), 0 user_array = self.inter_feat.get_col(self.fuid)[self.inter_feat_subset] item_array = self.inter_feat.get_col(self.fiid)[self.inter_feat_subset] sorted, index = torch.sort(user_array if isUser else item_array) @@ -1395,7 +1480,60 @@ def loader(self, batch_size, shuffle=True, num_workers=1, drop_last=False, ddp=F persistent_workers=False) return output - +class DICEDataset(TripletDataset): + def _get_neg_data(self, data: Dict): + if 'user_hist' not in data: + user_count = self.user_count[data[self.fuid]].max() + user_hist = self.user_hist[data[self.fuid]][:, 0:user_count] + else: + user_hist = data['user_hist'] + neg_items, mask = popular_sampling_with_margin( + data[self.frating].size(0), + self.neg_count, + data[self.fiid], + user_hist, + self.item_freq, + self.config['margin_up'], + self.config['margin_down'], + self.config['pool'] + ) + if self.config['adaptive']: + self.config['margin_up'] = self.config['margin_up'] * self.config['margin_decay'] + self.config['margin_down'] = self.config['margin_up'] * self.config['margin_decay'] + data['mask'] = mask + data['neg_items'] = neg_items + return data + +class UBPRDataset(TripletDataset): + def _get_neg_data(self, data: Dict): + """sample mixed pos and neg items""" + if 'user_hist' not in data: + user_count = self.user_count[data[self.fuid]].max() + user_hist = self.user_hist[data[self.fuid]][:, 0:user_count] + else: + user_hist = data['user_hist'] + if self.config['sample_type'] == 'mixed': + sampled_id, sampled_label = uniform_sampling( + data[self.frating].size(0), + self.num_items, + self.neg_count, + user_hist + ) + elif self.config['sample_type'] == 'neg only': + sampled_id = uniform_neg_sampling( + data[self.frating].size(0), + self.num_items, + self.neg_count, + user_hist + ) + sampled_label = torch.zeros_like(sampled_id) + else: + raise ValueError(f'sample_type should be `mixed` or `neg only`.') + data['sampled_items'] = sampled_id + data['sampled_labels'] = sampled_label + return data + + class TensorFrame(Dataset): r"""The main data structure used to save interaction data in RecStudio dataset. @@ -1671,7 +1809,7 @@ def __iter__(self): return self def __next__(self): - batch = next(self.loaders[0]) + batch = next(self.loaders[0]) for i, l in enumerate(self.loaders[1:]): try: batch.update(next(l)) @@ -1680,6 +1818,29 @@ def __next__(self): batch.update(next(self.loaders[i+1])) return batch +class ConcatedLoaders(object): + """ + Multiple dataloaders. Each dataloader's batch is concated and returned. + the keys of loaders are natural numbers starting at 0 + """ + def __init__(self, loaders): + self.loaders = loaders + + def __iter__(self): + self.iters = [] + for l in self.loaders: + self.iters.append(iter(l)) + return self + + def __next__(self): + batch = next(self.iters[0]) + batch['Loader'] = torch.zeros_like(list(batch.values())[0]) + for i, l in enumerate(self.iters[1:]): + for k, v in next(l).items(): + batch[k] = torch.cat((batch[k], v)) + batch['Loader'] = torch.cat((batch['Loader'], (i+1)*torch.ones_like(v))) + return batch + class DatasetFromSampler(Dataset): """Dataset to create indexes from `Sampler`. @@ -1760,7 +1921,7 @@ def __iter__(self) -> Iterator[int]: return iter(itemgetter(*indexes_of_indexes)(subsampler_indexes)) -def uniform_sampling( +def uniform_neg_sampling( num_queries: int, num_items: int, num_neg: int, @@ -1794,4 +1955,80 @@ def uniform_sampling( neg_idx[isin_id] = torch.randint(1, num_items, size=(len(isin_id)), device=device) isin_id = torch.tensor( [id for id in isin_id if neg_idx[id] in user_hist[id // num_neg]], device=device) - return neg_idx \ No newline at end of file + return neg_idx + + +def popular_sampling_with_margin( + num_queries: int, + num_neg: int, + pos_items: torch.Tensor, + user_hist: torch.Tensor, + pop: torch.Tensor, + margin_up: float, + margin_down: float, + pool: float + ): + device = user_hist.device + neg_items = torch.full((num_queries, num_neg), -1, device=device) + neg_items = neg_items.reshape(num_queries, -1) # B x Neg + + pop_items = (pop > (pop[pos_items] + margin_up).unsqueeze(-1).expand(-1, len(pop))).float() + unpop_items = (pop < (pop[pos_items] - margin_down).unsqueeze(-1).expand(-1, len(pop))).float() + + _idx = torch.arange(user_hist.size(0), device=device).view(-1, 1).expand_as(user_hist) + pop_items[_idx, user_hist] = 0.0 + unpop_items[_idx, user_hist] = 0.0 + + # To avoid probabiliy = 0 + pop_items[:, 0] = torch.where(pop_items.sum(-1) > 0, pop_items[:, 0], torch.ones(num_queries, device=device)) + unpop_items[:, 0] = torch.where(unpop_items.sum(-1) > 0, unpop_items[:, 0], torch.ones(num_queries, device=device)) + + pop_neg_items = torch.multinomial(pop_items, num_neg, replacement=True) + unpop_neg_items = torch.multinomial(unpop_items, num_neg, replacement=True) + mixed_neg_items = torch.hstack([pop_neg_items[:, :num_neg//2], unpop_neg_items[:, num_neg//2:]]) + + num_pop_items = torch.sum(pop_items, dim=-1, keepdim=True) + num_unpop_items = torch.sum(unpop_items, dim=-1, keepdim=True) + neg_items = torch.where( + torch.tile(num_pop_items < pool, [1, num_neg]), + unpop_neg_items, + torch.where( + torch.tile(num_unpop_items < pool, [1, num_neg]), + pop_neg_items, + mixed_neg_items + ) + ) + mask = torch.where( + torch.tile(num_pop_items < pool, [1, num_neg]), + torch.full(neg_items.shape, False, device=device), + torch.where( + torch.tile(num_unpop_items < pool, [1, num_neg]), + torch.full(neg_items.shape, True, device=device), + torch.hstack([ + torch.full((num_queries, num_neg//2), True, device=device), + torch.full((num_queries, num_neg - num_neg//2), False, device=device), + ]) + ) + ) + return neg_items, mask + + +def uniform_sampling( + num_queries: int, + num_items: int, + num_neg: int, + user_hist: torch.Tensor = None + ): + device = user_hist.device + id = torch.randint(0, num_items - 1, (num_queries, num_neg), device=device) + id_weight = torch.zeros(num_queries, num_items, device=device).scatter_( + 1, id, torch.ones(num_queries, num_neg, device=device)) + + _idx = torch.arange(user_hist.size(0), device=device).view(-1, 1) + hist_weight = torch.zeros(size=(num_queries, num_items), device=device) + hist_weight[_idx, user_hist] = 1 + + pos_id_weight = torch.logical_and(id_weight, hist_weight) + label = pos_id_weight[_idx, id].float() + + return id + 1, label \ No newline at end of file diff --git a/recstudio/model/basemodel/basemodel.yaml b/recstudio/model/basemodel/basemodel.yaml index d0ae91b..131af1b 100644 --- a/recstudio/model/basemodel/basemodel.yaml +++ b/recstudio/model/basemodel/basemodel.yaml @@ -19,9 +19,10 @@ data: # params related to dataset neg_count: 0 sampler: ~ # [uniform] shuffle: True - split_mode: user_entry # [user, entry, user_entry] - split_ratio: [0.8,0.1,0.1] # list or int type, list type for split by ratio, int type for leave one out - + split_mode: user_entry # [user, entry, user_entry] + mcar_sampling_ratio: ~ + split_ratio: [0.8,0.1,0.1] # list or int type, list type for split by ratio, int type for leave one out; to split MNAR + mcar_split_ratio: ~ # a list of 3 floats for split by ratio; corresponds to mcar for trn/val/tst model: embed_dim: 64 # embedding dimension for embedding layers, usually for item and user embeddings @@ -29,7 +30,7 @@ model: train: - accelerator: gpu # [cpu, gpu, dp] + accelerator: cpu # [cpu, gpu, dp] # ann: {index: 'IVFx,Flat', parameter: ~} ## 1 HNSWx,Flat; 2 Flat; 3 IVFx,Flat ## {nprobe: 1} {efSearch: 1} ann: ~ @@ -38,7 +39,7 @@ train: early_stop_mode: max early_stop_patience: 10 - epochs: 1000 + epochs: 50 gpu: 1 grad_clip_norm: ~ init_method: xavier_normal # [xavier_normal, normal] diff --git a/recstudio/model/basemodel/baseretriever.py b/recstudio/model/basemodel/baseretriever.py index 14d4fa8..f537ac9 100644 --- a/recstudio/model/basemodel/baseretriever.py +++ b/recstudio/model/basemodel/baseretriever.py @@ -2,6 +2,7 @@ import inspect import recstudio.eval as eval import torch +from torch import Tensor import torch.nn.functional as F from ..scorer import * from . import Recommender @@ -146,20 +147,24 @@ def forward( return_query: bool = False, return_item: bool = False, return_neg_item: bool = False, - return_neg_id: bool = False + return_neg_id: bool = False, + query: Tensor = None, + neg_item_idx: Tensor = None, + log_pos_prob: Tensor = None, + log_neg_prob: Tensor = None ): # query_vec, pos_item_vec, neg_item_vec, output = {} pos_items = self._get_item_feat(batch) pos_item_vec = self.item_encoder(pos_items) if self.sampler is not None: - if self.neg_count is None: + if not self.neg_count: raise ValueError("`negative_count` value is required when " "`sampler` is not none.") - - (log_pos_prob, neg_item_idx, log_neg_prob), query = self.sampling(batch=batch, num_neg=self.neg_count, - excluding_hist=self.config['train'].get('excluding_hist', False), - method=self.config['train'].get('sampling_method', 'none'), return_query=True) + if neg_item_idx is None: + (log_pos_prob, neg_item_idx, log_neg_prob), query = self.sampling(batch=batch, num_neg=self.neg_count, + excluding_hist=self.config['train'].get('excluding_hist', False), + method=self.config['train'].get('sampling_method', 'none'), return_query=True) pos_score = self.score_func(query, pos_item_vec) if batch[self.fiid].dim() > 1: pos_score[batch[self.fiid] == 0] = -float('inf') # padding diff --git a/recstudio/model/basemodel/recommender.py b/recstudio/model/basemodel/recommender.py index 0ccad9c..b90a2ff 100644 --- a/recstudio/model/basemodel/recommender.py +++ b/recstudio/model/basemodel/recommender.py @@ -239,6 +239,7 @@ def current_epoch_trainloaders(self, nepoch) -> Tuple: Returns: list or dict or Dataloader : the train loaders used in the current epoch bool : whether to combine the train loaders or use them alternately in one epoch. + bool : whether to concat the train loaders in one epoch. """ # use nepoch to config current trainloaders combine = False diff --git a/recstudio/model/debias/cause.py b/recstudio/model/debias/cause.py new file mode 100644 index 0000000..f8afbb5 --- /dev/null +++ b/recstudio/model/debias/cause.py @@ -0,0 +1,66 @@ +import copy +import torch +from .debiasedretriever import DebiasedRetriever +from ..loss_func import BCEWithLogitLoss +from recstudio.data.dataset import ConcatedLoaders +from recstudio.model import basemodel +from recstudio.model.debias.debiasedretriever import DebiasedQueryEncoder, DebiasedItemEncoder + +r""" +CausE +######### + +Paper Reference: + Causal Embeddings for Recommendation (RecSys'18) + https://dl.acm.org/doi/10.1145/3240323.3240360 +""" + +class CausE(DebiasedRetriever): + + def add_model_specific_args(parent_parser): + parent_parser = basemodel.Recommender.add_model_specific_args(parent_parser) + parent_parser.add_argument_group('CausE') + parent_parser.add_argument("--method", type=float, default='control', help='eval method') + return parent_parser + + def _init_model(self, train_data): + super()._init_model(train_data) + self.backbone['control'].loss_fn = BCEWithLogitLoss() + self.backbone['treatment'].loss_fn = BCEWithLogitLoss() + self.query_encoder.query_encoders['treatment'] = \ + self.backbone['treatment'].query_encoder = self.backbone['control'].query_encoder + + + def _get_query_encoder(self, train_data): + return DebiasedQueryEncoder(self.backbone, lambda d: d[self.config['eval']['method']]) + + def _get_item_encoder(self, train_data): + return DebiasedItemEncoder(self.backbone, lambda d: d[self.config['eval']['method']]) + + def _get_item_vector(self): + return self.backbone[self.config['eval']['method']]._get_item_vector() + + def _get_final_loss(self, loss : dict, output : dict, batch : dict): + item_c, item_t = self.item_encoder(self._get_item_feat(batch)).chunk(2, 1) + loss_dis = self.discrepancy(item_c, item_t) + return loss['control'] + loss['treatment'] + self.config['train']['dis_penalty'] * loss_dis + + def _get_masked_batch(self, backbone_name, batch): + masked_batch = copy.deepcopy(batch) + control = (masked_batch['Loader'] == 0) + if backbone_name == 'control': + for k, v in masked_batch.items(): + masked_batch[k] = v[control] + elif backbone_name == 'treatment': + for k, v in masked_batch.items(): + masked_batch[k] = v[~control] + return masked_batch + + def _get_train_loaders(self, train_data): + c_loader = train_data.train_loader( + batch_size=self.config['backbone']['control']['train']['batch_size'], + shuffle=True, drop_last=False) + t_loader = train_data.mcar_feat_for_train.loader( + batch_size = self.config['backbone']['treatment']['train']['batch_size'], + shuffle=True, drop_last=False) + return ConcatedLoaders([c_loader, t_loader]) \ No newline at end of file diff --git a/recstudio/model/debias/config/all.yaml b/recstudio/model/debias/config/all.yaml new file mode 100644 index 0000000..48a184e --- /dev/null +++ b/recstudio/model/debias/config/all.yaml @@ -0,0 +1,13 @@ +# data: +# mcar_sampling_ratio: ~ +# split_ratio: [0.8,0.1,0.1] # to split MNAR +# mcar_split_ratio: ~ # a list of 3 floats for split by ratio; corresponds to mcar for trn/val/tst + +train: + co_sampling: True + + eta: 1.0 + truncation: 0.2 + eps: 1e-6 + + \ No newline at end of file diff --git a/recstudio/model/debias/config/cause.yaml b/recstudio/model/debias/config/cause.yaml new file mode 100644 index 0000000..83cb811 --- /dev/null +++ b/recstudio/model/debias/config/cause.yaml @@ -0,0 +1,26 @@ +backbone: + control: + class: PMF + loss_reduction: mean + train: + batch_size: 256 + + treatment: + class: PMF + loss_reduction: mean + train: + batch_size: 256 + +eval: + method: control + +data: + mcar_sampling_ratio: 0.1 + split_ratio: [0.8,0.1,0.1] + mcar_split_ratio: [0.1,0,0] + binarized_rating_thres: 3 + neg_count: 1 + +train: + discrepancy: l2 # l1/l2 + dis_penalty: 1.0 \ No newline at end of file diff --git a/recstudio/model/debias/config/dice.yaml b/recstudio/model/debias/config/dice.yaml new file mode 100644 index 0000000..69f3045 --- /dev/null +++ b/recstudio/model/debias/config/dice.yaml @@ -0,0 +1,25 @@ +backbone: + interest: + class: BPR + loss_reduction: none + + conformity: + class: BPR + loss_reduction: none + +train: + discrepancy: l1 + dis_penalty: 0.01 + + int_weight: 0.1 + con_weight: 0.1 + loss_decay: 0.9 + learning_rate: 1e-3 + +data: + neg_count: 2 + margin_up: 40 + margin_down: 40 + pool: 40 + adaptive: True + margin_decay: 0.9 \ No newline at end of file diff --git a/recstudio/model/debias/config/expomf.yaml b/recstudio/model/debias/config/expomf.yaml new file mode 100644 index 0000000..6d30746 --- /dev/null +++ b/recstudio/model/debias/config/expomf.yaml @@ -0,0 +1,8 @@ +train: + epochs: 10 + lambda_y: 1.0 + lambda_theta: 1e-5 + lambda_beta: 1e-5 + init_mu: 0.01 + alpha1: 1.0 + alpha2: 1.0 \ No newline at end of file diff --git a/recstudio/model/debias/config/ips.yaml b/recstudio/model/debias/config/ips.yaml new file mode 100644 index 0000000..716b0c1 --- /dev/null +++ b/recstudio/model/debias/config/ips.yaml @@ -0,0 +1,14 @@ +backbone: + IPS: + class: PMF + loss_reduction: none + train: + learning_rate: 0.01 + +train: + eta: 1.0 + +data: + neg_count: 1 + + \ No newline at end of file diff --git a/recstudio/model/debias/config/ipw.yaml b/recstudio/model/debias/config/ipw.yaml new file mode 100644 index 0000000..a9243df --- /dev/null +++ b/recstudio/model/debias/config/ipw.yaml @@ -0,0 +1,14 @@ +data: + neg_count: 1 + low_rating_thres: ~ + binarized_rating_thres: 3.0 + min_user_inter: 10 + min_item_inter: 10 + +train: + lambda_theta: 1e-7 + lambda_beta: 1e-7 + truncation: 1 + +eval: + method: conditional # conditional / marginal \ No newline at end of file diff --git a/recstudio/model/debias/config/macr.yaml b/recstudio/model/debias/config/macr.yaml new file mode 100644 index 0000000..642d298 --- /dev/null +++ b/recstudio/model/debias/config/macr.yaml @@ -0,0 +1,28 @@ +train: + c: 0 + alpha: 1e-3 + beta: 1e-3 + +data: + binarized_rating_thres: 3 + low_rating_thres: 3 + neg_count: 1 + +backbone: + matching: + class: PMF + loss_reduction: mean + +user_module: + mlp_layers: [64, 32, 1] + activation_func: relu + bias: True + batch_norm: False + dropout: 0.0 + +item_module: + mlp_layers: [64, 32, 1] + activation_func: relu + bias: True + batch_norm: False + dropout: 0.0 \ No newline at end of file diff --git a/recstudio/model/debias/config/pda.yaml b/recstudio/model/debias/config/pda.yaml new file mode 100644 index 0000000..7db7ada --- /dev/null +++ b/recstudio/model/debias/config/pda.yaml @@ -0,0 +1,18 @@ +backbone: + PDA: + class: BPR + loss_reduction: mean + + +data: + negative_count: 1 + # split_mode: time + # time_slot: 9 + + +train: + eta: 0.02 + popularity: global # global/local + +eval: + method: PDA # PD/PDA \ No newline at end of file diff --git a/recstudio/model/debias/config/relmf.yaml b/recstudio/model/debias/config/relmf.yaml new file mode 100644 index 0000000..72aa0f9 --- /dev/null +++ b/recstudio/model/debias/config/relmf.yaml @@ -0,0 +1,9 @@ +backbone: + RelMF: + class: PMF + +train: + eta: 0.5 + +data: + neg_count: 5 \ No newline at end of file diff --git a/recstudio/model/debias/config/ubpr.yaml b/recstudio/model/debias/config/ubpr.yaml new file mode 100644 index 0000000..3779302 --- /dev/null +++ b/recstudio/model/debias/config/ubpr.yaml @@ -0,0 +1,11 @@ +backbone: + UBPR: + class: BPR + loss_reduction: none + +train: + eta: 1 + +data: + neg_count: 2 + sample_type: neg only # mixed \ No newline at end of file diff --git a/recstudio/model/debias/debiasedretriever.py b/recstudio/model/debias/debiasedretriever.py new file mode 100644 index 0000000..c724c38 --- /dev/null +++ b/recstudio/model/debias/debiasedretriever.py @@ -0,0 +1,186 @@ +from .. import loss_func, scorer +from ..basemodel import BaseRetriever +from ..loss_func import FullScoreLoss +from recstudio.data import dataset +from recstudio.utils import get_model +import torch +from typing import Dict, List, Optional, Tuple, Union + +class DebiasedRetriever(BaseRetriever): + def __init__(self, config: Dict = None, **kwargs): + super().__init__(config, **kwargs) + self.backbone = torch.nn.ModuleDict() + + if 'propensity' in kwargs: + assert isinstance(kwargs['propensity'], torch.nn.Module), \ + "propensity must be torch.nn.Module" + self.propensity = kwargs['propensity'] + else: + self.propensity = None + + if 'discrepancy' in kwargs: + assert isinstance(kwargs['discrepancy'], torch.nn.Module), \ + "discrepancy must be torch.nn.Module" + self.discrepancy = kwargs['discrepancy'] + else: + self.discrepancy = self._get_discrepancy() + + def _get_dataset_class(): + return dataset.TripletDataset + + def _init_model(self, train_data): + self._add_backbone(train_data) + super()._init_model(train_data) + self.propensity = self._get_propensity(train_data) if not self.propensity else self.propensity + + def _get_sampler(self, train_data): + return None + + def _get_propensity(self, train_data): + return None + + def _get_discrepancy(self): + if 'discrepancy' not in self.config['train'].keys(): + return None + elif self.config['train']['discrepancy'].lower() == 'l1': + return loss_func.L1Loss() + elif self.config['train']['discrepancy'].lower() == 'l2': + return loss_func.MSELoss() + elif self.config['train']['discrepancy'].lower() == 'dcor': + return loss_func.dCorLoss() + elif self.config['train']['discrepancy'].lower() == 'cos': + return scorer.CosineScorer(reduction='mean') + else: + raise ValueError(f"{self.config['train']['discrepancy']} is unsupportable.") + + def _add_backbone(self, train_data): + for name in self.config['backbone'].keys(): + if name in self.backbone.keys(): + raise ValueError(f'Backbone name {name} appears more than one time.') + model_class, model_conf = get_model(self.config['backbone'][name]['class']) + backbone = model_class(model_conf) + backbone._init_model(train_data) + self.backbone[name] = backbone + + def _get_masked_batch(self, backbone_name, batch): + return batch + + def forward(self, batch, + return_query=False, return_item=False, + return_neg_item=False, return_neg_id=False): + query, neg_item_idx, log_pos_prob, log_neg_prob = None, None, None, None + if self.config['train']['co_sampling']: + if self.sampler is not None: + if not self.neg_count: + raise ValueError("`negative_count` value is required when " + "`sampler` is not none.") + (log_pos_prob, neg_item_idx, log_neg_prob), query = self.sampling(batch=batch, num_neg=self.neg_count, + excluding_hist=self.config['train'].get('excluding_hist', False), + method=self.config['train'].get('sampling_method', 'none'), return_query=True) + query = self.query_encoder.split(query) + output = {} + for name, backbone in self.backbone.items(): + masked_batch = self._get_masked_batch(name, batch) + output[name] = backbone.forward( + masked_batch, + isinstance(backbone.loss_fn, FullScoreLoss), + return_query=True, + return_item=True, + return_neg_item=True, + return_neg_id=True, + query=query[name], + neg_item_idx=neg_item_idx, + log_pos_prob=log_pos_prob, + log_neg_prob=log_neg_prob) + return output + + def training_step(self, batch, nepoch=None, loader_idx=None, batch_idx=None): + output = self.forward(batch) + loss = {} + for name, backbone in self.backbone.items(): + score = self._get_score(name, output, self._get_masked_batch(name, batch)) + if backbone.loss_fn is not None: + loss[name] = backbone.loss_fn( + reduction=self.config['backbone'][name]['loss_reduction'], + **score) + loss_value = self._get_final_loss(loss, output, batch) + return loss_value + + def _get_score(self, name, output, batch): + score = output[name]['score'] + score['label'] = batch[self.frating] + return score + + def _get_final_loss(self, loss : Dict, output : Dict, batch : Dict): + return sum(loss.values()) + + + """Below is all for evaluation.""" + def _get_query_encoder(self, train_data): + return DebiasedQueryEncoder(self.backbone) + + def _get_item_encoder(self, train_data): + return DebiasedItemEncoder(self.backbone) + + def _get_query_feat(self, data): + query_feat = {} + for k, v in self.backbone.items(): + query_feat[k] = v._get_query_feat(data) + return query_feat + + def _get_item_feat(self, data): + item_feat = {} + for k, v in self.backbone.items(): + item_feat[k] = v._get_item_feat(data) + return item_feat + + def _get_item_vector(self): + item_vector = {} + for name, backbone in self.backbone.items(): + item_vector[name] = backbone._get_item_vector() + item_vector = torch.hstack([v for _, v in item_vector.items()]) + return item_vector + +class DebiasedQueryEncoder(torch.nn.Module): + def __init__(self, backbone, + concat_func=lambda d: torch.hstack([v for _, v in d.items()]), + split_func=lambda x, num: x.chunk(num, dim=-1)): + """func(function): decide how to get the query vector""" + super().__init__() + self.backbone_names = list(backbone.keys()) + self.concat_func = concat_func + self.split_func = split_func + self.query_encoders = {} + for k, v in backbone.items(): + self.query_encoders[k] = v.query_encoder + def forward(self, input): + """input (dict): {backbone name: corresponding query feat}""" + query = {} + for k, v in self.query_encoders.items(): + query[k] = v(input[k]) + query = self.concat_func(query) + return query + def split(self, query): + if query is not None: + queries = self.split_func(query, len(self.backbone_names)) + query = {k: v for k, v in zip(self.backbone_names, queries)} + else: + query = {k: None for k in self.backbone_names} + return query + +class DebiasedItemEncoder(torch.nn.Module): + def __init__(self, backbone, + func=lambda d: torch.hstack([v for _, v in d.items()])): + """choice(str): one of backbone names or `all`""" + super().__init__() + self.func = func + self.item_encoders = {} + for k, v in backbone.items(): + self.item_encoders[k] = v.item_encoder + def forward(self, input): + """input (dict): {backbone name: corresponding item feat}""" + item = {} + for k, v in self.item_encoders.items(): + item[k] = v(input[k]) + item = self.func(item) + return item \ No newline at end of file diff --git a/recstudio/model/debias/dice.py b/recstudio/model/debias/dice.py new file mode 100644 index 0000000..13899ca --- /dev/null +++ b/recstudio/model/debias/dice.py @@ -0,0 +1,91 @@ +import torch +from .debiasedretriever import DebiasedRetriever +from recstudio.data import DICEDataset +from recstudio.model import basemodel + +r""" +DICE +######### + +Paper Reference: + Disentangling User Interest and Conformity for Recommendation with Causal Embedding (WWW'21) + https://doi.org/10.1145/3442381.3449788 +""" + +class DICE(DebiasedRetriever): + + def add_model_specific_args(parent_parser): + parent_parser = basemodel.Recommender.add_model_specific_args(parent_parser) + parent_parser.add_argument_group('DICE') + parent_parser.add_argument("--discrepancy", type=str, default='l1', help='discrepency loss function') + parent_parser.add_argument("--dis_penalty", type=float, default=0.01, help='discrepency penalty') + parent_parser.add_argument("--int_weight", type=float, default=0.1, help='weight for interest term in the loss function') + parent_parser.add_argument("--con_weight", type=float, default=0.1, help='weight for popularity term in the loss function') + parent_parser.add_argument("--margin_up", type=float, default=40.0, help='margin for negative but more popular sampling') + parent_parser.add_argument("--margin_down", type=float, default=40.0, help='margin for negative and less popular sampling') + parent_parser.add_argument("--pool", type=int, default=40, help='pool for negative sampling') + parent_parser.add_argument("--adaptive", type=bool, default=True, help='adapt hyper-parameters or not') + parent_parser.add_argument("--margin_decay", type=float, default=0.9, help='decay of margin') + parent_parser.add_argument("--loss_decay", type=float, default=0.9, help='decay of loss') + return parent_parser + + def _get_dataset_class(): + return DICEDataset + + def _get_final_loss(self, loss : dict, output : dict, batch : dict): + query_int = output['interest']['query'] + query_con = output['conformity']['query'] + pos_item_int = output['interest']['item'] + pos_item_con = output['conformity']['item'] + neg_item_int = output['interest']['neg_item'] + neg_item_con = output['conformity']['neg_item'] + item_int = torch.vstack((pos_item_int, neg_item_int.view(-1, pos_item_int.shape[1]))) + item_con = torch.vstack((pos_item_con, neg_item_con.view(-1, pos_item_con.shape[1]))) + loss_dis = self.discrepancy(query_int, query_con) + self.discrepancy(item_int, item_con) + loss_click = self.backbone['interest'].loss_fn( + pos_score=output['interest']['score']['pos_score'] + output['conformity']['score']['pos_score'], + neg_score=output['interest']['score']['neg_score'] + output['conformity']['score']['neg_score'], + label=None, log_pos_prob=None, log_neg_prob=None) + + mask = batch['mask'] + loss_int = torch.mean(mask * loss['interest']) + loss_con = torch.mean(~mask * loss['conformity']) + \ + torch.mean(mask * self.backbone['conformity'].loss_fn( + pos_score=output['conformity']['score']['neg_score'], + neg_score=output['conformity']['score']['pos_score'], + label=None, log_pos_prob=None, log_neg_prob=None + )) + + return self.int_weight * loss_int + self.con_weight * loss_con + \ + loss_click - self.config['train']['dis_penalty'] * loss_dis + + def _adapt(self, current_epoch): + if not hasattr(self, 'last_epoch'): + self.last_epoch = 0 + self.int_weight = self.config['train']['int_weight'] + self.con_weight = self.config['train']['con_weight'] + if current_epoch > self.last_epoch: + self.last_epoch = current_epoch + self.int_weight = self.int_weight * self.config['train']['loss_decay'] + self.con_weight = self.con_weight * self.config['train']['loss_decay'] + + def training_step(self, batch, nepoch): + self._adapt(nepoch) + return super().training_step(batch, nepoch) + + def forward(self, batch): + query = self.query_encoder(self._get_query_feat(batch)) + query = self.query_encoder.split(query) + neg_item_idx = batch['neg_items'] + output = {} + for name, backbone in self.backbone.items(): + output[name] = backbone.forward( + batch, + False, + return_query=True, + return_item=True, + return_neg_item=True, + return_neg_id=True, + query=query[name], + neg_item_idx=neg_item_idx) + return output \ No newline at end of file diff --git a/recstudio/model/debias/expomf.py b/recstudio/model/debias/expomf.py new file mode 100644 index 0000000..0f168cf --- /dev/null +++ b/recstudio/model/debias/expomf.py @@ -0,0 +1,123 @@ +import torch +import numpy as np +from recstudio.model import basemodel, scorer +from recstudio.data.advance_dataset import ALSDataset + +r""" +ExpoMF +######### + +Paper Reference: + Modeling User Exposure in Recommendation (WWW'16) + https://dl.acm.org/doi/10.1145/2872427.2883090 +""" + +class ExpoMF(basemodel.BaseRetriever): + + def add_model_specific_args(parent_parser): + parent_parser = basemodel.Recommender.add_model_specific_args(parent_parser) + parent_parser.add_argument_group('ExpoMF') + parent_parser.add_argument("--lambda_y", type=float, default=1.0, help='lambda_y for ExpoMF') + parent_parser.add_argument("--lambda_theta", type=float, default=1e-5, help='lambda_theta for ExpoMF') + parent_parser.add_argument("--lambda_beta", type=float, default=1e-5, help='lambda_beta for ExpoMF') + parent_parser.add_argument("--init_mu", type=float, default=0.01, help='init mu for ExpoMF') + parent_parser.add_argument("--alpha1", type=float, default=1.0, help='alpha1 for ExpoMF') + parent_parser.add_argument("--alpha2", type=float, default=1.0, help='alpha2 for ExpoMF') + return parent_parser + + def _init_model(self, train_data): + super()._init_model(train_data) + self.register_buffer('a', torch.ones(train_data.num_users, train_data.num_items)) + self.register_buffer('mu', self.config['train']['init_mu'] * torch.ones(train_data.num_items)) + + def _init_parameter(self): + super()._init_parameter() + self.query_encoder.weight.requires_grad = False + self.item_encoder.weight.requires_grad = False + + def _get_dataset_class(): + return ALSDataset + + def _get_query_encoder(self, train_data): + return torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0) + + def _get_item_encoder(self, train_data): + return torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0) + + def _get_score_func(self): + return scorer.InnerProductScorer() + + def _get_train_loaders(self, train_data): + loader = train_data.train_loader(batch_size = self.config['train']['batch_size'], shuffle = True, drop_last = False) + loader_T = train_data.transpose().train_loader( + batch_size = self.config['train']['batch_size'], shuffle = True, drop_last = False) + return [loader, loader_T] + + def current_epoch_trainloaders(self, nepoch): + return self.trainloaders[nepoch % len(self.trainloaders)], False + + def training_epoch(self, nepoch): + super().training_epoch(nepoch) + self.mu = (self.config['train']['alpha1'] + torch.sum(self.a, dim=0) - 1) / \ + (self.config['train']['alpha1'] + self.config['train']['alpha2'] + self.a.shape[0] - 2) + return torch.tensor(0.) + + def training_step(self, batch): + a = self._expectation(batch) + self._maximization(batch, a) + + def _expectation(self, batch): + """ + Compute the posterior of exposure latent variables a_{ui} + """ + if batch[self.fuid].dim() == 1: + mu = self.mu + P_y0_given_a1 = np.sqrt(self.config['train']['lambda_y'] / 2 * torch.pi) * \ + torch.exp(-self.config['train']['lambda_y'] * + (self.query_encoder(self._get_query_feat(batch)) @ # B x D + self.item_encoder.weight.transpose(0, 1)) # D x num_items + **2 / 2) # -> B x num_items + else: + mu = self.mu[batch[self.fiid]].unsqueeze(-1) + P_y0_given_a1 = np.sqrt(self.config['train']['lambda_y'] / 2 * torch.pi) * \ + torch.exp(-self.config['train']['lambda_y'] * + (self.item_encoder(self._get_item_feat(batch)) @ # B x D + self.query_encoder.weight.transpose(0, 1)) # D x num_users + **2 / 2) # -> B x num_users + + a = (P_y0_given_a1 + 1e-8) / (P_y0_given_a1 + 1e-8 + (1 - mu) / mu) + for i, j in batch[self.frating].nonzero(): + a[i, j] = torch.tensor(1.) + + # update self.a + if batch[self.fuid].dim() == 1: + for i, uid in enumerate(batch[self.fuid]): + for j, iid in enumerate(batch[self.fiid][i]): + self.a[uid, iid] = a[i, j] + return a + + def _maximization(self, batch, a): + """ + Update latent factors theta and beta + """ + if batch[self.fuid].dim() == 1: + for i, uid in enumerate(batch[self.fuid]): + A = self.config['train']['lambda_y'] * \ + (a[i] * self.item_encoder.weight.transpose(0, 1)) @ \ + self.item_encoder.weight + \ + self.config['train']['lambda_theta'] * torch.eye(self.embed_dim, device=self.device) # D x D + B = self.config['train']['lambda_y'] * ((a[i][batch[self.fiid][i]] * batch[self.frating][i]) @ \ + self.item_encoder.weight[batch[self.fiid][i]]).unsqueeze(-1) # D x 1 + self.query_encoder.weight[uid] = torch.linalg.solve(A, B).squeeze(-1) + else: + for i, iid in enumerate(batch[self.fiid]): + A = self.config['train']['lambda_y'] * \ + (a[i] * self.query_encoder.weight.transpose(0, 1)) @ \ + self.query_encoder.weight + \ + self.config['train']['lambda_beta'] * torch.eye(self.embed_dim, device=self.device) # D x D + B = self.config['train']['lambda_y'] * ((a[i][batch[self.fuid][i]] * batch[self.frating][i]) @ \ + self.query_encoder.weight[batch[self.fuid][i]]).unsqueeze(-1) # D x 1 + self.item_encoder.weight[iid] = torch.linalg.solve(A, B).squeeze(-1) + + def _get_sampler(self, train_data): + return None \ No newline at end of file diff --git a/recstudio/model/debias/ips.py b/recstudio/model/debias/ips.py new file mode 100644 index 0000000..8fa887b --- /dev/null +++ b/recstudio/model/debias/ips.py @@ -0,0 +1,41 @@ +import torch +from .debiasedretriever import DebiasedRetriever +from ..loss_func import PointwiseLoss +from recstudio.model import basemodel +from recstudio.model.module.propensity import Popularity + +r""" +IPS +###### + +Paper Reference: + Recommendations as treatments: debiasing learning and evaluation (ICML'16) + https://dl.acm.org/doi/10.5555/3045390.3045567 +""" + +class IPS(DebiasedRetriever): + + def add_model_specific_args(parent_parser): + parent_parser = basemodel.Recommender.add_model_specific_args(parent_parser) + parent_parser.add_argument_group('IPS') + parent_parser.add_argument("--eta", type=float, default=1, help='adjust propensities') + return parent_parser + + def _init_model(self, train_data): + super()._init_model(train_data) + for name, backbone in self.backbone.items(): + if not isinstance(backbone.loss_fn, PointwiseLoss): + raise ValueError('IPS asks for PointwiseLoss ' + f'rather than {backbone.loss_fn}.') + + + def _get_propensity(self, train_data): + propensity = Popularity(self.config['train']['eta']) + propensity.fit(train_data) + return propensity + + def _get_final_loss(self, loss : dict, output : dict, batch : dict): + unreweighted_loss = loss['IPS'] + weight = 1 / self.propensity(batch[self.fiid]) + reweighted_loss = torch.mean(weight * unreweighted_loss) + return reweighted_loss \ No newline at end of file diff --git a/recstudio/model/debias/ipw.py b/recstudio/model/debias/ipw.py new file mode 100644 index 0000000..aee511e --- /dev/null +++ b/recstudio/model/debias/ipw.py @@ -0,0 +1,119 @@ +import torch +from ..loss_func import l2_reg_loss_fn +from recstudio.model import basemodel, scorer +from recstudio.data.advance_dataset import ALSDataset +from recstudio.model.module.propensity import Popularity + +r""" +IPW +######### + +Paper Reference: + Causal Inference for Recommendation + http://www.its.caltech.edu/~fehardt/UAI2016WS/papers/Liang.pdf +""" +class IPWScorer(scorer.InnerProductScorer): + def __init__(self, eval_method, pop): + super().__init__() + self.is_eval = False + self.eval_method = eval_method + self.register_buffer('pop', pop) + def forward(self, query, items): + score = super().forward(query, items) + if self.eval_method == 'marginal' and self.is_eval: + score = self.pop * score + return score + +class IPW(basemodel.BaseRetriever): + + def add_model_specific_args(parent_parser): + parent_parser = basemodel.Recommender.add_model_specific_args(parent_parser) + parent_parser.add_argument_group('IPW') + parent_parser.add_argument("--lambda_theta", type=float, default=1e-5, help='lambda_theta for IPW') + parent_parser.add_argument("--lambda_beta", type=float, default=1e-5, help='lambda_beta for IPW') + parent_parser.add_argument("--method", type=str, default='conditional', help='prediction method') + return parent_parser + + def _init_model(self, train_data): + super()._init_model(train_data) + self.propensity = self._get_propensity(train_data) + self.score_func = self._get_score_func() + + def _init_parameter(self): + super()._init_parameter() + self.query_encoder.weight.requires_grad = False + self.item_encoder.weight.requires_grad = False + + def _get_dataset_class(): + return ALSDataset + + def _get_propensity(self, train_data): + propensity = Popularity(self.config['train']['eta'], + self.config['train']['truncation'], + self.config['train']['eps']) + propensity.fit(train_data) + return propensity + + def _get_score_func(self): + if not hasattr(self, 'propensity'): + return None + else: + return IPWScorer(self.config['eval']['method'], self.propensity.pop[1:]) + + def _get_query_encoder(self, train_data): + return torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0) + + def _get_item_encoder(self, train_data): + return torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0) + + def _get_train_loaders(self, train_data): + loader = train_data.train_loader(batch_size = self.config['train']['batch_size'], shuffle = True, drop_last = False) + loader_T = train_data.transpose().train_loader( + batch_size = self.config['train']['batch_size'], shuffle = True, drop_last = False) + return [loader, loader_T] + + def current_epoch_trainloaders(self, nepoch): + return self.trainloaders[nepoch % len(self.trainloaders)], False + + def training_step(self, batch): + """ + Update latent user/item factors + """ + label = (batch[self.frating] > 0).float() + self.score_func.is_eval = False + query_emb = self.query_encoder(self._get_query_feat(batch)) + item_emb = self.item_encoder(self._get_item_feat(batch)) + weight = 1 / self.propensity(batch[self.fiid]) + + if batch[self.fuid].dim() == 1: + for i, uid in enumerate(batch[self.fuid]): + weight_o = weight[i] + A = (weight_o * self.item_encoder.weight[batch[self.fiid][i]].transpose(0, 1)) @ \ + self.item_encoder.weight[batch[self.fiid][i]] + \ + self.config['train']['lambda_theta'] * torch.eye(self.embed_dim, device=self.device) # D x D + B = ((weight_o * batch[self.frating][i]) @ self.item_encoder.weight[batch[self.fiid][i]]).unsqueeze(-1) # D x 1 + self.query_encoder.weight[uid] = torch.linalg.solve(A, B).squeeze(-1) + pos_score = self.score_func(query_emb, item_emb) + mse_loss = (weight * (label - pos_score)**2).sum(-1) + else: + for i, iid in enumerate(batch[self.fiid]): + weight_o = weight[i] + A = (weight_o * self.query_encoder.weight[batch[self.fuid][i]].transpose(0, 1)) @ \ + self.query_encoder.weight[batch[self.fuid][i]] + \ + self.config['train']['lambda_beta'] * torch.eye(self.embed_dim, device=self.device) # D x D + B = ((weight_o * batch[self.frating][i]) @ self.query_encoder.weight[batch[self.fuid][i]]).unsqueeze(-1) # D x 1 + self.item_encoder.weight[iid] = torch.linalg.solve(A, B).squeeze(-1) + pos_score = self.score_func(item_emb, query_emb) + mse_loss = (weight.unsqueeze(-1) * (label - pos_score)**2).sum(-1) + + reg_loss = self.config['train']['lambda_theta'] * l2_reg_loss_fn(self.query_encoder.weight) + \ + self.config['train']['lambda_beta'] * l2_reg_loss_fn(self.item_encoder.weight) + loss = mse_loss + reg_loss + return {'loss': loss} + + def _test_step(self, batch, metric, cutoffs): + self.score_func.is_eval = True + return super()._test_step(batch, metric, cutoffs) + + def _get_sampler(self, train_data): + return None \ No newline at end of file diff --git a/recstudio/model/debias/macr.py b/recstudio/model/debias/macr.py new file mode 100644 index 0000000..fcb1d68 --- /dev/null +++ b/recstudio/model/debias/macr.py @@ -0,0 +1,69 @@ +import torch +from .debiasedretriever import DebiasedRetriever +from ..loss_func import BCELoss +from recstudio.model import basemodel, scorer +from recstudio.model.module import MLPModule + +r""" +MACR +###### + +Paper Reference: + Model-Agnostic Counterfactual Reasoning for Eliminating Popularity Bias in Recommender System (KDD'21) + https://doi.org/10.1145/3447548.3467289 +""" + +class MACR(DebiasedRetriever): + + def add_model_specific_args(parent_parser): + parent_parser = basemodel.Recommender.add_model_specific_args(parent_parser) + parent_parser.add_argument_group('MACR') + parent_parser.add_argument("--c", type=float, default=0.0, help='reference status') + parent_parser.add_argument("--alpha", type=float, default=1e-3, help='weight of user loss') + parent_parser.add_argument("--beta", type=float, default=1e-3, help='weight of item loss') + return parent_parser + + def _init_model(self, train_data): + super()._init_model(train_data) + self.backbone['matching'].loss_fn = None + self.backbone['matching'].sampler = None + + def _get_score_func(self): + # For only topk() function + class MACRScorer(scorer.InnerProductScorer): + def __init__(self, c, user_module, item_module): + super().__init__() + self.c = c + self.user_module = user_module # shared + self.item_module = item_module # shared + def forward(self, query, items): + yk = super().forward(query, items) + yu = self.user_module(query).squeeze() + yi = self.item_module(items).squeeze() + yui = (yk - self.c) * torch.outer(yu, yi) + return yui + + assert self.config['user_module']['mlp_layers'][0] == self.config['model']['embed_dim'] + assert self.config['item_module']['mlp_layers'][0] == self.config['model']['embed_dim'] + assert self.config['user_module']['mlp_layers'][-1] == 1 + assert self.config['item_module']['mlp_layers'][-1] == 1 + self.user_module = MLPModule(**self.config['user_module']) + self.item_module = MLPModule(**self.config['item_module']) + if self.config['user_module']['activation_func'].lower() != 'sigmoid': + self.user_module.add_modules(torch.nn.Sigmoid()) + if self.config['item_module']['activation_func'].lower() != 'sigmoid': + self.item_module.add_modules(torch.nn.Sigmoid()) + return MACRScorer(self.config['train']['c'], self.user_module, self.item_module) + + def _get_loss_func(self): + return BCELoss() + + def _get_final_loss(self, loss: dict, output: dict, batch : dict): + label = batch[self.frating] + score_u = self.user_module(output['matching']['query']).squeeze() + score_i = self.item_module(output['matching']['item']).squeeze() + score_click = torch.sigmoid(output['matching']['score']['pos_score'] * score_u * score_i) + loss_click = self.loss_fn(label=label, pos_score=score_click) + loss_u = self.loss_fn(label=label, pos_score=score_u) + loss_i = self.loss_fn(label=label, pos_score=score_i) + return loss_click + self.config['train']['alpha'] * loss_u + self.config['train']['beta'] * loss_i \ No newline at end of file diff --git a/recstudio/model/debias/pda.py b/recstudio/model/debias/pda.py new file mode 100644 index 0000000..7e45c73 --- /dev/null +++ b/recstudio/model/debias/pda.py @@ -0,0 +1,66 @@ +import torch.nn.functional as F +from .debiasedretriever import DebiasedRetriever +from recstudio.model import basemodel, scorer +from recstudio.model.module.propensity import Popularity + +r""" +PDA +######### + +Paper Reference: + Causal Intervention for Leveraging Popularity Bias in Recommendation (SIGIR'21) + https://doi.org/10.1145/3404835.3462875 +""" +class PDAEvalScorer(scorer.InnerProductScorer): + """ + For full score evaluation. + """ + def __init__(self, eval_method, pop): + super().__init__() + self.eval_method = eval_method + self.register_buffer('pop', pop) + def forward(self, query, items): + f = super().forward(query, items) + elu_ = F.elu(f) + 1 + if self.eval_method == 'PD': + return elu_ + elif self.eval_method == 'PDA': + return self.pop * elu_ + +class PDA(DebiasedRetriever): + + def add_model_specific_args(parent_parser): + parent_parser = basemodel.Recommender.add_model_specific_args(parent_parser) + parent_parser.add_argument_group('PDA') + parent_parser.add_argument("--eta", type=float, default=0.02, help='gamma for PDA') + parent_parser.add_argument("--method", type=str, default='PD', help='evaluation way of PDA') + parent_parser.add_argument("--popularity", type=str, default='global', help='global or local') + return parent_parser + + def _init_model(self, train_data): + super()._init_model(train_data) + self.score_func = self._get_score_func() + + def _get_propensity(self, train_data): + if self.config['train']['popularity'].lower() == 'global': + propensity = Popularity(self.config['train']['eta'], + self.config['train']['truncation'], + self.config['train']['eps']) + propensity.fit(train_data) + return propensity + elif self.config['train']['popularity'].lower() == 'local': + raise NotImplementedError(f"Local popularity is not implemented.") + + def _get_score_func(self): + if not hasattr(self, 'propensity'): + return None + else: + return PDAEvalScorer(self.config['eval']['method'], self.propensity.pop[1:]) + + def _get_score(self, name, output, batch): + score = super()._get_score(name, output, batch) + pos_weight = self.propensity(batch[self.fiid]) + neg_weight = self.propensity(output['PDA']['neg_id']) + score['pos_score'] = pos_weight * score['pos_score'] + score['neg_score'] = neg_weight * score['neg_score'] + return score \ No newline at end of file diff --git a/recstudio/model/debias/relmf.py b/recstudio/model/debias/relmf.py new file mode 100644 index 0000000..cd126bd --- /dev/null +++ b/recstudio/model/debias/relmf.py @@ -0,0 +1,43 @@ +from .debiasedretriever import DebiasedRetriever +from ..loss_func import BCEWithLogitLoss +from recstudio.model import basemodel +from recstudio.model.module.propensity import Popularity + +r""" +RelMF +###### + +Paper Reference: + Unbiased Recommender Learning from Missing-Not-At-Random Implicit Feedback (WSDM'20) + https://doi.org/10.1145/3336191.3371783 +""" + +class RelMF(DebiasedRetriever): + + def add_model_specific_args(parent_parser): + parent_parser = basemodel.Recommender.add_model_specific_args(parent_parser) + parent_parser.add_argument_group('RelMF') + parent_parser.add_argument("--eta", type=float, default=0.5, help='adjust propensities') + return parent_parser + + def _init_model(self, train_data): + super()._init_model(train_data) + self.backbone['RelMF'].loss_fn = None + + def _get_propensity(self, train_data): + propensity = Popularity(self.config['train']['eta'], + self.config['train']['truncation'], + self.config['train']['eps']) + propensity.fit(train_data) + return propensity + + def _get_loss_func(self): + return BCEWithLogitLoss() + + def _get_final_loss(self, loss : dict, output : dict, batch : dict): + pop = self.propensity(batch[self.fiid]) + score = output['RelMF']['score'] + label = batch[self.frating] + score['label'] = label / pop + (1 - label) * (1 - label / pop) + loss = self.loss_fn(**score) + return loss \ No newline at end of file diff --git a/recstudio/model/debias/ubpr.py b/recstudio/model/debias/ubpr.py new file mode 100644 index 0000000..c39babb --- /dev/null +++ b/recstudio/model/debias/ubpr.py @@ -0,0 +1,55 @@ +import torch +from .debiasedretriever import DebiasedRetriever +from recstudio.data import UBPRDataset +from recstudio.model import basemodel +from recstudio.model.module.propensity import Popularity + +r""" +UBPR +###### + +Paper Reference: + Unbiased Pairwise Learning from Biased Implicit Feedback (ICTIR'20) + https://doi.org/10.1145/3409256.3409812 +""" + +class UBPR(DebiasedRetriever): + + def add_model_specific_args(parent_parser): + parent_parser = basemodel.Recommender.add_model_specific_args(parent_parser) + parent_parser.add_argument_group('UBPR') + parent_parser.add_argument("--eta", type=float, default=0.5, help='adjust propensities') + return parent_parser + + def _get_dataset_class(): + return UBPRDataset + + def _get_propensity(self, train_data): + propensity = Popularity(self.config['train']['eta'], + self.config['train']['truncation'], + self.config['train']['eps']) + propensity.fit(train_data) + return propensity + + def forward(self, batch): + query = self.query_encoder(self._get_query_feat(batch)) + query = self.query_encoder.split(query) + neg_item_idx = batch['sampled_items'] + output = {} + for name, backbone in self.backbone.items(): + output[name] = backbone.forward( + batch, + False, + return_query=True, + return_item=True, + return_neg_item=True, + return_neg_id=True, + query=query[name], + neg_item_idx=neg_item_idx) + return output + + def _get_final_loss(self, loss: dict, output: dict, batch: dict): + weight_i = 1 / self.propensity(batch[self.fiid]).unsqueeze(-1) + weight_j = 1 - batch['sampled_labels'] / self.propensity(batch['sampled_items']) + loss_ = torch.mean(weight_i * weight_j * loss['UBPR']) + return loss_ \ No newline at end of file diff --git a/recstudio/model/loss_func.py b/recstudio/model/loss_func.py index 00ed136..82532b0 100644 --- a/recstudio/model/loss_func.py +++ b/recstudio/model/loss_func.py @@ -11,40 +11,37 @@ class FullScoreLoss(torch.nn.Module): may be very time-consuming. So the loss is seldom used in large-scale dataset. """ - def forward(self, label, pos_score, all_score): + def forward(self, label, pos_score, all_score, reduction='mean'): r""" """ pass class PairwiseLoss(torch.nn.Module): - def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob): + def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob, reduction='mean'): pass class PointwiseLoss(torch.nn.Module): - def forward(self, label, pos_score): + def forward(self, label, pos_score, reduction='mean'): raise NotImplementedError(f'{type(self).__name__} is an abstrat class, \ this method would not be implemented') -class SquareLoss(PointwiseLoss): - def forward(self, label, pos_score): - if label.dim() > 1: - return torch.mean(torch.mean(torch.square(label - pos_score), dim=-1)) - else: - return torch.mean(torch.square(label - pos_score)) - class SoftmaxLoss(FullScoreLoss): - def forward(self, label, pos_score, all_score): + def forward(self, label, pos_score, all_score, reduction='mean'): if all_score.dim() > pos_score.dim(): - return torch.mean(torch.logsumexp(all_score, dim=-1) - pos_score) + loss = torch.logsumexp(all_score, dim=-1) - pos_score else: output = torch.logsumexp(all_score, dim=-1, keepdim=True) - pos_score notpadnum = torch.logical_not(torch.isinf(pos_score)).float().sum(-1) - output = torch.nan_to_num(output, posinf=0).sum(-1) / notpadnum - return torch.mean(output) + loss = torch.nan_to_num(output, posinf=0).sum(-1) / notpadnum + + if reduction == 'mean': + return torch.mean(loss) + else: + return loss class BPRLoss(PairwiseLoss): @@ -52,33 +49,40 @@ def __init__(self, dns=False): super().__init__() self.dns = dns - def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob): + def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob, reduction='mean'): if not self.dns: loss = F.logsigmoid(pos_score.view(*pos_score.shape, 1) - neg_score) weight = F.softmax(torch.ones_like(neg_score), -1) - return -torch.mean((loss * weight).sum(-1)) + loss = -loss * weight + if reduction == 'mean': + return torch.mean(loss.sum(-1)) else: - loss = -torch.mean( - F.logsigmoid(pos_score - torch.max(neg_score, dim=-1))) - return loss + loss = -F.logsigmoid(pos_score - torch.max(neg_score, dim=-1)) + if reduction == 'mean': + return torch.mean(loss) + return loss class Top1Loss(BPRLoss): - def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob): + def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob, reduction='mean'): if not self.dns: loss = torch.sigmoid(neg_score - pos_score.view(*pos_score.shape, 1)) loss += torch.sigmoid(neg_score ** 2) weight = F.softmax(torch.ones_like(neg_score), -1) - return torch.mean((loss * weight).sum(-1)) + loss = (loss * weight).sum(-1) else: max_neg_score = torch.max(neg_score, dim=-1) loss = torch.sigmoid(max_neg_score-pos_score) loss = loss + torch.sigmoid(max_neg_score ** 2) - return loss + + if reduction == 'mean': + return torch.mean(loss) + else: + return loss class SampledSoftmaxLoss(PairwiseLoss): - def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob): + def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob, reduction='mean'): new_pos = pos_score - log_pos_prob new_neg = neg_score - log_neg_prob if new_pos.dim() < new_neg.dim(): @@ -86,15 +90,23 @@ def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob): new_neg = torch.cat([new_pos, new_neg], dim=-1) output = torch.logsumexp(new_neg, dim=-1, keepdim=True) - new_pos notpadnum = torch.logical_not(torch.isinf(new_pos)).float().sum(-1) - output = torch.nan_to_num(output, posinf=0).sum(-1) / notpadnum - return torch.mean(output) + loss = torch.nan_to_num(output, posinf=0).sum(-1) / notpadnum + + if reduction == 'mean': + return torch.mean(loss) + else: + return loss class WeightedBPRLoss(PairwiseLoss): - def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob): + def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob, reduction='mean'): loss = F.logsigmoid(pos_score.view(*pos_score.shape, 1) - neg_score) weight = F.softmax(neg_score - log_neg_prob, -1) - return -torch.mean((loss * weight).sum(-1)) + loss = -(loss * weight).sum(-1) + if reduction == 'mean': + return torch.mean(loss) + else: + return loss class BinaryCrossEntropyLoss(PairwiseLoss): @@ -102,7 +114,7 @@ def __init__(self, dns=False): super().__init__() self.dns = dns - def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob): + def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob, reduction='mean'): # pos_score: B | B x L | B x L # neg_score: B x neg | B x L x neg | B x neg assert ((pos_score.dim() == neg_score.dim()-1) and (pos_score.shape == @@ -133,7 +145,7 @@ def _cal_weight(self, neg_score, log_neg_prob): class WeightedBinaryCrossEntropyLoss(BinaryCrossEntropyLoss): - def _cal_weight(self, neg_score, log_neg_prob): + def _cal_weight(self, neg_score, log_neg_prob, reduction='mean'): return F.softmax(neg_score - log_neg_prob, -1) @@ -143,29 +155,34 @@ def __init__(self, margin=2, num_items=None): self.margin = margin self.n_items = num_items - def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob): + def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob, reduction='mean'): loss = torch.maximum(torch.max(neg_score, dim=-1).values - pos_score + self.margin, torch.tensor([0]).type_as(pos_score)) if self.n_items is not None: impostors = neg_score - pos_score.view(-1, 1) + self.margin > 0 rank = torch.mean(impostors, -1) * self.n_items - return torch.mean(loss * torch.log(rank + 1)) - else: + loss = loss * torch.log(rank + 1) + if reduction == 'mean': return torch.mean(loss) + else: + return loss class InfoNCELoss(SampledSoftmaxLoss): - def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob): + def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob, reduction='mean'): return super().forward(label, pos_score, torch.zeros_like(pos_score), - neg_score, torch.zeros_like(neg_score)) + neg_score, torch.zeros_like(neg_score), reduction) class NCELoss(PairwiseLoss): - def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob): + def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob, reduction='mean'): new_pos = pos_score - log_pos_prob new_neg = neg_score - log_neg_prob - loss = F.logsigmoid(new_pos) + (new_neg - F.softplus(new_neg)).sum(1) - return -loss.mean() + loss = -F.logsigmoid(new_pos) + (new_neg - F.softplus(new_neg)).sum(1) + if reduction == 'mean': + return torch.mean(loss) + else: + return loss class CCLLoss(PairwiseLoss): @@ -174,7 +191,7 @@ def __init__(self, margin=0.8, neg_weight=0.3) -> None: self.margin = margin self.neg_weight = neg_weight - def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob): + def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob, reduction='mean'): # pos_score: [B,] or [B, N] # neg_score: [B, num_neg] or [B, N, num_neg] pos_score = torch.sigmoid(pos_score) @@ -194,24 +211,57 @@ def l2_reg_loss_fn(*args): class BCEWithLogitLoss(PointwiseLoss): - def __init__(self, reduction: str='mean') -> None: - super().__init__() - self.reduction = reduction - - def forward(self, label, pos_score): - loss = torch.nn.functional.binary_cross_entropy_with_logits( - pos_score, label, reduction=self.reduction) + def forward(self, label, pos_score, reduction='mean'): + loss = F.binary_cross_entropy_with_logits( + pos_score, label, reduction=reduction) return loss +class BCELoss(PointwiseLoss): + def forward(self, label, pos_score, reduction='mean'): + return F.binary_cross_entropy(pos_score, label, reduction=reduction) + + class MSELoss(PointwiseLoss): - def __init__(self, threshold: float=None, reduction: str='mean') -> None: + def __init__(self, threshold: float=None) -> None: + super().__init__() + self.threshold = threshold + + def forward(self, label, pos_score, reduction='mean'): + if self.threshold is not None: + label = (label > self.threshold).float() + loss = F.mse_loss(pos_score, label, reduction=reduction) + return loss + + +class L1Loss(PointwiseLoss): + def __init__(self, threshold: float=None) -> None: super().__init__() self.threshold = threshold - self.reduction = reduction - def forward(self, label, pos_score): + def forward(self, label, pos_score, reduction='mean'): if self.threshold is not None: label = (label > self.threshold).float() - loss = torch.nn.functional.mse_loss(pos_score, label) + loss = F.l1_loss(pos_score, label, reduction=reduction) return loss + + +class dCorLoss(PointwiseLoss): + def forward(self, label, pos_score, reduction='mean'): + """returns a value in [0, 1]""" + assert reduction == 'mean', '`reduction` in dCorLoss must be `mean`.' + pairwise_dis0 = torch.norm(label[:, None] - label, p = 2, dim = 2) + pairwise_dis1 = torch.norm(pos_score[:, None] - pos_score, p = 2, dim = 2) + + center_dis_mat0 = pairwise_dis0 - pairwise_dis0.mean(dim=0)[None, :] - \ + pairwise_dis0.mean(dim=1)[:, None] + pairwise_dis0.mean() + center_dis_mat1 = pairwise_dis1 - pairwise_dis1.mean(dim=0)[None, :] - \ + pairwise_dis1.mean(dim=1)[:, None] + pairwise_dis1.mean() + + n = label.size(0) + + dcov2_01 = (center_dis_mat0 * center_dis_mat1).sum() / n**2 + dcov2_00 = (center_dis_mat0 * center_dis_mat0).sum() / n**2 + dcov2_11 = (center_dis_mat1 * center_dis_mat1).sum() / n**2 + dcor = -torch.sqrt(dcov2_01) / torch.sqrt(torch.sqrt(dcov2_00) * torch.sqrt(dcov2_11)) + return dcor \ No newline at end of file diff --git a/recstudio/model/mf/__init__.py b/recstudio/model/mf/__init__.py index c20777f..0151836 100644 --- a/recstudio/model/mf/__init__.py +++ b/recstudio/model/mf/__init__.py @@ -7,4 +7,5 @@ from .ncf import NCF from .slim import SLIM from .wrmf import WRMF -from .dssm import DSSM \ No newline at end of file +from .dssm import DSSM +from .pmf import PMF \ No newline at end of file diff --git a/recstudio/model/mf/config/all.yaml b/recstudio/model/mf/config/all.yaml index b0892e7..8090b7d 100644 --- a/recstudio/model/mf/config/all.yaml +++ b/recstudio/model/mf/config/all.yaml @@ -1,5 +1,5 @@ data: - split_mode: user_entry # user # entry + # split_mode: user_entry # user # entry fmeval: False binaried_rating_thres: 0.0 diff --git a/recstudio/model/mf/config/pmf.yaml b/recstudio/model/mf/config/pmf.yaml new file mode 100644 index 0000000..7d04657 --- /dev/null +++ b/recstudio/model/mf/config/pmf.yaml @@ -0,0 +1,4 @@ +train: + batch_size: 2048 +eval: + cutoff: [10] \ No newline at end of file diff --git a/recstudio/model/mf/pmf.py b/recstudio/model/mf/pmf.py index 8539344..e2cb5a6 100644 --- a/recstudio/model/mf/pmf.py +++ b/recstudio/model/mf/pmf.py @@ -1,11 +1,9 @@ import torch from recstudio.data import dataset - -from .. import basemodel, scorer, loss_func +from recstudio.model import basemodel, scorer, loss_func class PMF(basemodel.BaseRetriever): - @staticmethod def _get_dataset_class(): return dataset.TripletDataset @@ -15,13 +13,11 @@ def _get_item_encoder(self, train_data): def _get_query_encoder(self, train_data): return torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0) - def _init_parameter(self): - super()._init_parameter() - torch.nn.init.normal_(self.query_encoder.weight, mean=0, std=self.config['query_std']) - torch.nn.init.normal_(self.item_encoder.weight, mean=0, std=self.config['item_std']) - def _get_score_func(self): return scorer.InnerProductScorer() def _get_loss_func(self): - return loss_func.SquareLoss() + return loss_func.MSELoss() + + def _get_sampler(self, train_data): + return None \ No newline at end of file diff --git a/recstudio/model/module/__init__.py b/recstudio/model/module/__init__.py index 716028f..fa0c355 100644 --- a/recstudio/model/module/__init__.py +++ b/recstudio/model/module/__init__.py @@ -1,2 +1,3 @@ from recstudio.model.module.layers import * from recstudio.model.module.graphmodule import * +from recstudio.model.module.propensity import * \ No newline at end of file diff --git a/recstudio/model/module/propensity.py b/recstudio/model/module/propensity.py new file mode 100644 index 0000000..1ef6b23 --- /dev/null +++ b/recstudio/model/module/propensity.py @@ -0,0 +1,118 @@ +import torch +from torch.utils.data import Dataset +# from recstudio.model.fm.lr import LR +# from recstudio.utils import get_model +# from recstudio.utils import get_model, color_dict_normal, set_color, get_logger + +class Popularity(torch.nn.Module): + """ + get propensity by popularity. + """ + def __init__(self, eta=1.0, truncation=0.1, eps=1e-7): + super().__init__() + self.eta = eta + self.trucation = truncation + self.eps = eps + + def fit(self, train_data : Dataset): + pop = (train_data.item_freq + 1) / (torch.sum(train_data.item_freq) + train_data.num_items) + pop = (pop - pop.min()) / (pop.max() - pop.min()) + pop = pop ** self.eta + if self.trucation is None or self.trucation == 0: + pop = pop + self.eps + else: + pop = torch.max( + torch.vstack([pop, self.trucation * torch.ones_like(pop)]), + dim=0 + ).values + self.register_buffer('pop', pop) + + def forward(self, batch): + """batch (torch.tensor): item id""" + return self.pop[batch] + +class FromCoatFile(torch.nn.Module): + """ + get propensity from the file in Coat dataset. + + Args: + pop (torch.Tensor): a U x I matrix + """ + def __init__(self, prop): + super().__init__() + self.prop = prop + def fit(self, train_data): + self.fuid = train_data.fuid + self.fiid = train_data.fiid + def forward(self, batch): + """batch (dict)""" + return self.prop[batch[self.fuid], batch[self.fiid]] + +# def get_propensity(config) -> torch.nn.Module: +# if config['propensity_estimation'].lower() == "naive_bayes": +# return NaiveBayes() +# elif config['propensity_estimation'].lower() == "logistic_regression": +# _, model_conf = get_model('LR') +# for k, v in config.items(): +# if k.startswith('p_'): +# model_conf.update({k[2:]: v}) +# model = LR(model_conf) +# model.logger.info(f"\n{set_color('P-Model Config', 'green')}: \n\n" + color_dict_normal(model_conf, False)) +# return model +# elif config['propensity_estimation'].lower() == "popularity": +# return Popularity() +# elif config['propensity_estimation'].lower() == "poisson_factorization": +# return PoissonFactorization() +# else: +# raise ValueError(f"{config['propensity_estimation']} is not supportable.") + +class NaiveBayes(torch.nn.Module): + """get propensity by naive bayes method. + + Args: + train_data (Dataset): missing not at random data; for training recommender and propensity + unif_data (Dataset): missing completely at random data; for training propensity only + + """ + def fit(self, train_data : Dataset, unif_data : Dataset): + y, y_cnt_given_o = torch.unique(train_data.inter_feat.get_col[train_data.frating], return_counts=True) + y = y.tolist() + P_y_given_o = y_cnt_given_o / torch.sum(y_cnt_given_o) + P_o = train_data.num_inters / train_data.num_users * train_data.num_items + + y_, y_cnt = torch.unique(unif_data.inter_feat.get_col[unif_data.frating], return_counts=True) + y_ = y_.tolist() + P_y = y_cnt / torch.sum(y_cnt) + + y_dict = {} + for k, v in zip(y, P_y_given_o): + y_dict[k] = v * P_o / P_y[y_.index(k)] + + self.register_buffer('y_dict', y_dict) + + def forward(self, batch): + p = torch.zeros_like(batch) + for i, y in enumerate(batch): + p[i] = self.y_dict[y] + return p + + + +class PoissonFactorization(torch.nn.Module): + """ + For Poisson factorization exposure model aui ~ Poi(\pi^T_u * \eta_i) + with conjugae gamma prior on the latent embeddings pi_u and \eta_i, + we perform standard variational inference on the exposure data a_{ui}. + After obtaining the optimal approximating variational distribution q + on pi_u and \eta_i at convergence, we compute the propensity score. + + Used by Dawen Liang et.al. Causal Inference for Recommendation + """ + def fit(self, train_data : Dataset): + #TODO(@pepsi2222) + self.fuid = train_data.fuid + self.fiid = train_data.fiid + pass + def forward(self, batch): + lambda_ui = (self.pi[batch[self.fuid]] * self.eta[batch[self.fiid]]).sum(1) + return 1 - torch.exp(-lambda_ui) \ No newline at end of file diff --git a/recstudio/model/scorer.py b/recstudio/model/scorer.py index 8129bb9..48d89d6 100644 --- a/recstudio/model/scorer.py +++ b/recstudio/model/scorer.py @@ -17,13 +17,20 @@ def forward(self, query, items): return output class CosineScorer(InnerProductScorer): + def __init__(self, reduction=None): + super().__init__() + self.reduction = reduction def forward(self, query, items): output = super().forward(query, items) output /= torch.norm(items, dim=-1) output /= torch.norm(query, dim=-1, keepdim=(query.dim()!=items.dim() or query.size(0)!=items.size(0))) - return output - + if self.reduction == 'mean': + return output.mean() + elif self.reduction == 'sum': + return output.sum() + else: + return output class EuclideanScorer(InnerProductScorer): def forward(self, query, items): diff --git a/recstudio/utils/arguments.py b/recstudio/utils/arguments.py index 2a112ef..6b90320 100644 --- a/recstudio/utils/arguments.py +++ b/recstudio/utils/arguments.py @@ -71,7 +71,7 @@ def get_default_parser() -> ArgumentParser: argument_default=SUPPRESS, formatter_class=ArgumentDefaultsHelpFormatter) group = parser.add_argument_group('main') - group.add_argument('--model', '-m', type=str, default='BPR', help='model name') + group.add_argument('--model', '-m', type=str, default='CausE', help='model name') group.add_argument('--dataset', '-d', type=str, default='ml-100k', help='dataset name') group.add_argument('--data_config_path', type=str, default=None, help='path of datasets config file') group.add_argument('--mode', choices=['tune', 'light', 'detail'],