論文の被引用数予測

研究価値を最大化させるワーディングとは

賞金: 100,000 参加ユーザー数: 141 終了まで: 23日

LightGBM Base line (CV: 0.560154 / LB 0.564024)

gensimによる'abstract'ベクトル化データ + 'doi_cites'データによる LightGBM Base lineコードを共有いたします。ご活用いただければと思います。

# ------------------------------------------------------------------------------
# ProbSpace: 論文の被引用数予測
# ------------------------------------------------------------------------------
import pandas as pd
import numpy as np
import json
import os

from pathlib import Path
from tqdm import tqdm

import gensim
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# ------------------------------------------------------------------------------
# Start-up
# ------------------------------------------------------------------------------
NFOLDS = 5
SEED = 42

def set_seed(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

set_seed(SEED)

# ------------------------------------------------------------------------------
# File and model path definition
# ------------------------------------------------------------------------------
DATA_PATH = Path("../../input/Predicting_the_number_of_citations_to_a_paper/")
FEATURE_PATH = Path("../../features/")
MODEL_PATH = Path("../../models/")

train_file = DATA_PATH / "train_data.json"
test_file = DATA_PATH / "test_data.json"

# ------------------------------------------------------------------------------
# Define gensim model
# ------------------------------------------------------------------------------
emb_model = gensim.models.KeyedVectors.load_word2vec_format(MODEL_PATH / "GoogleNews-vectors-negative300.bin", binary=True)


# ------------------------------------------------------------------------------
# Define data iterator
# ------------------------------------------------------------------------------
def get_data_iter(file_path):
    with open(file_path, 'r') as f:
        for jason_line in f:
            yield jason_line


# ------------------------------------------------------------------------------
# Read train and test data
# ------------------------------------------------------------------------------
train = []
train_feat = []
target = []
train_iter = get_data_iter(train_file)
for line in tqdm(train_iter, desc="train", total=851_524):
    data = json.loads(line)
    if 'cites' in data:  # 'cites' が nan のデータは除外
        abstract = data['abstract']
        doi_cites = [np.log1p(int(data['doi_cites']))]
        cites = int(data['cites'])
        # 'abstract' を gensim でベクトル化
        emb_abstract = np.mean([emb_model[w] for w in abstract.split(' ') if w in emb_model], axis=0)
        train.append(emb_abstract)
        train_feat.append(doi_cites)
        target.append(cites)

test = []
test_feat = []
test_index = []
test_iter = get_data_iter(test_file)
for line in tqdm(test_iter, desc="test", total=59_084):
    data = json.loads(line)
    abstract = data['abstract']
    doi_cites = [np.log1p(int(data['doi_cites']))]
    emb_abstract = np.mean([emb_model[w] for w in abstract.split(' ') if w in emb_model], axis=0)
    test.append(emb_abstract)
    test_feat.append(doi_cites)
    test_index.append(data['id'])

train = np.concatenate([np.array(train), np.array(train_feat)], axis=1)
target = np.array(np.log1p(target))
test = np.concatenate([np.array(test), np.array(test_feat)], axis=1)

# ------------------------------------------------------------------------------
# Check the data
# ------------------------------------------------------------------------------
print(train.shape)
print(target.shape)
print(test.shape)

# --------------------------------------
# Check and save optimization history
# --------------------------------------
lgb_params = {'objective': 'root_mean_squared_error',
              'boosting_type': 'gbdt',
              'n_estimators': 50000,
              'learning_rate': 0.001,
              'num_leaves': 64,
              'min_child_samples': 30,
              'colsample_bytree': 0.5,
              'subsample': 0.5,
              'subsample_freq': 3,
              'reg_alpha': 8,
              'reg_lambda': 2,
              'random_state': SEED
              }

# --------------------------------------
# Training and prediction
# --------------------------------------
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
lgb_oof = np.zeros(train.shape[0])
lgb_pred = 0

for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train)):
    X_train, y_train = train[trn_idx], target[trn_idx]
    X_valid, y_valid = train[val_idx], target[val_idx]
    X_test = test

    # LightGBM
    model = lgb.LGBMRegressor(**lgb_params)
    model.fit(X_train, y_train,
              eval_set=(X_valid, y_valid),
              eval_metric='rmse',
              verbose=False,
              early_stopping_rounds=500
              )

    lgb_oof[val_idx] = model.predict(X_valid)
    lgb_pred += model.predict(X_test) / NFOLDS
    rmsle = mean_squared_error(y_valid, lgb_oof[val_idx], squared=False)
    print(f"fold {fold} lgb score: {rmsle}")

rmsle = mean_squared_error(target, lgb_oof, squared=False)
print("+-" * 40)
print(f"score: {rmsle}")
print(f"model score: {model.score(train, target)}")

# ------------------------------------------------------------------------------
# submit the results
# ------------------------------------------------------------------------------
test_predicted = np.expm1(lgb_pred)

submit_df = pd.DataFrame({'id': test_index})
submit_df['cites'] = np.where(test_predicted < 0, 0, test_predicted)
submit_df.to_csv("submission.csv", index=False)

Favicon
new user
コメントするには 新規登録 もしくは ログイン が必要です。