From 09b31528f93cdf1d8656676f9e94d93138755db4 Mon Sep 17 00:00:00 2001 From: localhost_frssoft Date: Sat, 3 Sep 2022 01:51:59 +0300 Subject: [PATCH] More fast convertation datasets to sqlite --- src/imdb_datasets_worker.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/imdb_datasets_worker.py b/src/imdb_datasets_worker.py index 678c8d6..f73879a 100644 --- a/src/imdb_datasets_worker.py +++ b/src/imdb_datasets_worker.py @@ -12,8 +12,10 @@ conn.commit() def convert_tsv_to_db(title_basics_tsv): - '''Конвертирование основного датасета в sqlite базу, выполняется весьма долго (5-10 минут)''' + '''Конвертирование основного датасета в sqlite базу, выполняется долго (~5 минут)''' with gzip.open(title_basics_tsv, mode='rt') as file: + write_dataset = [] + counter = 0 for line in file: line = line.split("\t") try: @@ -33,16 +35,20 @@ def convert_tsv_to_db(title_basics_tsv): year = None else: year = int(year) - c.execute("INSERT OR REPLACE INTO titles(tt_id, type, original_name, ru_name, year) VALUES (?, ?, ?, ?, ?)", - (tt_id, tt_type, original_name, ru_name, year)) + write_dataset.append((tt_id, tt_type, original_name, ru_name, year)) + counter += 1 + if counter >= 1000: + c.executemany("INSERT OR REPLACE INTO titles(tt_id, type, original_name, ru_name, year) VALUES (?, ?, ?, ?, ?)", write_dataset) + write_dataset = [] except Exception as E: print(E) pass - conn.commit() + conn.commit() def extract_ru_locale_from_tsv(title_akas_tsv): '''Конвертирование датасета с локализованными названиями и последующее добавление в базу''' with gzip.open(title_akas_tsv, mode='rt') as file: + ru_name_writer = [] for line in file: line = line.split("\t") try: @@ -56,11 +62,13 @@ def extract_ru_locale_from_tsv(title_akas_tsv): continue ru_name = line[2] print(ru_name, tt_type) - c.execute("UPDATE titles SET ru_name = ? WHERE tt_id = ?", (ru_name, tt_id)) + ru_name_writer.append((ru_name, tt_id)) except Exception as E: print(E) pass + + c.executemany("UPDATE titles SET ru_name = ? WHERE tt_id = ?", ru_name_writer) conn.commit() def convert_datasets_to_db():