Fix datasets converter (year partially broken)

This commit is contained in:
localhost_frssoft 2022-09-03 18:35:26 +03:00
parent e10b6da0e0
commit 6d30acedd0

View File

@ -16,6 +16,8 @@ def convert_tsv_to_db(title_basics_tsv):
with gzip.open(title_basics_tsv, mode='rt') as file: with gzip.open(title_basics_tsv, mode='rt') as file:
write_dataset = [] write_dataset = []
counter = 0 counter = 0
chunk = 1000
progress_counter = 0
for line in file: for line in file:
line = line.split("\t") line = line.split("\t")
try: try:
@ -24,23 +26,23 @@ def convert_tsv_to_db(title_basics_tsv):
original_name = line[3] original_name = line[3]
ru_name = None ru_name = None
year = line[5] year = line[5]
if year.startswith(r"\N"):
if tt_type not in ("movie", "video"):
original_name = None
year = "\\N"
else:
print(tt_id, tt_type, original_name, ru_name, year)
if year == "\\N":
year = None year = None
else: else:
year = int(year) year = int(year)
if tt_type not in ("movie", "video"):
original_name = None
year = None
write_dataset.append((tt_id, tt_type, original_name, ru_name, year)) write_dataset.append((tt_id, tt_type, original_name, ru_name, year))
counter += 1 counter += 1
if counter >= 1000: if counter >= chunk:
c.executemany("INSERT OR REPLACE INTO titles(tt_id, type, original_name, ru_name, year) VALUES (?, ?, ?, ?, ?)", write_dataset) c.executemany("INSERT OR REPLACE INTO titles(tt_id, type, original_name, ru_name, year) VALUES (?, ?, ?, ?, ?)", write_dataset)
write_dataset = [] write_dataset = []
counter = 0 counter = 0
progress_counter += chunk
print(f'Обработано: {progress_counter}')
except Exception as E: except Exception as E:
print(E) print(E)
pass pass
@ -50,6 +52,7 @@ def extract_ru_locale_from_tsv(title_akas_tsv):
'''Конвертирование датасета с локализованными названиями и последующее добавление в базу''' '''Конвертирование датасета с локализованными названиями и последующее добавление в базу'''
with gzip.open(title_akas_tsv, mode='rt') as file: with gzip.open(title_akas_tsv, mode='rt') as file:
ru_name_writer = [] ru_name_writer = []
counter = 0
for line in file: for line in file:
line = line.split("\t") line = line.split("\t")
try: try:
@ -62,8 +65,9 @@ def extract_ru_locale_from_tsv(title_akas_tsv):
if tt_type not in ("movie", "video"): if tt_type not in ("movie", "video"):
continue continue
ru_name = line[2] ru_name = line[2]
print(ru_name, tt_type)
ru_name_writer.append((ru_name, tt_id)) ru_name_writer.append((ru_name, tt_id))
counter += 1
print(f'Обработано ru_name: {counter}')
except Exception as E: except Exception as E:
print(E) print(E)