FMN_bot/src/imdb_datasets_worker.py

93 lines
3.3 KiB
Python
Raw Normal View History

2022-08-31 13:20:49 +03:00
import sqlite3
import gzip
conn = sqlite3.connect("imdb_titles.sqlite")
c = conn.cursor()
c.execute(
'''CREATE TABLE IF NOT EXISTS titles(tt_id INTEGER UNIQUE, type VARCHAR (50), original_name VARCHAR (500) DEFAULT NULL, ru_name VARCHAR (500) DEFAULT NULL, year INTEGER DEFAULT NULL)''')
c.execute("PRAGMA synchronous = OFF")
c.execute("PRAGMA optimize")
conn.commit()
def convert_tsv_to_db(title_basics_tsv):
'''Конвертирование основного датасета в sqlite базу, выполняется долго (~5 минут)'''
2022-08-31 13:20:49 +03:00
with gzip.open(title_basics_tsv, mode='rt') as file:
write_dataset = []
counter = 0
chunk = 1000
progress_counter = 0
2022-08-31 13:20:49 +03:00
for line in file:
line = line.split("\t")
try:
tt_id = int(line[0].split("tt")[1])
tt_type = line[1]
original_name = line[3]
ru_name = None
year = line[5]
if year.startswith(r"\N"):
2022-08-31 13:20:49 +03:00
year = None
else:
year = int(year)
if tt_type not in ("movie", "video"):
original_name = None
year = None
write_dataset.append((tt_id, tt_type, original_name, ru_name, year))
counter += 1
if counter >= chunk:
c.executemany("INSERT OR REPLACE INTO titles(tt_id, type, original_name, ru_name, year) VALUES (?, ?, ?, ?, ?)", write_dataset)
write_dataset = []
counter = 0
progress_counter += chunk
print(f'Обработано: {progress_counter}')
2022-08-31 13:20:49 +03:00
except Exception as E:
print(E)
pass
conn.commit()
2022-08-31 13:20:49 +03:00
def extract_ru_locale_from_tsv(title_akas_tsv):
'''Конвертирование датасета с локализованными названиями и последующее добавление в базу'''
with gzip.open(title_akas_tsv, mode='rt') as file:
ru_name_writer = []
counter = 0
2022-08-31 13:20:49 +03:00
for line in file:
line = line.split("\t")
try:
tt_region = line[3]
if tt_region != "RU":
continue
tt_id = int(line[0].split("tt")[1])
tt_type = c.execute(f"SELECT type FROM titles WHERE tt_id={tt_id}").fetchone()[0]
if tt_type not in ("movie", "video"):
continue
ru_name = line[2]
ru_name_writer.append((ru_name, tt_id))
counter += 1
print(f'Обработано ru_name: {counter}')
2022-08-31 13:20:49 +03:00
except Exception as E:
print(E)
pass
c.executemany("UPDATE titles SET ru_name = ? WHERE tt_id = ?", ru_name_writer)
2022-08-31 13:20:49 +03:00
conn.commit()
def convert_datasets_to_db():
print("Converting tsv dataset to sqlite...")
convert_tsv_to_db("title.basics.tsv.gz")
print("Unpack ru locale...")
extract_ru_locale_from_tsv("title.akas.tsv.gz")
def get_title_by_id(films_ids=list):
tt_list = []
for i in films_ids:
tt_film = c.execute(f"SELECT * FROM titles WHERE tt_id={i}").fetchone()
tt_list.append(tt_film)
return tt_list