2022-08-31 13:20:49 +03:00
|
|
|
|
import sqlite3
|
|
|
|
|
import gzip
|
|
|
|
|
|
|
|
|
|
conn = sqlite3.connect("imdb_titles.sqlite")
|
|
|
|
|
c = conn.cursor()
|
|
|
|
|
|
|
|
|
|
c.execute(
|
|
|
|
|
'''CREATE TABLE IF NOT EXISTS titles(tt_id INTEGER UNIQUE, type VARCHAR (50), original_name VARCHAR (500) DEFAULT NULL, ru_name VARCHAR (500) DEFAULT NULL, year INTEGER DEFAULT NULL)''')
|
|
|
|
|
c.execute("PRAGMA synchronous = OFF")
|
|
|
|
|
c.execute("PRAGMA optimize")
|
|
|
|
|
conn.commit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_tsv_to_db(title_basics_tsv):
|
2022-09-03 01:51:59 +03:00
|
|
|
|
'''Конвертирование основного датасета в sqlite базу, выполняется долго (~5 минут)'''
|
2022-08-31 13:20:49 +03:00
|
|
|
|
with gzip.open(title_basics_tsv, mode='rt') as file:
|
2022-09-03 01:51:59 +03:00
|
|
|
|
write_dataset = []
|
|
|
|
|
counter = 0
|
2022-09-03 18:35:26 +03:00
|
|
|
|
chunk = 1000
|
|
|
|
|
progress_counter = 0
|
2022-08-31 13:20:49 +03:00
|
|
|
|
for line in file:
|
|
|
|
|
line = line.split("\t")
|
|
|
|
|
try:
|
|
|
|
|
tt_id = int(line[0].split("tt")[1])
|
|
|
|
|
tt_type = line[1]
|
|
|
|
|
original_name = line[3]
|
|
|
|
|
ru_name = None
|
|
|
|
|
year = line[5]
|
2022-09-03 18:35:26 +03:00
|
|
|
|
if year.startswith(r"\N"):
|
2022-08-31 13:20:49 +03:00
|
|
|
|
year = None
|
|
|
|
|
else:
|
|
|
|
|
year = int(year)
|
2022-09-03 18:35:26 +03:00
|
|
|
|
|
|
|
|
|
if tt_type not in ("movie", "video"):
|
|
|
|
|
original_name = None
|
|
|
|
|
year = None
|
|
|
|
|
|
2022-09-03 01:51:59 +03:00
|
|
|
|
write_dataset.append((tt_id, tt_type, original_name, ru_name, year))
|
|
|
|
|
counter += 1
|
2022-09-03 18:35:26 +03:00
|
|
|
|
if counter >= chunk:
|
2022-09-03 01:51:59 +03:00
|
|
|
|
c.executemany("INSERT OR REPLACE INTO titles(tt_id, type, original_name, ru_name, year) VALUES (?, ?, ?, ?, ?)", write_dataset)
|
|
|
|
|
write_dataset = []
|
2022-09-03 02:00:23 +03:00
|
|
|
|
counter = 0
|
2022-09-03 18:35:26 +03:00
|
|
|
|
progress_counter += chunk
|
|
|
|
|
print(f'Обработано: {progress_counter}')
|
2022-08-31 13:20:49 +03:00
|
|
|
|
except Exception as E:
|
|
|
|
|
print(E)
|
|
|
|
|
pass
|
2022-09-03 01:51:59 +03:00
|
|
|
|
conn.commit()
|
2022-08-31 13:20:49 +03:00
|
|
|
|
|
|
|
|
|
def extract_ru_locale_from_tsv(title_akas_tsv):
|
|
|
|
|
'''Конвертирование датасета с локализованными названиями и последующее добавление в базу'''
|
|
|
|
|
with gzip.open(title_akas_tsv, mode='rt') as file:
|
2022-09-03 01:51:59 +03:00
|
|
|
|
ru_name_writer = []
|
2022-09-03 18:35:26 +03:00
|
|
|
|
counter = 0
|
2022-08-31 13:20:49 +03:00
|
|
|
|
for line in file:
|
|
|
|
|
line = line.split("\t")
|
|
|
|
|
try:
|
|
|
|
|
tt_region = line[3]
|
|
|
|
|
if tt_region != "RU":
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
tt_id = int(line[0].split("tt")[1])
|
|
|
|
|
tt_type = c.execute(f"SELECT type FROM titles WHERE tt_id={tt_id}").fetchone()[0]
|
|
|
|
|
if tt_type not in ("movie", "video"):
|
|
|
|
|
continue
|
|
|
|
|
ru_name = line[2]
|
2022-09-03 01:51:59 +03:00
|
|
|
|
ru_name_writer.append((ru_name, tt_id))
|
2022-09-03 18:35:26 +03:00
|
|
|
|
counter += 1
|
|
|
|
|
print(f'Обработано ru_name: {counter}')
|
2022-08-31 13:20:49 +03:00
|
|
|
|
|
|
|
|
|
except Exception as E:
|
|
|
|
|
print(E)
|
|
|
|
|
pass
|
2022-09-03 01:51:59 +03:00
|
|
|
|
|
|
|
|
|
c.executemany("UPDATE titles SET ru_name = ? WHERE tt_id = ?", ru_name_writer)
|
2022-08-31 13:20:49 +03:00
|
|
|
|
conn.commit()
|
|
|
|
|
|
|
|
|
|
def convert_datasets_to_db():
|
|
|
|
|
print("Converting tsv dataset to sqlite...")
|
|
|
|
|
convert_tsv_to_db("title.basics.tsv.gz")
|
|
|
|
|
print("Unpack ru locale...")
|
|
|
|
|
extract_ru_locale_from_tsv("title.akas.tsv.gz")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_title_by_id(films_ids=list):
|
|
|
|
|
tt_list = []
|
|
|
|
|
for i in films_ids:
|
|
|
|
|
tt_film = c.execute(f"SELECT * FROM titles WHERE tt_id={i}").fetchone()
|
|
|
|
|
tt_list.append(tt_film)
|
|
|
|
|
return tt_list
|
|
|
|
|
|