import sqlite3 import gzip conn = sqlite3.connect("imdb_titles.sqlite") c = conn.cursor() c.execute( '''CREATE TABLE IF NOT EXISTS titles(tt_id INTEGER UNIQUE, type VARCHAR (50), original_name VARCHAR (500) DEFAULT NULL, ru_name VARCHAR (500) DEFAULT NULL, year INTEGER DEFAULT NULL)''') c.execute("PRAGMA synchronous = OFF") c.execute("PRAGMA optimize") conn.commit() def convert_tsv_to_db(title_basics_tsv): '''Конвертирование основного датасета в sqlite базу, выполняется долго (~5 минут)''' with gzip.open(title_basics_tsv, mode='rt') as file: write_dataset = [] counter = 0 chunk = 1000 progress_counter = 0 for line in file: line = line.split("\t") try: tt_id = int(line[0].split("tt")[1]) tt_type = line[1] original_name = line[3] ru_name = None year = line[5] if year.startswith(r"\N"): year = None else: year = int(year) if tt_type not in ("movie", "video"): original_name = None year = None write_dataset.append((tt_id, tt_type, original_name, ru_name, year)) counter += 1 if counter >= chunk: c.executemany("INSERT OR REPLACE INTO titles(tt_id, type, original_name, ru_name, year) VALUES (?, ?, ?, ?, ?)", write_dataset) write_dataset = [] counter = 0 progress_counter += chunk print(f'Обработано: {progress_counter}') except Exception as E: print(E) pass conn.commit() def extract_ru_locale_from_tsv(title_akas_tsv): '''Конвертирование датасета с локализованными названиями и последующее добавление в базу''' with gzip.open(title_akas_tsv, mode='rt') as file: ru_name_writer = [] counter = 0 for line in file: line = line.split("\t") try: tt_region = line[3] if tt_region != "RU": continue tt_id = int(line[0].split("tt")[1]) tt_type = c.execute(f"SELECT type FROM titles WHERE tt_id={tt_id}").fetchone()[0] if tt_type not in ("movie", "video"): continue ru_name = line[2] ru_name_writer.append((ru_name, tt_id)) counter += 1 print(f'Обработано ru_name: {counter}') except Exception as E: print(E) pass c.executemany("UPDATE titles SET ru_name = ? WHERE tt_id = ?", ru_name_writer) conn.commit() def convert_datasets_to_db(): print("Converting tsv dataset to sqlite...") convert_tsv_to_db("title.basics.tsv.gz") print("Unpack ru locale...") extract_ru_locale_from_tsv("title.akas.tsv.gz") def get_title_by_id(films_ids=list): tt_list = [] for i in films_ids: tt_film = c.execute(f"SELECT * FROM titles WHERE tt_id={i}").fetchone() tt_list.append(tt_film) return tt_list