Source code for rootski.services.database.non_orm.db_service

# TODO use regex to make sure query strings are safe from SQL injection
from typing import Any, Dict, List

import pandas as pd
from loguru import logger
from sqlalchemy.engine import Engine
from sqlalchemy.orm import Session

from rootski.services.database.models.models import (
    Breakdown,
    BreakdownItem,
    Morpheme,
    MorphemeFamily,
    MorphemeFamilyMeaning,
    Word,
)
from rootski.services.database.non_orm import sql_statements
from rootski.services.database.non_orm.utils import collapse_df


[docs]class RootskiDBService: def __init__(self, engine: Engine): """ Args: db_type: one of ["postgres", "sqlite"]. Will format queries accordingly. """ self._engine = engine
[docs] def run_query(self, query, as_df=False, **kwargs): """ Args: query (str): pre-validated SQL query ready to run against db as_df (bool): return dataframe object if true, otherwise, rows as dictionaries kwargs : parameters to be forwarded to pandas.read_sql_query Returns: list[dict]: SQL result set (if not as_df) OR pd.DataFrame: SQL result set (if as_df) """ try: logger.debug(f"Running query: {query}") # run query and parse results to dataframe result_set = pd.read_sql_query(query, con=self._engine, **kwargs) # if desired, convert dataframe rows to list of dicts if not as_df: result_set = result_set.to_dict(orient="records") logger.debug(f"Fetched results: {str(result_set)}") return result_set except Exception as e: logger.warning(f"Query failed with exception: {str(e)}") return []
def query_word_by_id(self, word_id): # TODO: decide how to handle errors for example when the given word_id does not exist query = sql_statements.WORD_BY_ID.format(word_id=word_id) result_set = self.run_query(query) if len(result_set) > 0: return result_set[0] return None def search_words(self, search_key, limit=100): query = sql_statements.SEARCH_WORDS.format(search_key=search_key, limit=limit) return self.run_query(query) def search_morphemes(self, search_key, limit=100): query = sql_statements.SEARCH_MORPHEMES.format(search_key=search_key, limit=limit) return self.run_query(query)
[docs] def query_definitions(self, word_id): """ Returns .. code:: text [ { "word_type": str, "definitions": [ { "definition_id": int, "def_position": int, # the definition objs are sorted by def_position asc "sub_defs": [ { "sub_def_id": int, "sub_def_position": int, # sub_def objs are sorted by sub_def_position asc "definition": str, "notes": str | None }, ... ] }, ... ] }, ... ] """ query = sql_statements.DEFINITIONS.format(word_id=word_id) result_set = self.run_query(query, as_df=True) if len(result_set) == 0: return [] # nest the sub definitions under the definitions result_set = collapse_df( result_set, groupby_col="definition_id", group_cols=["definition_id", "def_position", "pos"], child_cols=["sub_def_id", "sub_def_position", "definition", "notes"], child_name="sub_defs", grp_sort_col="def_position", ch_sort_col="sub_def_position", ) logger.debug("Result set for definitions" + str(result_set)) # nest the definitions under the word types result_set = pd.DataFrame(result_set) result_set = collapse_df( result_set, groupby_col="pos", group_cols=["pos"], child_cols=["def_position", "definition_id", "sub_defs"], child_name="definitions", ) logger.debug(str(result_set)) # NOTE: this is a hack, rather than play with sql queries, we are going # to manually de-duplicate the subdefinitions here def get_deduped_sub_defs(sub_defs: List[Dict[str, Any]]): deduped_sub_defs = [] for sub_def in sub_defs: if len(deduped_sub_defs) == 0: deduped_sub_defs.append(sub_def) else: # if the subdef is already in the list, don't add it if sub_def["sub_def_id"] in [sub_def["sub_def_id"] for sub_def in deduped_sub_defs]: continue deduped_sub_defs.append(sub_def) return deduped_sub_defs def get_result_item_with_deduped_sub_defs(result_item: Dict[str, Any]): to_return = result_item.copy() to_return["definitions"] = [ { **definition, "sub_defs": get_deduped_sub_defs(definition["sub_defs"]), } for definition in result_item["definitions"] ] return to_return deduped_result_set = [get_result_item_with_deduped_sub_defs(definition) for definition in result_set] # END HACK :) return deduped_result_set
[docs] def query_morpheme_breakdown(self, word_id): """ Returns a breakdown of the following form .. code:: text [ { 'morpheme': 'год', 'morpheme_id': 294.0, 'level': 3.0, 'position': 0, # morphemes sorted by position asc 'type': 'root', 'family_id': 134, 'family': 'гож,год,гожд', 'meanings': [ { 'meaning': 'pleasing:1' }, ... ] }, ... ] """ # query = sql_statements.MORPHEME_BREAKDOWN.format(word_id) # result_set_df = self.run_query(query, as_df=True) # result_set = collapse_df(result_set_df, # groupby_col="position", # group_cols=["morpheme", "morpheme_id", "level", "position", "type", "family_id", "family"], # child_cols=["meaning"], # child_name="meanings", grp_sort_col="position") # return result_set # query sqlalchemy table for Breakdown where id is morpheme_id with Session(self._engine) as session: to_return = [] breakdown: Breakdown = session.query(Breakdown).where(Breakdown.word_id == word_id).limit(1).one() for breakdown_item in breakdown.breakdown_items: breakdown_item: BreakdownItem breakdown_item_fields = { "morpheme_id": breakdown_item.morpheme_id, "breakdown_id": breakdown_item.breakdown_id, "morpheme": breakdown_item.morpheme, "type": breakdown_item.type, "position": breakdown_item.position, "family_id": None, "family": None, "meanings": [], } if breakdown_item.morpheme_id: family: MorphemeFamily = breakdown_item.morpheme_.family breakdown_item_fields.update( { "family_id": family.id, "family": family.family, } ) if breakdown_item.morpheme_ and breakdown_item.morpheme_.family: family: MorphemeFamily = breakdown_item.morpheme_.family meanings = [{"meaning": m.meaning} for m in family.meanings] breakdown_item_fields.update({"meanings": meanings}) to_return.append(breakdown_item_fields) return to_return
def query_adjective_forms(self, word_id): query = sql_statements.ADJECTIVE_FORMS.format(word_id) return self.run_query(query) def query_verb_conjugations(self, word_id): query = sql_statements.VERB_CONJUGATIONS.format(word_id) return self.run_query(query) def query_aspectual_pairs(self, word_id): query = sql_statements.ASPECTUAL_PAIRS.format(word_id) return self.run_query(query) def query_noun_declensions(self, word_id): query = sql_statements.NOUN_DECLENSIONS.format(word_id) return self.run_query(query) def query_example_sentences(self, word_id): query = sql_statements.EXAMPLE_SENTENCES.format(word_id) return self.run_query(query)
[docs] def fetch_word_data(self, word_id, main_word_type): """ 1. Breakdown 2. Definitions 3. Example Sentences 4. POS specific results i.e. verb, noun, etc. 5. The word itself Args: word_id (int): id of a word in the "words" table main_word_type (str): one of ["noun", "adjective", "verb", "particle", "adverb", "preposition", "pronoun"] Returns: dict: payload-like object of all data to display on word page """ data = { "word": self.query_word_by_id(word_id), # "breakdown": self.query_morpheme_breakdown(word_id), "definitions": self.query_definitions(word_id), "sentences": self.query_example_sentences(word_id), } pos_specific_data = dict() if main_word_type == "noun": pos_specific_data = self.fetch_noun_data(word_id) elif main_word_type == "verb": pos_specific_data = self.fetch_verb_data(word_id) elif main_word_type == "adjective": pos_specific_data = self.fetch_adjective_data(word_id) data.update(pos_specific_data) return data
[docs] def fetch_adjective_data(self, word_id): """ 1. Adjective short forms """ short_forms = self.query_adjective_forms(word_id) if len(short_forms) > 0: short_forms = short_forms[0] else: short_forms = None return {"short_forms": short_forms}
[docs] def fetch_noun_data(self, word_id): """ 1. Declensions """ declensions = self.query_noun_declensions(word_id) if len(declensions) > 0: declensions = declensions[0] else: declensions = None return {"declensions": declensions}
[docs] def fetch_verb_data(self, word_id): """ 1. Conjugations 2. Aspectual Pairs """ conjugations = self.query_verb_conjugations(word_id) if len(conjugations) > 0: conjugations = conjugations[0] else: conjugations = None return { "conjugations": conjugations, "aspectual_pairs": self.query_aspectual_pairs(word_id), }