"""
TODO
(1) validate that this DDL works with sqlite
(2) validate that this DDL works with postgres
(3) use it on a postgres database
(4) adjust the `gather_data.py` script to load data into tables already created using this DDL
(5) run some queries in the populated database to make sure it all works right
"""
from sqlalchemy import (
TIMESTAMP,
Column,
DateTime,
Enum,
ForeignKey,
PrimaryKeyConstraint,
String,
Text,
UniqueConstraint,
)
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
from sqlalchemy.schema import CreateTable
from sqlalchemy.sql import expression, func
from sqlalchemy.types import Boolean, Integer
# Base object which will contain all proto table declarations
Base = declarative_base()
[docs]class GenericToString:
def __repr__(self):
return str(self.__dict__)
def __str__(self):
return self.__repr__()
# re-used types
POS_ENUM = Enum(
"preposition",
"interjection",
"participle",
"particle",
"adjective",
"pronoun",
"verb",
"noun",
"conjunction",
"adverb",
name="parts_of_speech",
)
MORPHEME_TYPE_ENUM = Enum("prefix", "suffix", "root", "link", name="morpheme_types")
[docs]def print_ddl(table_class, connection_or_engine):
"""Print the CREATE TABLE DDL for the given table class
Args:
table_class: class inheriting from Base
connection_or_engine: connection/engine
"""
print(CreateTable(table_class.__table__).compile(connection_or_engine))
#################
# --- Users --- #
#################
[docs]class User(Base, GenericToString):
__tablename__ = "users"
# WARNING! We are using the user email as the primary key.
# This is considered to be bad practice. We may change this
# in the future, but for now I'm more concerned about protecting
# users from losing their data if we delete/lose the existing
# cognito user pool for any reason. By using the email, if the
# cognito user pool gets deleted, users don't lose their data
# as long as they re-register with the same email as before.
email = Column(String(512), primary_key=True, nullable=True)
is_admin = Column(Boolean, nullable=False, server_default=expression.false())
# one-to-many
submitted_breakdowns = relationship(
"Breakdown",
uselist=True,
primaryjoin="User.email==Breakdown.submitted_by_user_email",
back_populates="submitted_by_user",
)
verified_breakdowns = relationship(
"Breakdown",
uselist=True,
primaryjoin="User.email==Breakdown.verified_by_user_email",
back_populates="verified_by_user",
)
################
# --- Word --- #
################
[docs]class Word(Base, GenericToString):
__tablename__ = "words"
id = Column(Integer, primary_key=True, autoincrement=True, nullable=False)
word = Column(String(256), nullable=False, comment="russian word in cyrillic characters")
accent = Column(
String(256),
nullable=False,
comment="russian word in cyrillic characters, which may include a ''' or 'ё' on the stressed syllable",
)
pos = Column(POS_ENUM, nullable=False, comment="part of speech (noun, adjective, etc.")
frequency = Column(Integer, nullable=True, comment="frequency rank of the word")
# one-to-many
breakdowns = relationship(
"Breakdown",
uselist=True,
# foreign_keys=["word_to_breakdowns.breakdown_id"],
# foreign_keys=[ForeignKey("word_to_breakdowns.breakdown_id")],
back_populates="word_",
)
######################
# --- Breakdowns --- #
######################
[docs]class MorphemeFamily(Base, GenericToString):
"""This table is necessary because some families have multiple meanings"""
__tablename__ = "morpheme_families"
id = Column(Integer, primary_key=True, autoincrement=True, nullable=False)
# TODO make 'family' a @property calculated by SQLAlchemy, rather than an actual field in the database
family = Column(
String(256),
nullable=True,
comment="comma separated list of morphemes in the morpheme family (cyrillic)",
)
level = Column(
Integer,
nullable=True,
comment="difficulty level of the morpheme--level 1 should be learned first",
)
# one-to-many
morphemes = relationship("Morpheme", uselist=True, back_populates="family")
meanings = relationship("MorphemeFamilyMeaning", uselist=True, back_populates="family")
[docs]class MorphemeFamilyMeaning(Base, GenericToString):
__tablename__ = "morpheme_family_meanings"
id = Column(Integer, primary_key=True, autoincrement=True, nullable=False)
family_id = Column(Integer, ForeignKey("morpheme_families.id"), nullable=False)
meaning = Column(String(256), nullable=True, comment="meaning of the morpheme family if any")
# many-to-one
family = relationship("MorphemeFamily", uselist=False, back_populates="meanings")
[docs]class Morpheme(Base, GenericToString):
__tablename__ = "morphemes"
morpheme_id = Column(Integer, primary_key=True, autoincrement=True, nullable=False)
morpheme = Column(
String(256),
nullable=False,
comment="one morpheme variant in a morpheme family (cyrillic)",
)
type = Column(MORPHEME_TYPE_ENUM, nullable=False)
word_pos = Column(
Enum("any", "adjective", "noun", "verb", name="morpheme_word_pos"),
nullable=False,
comment="some morphemes only appear in certain types (part of speech) of words",
) # formerly word_type
family_id = Column(Integer, ForeignKey("morpheme_families.id"), nullable=False)
# many-to-one
family: MorphemeFamily = relationship("MorphemeFamily", uselist=False, back_populates="morphemes")
[docs]class Breakdown(Base, GenericToString):
__tablename__ = "word_to_breakdowns"
word_id = Column(Integer, ForeignKey("words.id"), nullable=False)
breakdown_id = Column(Integer, primary_key=True, autoincrement=True, nullable=False)
word = Column(String(256), nullable=True, comment="Optional: word associated with the word_id")
submitted_by_user_email = Column(
String(512),
ForeignKey("users.email"),
nullable=True,
comment="id of the user who submitted the breakdown--if null, the submission is anonymous",
)
verified_by_user_email = Column(
String(512),
ForeignKey("users.email"),
nullable=True,
comment="id of the admin user who verified the breakdown--should be null exactly when the breakdown is unverified",
)
is_verified = Column(
Boolean,
nullable=False,
server_default=expression.false(),
comment="if True, the frontent should use this record--only one breakdown should be allowed to have True for this value",
)
is_inference = Column(
Boolean,
nullable=False,
server_default=expression.false(),
comment="Whether the breakdown came from a model. If True, submitted_by_user_email is null.",
)
date_submitted = Column(DateTime, nullable=False, server_default=func.current_timestamp())
date_verified = Column(DateTime, nullable=True, onupdate=func.current_timestamp())
# one-to-many
breakdown_items = relationship("BreakdownItem", uselist=True, back_populates="breakdown")
# many-to-one
word_ = relationship("Word", uselist=False, foreign_keys=[word_id], back_populates="breakdowns")
verified_by_user = relationship(
"User",
uselist=False,
foreign_keys=[verified_by_user_email],
back_populates="verified_breakdowns",
)
submitted_by_user = relationship(
"User",
uselist=False,
foreign_keys=[submitted_by_user_email],
back_populates="submitted_breakdowns",
)
[docs]class BreakdownItem(Base, GenericToString):
__tablename__ = "breakdowns"
# __table_args__ = (
# # a morpheme should never be used twice in the same word (unless it is the NULL morpheme)
# # so we set these columns up as the unique primary keys of this table
# PrimaryKeyConstraint("breakdown_id", "morpheme_id"),
# )
# this id column is required by sqlalchemy
id = Column(
Integer,
primary_key=True,
autoincrement=True,
nullable=False,
comment="This id is not used in rootski logic, SQLAlchemy requires a primary key",
)
morpheme_id = Column(
Integer,
ForeignKey("morphemes.morpheme_id"),
nullable=True,
comment="if not null, this word substring is an identified morpheme",
)
breakdown_id = Column(Integer, ForeignKey("word_to_breakdowns.breakdown_id"), nullable=True)
morpheme = Column(
String(256),
nullable=False,
comment="a substring of a word--if the morpheme_id is not null, this substring is an identified morpheme",
)
type = Column(
MORPHEME_TYPE_ENUM,
nullable=True,
comment="matches `type` in the morpheme table if the morpheme is identified, else null",
)
position = Column(
Integer,
nullable=False,
comment="where this word substring belongs in the order of all the word substrings that make up the word",
)
# many-to-one
morpheme_ = relationship("Morpheme", uselist=False)
breakdown = relationship(
"Breakdown",
uselist=False,
primaryjoin=breakdown_id == Breakdown.breakdown_id,
back_populates="breakdown_items",
)
###########################
# --- Parts of Speech --- #
###########################
[docs]class Noun(Base, GenericToString):
__tablename__ = "nouns"
word_id = Column(Integer, ForeignKey("words.id"), nullable=False, primary_key=True)
word = Column(String(256), nullable=False, comment="the original word in cyrillic characters")
gender = Column(
Enum("m", "f", "n", name="noun_genders"),
nullable=True,
comment="word gender -- masculine, femenine, or neuter",
)
animate = Column(
Boolean,
nullable=True,
comment="true if the noun is animate, false if inanimate",
)
indeclinable = Column(
Boolean,
nullable=True,
comment="true if the noun is indeclinable, false if declinable",
)
# singular cases
nom = Column(String(256), nullable=True, comment="nominative case")
acc = Column(String(256), nullable=True, comment="accusative case")
prep = Column(String(256), nullable=True, comment="prepositional case")
gen = Column(String(256), nullable=True, comment="genetive case")
dat = Column(String(256), nullable=True, comment="dative case")
inst = Column(String(256), nullable=True, comment="instrumental case")
# plural cases
nom_pl = Column(String(256), nullable=True, comment="nominative plural case")
acc_pl = Column(String(256), nullable=True, comment="accusative plural case")
prep_pl = Column(String(256), nullable=True, comment="prepositional plural case")
gen_pl = Column(String(256), nullable=True, comment="genetive plural case")
dat_pl = Column(String(256), nullable=True, comment="dative plural case")
inst_pl = Column(String(256), nullable=True, comment="instrumental plural case")
# one-to-one
_word = relationship("Word", uselist=False)
[docs]class Adjective(Base, GenericToString):
__tablename__ = "adjectives"
word_id = Column(Integer, ForeignKey("words.id"), nullable=False, primary_key=True)
comp = Column(String(256), nullable=True, comment="comparative form")
fem_short = Column(String(256), nullable=True, comment="femenine short form with accent")
masc_short = Column(String(256), nullable=True, comment="masculine short form with accent")
neut_short = Column(String(256), nullable=True, comment="neuter short form with accent")
plural_short = Column(String(256), nullable=True, comment="plural short form with accent")
# one-to-one
word = relationship("Word", uselist=False)
[docs]class Conjugation(Base, GenericToString):
__tablename__ = "verbs"
word_id = Column(Integer, ForeignKey("words.id"), nullable=False, primary_key=True)
aspect = Column(
Enum("perf", "impf", name="verb_aspects"),
nullable=False,
comment="verb aspect -- imperfective or perfective",
)
# present/future tense
_1st_per_sing = Column("1st_per_sing", String(256), comment="I")
_2nd_per_sing = Column("2nd_per_sing", String(256), comment="you (informal)")
_3rd_per_sing = Column("3rd_per_sing", String(256), comment="he/she/it")
_1st_per_pl = Column("1st_per_pl", String(256), comment="we")
_2nd_per_pl = Column("2nd_per_pl", String(256), comment="you all (you formal)")
_3rd_per_pl = Column("3rd_per_pl", String(256), comment="they")
# past tense
past_m = Column(String(256), comment="past masculine")
past_f = Column(String(256), comment="past femenine")
past_n = Column(String(256), comment="past neuter")
past_pl = Column(String(256), comment="past plural")
# participle forms
actv_part = Column(String(256), comment="active particible")
pass_part = Column(String(256), comment="passive participle")
actv_past_part = Column(String(256), comment="active past participle")
pass_past_part = Column(String(256), comment="passive past participle")
gerund = Column(String(256), comment="gerund")
impr = Column(String(256), comment="imperative")
impr_pl = Column(String(256), comment="imperative plural")
# one-to-one
word = relationship("Word", uselist=False)
[docs]class VerbPair(Base, GenericToString):
# TODO, redo this so that the columns are "verb_id", "other_verb_id", so there will be
# AT LEAST 2 * n rows in this table where n is the number of verbs
__tablename__ = "verb_pairs"
# __table_args__ = (
# # since this is an "Association" table, we set both of the foreign keys
# # as the the compound-primary-key of this table
# PrimaryKeyConstraint("imp_word_id", "pfv_word_id"),
# )
_id = Column(
Integer,
primary_key=True,
autoincrement=True,
nullable=False,
comment="Ignore this, it's just for SQLAlchemy",
)
imp_word_id = Column(Integer, ForeignKey("words.id"))
pfv_word_id = Column(Integer, ForeignKey("words.id"))
#############################
# --- Example Sentences --- #
#############################
[docs]class Sentence(Base, GenericToString):
__tablename__ = "sentences"
sentence_id = Column(Integer, primary_key=True, autoincrement=True, nullable=False)
sentence = Column(String(2048), nullable=False, comment="Russian example sentence (cyrillic)")
# one-to-one
translation = relationship("SentenceTranslation", uselist=False, back_populates="sentence")
[docs]class SentenceTranslation(Base, GenericToString):
__tablename__ = "sentence_translations"
translation_id = Column(Integer, primary_key=True, autoincrement=True, nullable=False)
sentence_id = Column(Integer, ForeignKey("sentences.sentence_id"))
translation = Column(
String(2048),
nullable=False,
comment="English translation of one of the russian example sentences",
)
# one-to-one
sentence = relationship("Sentence", uselist=False, back_populates="translation")
[docs]class WordToSentence(Base, GenericToString):
"""Bridge table between words and sentences"""
__tablename__ = "word_to_sentence"
__table_args__ = (
# since this is an "Association" table, we set both of the foreign keys
# as the the compound-primary-key of this table
PrimaryKeyConstraint("word_id", "sentence_id"),
)
word_id = Column(Integer, ForeignKey("words.id"))
sentence_id = Column(Integer, ForeignKey("sentences.sentence_id"), nullable=False)
exact_match = Column(
Boolean,
nullable=False,
comment="whether the example sentence contains exactly the word",
)
######################################
# --- Definitions / Translations --- #
######################################
[docs]class Definition(Base, GenericToString):
"""
There are 2 types of definition in this table: parent and child.
Parent definitions have a null "definition" field. Parent definitions
can be joined with the "definition_contents" table to access the child/sub definitions
of those rows.
"""
__tablename__ = "definitions"
id = Column(Integer, primary_key=True, nullable=False, autoincrement=True)
pos = Column(POS_ENUM, nullable=True) # formerly called "word_type"
definition = Column(String(512), nullable=True, comment="null if this is a parent definition")
notes = Column(String(256), nullable=True)
# one-to-many
# contents = relationship("DefinitionItem", uselist=True, back_populates="definition")
# we would need this if we used DefinitionExample since we'd have 2 tables pointing to
# the Definition.id foreign key (that's polymorphism since there can be 2 types of child)
# __mapper_args__ = {
# "polymorphic_on": id,
# "polymorphic_identity":"definitions",
# "with_polymorphic":"*"
# }
# currently unused
# class DefinitionExample(Base, GenericToString):
# __tablename__ = "definition_examples"
# __mapper_args__ = {"polymorphic_identity":"definition_examples"}
# definition_id = Column(Integer, ForeignKey("definitions.id"))
# id = Column(Integer, primary_key=True, nullable=False, autoincrement=True)
# russ = Column(String(2048), nullable=False, comment="russian translation of example specific to this definition")
# eng = Column(String(2048), nullable=False, comment="english translation of example specific to this definition")
[docs]class DefinitionItem(Base, GenericToString):
"""
This is a bridge table. Each row in this table represents either
a sub-definition or an example. The join to get a list of sub-definitions from
a word looks like this:
word -> word_defs -> definition_contents -> definitions or examples
"""
__tablename__ = "definition_contents"
# __mapper_args__ = {"polymorphic_identity":"definition_contents"}
__table_args__ = (PrimaryKeyConstraint("definition_id", "child_id"),)
definition_id = Column(Integer, ForeignKey("definitions.id"))
child_type = Column(Enum("definition", "example", name="definition_child_types"), nullable=False)
# technically this ForeignKey can be to both definitions and definition_examples, but
# the definition_examples table isn't currently being used.
child_id = Column(Integer, ForeignKey("definitions.id"), nullable=False)
position = Column(
Integer,
nullable=False,
comment="position in the order of sub-definitions or examples belonging to a definition",
)
# many-to-one
definition = relationship("Definition", uselist=False, foreign_keys=[definition_id])
child = relationship("Definition", uselist=False, foreign_keys=[child_id])
if __name__ == "__main__":
from rootski.services.database.non_orm.sql_client import SqlClient
russian_db = "/Users/eric/Desktop/rootski/data/russian/master/russian.db"
sql_client = SqlClient(
sql_dialect="sqlite",
driver_lib=None,
username=None,
password=None,
host=None,
port=None,
database=russian_db,
engine_kwargs=dict(),
)
session = sql_client.get_session()
# Base.metadata.bind = session
print(session.query(Word).limit(10).one())