#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
CDSL Corpus Management
"""
###############################################################################
import logging
from pathlib import Path
from dataclasses import dataclass, field
from typing import Dict, Generator, List, Tuple
import bs4
import requests
from .models import (
Lexicon, Entry,
MWLexicon, MWEntry,
AP90Lexicon, AP90Entry
)
from .lexicon import CDSLDict
from .constants import (
DEFAULT_SEARCH_MODE,
SERVER_URL,
DEFAULT_CORPUS_DIR,
DEFAULT_DICTIONARIES,
ENGLISH_DICTIONARIES,
DEFAULT_SCHEME,
)
###############################################################################
DEFAULT_MODEL_MAP = {
"MW": (MWLexicon, MWEntry),
"AP90": (AP90Lexicon, AP90Entry),
}
###############################################################################
LOGGER = logging.getLogger(__name__)
###############################################################################
[docs]@dataclass
class CDSLCorpus:
"""
CDSL Corpus Class
Refers to a CDSL installation instance at the location `data_dir`.
"""
data_dir: str or Path = field(default=None)
search_mode: str = field(repr=False, default=DEFAULT_SEARCH_MODE)
input_scheme: str = field(repr=False, default=DEFAULT_SCHEME)
output_scheme: str = field(repr=False, default=DEFAULT_SCHEME)
transliterate_keys: bool = field(repr=False, default=True)
# ----------------------------------------------------------------------- #
def __post_init__(self):
self.data_dir = (
Path(DEFAULT_CORPUS_DIR)
if self.data_dir is None
else Path(self.data_dir)
)
self.dict_dir = self.data_dir / "dict"
self.db_dir = self.data_dir / "db"
self.dicts = {}
self.get_available_dicts()
# ----------------------------------------------------------------------- #
def __getattr__(self, attr: str) -> CDSLDict:
if attr in self.dicts:
return self.dicts[attr]
else:
raise AttributeError
def __getitem__(self, item: str) -> CDSLDict:
if item in self.dicts:
return self.dicts[item]
else:
raise KeyError(f"Dictionary '{item}' is not setup.")
def __iter__(self) -> Generator[CDSLDict, None, None]:
yield from self.dicts.values()
# ----------------------------------------------------------------------- #
[docs] def setup(
self,
dict_ids: list = None,
update: bool = False,
model_map: Dict[str, Tuple[Lexicon, Entry]] = None
) -> bool:
"""Setup CDSL dictionaries in bulk
Calls `CDSLDict.setup()` on every `CDSLDict`, and if successful, also
calls `CDSLDict.connect()` to establish a connection to the database
Parameters
----------
dict_ids : list or None, optional
List of dictionary IDs to setup.
If None, the dictionaries from `DEFAULT_DICTIONARIES` as well as
locally installed dictionaries will be setup.
The default is None.
update : bool, optional
If True, and update check is performed for every dictionary in
`dict_ids`, and if available, the updated version is installed
The default is False.
lexicon_model : object, optional
Lexicon model argument passed to `CDSLDict.connect()`
The default is None.
entry_model : object, optional
Entry model argument passed to `CDSLDict.connect()`
The default is None.
model_map : dict, optional
Map of dictionary ID to a tuple of lexicon model and entry model.
The argument is used to specify `lexicon_model` and `entry_model`
arguments passed to `CDSLDict.connect()`.
If None, the default map `DEFAULT_MODEL_MAP` will be used.
The default is None.
Returns
-------
bool
True, if the setup of all the dictionaries from `dict_ids`
is successful.
i.e. If every `CDSLDict.setup()` call returns True.
Raises
------
ValueError
If `dict_ids` is not a `list` or `None`.
"""
if dict_ids is None:
dict_ids = DEFAULT_DICTIONARIES + list(self.get_installed_dicts())
if isinstance(dict_ids, list):
dict_ids = {dict_id.upper() for dict_id in dict_ids}
setup_dicts = {
dict_id: cdsl_dict
for dict_id, cdsl_dict in self.available_dicts.items()
if dict_id in dict_ids
}
else:
raise ValueError("`dict_ids` must be a `list` or `None`")
status = []
for dict_id, cdsl_dict in setup_dicts.items():
dict_dir = self.dict_dir / dict_id.upper()
success = cdsl_dict.setup(
data_dir=dict_dir,
symlink_dir=self.db_dir,
update=update
)
status.append(success)
if success:
if model_map is None:
model_map = DEFAULT_MODEL_MAP
lexicon_model, entry_model = model_map.get(
cdsl_dict.id,
(None, None)
)
cdsl_dict.connect(
lexicon_model=lexicon_model,
entry_model=entry_model
)
self.dicts[dict_id] = cdsl_dict
return bool(status) and all(status)
# ----------------------------------------------------------------------- #
[docs] def search(
self,
pattern: str,
dict_ids: List[str] = None,
mode: str = None,
input_scheme: str = None,
output_scheme: str = None,
ignore_case: bool = False,
limit: int = None,
offset: int = None,
omit_empty: bool = True
) -> Dict[str, List[Entry]]:
"""Search in multiple dictionaries from the corpus
Parameters
----------
pattern : str
Search pattern, may contain wildcards (`*`).
dict_ids : list or None
List of dictionary IDs to search in.
Only the `dict_ids` that exist in `self.dicts` will be used.
If None, all the dictionaries that have been setup,
i.e., the dictionaries from `self.dicts` will be used.
The default is None.
mode : str or None, optional
Search mode to query by `key`, `value` or `both`.
The default is None.
input_scheme : str or None, optional
Input transliteration scheme
If None, `self.input_scheme` will be used.
The default is None.
output_scheme : str or None, optional
Output transliteration scheme
If None, `self.output_scheme` will be used.
The default is None.
ignore_case : bool, optional
Ignore case while performing lookup.
The default is False.
limit : int or None, optional
Limit the number of search results to `limit`.
The default is None.
offset : int or None, optional
Offset the search results by `offset`.
The default is None
omit_empty : bool, optional
If True, only the non-empty search results will be included.
The default is False.
Returns
-------
dict
Dictionary of (dict_id, list of matching entries)
"""
all_results = {}
if dict_ids is None:
dict_ids = list(self.dicts)
if isinstance(dict_ids, list):
dict_ids = {
dict_id.upper()
for dict_id in dict_ids
if dict_id.upper() in self.dicts
}
else:
raise ValueError("`dict_ids` must be a `list` or `None`")
for dict_id in dict_ids:
dict_results = self.dicts[dict_id].search(
pattern=pattern,
mode=mode,
input_scheme=input_scheme,
output_scheme=output_scheme,
ignore_case=ignore_case,
limit=limit,
offset=offset
)
if not omit_empty or dict_results:
all_results[dict_id] = dict_results
return all_results
# ----------------------------------------------------------------------- #
[docs] def get_available_dicts(self) -> Dict[str, CDSLDict]:
"""
Fetch a list of dictionaries available for download from CDSL
Homepage of CDSL Project (`SERVER_URL`) is fetched and parsed to obtain
this list.
"""
html = requests.get(SERVER_URL).content.decode()
soup = bs4.BeautifulSoup(html, "html.parser")
dl_tags = soup.find_all("a", attrs={"title": "Downloads"})
dictionaries = {}
for dl_tag in dl_tags:
row = dl_tag.find_parent("tr")
cells = row.find_all("td")
assert(len(cells) == 4)
dict_id = cells[0].get_text(" ").strip().split()[0]
dict_date = cells[1].get_text(" ").strip().split()[0]
dict_name = cells[2].find("a").get_text(" ").strip()
dict_download = f"{SERVER_URL}{dl_tag['href']}"
dict_transliterate_keys = (
dict_id not in ENGLISH_DICTIONARIES
and
self.transliterate_keys
)
dictionaries[dict_id] = CDSLDict(
id=dict_id,
date=dict_date,
name=dict_name,
url=dict_download,
search_mode=self.search_mode,
input_scheme=self.input_scheme,
output_scheme=self.output_scheme,
transliterate_keys=dict_transliterate_keys
)
self.available_dicts = dictionaries
return dictionaries
# ----------------------------------------------------------------------- #
[docs] def get_installed_dicts(self) -> Dict[str, CDSLDict]:
"""Fetch a list of dictionaries installed locally"""
dictionaries = {}
dict_ids = [path.name for path in self.dict_dir.glob("*")]
for dict_id in dict_ids:
if dict_id not in self.available_dicts:
LOGGER.error(f"Invalid dictionary '{dict_id}'")
continue
db_filename = f"{dict_id.lower()}.sqlite"
db_path = self.dict_dir / dict_id / "web" / "sqlite" / db_filename
if db_path.is_file():
dictionaries[dict_id] = self.available_dicts[dict_id]
return dictionaries
###############################################################################