#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
CDSL Lexicon Management
"""
import json
import logging
from pathlib import Path
from dataclasses import dataclass, field
from functools import lru_cache
import zipfile
from typing import Dict, Generator, List
from peewee import fn
from playhouse.db_url import connect
import bs4
import requests
from requests_downloader.downloader import download
from indic_transliteration.sanscript import transliterate
from .utils import validate_scheme, validate_search_mode
from .models import (
Lexicon, Entry,
lexicon_constructor, entry_constructor
)
from .constants import (
INTERNAL_SCHEME,
ENGLISH_DICTIONARIES,
DEFAULT_SEARCH_MODE,
SEARCH_MODE_KEY,
SEARCH_MODE_VALUE,
SEARCH_MODE_BOTH
)
###############################################################################
LOGGER = logging.getLogger(__name__)
###############################################################################
[docs]@dataclass(eq=False)
class CDSLDict:
"""Dictionary from CDSL"""
id: str
date: str
name: str
url: str = field(repr=False)
db: str = field(repr=False, default=None)
search_mode: str = field(repr=False, default=None)
input_scheme: str = field(repr=False, default=None)
output_scheme: str = field(repr=False, default=None)
transliterate_keys: bool = field(repr=False, default=None)
# ----------------------------------------------------------------------- #
def __post_init__(self):
self.set_scheme(
input_scheme=self.input_scheme,
output_scheme=self.output_scheme,
transliterate_keys=self.transliterate_keys
)
self.set_search_mode(self.search_mode)
# ----------------------------------------------------------------------- #
def __getitem__(self, item: str) -> Entry:
result = self.entry(item)
if result is None:
raise KeyError(f"Entry with ID '{item}' not found.")
else:
return result
def __iter__(self) -> Generator[Entry, None, None]:
for entry in self._lexicon.select():
yield self._entry(
entry,
lexicon_id=self.id,
scheme=self.output_scheme,
transliterate_keys=self.transliterate_keys
)
# ----------------------------------------------------------------------- #
[docs] def download(self, download_dir: str or Path) -> bool:
"""Download and extract dictionary data
Parameters
----------
download_dir : str or Path
Full path of directory where the dictionary data should be
downloaded and extracted
Returns
-------
bool
True if successfully downloaded or already up-to-date
"""
up_to_date = False
html = requests.get(self.url).content.decode()
soup = bs4.BeautifulSoup(html, "html.parser")
footer = soup.find("div", attrs={"id": "footer"})
last_modified = footer.find("p").get_text().split(":", 1)[-1].strip()
LOGGER.debug(f"Last modified at {last_modified}")
download_dir = Path(download_dir)
download_dir.mkdir(parents=True, exist_ok=True)
last_modified_file = download_dir / "last_modified.txt"
if last_modified_file.exists():
local_last_modified = last_modified_file.read_text().strip()
up_to_date = (local_last_modified == last_modified)
if not up_to_date:
# not up-to-date
# backup current file
download_path = download_dir / f"{self.id}.web.zip"
backup_path = download_dir / f"{self.id}.web.zip.bak"
if download_path.exists():
backup_path = download_path.rename(backup_path)
# find download link
lis = soup.find_all("li")
for li in lis:
if "Directory 'web' containing displays" in li.get_text():
break
else:
LOGGER.error("No download link for 'web' displays was found.")
return False
web_url = li.find("a")["href"]
web_url = requests.compat.urljoin(self.url, web_url)
# download
success = download(web_url, download_path=download_path)
if not success:
# download failed - restore backup
LOGGER.error("Something went wrong.")
if backup_path.exists():
LOGGER.debug("Restoring ..")
backup_path.rename(download_path)
return False
# download was sucessful - remove backup
if backup_path.exists():
backup_path.unlink()
# update last modified info
last_modified_file.write_text(last_modified)
# extract
with zipfile.ZipFile(download_path, "r") as zipref:
zipref.extractall(download_dir)
else:
LOGGER.info(f"Data for dictionary '{self.id}' is up-to-date.")
return True
[docs] def setup(
self,
data_dir: str or Path,
symlink_dir: str or Path = None,
update: bool = False
) -> bool:
"""Setup the dictionary database path
Parameters
----------
data_dir : str or Path
Full path of directory where the dictionary data is stored
symlink_dir : str or Path, optional
Full path of the directory where the symbolink links to the
SQLite database of dictionary will be created
If None, symbolic links aren't created.
The default is None.
update : bool, optional
If True, an attempt to update dictionary data will be made.
The default is False.
Returns
-------
bool
True if the setup was successful
"""
# setup database path
data_dir = Path(data_dir)
database_filename = f"{self.id.lower()}.sqlite"
database_path = data_dir / "web" / "sqlite" / database_filename
self.db = str(database_path)
status = (
(not update and database_path.exists())
or
self.download(download_dir=data_dir)
)
if not status:
LOGGER.error(f"Couldn't setup dictionary '{self.id}'.")
return False
# create symlink
if symlink_dir is not None:
symlink_dir = Path(symlink_dir)
symlink_dir.mkdir(parents=True, exist_ok=True)
symlink_path = symlink_dir / f"{self.id}.db"
if not symlink_path.exists():
symlink_path.symlink_to(database_path)
self.db = str(symlink_path)
return True
# ----------------------------------------------------------------------- #
[docs] def set_scheme(
self,
input_scheme: str = None,
output_scheme: str = None,
transliterate_keys: bool = None
):
"""Set transliteration scheme for the dictionary instance
Parameters
----------
input_scheme : str, optional
Input transliteration scheme.
If None, `INTERNAL_SCHEME` is used.
The default is None.
output_scheme : str, optional
Output transliteration scheme.
If None, `INTERNAL_SCHEME` is used.
The default is None.
transliterate_keys : bool, optional
Determines whether the keys in lexicon should be transliterated
to `scheme` or not.
If None, the value will be inferred based on dictionary type.
The default is None.
"""
input_scheme = validate_scheme(input_scheme) or INTERNAL_SCHEME
output_scheme = validate_scheme(output_scheme) or INTERNAL_SCHEME
if transliterate_keys is None:
transliterate_keys = (self.id in ENGLISH_DICTIONARIES)
if (
self.input_scheme != input_scheme or
self.output_scheme != output_scheme or
self.transliterate_keys != transliterate_keys
):
self.search.cache_clear()
self.stats.cache_clear()
self.input_scheme = input_scheme
self.output_scheme = output_scheme
self.transliterate_keys = transliterate_keys
[docs] def set_search_mode(self, mode: str):
"""Set search mode
Parameters
----------
mode : str
Valid values are 'key', 'value', 'both'
Recommended to use the convenience variables SEARCH_MODE_KEY,
SEARCH_MODE_VALUE or SEARCH_MODE_BOTH.
"""
self.search_mode = validate_search_mode(mode) or DEFAULT_SEARCH_MODE
# ----------------------------------------------------------------------- #
[docs] def connect(
self,
lexicon_model: Lexicon = None,
entry_model: Entry = None
):
"""
Connect to the SQLite database
If both `lexicon_model` and `entry_model` are specified,
they are used as the ORM layer, and take preference over `model_map`.
If any of `lexicon_model` or `entry_model` is None,
then the models are resolved in the following way.
First, if the current dictionary ID is present in `model_map` the
models specified by the `model_map` are used.
Otherwise, `models.lexicon_constructor` and `models.entry_constructor`
functions are used, which subclass the `models.Lexicon` and
`models.Entry` models.
Parameters
----------
lexicon_model : object, optional
Lexicon model.
The default is None.
entry_model : object, optional
Entry model.
The default is None.
"""
if lexicon_model is not None and entry_model is not None:
self._lexicon = lexicon_model
self._entry = entry_model
else:
self._lexicon = lexicon_constructor(self.id)
self._entry = entry_constructor(self.id)
db_url = f"sqlite:///{self.db}"
self._lexicon.bind(connect(db_url))
self.search.cache_clear()
self.stats.cache_clear()
# ----------------------------------------------------------------------- #
[docs] @lru_cache(maxsize=1)
def stats(self, top: int = 10, output_scheme: str = None) -> Dict:
"""Display statistics about the lexicon
Parameters
----------
top : int, optional
Display top `top` entries having most different meanings.
The default is 10.
output_scheme: str, optional
Output transliteration scheme
If None, `self.output_scheme` will be used.
The default is None.
Returns
-------
dict
Statistics about the dictionary
"""
output_scheme = validate_scheme(output_scheme) or self.output_scheme
lex = self._lexicon
total_count = lex.select().count()
distinct_query = (
lex
.select(
lex.id, lex.key, lex.data, fn.COUNT(lex.key).alias("count")
)
.group_by(lex.key)
.order_by(fn.COUNT(lex.key).desc())
)
top_entries = [
(
(
transliterate(
item.key,
INTERNAL_SCHEME,
output_scheme
) if self.transliterate_keys else item.key
),
item.count
)
for item in distinct_query.limit(top)
]
distinct_count = distinct_query.count()
return {
"total": total_count,
"distinct": distinct_count,
"top": top_entries
}
# ----------------------------------------------------------------------- #
[docs] @lru_cache(maxsize=4096)
def search(
self,
pattern: str,
mode: str = None,
input_scheme: str = None,
output_scheme: str = None,
ignore_case: str = False,
limit: int = None,
offset: int = None
) -> List[Entry]:
"""Search in the dictionary
Parameters
----------
pattern : str
Search pattern, may contain wildcards (`*`).
mode : str or None, optional
Search mode to query by `key`, `value` or `both`.
If None, `self.search_mode` will be used.
The default is None.
input_scheme : str or None, optional
Input transliteration scheme
If None, `self.input_scheme` will be used.
The default is None.
output_scheme : str or None, optional
Output transliteration scheme
If None, `self.output_scheme` will be used.
The default is None.
ignore_case : bool, optional
Ignore case while performing lookup.
The default is False.
limit : int or None, optional
Limit the number of search results to `limit`.
The default is None.
offset : int or None, optional
Offset the search results by `offset`.
The default is None
Returns
-------
list
List of matching entries
"""
input_scheme = validate_scheme(input_scheme) or self.input_scheme
output_scheme = validate_scheme(output_scheme) or self.output_scheme
mode = validate_search_mode(mode) or self.search_mode
pattern = transliterate(pattern, input_scheme, INTERNAL_SCHEME)
value_pattern = f"*<body>*{pattern.strip('*')}*</body>*"
if mode == SEARCH_MODE_KEY:
query = self._lexicon.select().where(self._lexicon.key % pattern)
iquery = self._lexicon.select().where(self._lexicon.key ** pattern)
if mode == SEARCH_MODE_VALUE:
query = self._lexicon.select().where(
self._lexicon.data % value_pattern
)
iquery = self._lexicon.select().where(
self._lexicon.data ** value_pattern
)
if mode == SEARCH_MODE_BOTH:
query = self._lexicon.select().where(
(self._lexicon.key % pattern) |
(self._lexicon.data % value_pattern)
)
iquery = self._lexicon.select().where(
(self._lexicon.key ** pattern) |
(self._lexicon.data ** value_pattern)
)
search_query = iquery if ignore_case else query
return [
self._entry(
result,
lexicon_id=self.id,
scheme=output_scheme,
transliterate_keys=self.transliterate_keys
)
for result in search_query.limit(limit).offset(offset)
]
[docs] def entry(self, entry_id: str, output_scheme: str = None) -> Entry or None:
"""Get an entry by ID
Parameters
----------
entry_id : str
Entry ID to lookup
output_scheme : str or None, optional
Output transliteration scheme
If None, `self.output_scheme` will be used.
The default is None.
Returns
-------
object
If the `entry_id` is valid, `Entry` with the matching ID
otherwise, None.
"""
output_scheme = validate_scheme(output_scheme) or self.output_scheme
try:
return self._entry(
self._lexicon.get(self._lexicon.id == entry_id),
lexicon_id=self.id,
scheme=output_scheme,
transliterate_keys=self.transliterate_keys
)
except Exception:
LOGGER.error(f"Entry with ID '{entry_id}' not found.")
[docs] def dump(
self,
output_path: str or Path = None,
output_scheme: str = None
) -> List[Dict[str, str]]:
"""
Dump data as JSON
Parameters
----------
output_path : str or Path, optional
Path to the output JSON file.
If None, the data isn't written to the disk, only returned.
The default is None.
output_scheme : str or None, optional
Output transliteration scheme
If None, `self.output_scheme` will be used.
The default is None
Returns
-------
list
List of all the entries in the dictionary. Every entry is a `dict`.
If `output_path` is provided, the same list is written as JSON.
"""
output_scheme = validate_scheme(output_scheme) or self.output_scheme
data = [
entry.to_dict()
for entry in (
self._entry(
result,
lexicon_id=self.id,
scheme=output_scheme,
transliterate_keys=self.transliterate_keys
)
for result in self._lexicon.select()
)
]
if output_path is not None:
with open(output_path, mode="w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False)
return data
###############################################################################