calibre-web/cps/metadata_provider/scholar.py

84 lines
3.1 KiB
Python
Raw Normal View History

2021-07-05 18:55:54 +02:00
# -*- coding: utf-8 -*-
# This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)
# Copyright (C) 2021 OzzieIsaacs
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
2021-12-13 17:21:41 +01:00
import itertools
from typing import Dict, List, Optional
from urllib.parse import quote, unquote
2021-07-05 18:55:54 +02:00
try:
from fake_useragent.errors import FakeUserAgentError
except (ImportError):
FakeUserAgentError = BaseException
try:
from scholarly import scholarly
except FakeUserAgentError:
raise ImportError("No module named 'scholarly'")
2021-07-05 18:55:54 +02:00
from cps import logger
2021-12-13 17:21:41 +01:00
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
2021-07-06 20:24:27 +02:00
log = logger.create()
2021-07-06 20:24:27 +02:00
class scholar(Metadata):
2021-07-08 19:14:38 +02:00
__name__ = "Google Scholar"
__id__ = "googlescholar"
2021-12-13 17:21:41 +01:00
META_URL = "https://scholar.google.com/"
2021-07-05 18:55:54 +02:00
2021-12-13 17:21:41 +01:00
def search(
self, query: str, generic_cover: str = "", locale: str = "en"
) -> Optional[List[MetaRecord]]:
2021-07-07 21:10:38 +02:00
val = list()
2021-07-05 18:55:54 +02:00
if self.active:
2021-12-13 17:21:41 +01:00
title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = " ".join(tokens)
try:
scholarly.set_timeout(20)
scholarly.set_retries(2)
scholar_gen = itertools.islice(scholarly.search_pubs(query), 10)
except Exception as e:
log.warning(e)
return None
2021-12-13 17:21:41 +01:00
for result in scholar_gen:
match = self._parse_search_result(
result=result, generic_cover="", locale=locale
2021-12-13 17:21:41 +01:00
)
val.append(match)
2021-07-07 21:10:38 +02:00
return val
2021-07-06 20:24:27 +02:00
2021-12-13 17:21:41 +01:00
def _parse_search_result(
self, result: Dict, generic_cover: str, locale: str
) -> MetaRecord:
match = MetaRecord(
id=result.get("pub_url", result.get("eprint_url", "")),
title=result["bib"].get("title"),
authors=result["bib"].get("author", []),
url=result.get("pub_url", result.get("eprint_url", "")),
source=MetaSourceInfo(
id=self.__id__, description=self.__name__, link=scholar.META_URL
),
)
2021-07-05 18:55:54 +02:00
2021-12-13 17:21:41 +01:00
match.cover = result.get("image", {}).get("original_url", generic_cover)
match.description = unquote(result["bib"].get("abstract", ""))
2021-12-13 17:21:41 +01:00
match.publisher = result["bib"].get("venue", "")
match.publishedDate = result["bib"].get("pub_year") + "-01-01"
match.identifiers = {"scholar": match.id}
return match