Source code for assembl.models.hypothesis_source

from builtins import object
from datetime import datetime
import logging

from sqlalchemy import (
    Column, ForeignKey, Integer, DateTime, Table,
    UniqueConstraint, Unicode, String, Boolean,
    CheckConstraint, event, Index, func)
from sqlalchemy.orm import relationship, reconstructor
from future.utils import string_types
import simplejson as json
from rdflib_jsonld.context import Context
from pyramid.threadlocal import get_current_registry
import requests

from . import DiscussionBoundBase, Base
from .generic import ContentSource
from ..lib.parsedatetime import parse_datetime
from ..lib.sqla import get_named_class, get_named_object
from ..lib.generic_pointer import (
    UniversalTableRefColType, generic_relationship)
from .auth import IdentityProvider, AgentProfile
from .langstrings import LangString
from .social_auth import SocialAuthAccount
from .annotation import Webpage
from .idea_content_link import Extract
from .import_record_source import ImportRecord, ImportRecordSource
from ..tasks.source_reader import ClientError, ReaderError


log = logging.getLogger(__name__)


[docs]class HypothesisExtractSource(ImportRecordSource): __tablename__ = 'hypothesis_extract_source' id = Column(Integer, ForeignKey(ImportRecordSource.id), primary_key=True) api_key = Column(String) # search criteria user = Column(String) group = Column(String) tag = Column(Unicode) document_url = Column(Unicode) __mapper_args__ = { 'polymorphic_identity': 'hypothesis_source', } def __init__(self, *args, **kwargs): kwargs['source_uri'] = kwargs.pop('source_uri', 'https://hypothes.is/api/') super(HypothesisExtractSource, self).__init__(*args, **kwargs) def class_from_data(self, data): return Extract def init_importer(self): super(HypothesisExtractSource, self).init_importer() self.hypothesis_provider = IdentityProvider.get_by_type("Hypothesis") def read(self, admin_user_id=None): self.init_importer() self.load_previous_records() if not self.api_key: raise ClientError("Missing the api key") admin_user_id = admin_user_id or self.discussion.creator_id headers={"Authorization": "Bearer "+self.api_key} uri = self.source_uri + "search" (latest,) = self.db.query(func.max(ImportRecord.last_import_time) ).filter_by(source_id=self.id).first() params = {'sort': 'updated', 'order': 'asc'} if self.user: params['user'] = 'acct:' + self.user if self.group: params['group'] = self.group if self.tag: params['tag'] = self.tag if self.document_url: params['uri'] = self.document_url while True: if latest: params['search_after'] = latest.isoformat()+"Z" result = requests.get(uri, params=params, headers=headers) if not result.ok: raise ReaderError() rows = result.json().get('rows', None) if not rows: break latest = parse_datetime(max([x['updated'] for x in rows])) self.read_data_gen(rows, admin_user_id, True) self.db.flush() def id_from_data(self, data): if isinstance(data, dict): data = data.get('id', None) if isinstance(data, string_types): return data def process_data(self, data): uri = data['uri'] page = self[uri] if page is None: page = Webpage.get_instance(uri, self.discussion_id) self[uri] = page name = data.get("user_info", {}).get("display_name", None) doc_title = data.get('document', {}).get('title', [None])[0] if not page.subject: if page.subject is not None: page.subject.delete() page.subject = LangString.create(doc_title) external_url = data.get('links', {}).get('incontext', None) account_id = data['user'] account = self[account_id] if account is None: accounts = self.db.query(SocialAuthAccount ).filter_by( uid=account_id, provider_id=self.hypothesis_provider.id).first() if account is None: profile = AgentProfile(name=name) account = SocialAuthAccount( uid=account_id, identity_provider=self.hypothesis_provider, profile=profile, verified=True) self.db.add(account) self.db.flush() self[account_id] = account.profile targets = data.get('target', []) if not targets: log.error("Empty targets in hypothesis") return if len(targets) > 1: log.warning("Multiple targets in hypothesis") target = targets[0] tselectors = target.get('selector', []) quote = None selectors = [] for selector in tselectors: stype = selector["type"] if stype == "TextQuoteSelector": quote = selector.get('exact', None) selectors.append({ "@type": "TextQuoteSelector", "prefix": selector['prefix'], "suffix": selector['suffix'], "body": selector['exact'], }) elif stype == "RangeSelector": selectors.append({ "@type": "RangeSelector", "end": selector['endContainer'], "endOffset": selector['endOffset'], "start": selector['startContainer'], "startOffset": selector['startOffset'], }) elif stype == "TextPositionSelector": selectors.append({ "@type": "TextPositionSelector", "end": selector['end'], "start": selector['start'], }) elif stype == "FragmentSelector": selectors.append({ "@type": "FragmentSelector", "value": selector['value'], }) return { "@type": "Excerpt", "idPost": uri, "idCreator": account_id, "external_url": external_url, "text": data.get('text', None), "ranges": selectors, }