adsabs/biblib-service

View on GitHub
biblib/views/library_view.py

Summary

Maintainability
D
2 days
Test Coverage
"""
Library view
"""

from biblib.views import USER_ID_KEYWORD
from biblib.utils import err, check_boolean
from biblib.models import Library, Notes
from biblib.client import client
from biblib.views.base_view import BaseView 
from datetime import datetime
from flask import request, current_app
from flask_discoverer import advertise
from sqlalchemy.orm.attributes import flag_modified
from biblib.views.http_errors import SOLR_RESPONSE_MISMATCH_ERROR, \
    MISSING_LIBRARY_ERROR, MISSING_USERNAME_ERROR, BAD_LIBRARY_ID_ERROR, NO_PERMISSION_ERROR
from biblib.biblib_exceptions import BibcodeNotFoundError, DuplicateNoteError



class LibraryView(BaseView):
    """
    End point to interact with a specific library, only returns library content
    if the user has the correct privileges.

    The GET requests are separate from the POST, DELETE requests as this class
    must be scopeless, whereas the others will have scope.
    """
    decorators = [advertise('scopes', 'rate_limit')]
    scopes = []
    rate_limit = [1000, 60*60*24]
    
    @classmethod
    def get_alternate_bibcodes(cls, solr_docs):
        """
        Gets all the alternate bibcodes from solr docs 
        :param solr_docs: solr docs from the bigquery response
    

        :return: dict of alternate bibcodes and their corresponding canonical bibcodes {alternate_bibcode: canonical_bibcode}
        """
        alternate_bibcodes = {} 
        for doc in solr_docs:
            canonical_bibcode = doc['bibcode']
            if doc.get('alternate_bibcode'):
                alternate_bibcodes.update(
                    {alternate_bibcode: canonical_bibcode for alternate_bibcode in doc['alternate_bibcode']}
                )
        return alternate_bibcodes
    
    @classmethod
    def update_notes(cls, session, library, updated_list):
        """
        Updates the notes based on the solr canonical bibcodes response
        :param session: necessary for all the queries 
        :param library: library to update
        :param updated_list: list of changed bibcodes [{'before': 'after'}]

        :return: updated_notes: list with all the notes that have been updated 
        """
        notes = session.query(Notes).filter(Notes.library_id == library.id).all() 
        updated_dict = {}
        updated_notes = []
        updated_ids = set()
        
        # Turn list into a dictionary for fast lookup
        for updated_bibcode in updated_list: 
            for key, value in updated_bibcode.items(): 
                updated_dict[key] = value
        
        for note in notes: 
            
            if note.bibcode in updated_dict:  
                # Convert to notes to a hashable tuple and add to updated_notes
                canonical_bibcode = updated_dict[note.bibcode]
                canonical_note = session.query(Notes).filter(Notes.library_id == library.id, 
                                                            Notes.bibcode == canonical_bibcode).one_or_none()
                if note.id not in updated_ids: 
                    updated_ids.add(note.id)
                    updated_notes.append(note.as_dict())
                
                # If there's no note with the canonical bibcode, create a new note
                if not canonical_note:
                    try:  
                        new_note = Notes.create_unique(session=session, 
                                            content=note.content, 
                                            bibcode=canonical_bibcode, 
                                            library=library) 
                        session.add(new_note)
                        session.commit()
                        if new_note.id not in updated_ids: 
                            updated_ids.add(new_note.id)
                            updated_notes.append(new_note.as_dict())
                    except (BibcodeNotFoundError, DuplicateNoteError) as error: 
                        current_app.logger.error('Error while creating new note with canonical bibcode {0}: {1}'
                                                .format(canonical_bibcode, error))
                else: 
                    canonical_note.content = '{0} {1}'.format(canonical_note.content, note.content)
                    session.add(canonical_note)
                    session.commit()
                    if canonical_note.id not in updated_ids: 
                            updated_ids.add(canonical_note.id)
                            updated_notes.append(canonical_note.as_dict())
        return updated_notes

    @classmethod
    def update_library(cls, session, library):
        """
        Carries the actual database update for the library and notes tables. 
        :param session: Necessary for the updates 
        :param library: Library to update
        :param new_library_bibcodes: The updated versions of all the bibcodes in the library
        :param updates: dictionary with all the updates

        :return: updates dictionary with details of files modified. They keys in the dictionary are:
                 num_updated: number of documents modified
                 duplicates_removed: number of files removed for duplication
                 update_list: list of changed bibcodes {'before': 'after'}
                 updated_notes: list of notes that were updates
        """
        try: 
            
            session.add(library)
            flag_modified(library, "bibcode")
            session.commit()
            
        except Exception as error:
            current_app.logger.warning('Could not update library: {0}'
                                    .format(error))
            
    @classmethod
    def solr_update_library(cls, library_id, solr_docs, session):
        """
        Updates the library based on the solr canonical bibcodes response
        :param library: library_id of the library to update
        :param solr_docs: solr docs from the bigquery response

        :return: dictionary with details of files modified
                 num_updated: number of documents modified
                 duplicates_removed: number of files removed for duplication
                 update_list: list of changed bibcodes {'before': 'after'}
        """

        # Definitions
        new_library_bibcodes = {}

        # Output dictionary
        updates = dict(
                num_updated=0,
                duplicates_removed=0,
                update_list=[],
                updated_notes=[]
            )
        # Extract the canonical bibcodes and create a hashmap 
        # in which the alternate bibcode is the key and the canonical bibcode is the value
        alternate_bibcodes = cls.get_alternate_bibcodes(solr_docs) # alternate_bibcode: canonical_bibcode

        library = session.query(Library).filter(Library.id == library_id).one()
        default_timestamp = datetime.timestamp(library.date_created) 
        updated_timestamp = False 
        for bibcode in library.bibcode:

            if "timestamp" not in library.bibcode[bibcode].keys():
                library.bibcode[bibcode]["timestamp"] = default_timestamp
                updated_timestamp = True

            # Update if its an alternate
            if bibcode in alternate_bibcodes:
                canonical = alternate_bibcodes[bibcode]
                updates['num_updated'] += 1
                updates['update_list'].append({bibcode: canonical})

                # Only add the bibcode to the library if it is not there
                if canonical not in new_library_bibcodes:
                    new_library_bibcodes[canonical] = library.bibcode[bibcode]
                else:
                    updates['duplicates_removed'] += 1
            else:
                new_library_bibcodes[bibcode] = library.bibcode[bibcode]
        
        if updates['update_list']: 
            library.bibcode = new_library_bibcodes
            cls.update_library(session, library)
            updates['updated_notes'] = cls.update_notes(session, library, updates['update_list'])
        elif updated_timestamp: 
            cls.update_library(session, library)
        
        return updates
        
    def load_parameters(self, request): 
        """
        Loads parameters necessary for the Solr search
        :param request: request object

        :return: start: int representing the start of the pagination
                 rows: int representing the number of rows to be returned
                 sort: enum representing the sort order 
                 fl: str representing the field to be returned
                 raw_library: boolean 
      
        """
        try:
            start = int(request.args.get('start', 0))
            max_rows = current_app.config.get('BIBLIB_MAX_ROWS', 100)
            max_rows *= float(
                request.headers.get('X-Adsws-Ratelimit-Level', 1.0)
            )
            max_rows = int(max_rows)
            rows = min(int(request.args.get('rows', 20)), max_rows)
            raw_library = check_boolean(request.args.get('raw', 'false'))

        except ValueError:
            current_app.logger.debug("Raised value error")
            start = 0
            rows = 20
            raw_library = False

        sort = request.args.get('sort', 'date desc')
        # timestamp sorting is handled in biblib so we need to change the sort to something SOLR understands.
        if sort in ['time asc', 'time desc']:
            current_app.logger.debug("sort order is set to {}".format(sort))
            if sort == 'time desc':
                add_sort = 'desc'
            else:
                add_sort = 'asc'
            sort = 'date desc'

        else: add_sort = None

        fl = request.args.get('fl', 'bibcode')
        current_app.logger.info('User gave pagination parameters:'
                                'start: {}, '
                                'rows: {}, '
                                'sort: "{}", '
                                'fl: "{}", '
                                'raw: "{}"'.format(start, rows, sort, fl, raw_library))
        return start, rows, sort, fl, raw_library, add_sort
    
        
    def has_read_access(self, service_uid, library):
        """
        Checks if the user has read access 
        :param service_uid: user service id 
        :param library: 
        

        :return: uid: if it exists and None otherwise
        """
        if not self.read_access(service_uid=service_uid,
                                library_id=library.id):
            current_app.logger.error(
                'User: {0} does not have access to library: {1}. DENIED'
                .format(service_uid, library.id)
            ) 
            return False 
        return True
    
    def process_solr(self, library, start, rows, sort, fl, session, add_sort):
        """
        Processes the request to solr big query
        :param library: <string> <library ID>
        :param start: <int> used to delimit the start of pagination 
        :param rows: <int> used to delimit the start of pagination 
        :param sort: <int> used to sort 
        :param fl: <int> field used in the search, usually 'bibcode'

        :return: solr: <str>
                 updates: <dictionary>
                 documents: <dictionary> with docs in library 
        """
        try:
            solr = self.process_solr_big_query(
                bibcodes=library.bibcode,
                start=start,
                rows=rows,
                sort=sort,
                fl=fl
            ).json()
        except Exception as error:
            current_app.logger.warning('Could not parse solr data: {0}'
                                    .format(error))
            solr = {'error': 'Could not parse solr data'}
        
        reverse = True if add_sort == 'desc' else False 
        # Now check if we can update the library database based on the
        # returned canonical bibcodes
        if solr.get('response'):
            # Update bibcodes based on solr's response
            updates = self.solr_update_library(
                library_id=library.id,
                solr_docs=solr['response']['docs'], 
                session=session
            )
            if add_sort:
    
                solr = self.timestamp_sort(solr, library, reverse=reverse)

            documents = [doc['bibcode'] for doc in solr['response']['docs']]
        else:
            # Some problem occurred, we will just ignore it, but will
            # definitely log it.
            solr = SOLR_RESPONSE_MISMATCH_ERROR['body']
            current_app.logger.warning('Problem with solr response: {0}'
                                    .format(solr))
            updates = {}
            if add_sort != None:
                # Find the specified library (we have to do this to have full access to the library)
                temp_library = session.query(Library).filter_by(id=library.id).one()
                sortable_list = [(bibcode, library.bibcode[bibcode]["timestamp"]) for bibcode in temp_library.get_bibcodes()]
                sortable_list.sort(key = lambda stamped: stamped[1], reverse=reverse)
                documents = [doc[0] for doc in sortable_list]         
            else:
                documents = library.get_bibcodes()
                documents.sort()
            documents = documents[start:start+rows]
        return solr, updates, documents
    
    def process_raw_library(self, user, library, start, rows):
        """
        Processes the request for raw library 
        :param library: library ID
        :param user: user ID 
        :param start: int used to delimit the start of pagination 
        :param rows: int used to delimit the start of pagination 
        :return: solr: str 
                 updates: empty dictionary
                 documents: dictionary with docs in library from start to start + rows
        """
        solr = 'Only the raw library was requested.'
        current_app.logger.info('User: {0} requested only raw library output'
                                    .format(user))
        updates = {}
        documents = library.get_bibcodes()
        documents.sort()
        documents = documents[start:start+rows]
        
        return solr, updates, documents
    
    @classmethod
    def get_notes_from_library(cls, library, session): 
        """
        Get all notes (including orphan notes) from the library 
        :param library: <string>  ID of the library 
        :param session: current session necessary to get notes

        :return: dict of notes with valid bibcode and invalid bibcode in the form {'notes': [], 'orphan_notes': []}    
                 if there are no notes returns {'notes': [], 'orphan_notes': []} 
        """
        # Get all notes from library 
        notes = session.query(Notes).filter(Notes.library_id == library.id).all() 
        # Easy way to get corresponding note to bibcode 
        bibcode_to_notes_map = {note.bibcode: note for note in notes}

        # Since we ran solr_update_library to know if a note is valid or not 
        # We just need to check if its bibcode is in the library 
        # If it's not we're looking at an orphan note. 
        response = {'notes': {}, 'orphan_notes': {}}
        for bibcode in bibcode_to_notes_map.keys():
            if bibcode in set(library.get_bibcodes()): 
                note = bibcode_to_notes_map[bibcode]
                response['notes'][bibcode] = note.as_dict()
            else: 
                note = bibcode_to_notes_map[bibcode]
                response['orphan_notes'][bibcode] = note.as_dict()
        return response

    def get_library_data(self, data):
        """
        Processes the get request for the library and assembles a response
        :param data: <dict> containing user, service_uid, library_id, start, rows, sort, fl, 
                    raw_library, notes and session. 

        Return data:
        -----------
        documents:    <list>   Currently, a list containing the bibcodes.
        solr:         <dict>   The response from the solr bigquery end point
        metadata:     <dict>   contains the following:

          name:                 <string>  Name of the library
          id:                   <string>  ID of the library
          description:          <string>  Description of the library
          num_documents:        <int>     Number of documents in the library
          date_created:         <string>  ISO date library was created
          date_last_modified:   <string>  ISO date library was last modified
          permission:           <sting>   Permission type, can be: 'read',
                                          'write', 'admin', or 'owner'
          public:               <boolean> True means it is public
          num_users:            <int>     Number of users with permissions to
                                          this library
          owner:                <string>  Identifier of the user who created
                                          the library

        updates:      <dict>   contains the following

          num_updated:          <int>     Number of documents modified based on
                                          the response from solr
          duplicates_removed:   <int>     Number of files removed because
                                          they are duplications
          update_list:          <list>[<dict>]
                                          List of dictionaries such that a
                                          single element described the original
                                          bibcode (key) and the updated bibcode
                                          now being stored (item)
          updated_notes:        <list>  A list of all the notes that were updated   
        library_notes:        <dict>    Dictionary of library notes, including orphan 
                                        notes (those not associated with a bibcode in the library)
        """
        with current_app.session_scope() as session:
            library, metadata = BaseView.get_library_and_metadata(
                    library_id=data["library_id"],
                    service_uid=data["service_uid"],
                    session=session
                )
            if data["raw_library"]:
                solr, updates, documents = self.process_raw_library(data["user"], 
                                                                    library, 
                                                                    data["start"], 
                                                                    data["rows"])
            else:
                try:
                    solr, updates, documents = self.process_solr(library, 
                                                                data["start"], 
                                                                data["rows"], 
                                                                data["sort"], 
                                                                data["fl"], 
                                                                session, 
                                                                data["add_sort"])
                except Exception as error:
                    current_app.logger.warning(
                        'Library missing or solr endpoint failed: {0}'
                        .format(error)
                    )
                    return data["library_id"], None, err(MISSING_LIBRARY_ERROR)

            library_notes = {}
            if data["notes"]: 
                library_notes = self.get_notes_from_library(library, session)
            
            # Make the response dictionary
            response = dict(
                documents=documents,
                solr=solr,
                metadata=metadata,
                updates=updates,
            )

            if library_notes and (library_notes.get('notes', {}) or library_notes.get('orphan_notes', {})):
                response['library_notes'] = library_notes


            return library, response, None

            
            
    @staticmethod
    def timestamp_sort(solr, library, reverse=False):
        """
        Take a solr response and sort it based on the timestamps contained in the library
        :input: response: response from SOLR bigquery
        :input: library: The original library
        :input: reverse: returns library by `time desc` if true, `time asc` otherwise.
        
        :return: response: SOLR response sorted by when each item was added.
        """
        if "error" not in solr['response'].keys():
            try:
                #First we generate a list of timestamps for the valid bibcodes
                timestamp = [library.bibcode[doc['bibcode']]['timestamp'] for doc in solr['response']['docs']]
                #Then we sort the SOLR response by the generated timestamp list
                solr['response']['docs'] = [\
                        doc for (doc, timestamp) in sorted(zip(solr['response']['docs'], timestamp), reverse=reverse, key = lambda stamped: stamped[1])\
                    ]
            except Exception as e:
                current_app.logger.warn("Failed to retrieve timestamps for {} with exception: {}. Returning default sorting.".format(library.id, e))
        else:
            current_app.logger.warn("SOLR bigquery returned status code {}. Stopping.".format(solr['response'].status_code))

        return solr

    # Methods
    def get(self, library):
        """
        HTTP GET request that returns all the documents inside a given
        user's library
        :param library: library slug
        :param start: int (optional) start of pagination
        :param rows: int (optional) how many rows should be fetched
        :param sort: enum (optional) type of sort 
        :param fl: list<str> (optional) set of fields to return
        :param notes: bool (optional) True if notes should be returned

        :return: list of the users libraries with the relevant information

        Header:
        -------
        Must contain the API forwarded user ID of the user accessing the end
        point

        Post body:
        ----------
        No post content accepted.

        Return data:
        -----------
        documents:    <list>   Currently, a list containing the bibcodes.
        solr:         <dict>   The response from the solr bigquery end point
        metadata:     <dict>   contains the following:

          name:                 <string>  Name of the library
          id:                   <string>  ID of the library
          description:          <string>  Description of the library
          num_documents:        <int>     Number of documents in the library
          date_created:         <string>  ISO date library was created
          date_last_modified:   <string>  ISO date library was last modified
          permission:           <sting>   Permission type, can be: 'read',
                                          'write', 'admin', or 'owner'
          public:               <boolean> True means it is public
          num_users:            <int>     Number of users with permissions to
                                          this library
          owner:                <string>  Identifier of the user who created
                                          the library

        updates:      <dict>   contains the following

          num_updated:          <int>     Number of documents modified based on
                                          the response from solr
          duplicates_removed:   <int>     Number of files removed because
                                          they are duplications
          update_list:          <list>[<dict>]
                                          List of dictionaries such that a
                                          single element described the original
                                          bibcode (key) and the updated bibcode
                                          now being stored (item)
          updated_notes:        <list>  A list of all the notes that were updated 
        library_notes:        <dict>    Dictionary of library notes, including orphan 
                                        notes (those not associated with a bibcode in the library)

        Permissions:
        -----------
        The following type of user can read a library:
          - owner
          - admin
          - write
          - read

        Default Pagination Values:
        -----------
        - start: 0
        - rows: 20 (max 100)
        - sort: 'date desc'
        - fl: 'bibcode'

         Additional Pagination options:
        ------------
        - sort:
            - "time asc" sort by time added to library with documents added least recently added documents being listed first.
            - "time desc" sort by time added to library with the most recently added documents being listed first.

        """

        # If set to True, return notes in library
        notes = request.args.get('notes', type=check_boolean, default=True)        

        # Get user 
        try:
            user = self.helper_get_user_id()
        except KeyError:
            return err(MISSING_USERNAME_ERROR)
        
        # Get library id
        current_app.logger.info('User: {0} requested library: {1}'
                                .format(user, library))
        try:
            library = self.helper_slug_to_uuid(library)
        except TypeError:
            return err(BAD_LIBRARY_ID_ERROR)
        
        if not self.helper_library_exists(library):

            return err(MISSING_LIBRARY_ERROR)
        
        # Get user id for service 
        service_uid = self.helper_absolute_uid_to_service_uid(absolute_uid=user)
        
        # Parameters to be forwarded to Solr: pagination, and fields
        start, rows, sort, fl, raw_library, add_sort = self.load_parameters(request)

        # Data needed to process the library request
        data = {"user": user, 
                "service_uid": service_uid, 
                "library_id": library,
                "start": start, 
                "rows": rows, 
                "sort": sort, 
                "fl": fl, 
                "raw_library": raw_library,
                "notes": notes,
                "add_sort": add_sort}
        
        
        library, response, solr_error = self.get_library_data(data)
        if solr_error: 
            return solr_error

        # Skip any more logic if the library is public or the exception token is present
        if self.helper_is_library_public_or_has_special_token(library, request):
            current_app.logger.info('Library: {0} is public'
                                    .format(library))
            return response, 200
        
        current_app.logger.info('Library: {0} is private'.format(library))

        # If user does not exist they don't have access to this private library
        if not self.helper_user_exists(user):
            current_app.logger.error(
                'User: {0} does not exist in the database. '
                'Therefore will not have extra privileges to view the library: {1}'
                .format(user, library.id)
            )
            return err(NO_PERMISSION_ERROR)
        
        # Check if the user has read access to this private library
        if not self.helper_check_user_has_read_access(service_uid, library): 
            return err(NO_PERMISSION_ERROR)
        
        # If they have access, let them obtain the requested content
        current_app.logger.info('User: {0} has access to library: {1}. '
                                'ALLOWED'
                                .format(user, library))
        return response, 200