publication/documents_citations.py

-Original file line number
+Diff line change
@@ -0,0 +1,242 @@
+    # coding: utf-8
+    """
+    Este processamento gera uma tabulação de citações de cada artigo da
+    coleção SciELO.
+    """
+    import argparse
+    import logging
+    import codecs
+    import datetime
+    import utils
+    import choices
+    logger = logging.getLogger(__name__)
+    def _config_logging(logging_level='INFO', logging_file=None):
+        allowed_levels = {
+            'DEBUG': logging.DEBUG,
+            'INFO': logging.INFO,
+            'WARNING': logging.WARNING,
+            'ERROR': logging.ERROR,
+            'CRITICAL': logging.CRITICAL
+        }
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        logger.setLevel(allowed_levels.get(logging_level, 'INFO'))
+        if logging_file:
+            hl = logging.FileHandler(logging_file, mode='a')
+        else:
+            hl = logging.StreamHandler()
+        hl.setFormatter(formatter)
+        hl.setLevel(allowed_levels.get(logging_level, 'INFO'))
+        logger.addHandler(hl)
+        return logger
+    class Dumper(object):
+        def __init__(self, collection, issns=None, output_file=None):
+            self._ratchet = utils.ratchet_server()
+            self._articlemeta = utils.articlemeta_server()
+            self.collection = collection
+            self.issns = issns
+            self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file
+            header = []
+            header.append(u"pid")
+            header.append(u"scielo_issn")
+            header.append(u"publication_year")
+            header.append(u"volume")
+            header.append(u"source")
+            header.append(u"doi")
+            header.append(u"publication_type")
+            header.append(u"number_or_suppl")
+            header.append(u"reference_pid")
+            header.append(u"part_title")
+            self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header]))
+        def write(self, lines):
+            if isinstance(lines, unicode):
+                lines = [lines]
+            for line in lines:
+                if not self.output_file:
+                    print(line.encode('utf-8'))
+                else:
+                    self.output_file.write('%s\r\n' % line)
+        def run(self):
+            for item in self.items():
+                self.write(item)
+            logger.info('Export finished')
+        def items(self):
+            if not self.issns:
+                self.issns = [None]
+            for issn in self.issns:
+                for data in self._articlemeta.documents(collection=self.collection, issn=issn):
+                    logger.debug('Reading document: %s' % data.publisher_id)
+                    for item in self.fmt_csv(data):
+                        yield item
+        def join_line(self, line):
+            return ','.join(['"%s"' % i.replace('"', '""') for i in line])
+        def fmt_csv(self, data):
+            """
+            Format citation data into CSV rows.
+            Columns:
+            - pid: Document PID
+            - scielo_issn: SciELO ISSN from the document's journal
+            - publication_year: Citation publication year
+            - volume: Citation volume
+            - source: Citation source (journal title, book title, thesis title, etc.)
+            - doi: Citation DOI
+            - publication_type: Citation publication type
+            - number_or_suppl: Citation issue/number or supplement
+            - reference_pid: Citation PID (v880)
+            - part_title: Citation article_title or chapter_title
+            """
+            # If there are no citations, yield a single empty row for the document
+            if not data.citations:
+                line = []
+                line.append(data.publisher_id or u'')
+                line.append(data.journal.scielo_issn or u'')
+                line.append(u'')
+                line.append(u'')
+                line.append(u'')
+                line.append(u'')
+                line.append(u'')
+                line.append(u'')
+                line.append(u'')
+                line.append(u'')
+                yield self.join_line(line)
+                return
+            # Process each citation
+            for citation in data.citations:
+                line = []
+                # pid - Document PID
+                line.append(data.publisher_id or u'')
+                # scielo_issn - SciELO ISSN from the document's journal
+                line.append(data.journal.scielo_issn or u'')
+                # publication_year - Citation publication year
+                pub_year = u''
+                if citation.publication_date:
+                    try:
+                        pub_year = unicode(citation.publication_date[0:4])
+                    except (TypeError, ValueError, AttributeError):
+                        pass
+                # Fallback to v64 if publication_date doesn't work
+                if not pub_year and 'v64' in citation.data:
+                    pub_year = citation.data['v64'][0].get('_', u'')
+                line.append(pub_year)
+                # volume - Citation volume
+                line.append(citation.volume or u'')
+                # source - Citation source (journal title, book title, thesis title, etc.)
+                line.append(citation.source or u'')
+                # doi - Citation DOI
+                line.append(citation.doi or u'')
+                # publication_type - Citation publication type
+                line.append(citation.publication_type or u'')
+                # number_or_suppl - Citation issue/number
+                issue = u''
+                if citation.issue:
+                    issue = citation.issue
+                # Check for supplement info in v882
+                elif 'v882' in citation.data:
+                    v882_data = citation.data['v882'][0]
+                    if 'n' in v882_data:
+                        issue = v882_data['n']
+                    elif 's' in v882_data:
+                        issue = v882_data['s']
+                line.append(issue)
+                # reference_pid - Citation PID from v880
+                ref_pid = u''
+                if 'v880' in citation.data:
+                    ref_pid = citation.data['v880'][0].get('_', u'')
+                line.append(ref_pid)
+                # part_title - Citation article_title or chapter_title
+                part_title = u''
+                if citation.article_title:
+                    part_title = citation.article_title
+                elif citation.chapter_title:
+                    part_title = citation.chapter_title
+                line.append(part_title)
+                yield self.join_line(line)
+    def main():
+        parser = argparse.ArgumentParser(
+            description='Dump citations distribution by article'
+        )
+        parser.add_argument(
+            'issns',
+            nargs='*',
+            help='ISSN\'s separated by spaces'
+        )
+        parser.add_argument(
+            '--collection',
+            '-c',
+            help='Collection Acronym'
+        )
+        parser.add_argument(
+            '--output_file',
+            '-r',
+            help='File to receive the dumped data'
+        )
+        parser.add_argument(
+            '--logging_file',
+            '-o',
+            help='Full path to the log file'
+        )
+        parser.add_argument(
+            '--logging_level',
+            '-l',
+            default='DEBUG',
+            choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
+            help='Logging level'
+        )
+        args = parser.parse_args()
+        _config_logging(args.logging_level, args.logging_file)
+        logger.info('Dumping data for: %s' % args.collection)
+        issns = None
+        if len(args.issns) > 0:
+            issns = utils.ckeck_given_issns(args.issns)
+        dumper = Dumper(args.collection, issns, args.output_file)
+        dumper.run()

publication/dumper.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -13,7 +13,8 @@ @@
         documents_languages,
         documents_licenses,
         documents_authors,
-        documents_dates
+        documents_dates,
+        documents_citations
     )
@@ Expand Down Expand Up @@
             self.documents_licenses = documents_licenses.Dumper(collection, output_file='documents_licenses.csv')
             self.documents_authors = documents_authors.Dumper(collection, output_file='documents_authors.csv')
             self.documents_dates = documents_dates.Dumper(collection, output_file='documents_dates.csv')
+            self.documents_citations = documents_citations.Dumper(collection, output_file='documents_citations.csv')
             if self.home_nationality:
                 self.documents_affiliations_nationality = documents_affiliations_nationality.Dumper(home_nationality, collection, output_file='documents_affiliation_nationality.csv')
@@ Expand All / @@ -79,6 +81,7 @@ def run(self): @@
                     self.documents_licenses.write(self.documents_licenses.fmt_csv(data))
                     self.documents_authors.write(self.documents_authors.fmt_csv(data))
                     self.documents_dates.write(self.documents_dates.fmt_csv(data))
+                    self.documents_citations.write(self.documents_citations.fmt_csv(data))
                     if self.home_nationality:
                         self.documents_affiliations_nationality.write(self.documents_affiliations_nationality.fmt_csv(data))
@@ Expand Down @@

Add documents_citations.csv export for tabs_collection.zip #89

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft

Copilot wants to merge 3 commits into master from copilot/create-documents-citations-csv

+246 −1

-Original file line number
+Diff line change
@@ -0,0 +1,242 @@
+    # coding: utf-8
+    """
+    Este processamento gera uma tabulação de citações de cada artigo da
+    coleção SciELO.
+    """
+    import argparse
+    import logging
+    import codecs
+    import datetime
+    import utils
+    import choices
+    logger = logging.getLogger(__name__)
+    def _config_logging(logging_level='INFO', logging_file=None):
+        allowed_levels = {
+            'DEBUG': logging.DEBUG,
+            'INFO': logging.INFO,
+            'WARNING': logging.WARNING,
+            'ERROR': logging.ERROR,
+            'CRITICAL': logging.CRITICAL
+        }
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        logger.setLevel(allowed_levels.get(logging_level, 'INFO'))
+        if logging_file:
+            hl = logging.FileHandler(logging_file, mode='a')
+        else:
+            hl = logging.StreamHandler()
+        hl.setFormatter(formatter)
+        hl.setLevel(allowed_levels.get(logging_level, 'INFO'))
+        logger.addHandler(hl)
+        return logger
+    class Dumper(object):
+        def __init__(self, collection, issns=None, output_file=None):
+            self._ratchet = utils.ratchet_server()
+            self._articlemeta = utils.articlemeta_server()
+            self.collection = collection
+            self.issns = issns
+            self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file
+            header = []
+            header.append(u"pid")
+            header.append(u"scielo_issn")
+            header.append(u"publication_year")
+            header.append(u"volume")
+            header.append(u"source")
+            header.append(u"doi")
+            header.append(u"publication_type")
+            header.append(u"number_or_suppl")
+            header.append(u"reference_pid")
+            header.append(u"part_title")
+            self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header]))
+        def write(self, lines):
+            if isinstance(lines, unicode):
+                lines = [lines]
+            for line in lines:
+                if not self.output_file:
+                    print(line.encode('utf-8'))
+                else:
+                    self.output_file.write('%s\r\n' % line)
+        def run(self):
+            for item in self.items():
+                self.write(item)
+            logger.info('Export finished')
+        def items(self):
+            if not self.issns:
+                self.issns = [None]
+            for issn in self.issns:
+                for data in self._articlemeta.documents(collection=self.collection, issn=issn):
+                    logger.debug('Reading document: %s' % data.publisher_id)
+                    for item in self.fmt_csv(data):
+                        yield item
+        def join_line(self, line):
+            return ','.join(['"%s"' % i.replace('"', '""') for i in line])
+        def fmt_csv(self, data):
+            """
+            Format citation data into CSV rows.
+            Columns:
+            - pid: Document PID
+            - scielo_issn: SciELO ISSN from the document's journal
+            - publication_year: Citation publication year
+            - volume: Citation volume
+            - source: Citation source (journal title, book title, thesis title, etc.)
+            - doi: Citation DOI
+            - publication_type: Citation publication type
+            - number_or_suppl: Citation issue/number or supplement
+            - reference_pid: Citation PID (v880)
+            - part_title: Citation article_title or chapter_title
+            """
+            # If there are no citations, yield a single empty row for the document
+            if not data.citations:
+                line = []
+                line.append(data.publisher_id or u'')
+                line.append(data.journal.scielo_issn or u'')
+                line.append(u'')
+                line.append(u'')
+                line.append(u'')
+                line.append(u'')
+                line.append(u'')
+                line.append(u'')
+                line.append(u'')
+                line.append(u'')
+                yield self.join_line(line)
+                return
+            # Process each citation
+            for citation in data.citations:
+                line = []
+                # pid - Document PID
+                line.append(data.publisher_id or u'')
+                # scielo_issn - SciELO ISSN from the document's journal
+                line.append(data.journal.scielo_issn or u'')
+                # publication_year - Citation publication year
+                pub_year = u''
+                if citation.publication_date:
+                    try:
+                        pub_year = unicode(citation.publication_date[0:4])
+                    except (TypeError, ValueError, AttributeError):
+                        pass
+                # Fallback to v64 if publication_date doesn't work
+                if not pub_year and 'v64' in citation.data:
+                    pub_year = citation.data['v64'][0].get('_', u'')
+                line.append(pub_year)
+                # volume - Citation volume
+                line.append(citation.volume or u'')
+                # source - Citation source (journal title, book title, thesis title, etc.)
+                line.append(citation.source or u'')
+                # doi - Citation DOI
+                line.append(citation.doi or u'')
+                # publication_type - Citation publication type
+                line.append(citation.publication_type or u'')
+                # number_or_suppl - Citation issue/number
+                issue = u''
+                if citation.issue:
+                    issue = citation.issue
+                # Check for supplement info in v882
+                elif 'v882' in citation.data:
+                    v882_data = citation.data['v882'][0]
+                    if 'n' in v882_data:
+                        issue = v882_data['n']
+                    elif 's' in v882_data:
+                        issue = v882_data['s']
+                line.append(issue)
+                # reference_pid - Citation PID from v880
+                ref_pid = u''
+                if 'v880' in citation.data:
+                    ref_pid = citation.data['v880'][0].get('_', u'')
+                line.append(ref_pid)
+                # part_title - Citation article_title or chapter_title
+                part_title = u''
+                if citation.article_title:
+                    part_title = citation.article_title
+                elif citation.chapter_title:
+                    part_title = citation.chapter_title
+                line.append(part_title)
+                yield self.join_line(line)
+    def main():
+        parser = argparse.ArgumentParser(
+            description='Dump citations distribution by article'
+        )
+        parser.add_argument(
+            'issns',
+            nargs='*',
+            help='ISSN\'s separated by spaces'
+        )
+        parser.add_argument(
+            '--collection',
+            '-c',
+            help='Collection Acronym'
+        )
+        parser.add_argument(
+            '--output_file',
+            '-r',
+            help='File to receive the dumped data'
+        )
+        parser.add_argument(
+            '--logging_file',
+            '-o',
+            help='Full path to the log file'
+        )
+        parser.add_argument(
+            '--logging_level',
+            '-l',
+            default='DEBUG',
+            choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
+            help='Logging level'
+        )
+        args = parser.parse_args()
+        _config_logging(args.logging_level, args.logging_file)
+        logger.info('Dumping data for: %s' % args.collection)
+        issns = None
+        if len(args.issns) > 0:
+            issns = utils.ckeck_given_issns(args.issns)
+        dumper = Dumper(args.collection, issns, args.output_file)
+        dumper.run()

-Original file line number
+Diff line change
@@ Expand Up / @@ -13,7 +13,8 @@ @@
         documents_languages,
         documents_licenses,
         documents_authors,
-        documents_dates
+        documents_dates,
+        documents_citations
     )
@@ Expand Down Expand Up @@
             self.documents_licenses = documents_licenses.Dumper(collection, output_file='documents_licenses.csv')
             self.documents_authors = documents_authors.Dumper(collection, output_file='documents_authors.csv')
             self.documents_dates = documents_dates.Dumper(collection, output_file='documents_dates.csv')
+            self.documents_citations = documents_citations.Dumper(collection, output_file='documents_citations.csv')
             if self.home_nationality:
                 self.documents_affiliations_nationality = documents_affiliations_nationality.Dumper(home_nationality, collection, output_file='documents_affiliation_nationality.csv')
@@ Expand All / @@ -79,6 +81,7 @@ def run(self): @@
                     self.documents_licenses.write(self.documents_licenses.fmt_csv(data))
                     self.documents_authors.write(self.documents_authors.fmt_csv(data))
                     self.documents_dates.write(self.documents_dates.fmt_csv(data))
+                    self.documents_citations.write(self.documents_citations.fmt_csv(data))
                     if self.home_nationality:
                         self.documents_affiliations_nationality.write(self.documents_affiliations_nationality.fmt_csv(data))
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add documents_citations.csv export for tabs_collection.zip #89

Diff view

Diff view

There are no files selected for viewing

Add documents_citations.csv export for tabs_collection.zip #89

Are you sure you want to change the base?

Add documents_citations.csv export for tabs_collection.zip #89

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing