import React, { memo, useState } from 'react';
import styled from 'styled-components';
import { Grid, TextField, MenuItem, Button } from '@material-ui/core';
import Collapse from '../../common/Collapse';
import Spinner from '../../common/loading/Spinner';
import {
  Paragraph,
  StyledButton,
  DemoContainer,
  LinkContainer,
} from '../../../styles/common';
import OpenInNewIcon from "@material-ui/icons/OpenInNew";
import SolutionContainer from '../../common/SolutionContainerWrapper';

// images
import sol1 from '../../../assets/images/app/genomicMetaDataSol1.webp';

const GenomicMetadataIntegration = () => {

  const [notification, setNotification] = useState('');
  

  return (
    <SolutionContainer snackbar={notification}>
      <Collapse text="Description">
        <Paragraph>
          <p>
            Public repositories of genomic datasets such as Gene Expression Omnibus, Sequence Read Archive and ArrayExpress have become a fundamental source of knowledge that helps the scientific community to accelerate biological investigations. The analysis of its rich data corpus (including gene expression, mutation profiles and chromatin configuration) is useful to provide new insights into understanding disease and protein evolution. In particular, GEO is one of the largest public repositories of genomic data with {'>'}4 million experimental samples that are growing at an exponential rate in recent years.
          </p>
          <p>
            This happened also thanks to next-generation sequencing technologies , which have greatly reduced the cost of genome sequencing. Each experimental sample contained in GEO is composed of two parts: the region data and its associated metadata. In order to classify, compare and find relevant information at scale from such a large amount of genomic data, it is essential to have a well-structured metadata content that uniquely specifies attributes such as tissue type, cell type, sex, age, disease and species.
          </p>
          <p>Unfortunately, GEO metadata lack structure because they are provided in the form of a textual description of the experiment. Such text cannot be easily processed, because each information piece contained in the description may be missing, misspelled or expressed using synonyms. This issue prevents researchers from completely exploiting the knowledge contained in GEO, as the meta-analysis and the integration of multiple genomic datasets are infeasible due to the lack of machine-readable standardized metadata. </p>

          <center>
            {' '}
            <img src={sol1} width="40%" alt="" />
          </center>
          <p>For this reason, annotating genomic datasets at scale is a challenging problem for bioinformaticians . Three approaches are usually employed to address this problem: manual curation, metadata inference directly from gene expression profiles (or other genomic signals) and automated natural language processing (NLP). In this work, we investigated the latter, focusing on applying the last developments of deep learning transformer-based NLP. In particular, we combined Generative Pre-trained Transformer 2 (GPT2) models for attribute extraction from metadata with the implementation of an Active Learning (AL) Framework and gradient-based deep learning interpretation technique. </p>
          <p>Recent breakthroughs in NLP with pretrained generative models as GPT2 allowed to build multitask learners using fewer data than classic supervised Machine Learning (ML) techniques. In our work, we used GPT2 to integrate the sets of attributes provided by the datasets generated from two important genomic data sources, namely Cistrome and the Encyclopedia of DNA Elements . However, the huge number and diversity of unlabeled samples in GEO still make it necessary to manually annotate new samples, as such data can be used to make a model able to learn dynamically over time and improve its accuracy.</p>

          <LinkContainer>
            <Grid container spacing={2}>
            <Grid item>
                <StyledButton
                  variant="outlined"
                  color="primary"
                  size="large"
                  startIcon={<OpenInNewIcon />}
                >
                  <a
                    href="https://github.com/armando2603/GeMI"
                    target="_blank"
                    rel="noreferrer"
                  >
                    Notebook
                  </a>
                </StyledButton>
              </Grid>

              <Grid item>
                <StyledButton
                  variant="outlined"
                  color="primary"
                  size="large"
                  startIcon={<OpenInNewIcon />}
                >
                  <a
                    href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9216561/"
                    target="_blank"
                    rel="noreferrer"
                  >
                    For more information
                  </a>
                </StyledButton>
              </Grid>
            </Grid>
          </LinkContainer>
        </Paragraph>
      </Collapse>
      <Collapse text="Demo">
        <DemoContainer>
          <center>
            <iframe src="https://apps.vlifevirtusa.com:6035" width="100%" height="650" />
          </center>
        </DemoContainer>
      </Collapse>
    </SolutionContainer>
  );
};

export default memo(GenomicMetadataIntegration);
