diff --git a/.circleci/config.yml b/.circleci/config.yml index cdffb5f1ddd..81ca8c7d962 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -33,7 +33,7 @@ jobs: - run: name: "Install yarn at specific version" command: - sudo npm install --global yarn@1.13.0 + sudo npm install --global yarn@1.21.1 - run: name: "Show yarn and node versions" command: | @@ -106,9 +106,10 @@ jobs: cd $TEST_HOME/local/runtime-config && \ ./setup_environment.sh && ./setup_environment.sh >> $BASH_ENV - run: - name: Generate checksum of data that populates the test database + name: Build image and generate checksum of data that populates the test database command: | $TEST_HOME/local/runtime-config/db_content_fingerprint.sh > /tmp/db_data_md5key + no_output_timeout: 1h - restore_cache: keys: - v4-e2e-database-files-{{ checksum "/tmp/db_data_md5key" }} diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index bbcc863454d..5f3d953a9d8 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -12,7 +12,7 @@ Describe changes proposed in this pull request: # Any screenshots or GIFs? If this is a new visual feature please add a before/after screenshot or gif -here with e.g. [GifGrabber](http://www.gifgrabber.com/). +here with e.g. [Giphy CAPTURE](https://giphy.com/apps/giphycapture) or [Peek](https://github.com/phw/peek) # Notify reviewers Read our [Pull request merging diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index e99c3bd2098..b00c827e6b2 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -4,46 +4,67 @@ categories: - title: '🧬 Features' labels: - 'feature' + - 'cl-feature' + - title: '✨ Enhancements' + labels: - 'enhancement' + - 'cl-enhancement' - title: 'πŸ› Bug Fixes' labels: - 'fix' - 'bugfix' - 'bug' + - 'cl-bug' - title: '🏎 Performance Tweaks' labels: - 'performance' + - 'cl-performance' - title: '🎨 Style Tweaks' labels: - 'style tweak' + - 'cl-style-tweak' - title: 'βš™οΈ REST API Changes' labels: - 'api' - title: 'βš™οΈ Changes to handle external APIs' labels: - 'external api' + - 'cl-external-api' - title: 'πŸ—„ Database Migration' labels: - 'includes db changes' - 'migration' + - 'cl-db-migration' - title: '⚠️ Deprecation' labels: - 'deprecation' + - 'cl-deprecation' - title: 'πŸ“˜ Documentation' labels: - 'documentation' - - title: '🧹 Cleanup' + - 'cl-documentation' + - title: '🧹 Cleanup & Refactoring πŸ”§' labels: - 'cleanup' + - 'refactoring' + - 'cl-refactoring' - title: 'πŸ‘·β€β™€οΈ Testing, Configuration & Deployment' labels: - 'devops' + - 'test' + - 'cl-test' + - 'cl-devops' - title: '🧰 Maintenance' labels: - 'chore' - 'dependencies' + - 'cl-chore' + - title: 'πŸ“ Prototype (Internal Use)' + labels: + - 'cl-prototype' exclude-labels: - 'skip-changelog' + - 'cl-skip-changelog' change-template: '- $TITLE @$AUTHOR (#$NUMBER)' template: | ## Changes @@ -53,3 +74,6 @@ template: | - Backend: https://github.com/cBioPortal/cbioportal/compare/$PREVIOUS_TAG...v$NEXT_PATCH_VERSION - Frontend: https://github.com/cBioPortal/cbioportal-frontend/compare/$PREVIOUS_TAG...v$NEXT_PATCH_VERSION + + ## 🏷Notes on versioning and release procedure + https://docs.cbioportal.org/4.-development/release-procedure#a-note-on-versioning diff --git a/.github/stale.yml b/.github/stale.yml new file mode 100644 index 00000000000..378ebeb74b5 --- /dev/null +++ b/.github/stale.yml @@ -0,0 +1,21 @@ +# Number of days of inactivity before an issue becomes stale +daysUntilStale: 90 +# Number of days of inactivity before a stale issue is closed +daysUntilClose: 14 +# Issues with these labels will never be considered stale +exemptLabels: + - pinned + - security + - critical + - priority +# Label to use when marking an issue as stale +staleLabel: wontfix +# Comment to post when marking an issue as stale. Set to `false` to disable +markComment: > + This issue has been automatically marked as stale because it has not had + recent activity. It will be closed if no further activity occurs. Thank you + for your contributions. +# Comment to post when closing a stale issue. Set to `false` to disable +closeComment: false +# Limit the number of actions per hour, from 1-30. Default is 30 +limitPerRun: 1 diff --git a/.github/workflows/dockerimage.yml b/.github/workflows/dockerimage.yml new file mode 100644 index 00000000000..86e62c25b7a --- /dev/null +++ b/.github/workflows/dockerimage.yml @@ -0,0 +1,59 @@ +name: Docker Image CI +on: + push: + branches: + - master + - release-* + - rc + tags: '*' + +jobs: + build_and_publish_web_and_data: + if: github.repository == 'cBioPortal/cbioportal' + runs-on: ubuntu-latest + steps: + - name: 'Checkout git repo' + uses: actions/checkout@v1 + - name: Extract branch or tag name + # The GITHUB_REF variable is like "refs/head/branch_name" or + # "refs/tag/tag_name". If the tag is prefixed with v, this is a new + # version and we want to push it with the tag "latest" as well because + # that is the version we refer to in the docucmentation. One can give + # the same image multiple tags by using "," + run: echo "##[set-output name=image_tag_names;]$(echo ${GITHUB_REF##*/} | sed 's/^v/latest,/g')" + id: extract_tags + - name: 'Docker build with cache' + uses: whoan/docker-build-with-cache-action@v4 + with: + username: "${{ secrets.DOCKER_USERNAME }}" + password: "${{ secrets.DOCKER_PASSWORD }}" + image_name: cbioportal/cbioportal + image_tag: ${{ steps.extract_tags.outputs.image_tag_names }} + context: . + dockerfile: docker/web-and-data/Dockerfile + pull_image_and_stages: false + + build_and_publish_web: + if: github.repository == 'cBioPortal/cbioportal' + runs-on: ubuntu-latest + steps: + - name: 'Checkout git repo' + uses: actions/checkout@v1 + - name: Extract branch or tag name + # For the web docker image we don't publish it as latest + # just extract branch/tag name and strip v prefix + run: echo "##[set-output name=image_tag_names;]$(echo ${GITHUB_REF##*/} | sed 's/^v//g')" + id: extract_tags + - name: 'Docker build with cache' + uses: whoan/docker-build-with-cache-action@v4 + with: + username: "${{ secrets.DOCKER_USERNAME }}" + password: "${{ secrets.DOCKER_PASSWORD }}" + image_name: cbioportal/cbioportal + image_tag: ${{ steps.extract_tags.outputs.image_tag_names }}-web-shenandoah + context: . + dockerfile: docker/web/Dockerfile + pull_image_and_stages: false + + +# Reference: https://github.com/marketplace/actions/build-docker-images-using-cache diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 930070a3b00..e5f478b73cc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,41 +1,49 @@ # How to contribute -Thank you for contributing to cBioPortal! This document provides a brief set of guidelines for contributing. +Thank you for your interest in contributing to cBioPortal! This document provides a brief set of guidelines for contributing. -# Background +# Who are you? -The cBioPortal currently uses a "fork and pull" model for collaborative software development. +We are curious to learn more about you! We would love to help you get started! The contributors in our community all have different backgrounds. For instance some people have: -From the [GitHub Help Page of Using Pull Requests](https://help.github.com/articles/using-pull-requests/): +1. Engineering experience but no to little knowledge of cancer genomics +2. Knowledge about cancer genomics but no to little engineering experience +3. No engineering nor cancer genomics experience but an eagerness to contribute -"The fork & pull model lets anyone fork an existing repository and push changes to their personal fork without requiring access be granted to the source repository. The changes must then be pulled into the source repository by the project maintainer. This model reduces the amount of friction for new contributors and is popular with open source projects because it allows people to work independently without upfront coordination." +if you feel like you don't fall into any of these categories, please reach out so you can help us update the above list πŸ™‚! Note that there are many contributions that can be made to an open source commmunity without coding a single line of code. You can reach us through our [public slack channel](https://slack.cbioportal.org). -## Branches within cBioPortal +# Join the Slack! + +Come and chat with us at https://slack.cbioportal.org πŸ‘‹ -The cBioPortal currently maintains three branches: +# Making a code contribution - * **master**: this reflects what is currently running in production on cbioportal.org. Bug fixes and documentation fixes go here. - * **rc**: release candidate branch, incorporating all the latest features. You could see our **rc** branch as a development branch where we only accept high quality contributions. Once ready for testing on cbioportal.org/beta a new branch is formed with the name **release-x.y.z**. - * **release-x.y.z**: before each release a new branch is created from **rc** that has a name like **release-x.y.z** .This one is usually deployed to www.cbioportal.org/beta. +The cBioPortal currently uses a "fork and pull" model for collaborative software development. + +From the [GitHub Help Page of Using Pull Requests](https://help.github.com/articles/using-pull-requests/): + +"The fork & pull model lets anyone fork an existing repository and push changes to their personal fork without requiring access be granted to the source repository. The changes must then be pulled into the source repository by the project maintainer. This model reduces the amount of friction for new contributors and is popular with open source projects because it allows people to work independently without upfront coordination." ## Getting Started * Make sure you have a [GitHub account](https://github.com/signup/free). * Create an issue in our issues tracker, assuming one does not already exist. * Fork the cbioportal project on GitHub. For general instructions on forking a GitHub project, see [Forking a Repo](https://help.github.com/articles/fork-a-repo/) and [Syncing a fork](https://help.github.com/articles/syncing-a-fork/). - * Familiarize yourself with the [project documentation](https://docs.cbioportal.org), including [backend code organization](Backend-Code-Organization.md) and [backend development guidelines](Backend-Development-Guidelines.md). + * Familiarize yourself with the [project documentation](https://docs.cbioportal.org), including the [Architecture docs](https://docs.cbioportal.org/2.1-deployment/architecture-overview), the [backend code organization](docs/Backend-Code-Organization.md) and [backend development guidelines](docs/Backend-Development-Guidelines.md). ## Contributing Code Changes via a Pull Request Once you have forked the repo, you need to create your code contributions within a new branch of your forked repo. For general background on creating and managing branches within GitHub, see: [Git Branching and Merging](https://git-scm.com/book/en/v2/Git-Branching-Basic-Branching-and-Merging). * To begin, create a topic branch from where you want to base your work. - * For a new feature, this is usually the **rc branch**. For documentation and bug fixes, this is usually the **master branch**. + * For any change that requires database migrations, this will be the **rc branch**. For all other changes, this will be the **master branch**. For additional details, see [Branches within cBioPortal](#branches-within-cbioportal) below. You usually create a branch like so: -```git checkout master``` -```git checkout -b [name_of_your_new_branch]``` +``` +git checkout master +git checkout -b [name_of_your_new_branch] +``` You then usually commit code changes, and push your branch back to GitHub like so: @@ -50,9 +58,19 @@ A few tips: When you are ready to submit your pull-request: * Push your branch to your GitHub project. -* Open a Pull Request on GitHub to the **rc (release candidate)** branch for a new feature or the **master** branch for a bug fix or documentation fix. +* Open the pull request to the branch you've based your work on + +For more details on submitting a pull-request, please see: [GitHub Guide to Collaborating with issues and pull requests](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests). + +## Branches within cBioPortal + +To figure out where your first pull request might go, it helps to have an understanding of cBioPortal's branching model. The cBioPortal currently maintains three branches in both the [frontend](https://github.com/cbioportal/cbioportal-frontend) and [backend repo](https://github.com/cbioportal/cbioportal): -For instructions on submitting a pull-request, please see: [Using Pull Requests ](https://help.github.com/articles/using-pull-requests/) and [Sending Pull Requests](http://help.github.com/send-pull-requests/). + * **master**: this reflects what will be released with our next weekly release (https://github.com/cBioPortal/cbioportal/releases). For the [frontend repo](https://github.com/cbioportal/cbioportal-frontend) this branch is automatically deployed to production. On the backend it is deployed at least once a week. New features, bug fixes and documentation updates can go here. Only if the feature requires a database migration it should go to **rc**. + * **rc**: release candidate branch, this branch contains new changes that require a database migration. It is deployed to https://rc.cbioportal.org. Once it's ready for more thorough product review a new branch is created with the name **release-x.y.z**. This way people can still continue to submit new changes to **rc**, while more thorough testing takes place of the **release-x.y.z** branch. + * **release-x.y.z**: this branch contains changes that require a database migration. It will be merged to master after thorough product review on https://beta.cbioportal.org. + +We try to continuously merge new changes from `master` to `release-x.y.z`, and subsequently from the `release-x.y.z` branch to `rc` such that everybody is working on the latest code. Keep in mind though that occasionally there are conflicts that need to be resolved before we can merge. If you're working on e.g. the rc branch, you can check whether all the changes in master are in rc like this: https://github.com/cBioPortal/cbioportal/compare/rc...master. If a particular change you are waiting for is not there, one can help creating a pull request that merges these changes in. Try e.g. (if origin points to the cbioportal repo): `git fetch origin && git checkout origin/rc -b merge-master-to-rc && git merge origin/master`. Then resolve conflicts and push the branch `merge-master-to-rc` to your repo and open the PR. ### Getting your changes reviewed @@ -65,7 +83,7 @@ If you have an idea who might be able to spot such issues in the parts of the code and functionality affected by your changes, notify them by requesting a review using the **Reviewers** menu to the right of the summary you just wrote -and/or `@`-mentioning them in a comment. +and/or `@`-mentioning them in a comment. Or reaching out them on [slack](https://slack.cbioportal.org). Reviewers may request you to rephrase or adjust things before they allow the changes to be integrated. @@ -77,38 +95,6 @@ until the reviewers approve. Reviewers may request you to squash such amendment commits afterwards, or offer to push rewritten versions of your commits themselves. -## Automated tests on Travis CI -All Pull Requests are automatically tested on [Travis -CI](https://travis-ci.org/cBioPortal/cbioportal/pull_requests). Currently there -is a set of tests for the core module and a visual regression test that makes -some screenshots and compares them to the ones stored in the repository. - -### What to do if the screenshot test fails -When the screenshot test fails, it means that the screenshot taken from your -instance of the portal differs from the screenshot stored in the repo. -Copy+Paste the URL in the Travis CI log to view the image diff online. Further -instructions are outlined on that page. - -If you prefer to compare the images locally, you need to first download the -failing screenshot. The Travis CI log will show you where the image was -uploaded on [clbin.com](https://clbin.com). First, download the image and -replace the screenshot in the repo. For instance run in the root dir of -cBioPortal: - -```bash -curl 'https://clbin.com/[replace-with-clbin-image-from-log].png' > test/end-to-end/screenshots/[replace-with-image-from-repo].png -``` - -Then follow the steps outlined in [this blog post](http://www.akikoskinen.info/image-diffs-with-git/) to compare the -images locally. Run `git diff` from your repo to see the ImageMagick diff. - -Once you downloaded the images you do the following for each screenshot: - -- If the change in the screenshot is **undesired**, i.e. there is regression, you - should fix your PR. -- If the change in the screenshot is **desired**, add the screenshot to the - repo, commit it and push it to your PR's branch. - ## Pull Request Reviewers Guide If someone requests your review on a pull request, read the title and description and assign any other collaborators @@ -158,7 +144,7 @@ New features: (Maven) config goes in the appriopriate `pom.xml` (root, `scripts/`, `portal/`, `core/`). Runtime (Spring) goes in `portal.properties`. Default values should be in `GlobalProperties.java`. - Non-stable configuration should be done through war overlays. -- Is the configuration tested as part of Travis CI? It's not a necessity but be +- Is the configuration tested as part of the CI tests? It's not a necessity but be aware that untested configuration will be tough to maintain. - Is there documentation on the proposed changes? @@ -166,4 +152,4 @@ New features: * [cBioPortal Issue Tracker](https://github.com/cBioPortal/cbioportal/issues) * [General GitHub documentation](http://help.github.com/) -* [GitHub pull request documentation](http://help.github.com/send-pull-requests/) +* [GitHub Pull Request documentation](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests) diff --git a/README.md b/README.md index cb5b79add0e..3c2876fb51f 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,76 @@ # cBioPortal -The cBioPortal for Cancer Genomics provides visualization, analysis, and download of large-scale cancer genomics data sets. The cBioPortal is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License, version 3, as published by the Free Software Foundation. -This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. +The cBioPortal for Cancer Genomics provides visualization, analysis, and download of large-scale cancer genomics data sets. For a short intro on cBioPortal, see [these introductory slides](https://docs.google.com/presentation/d/1hm0G77UklZnpQfFvywBfW2ZIsy8deKi5r1RfJarOPLg/edit?usp=sharing). -A public instance of cBioPortal (https://www.cbioportal.org) is hosted and maintained by Memorial Sloan Kettering Cancer Center. It provides access to data by The Cancer Genome Atlas as well as many carefully curated published data sets. The cBioPortal software can be used to for local instances that provide access to private data. +If you would like to know how to setup a private instance of the portal and/or get set up for developing, see the [documentation](https://docs.cbioportal.org). For details on contributing code changes via pull requests, see our [Contributing document](CONTRIBUTING.md). -If you would like to know how to setup a private instance of the portal and/or get set up for developing, see the [documentation](https://docs.cbioportal.org). For details on contributing code changes via pull requests, see our [Contributing document](CONTRIBUTING.md). Also see this overview of the [backend code organization](docs/Backend-Code-Organization.md). +If you are interested in coordinating the development of new features, please contact cbioportal@cbio.mskcc.org or reach out on https://slack.cbioportal.org. -If you are interested in coordinating the development of new features, please contact cbioportal@cbio.mskcc.org. +## πŸ“˜ Documentation +See [https://docs.cbioportal.org](https://docs.cbioportal.org) -## Test status -| | bugfix integration branch | feature integration branch | -| --- | --- | --- | -| Branch name | [`master`](https://github.com/cBioPortal/cbioportal/tree/master) | [`rc`](https://github.com/cBioPortal/cbioportal/tree/rc) | -| Status | [![master build status](https://travis-ci.org/cBioPortal/cbioportal.svg?branch=master)](https://travis-ci.org/cBioPortal/cbioportal) | [![Build Status](https://travis-ci.org/cBioPortal/cbioportal.svg?branch=rc)](https://travis-ci.org/cBioPortal/cbioportal) | +## 🀝 License +See [LICENSE](./LICENSE) -## Code Quality -[![codebeat badge](https://codebeat.co/badges/0738d3c9-5ffe-4b61-80c4-abb5698d8d44)](https://codebeat.co/projects/github-com-cbioportal-cbioportal) +## πŸ’» Run Backend +cBioPortal consists of several components, please read the [Architecture docs](https://docs.cbioportal.org/2.1-deployment/architecture-overview) to figure out what repo would be relevant to edit. If you e.g. only want to make frontend changes, one can directly edit [the frontend repo](https://github.com/cbioportal/cbioportal-frontend) instead. Read the instructions in that repo for more info on how to do frontend development. This repo only contains the backend part. Before editing the backend, it's good to read the [backend code organization](docs/Backend-Code-Organization.md). For development of the backend repo one should first set up a database. Please follow the [Docker deployment documentation](https://docs.cbioportal.org/2.1.1-deploy-with-docker-recommended/docker) to do so. In step 6 instead of using the `latest` image, you can create your own image and use that: -## Documentation -See: [https://docs.cbioportal.org](https://docs.cbioportal.org) +``` +docker build -t cbioportal/cbioportal:my-dev-cbioportal-image -f docker/web-and-data/Dockerfile . +``` -## Deployment -[![Deploy](https://www.herokucdn.com/deploy/button.svg)](https://heroku.com/deploy) +Note: internally we have a dev database available with the public data set that one can connect to directly. Please reach out on slack to get the credentials. It is usually best to use a small test dataset, but if a copy of the production database is necessary for e.g. fixing a bug specific to production data that can be useful. -[![Docker Automated build](https://img.shields.io/docker/automated/jrottenberg/ffmpeg.svg)](https://hub.docker.com/r/cbioportal/cbioportal/) [![Docker Pulls](https://img.shields.io/docker/pulls/cbioportal/cbioportal.svg)](https://hub.docker.com/r/cbioportal/cbioportal/) [![Docker Stars](https://img.shields.io/docker/stars/cbioportal/cbioportal.svg)](https://hub.docker.com/r/cbioportal/cbioportal/) +### πŸ•΅οΈβ€β™€οΈ Debugging + +If you want to attach a debugger you can change the command in step 6, by adding to the `JAVA_OPTS` parameter: `-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=5005` and make sure to expose that port with `-p 5005:5005`. + +You can then use a JAVA IDE to connect to that port. E.g. in [VSCode](https://code.visualstudio.com/), one would add the following configuration to `launch.json` to connect: + +``` +{ + "version": "0.2.0", + "configurations": [ + { + "type": "java", + "name": "Debug (Attach)", + "request": "attach", + "hostName": "localhost", + "port": 5005, + "projectName": "cbioportal" + } + ] +} +``` + +## 🌳 Branch Information +| | main branch | upcoming release branch | later release candidate branch | +| --- | --- | --- | --- | +| Branch name | [`master`](https://github.com/cBioPortal/cbioportal/tree/master) | release-3.3.0 | [`rc`](https://github.com/cBioPortal/cbioportal/tree/rc) | +| Description | All bug fixes and features not requiring database migrations go here. This code is either already in production or will be released this week | Next release that requires database migrations. Thorough manual product review often takes place for this branch before release | Later releases with features that require database migrations. This is useful to allow merging in new features without affecting the upcoming release. Could be seen as a development branch, but note that only high quality pull requests are merged. That is the feature should be pretty much ready for release after merge. | +| Live instance | https://www.cbioportal.org / https://master.cbioportal.org | -- | https://rc.cbioportal.org | +| Live instance version | https://www.cbioportal.org/api/info / https://master.cbioportal.org/api/info | -- | https://rc.cbioportal.org/api/info | +| Docker Image | cbioportal/cbioportal:master | -- | cbioportal/cbioportal:rc | +| Kubernetes Config | [production](https://github.com/knowledgesystems/knowledgesystems-k8s-deployment/blob/master/cbioportal/cbioportal_spring_boot.yaml) / [master](https://github.com/knowledgesystems/knowledgesystems-k8s-deployment/blob/master/cbioportal/cbioportal_backend_master.yaml) | -- | [rc](https://github.com/knowledgesystems/knowledgesystems-k8s-deployment/blob/master/cbioportal/cbioportal_backend_rc.yaml) | +| Status | [![master build status](https://travis-ci.org/cBioPortal/cbioportal.svg?branch=master)](https://travis-ci.org/cBioPortal/cbioportal/branches) | -- | [![Build Status](https://travis-ci.org/cBioPortal/cbioportal.svg?branch=rc)](https://travis-ci.org/cBioPortal/cbioportal/branches) | + + +## πŸš€ Releases +Release Notes on GitHub: + +https://github.com/cBioPortal/cbioportal/releases + +See also the cBioPortal News section for user focused release information: + +https://www.cbioportal.org/news + +Docker Images are available for each tag and branch: + +https://hub.docker.com/repository/docker/cbioportal/cbioportal/tags + +## πŸ‘‰ Other Repos +Read the [Architecture docs](https://docs.cbioportal.org/2.1-deployment/architecture-overview) to see how these relate: + +- https://github.com/cBioPortal/cbioportal-frontend +- https://github.com/cbioportal/session-service +- https://github.com/cBioPortal/datahub/ diff --git a/app.json b/app.json index fcc72b89438..140e7eabea4 100644 --- a/app.json +++ b/app.json @@ -21,7 +21,7 @@ }, "SPRING_OPTS": { "description":"set spring properties with e.g. -Dshow.civic=true (TODO: not all props work atm)", - "value":"-Dauthenticate=false -Dtomcat.catalina.scope=runtime -Ddb.user=cbio_user -Ddb.password=cbio_pass -Ddb.portal_db_name=cgds_public -Ddb.connection_string=jdbc:mysql://devdb.cbioportal.org:3306/ -Ddb.host=devdb.cbioportal.org -Dshow.civic=true -Dsuppress_schema_version_mismatch_errors=true" + "value":"-Dauthenticate=false -Dtomcat.catalina.scope=runtime -Ddb.user=cbio_user -Ddb.password=cbio_pass -Ddb.portal_db_name=cgds_public -Ddb.connection_string=jdbc:mysql://devdb.cbioportal.org:3306/ -Ddb.host=devdb.cbioportal.org -Dshow.civic=true -Dsuppress_schema_version_mismatch_errors=true -Dsession.service.url=https://cbioportal-session-service.herokuapp.com/session_service/api/sessions/heroku_portal/" }, "WEBAPP_RUNNER_OPTIONS": { "description":"set webapp runner options", diff --git a/core/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java b/core/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java index 003e1793b15..fbf24987e4b 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java +++ b/core/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java @@ -540,24 +540,26 @@ public static void deleteCancerStudyByCascade(int internalCancerStudyId) throws public static void deleteCancerStudy(int internalCancerStudyId) throws DaoException { String[] deleteStudyStatements = { "DELETE FROM sample_cna_event WHERE GENETIC_PROFILE_ID IN (SELECT GENETIC_PROFILE_ID FROM genetic_profile WHERE CANCER_STUDY_ID=?)", - "DELETE FROM generic_entity_properties WHERE GENETIC_ENTITY_ID IN (SELECT GENETIC_ENTITY_ID FROM genetic_alteration WHERE GENETIC_PROFILE_ID IN (SELECT GENETIC_PROFILE_ID FROM genetic_profile WHERE CANCER_STUDY_ID=?))", - "DELETE FROM genetic_entity WHERE ID IN (SELECT GENETIC_ENTITY_ID FROM genetic_alteration WHERE GENETIC_PROFILE_ID IN (SELECT GENETIC_PROFILE_ID FROM genetic_profile WHERE CANCER_STUDY_ID=? AND GENETIC_ALTERATION_TYPE='GENERIC_ASSAY'))", "DELETE FROM genetic_alteration WHERE GENETIC_PROFILE_ID IN (SELECT GENETIC_PROFILE_ID FROM genetic_profile WHERE CANCER_STUDY_ID=?)", "DELETE FROM genetic_profile_samples WHERE GENETIC_PROFILE_ID IN (SELECT GENETIC_PROFILE_ID FROM genetic_profile WHERE CANCER_STUDY_ID=?)", "DELETE FROM sample_profile WHERE GENETIC_PROFILE_ID IN (SELECT GENETIC_PROFILE_ID FROM genetic_profile WHERE CANCER_STUDY_ID=?)", "DELETE FROM mutation WHERE GENETIC_PROFILE_ID IN (SELECT GENETIC_PROFILE_ID FROM genetic_profile WHERE CANCER_STUDY_ID=?)", "DELETE FROM mutation_count_by_keyword WHERE GENETIC_PROFILE_ID IN (SELECT GENETIC_PROFILE_ID FROM genetic_profile WHERE CANCER_STUDY_ID=?)", "DELETE FROM clinical_attribute_meta WHERE CANCER_STUDY_ID=?", + "DELETE FROM resource_definition WHERE CANCER_STUDY_ID=?", + "DELETE FROM resource_study WHERE INTERNAL_ID=?", "DELETE FROM clinical_event_data WHERE CLINICAL_EVENT_ID IN (SELECT CLINICAL_EVENT_ID FROM clinical_event WHERE PATIENT_ID IN (SELECT INTERNAL_ID FROM patient WHERE CANCER_STUDY_ID=?))", "DELETE FROM clinical_event WHERE PATIENT_ID IN (SELECT INTERNAL_ID FROM patient WHERE CANCER_STUDY_ID=?)", "DELETE FROM sample_list_list WHERE LIST_ID IN (SELECT LIST_ID FROM sample_list WHERE CANCER_STUDY_ID=?)", "DELETE FROM clinical_sample WHERE INTERNAL_ID IN (SELECT INTERNAL_ID FROM sample WHERE PATIENT_ID IN (SELECT INTERNAL_ID FROM patient WHERE CANCER_STUDY_ID=?))", + "DELETE FROM resource_sample WHERE INTERNAL_ID IN (SELECT INTERNAL_ID FROM sample WHERE PATIENT_ID IN (SELECT INTERNAL_ID FROM patient WHERE CANCER_STUDY_ID=?))", "DELETE FROM copy_number_seg WHERE CANCER_STUDY_ID=?", "DELETE FROM copy_number_seg_file WHERE CANCER_STUDY_ID=?", "DELETE FROM protein_array_data WHERE CANCER_STUDY_ID=?", "DELETE FROM protein_array_cancer_study WHERE CANCER_STUDY_ID=?", "DELETE FROM sample WHERE PATIENT_ID IN (SELECT INTERNAL_ID FROM patient WHERE CANCER_STUDY_ID=?)", "DELETE FROM clinical_patient WHERE INTERNAL_ID IN (SELECT INTERNAL_ID FROM patient WHERE CANCER_STUDY_ID=?)", + "DELETE FROM resource_patient WHERE INTERNAL_ID IN (SELECT INTERNAL_ID FROM patient WHERE CANCER_STUDY_ID=?)", "DELETE FROM patient WHERE CANCER_STUDY_ID=?", "DELETE FROM sample_list WHERE CANCER_STUDY_ID=?", "DELETE FROM structural_variant WHERE GENETIC_PROFILE_ID IN (SELECT GENETIC_PROFILE_ID FROM genetic_profile WHERE CANCER_STUDY_ID=?)", @@ -572,6 +574,10 @@ public static void deleteCancerStudy(int internalCancerStudyId) throws DaoExcept PreparedStatement pstmt = null; ResultSet rs = null; try { + // check whether should delete generic assay meta + if (DaoGenericAssay.geneticEntitiesOnlyExistInSingleStudy(internalCancerStudyId)) { + deleteGenericAssayMeta(internalCancerStudyId); + } con = JdbcUtil.getDbConnection(DaoCancerStudy.class); for (String statementString : deleteStudyStatements) { pstmt = con.prepareStatement(statementString); @@ -618,6 +624,35 @@ public static void purgeUnreferencedRecordsAfterDeletionOfStudy() throws DaoExce } } + /** + * delete generic assay meta records if meta is not shared with other studies + * @throws DaoException + */ + public static void deleteGenericAssayMeta(int internalCancerStudyId) throws DaoException { + String[] deleteGenericAssayStatements = { + "DELETE FROM generic_entity_properties WHERE GENETIC_ENTITY_ID IN (SELECT GENETIC_ENTITY_ID FROM genetic_alteration WHERE GENETIC_PROFILE_ID IN (SELECT GENETIC_PROFILE_ID FROM genetic_profile WHERE CANCER_STUDY_ID=?))", + "DELETE FROM genetic_entity WHERE ID IN (SELECT GENETIC_ENTITY_ID FROM genetic_alteration WHERE GENETIC_PROFILE_ID IN (SELECT GENETIC_PROFILE_ID FROM genetic_profile WHERE CANCER_STUDY_ID=? AND GENETIC_ALTERATION_TYPE='GENERIC_ASSAY'))" + }; + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoCancerStudy.class); + for (String statementString : deleteGenericAssayStatements) { + pstmt = con.prepareStatement(statementString); + if (statementString.contains("?")) { + pstmt.setInt(1, internalCancerStudyId); + } + pstmt.executeUpdate(); + pstmt.close(); + } + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoCancerStudy.class, con, pstmt, rs); + } + } + /** * Extracts Cancer Study JDBC Results. */ diff --git a/core/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java b/core/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java index 74ee0c3c756..1f58acb7bd9 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java +++ b/core/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java @@ -95,8 +95,9 @@ private synchronized void fillCache() { if (gene==null) { ProgressMonitor.logWarning(line+" in config file [resources" + GENE_SYMBOL_DISAMBIGUATION_FILE + "]is not valid. You should either update this file or update the `gene` and `gene_alias` tables to fix this."); + } else { + disambiguousGenes.put(parts[0], gene); } - disambiguousGenes.put(parts[0], gene); } in.close(); } catch(IOException e) { diff --git a/core/src/main/java/org/mskcc/cbio/portal/dao/DaoGenericAssay.java b/core/src/main/java/org/mskcc/cbio/portal/dao/DaoGenericAssay.java index 61355b55986..0a319d9d8cd 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/dao/DaoGenericAssay.java +++ b/core/src/main/java/org/mskcc/cbio/portal/dao/DaoGenericAssay.java @@ -64,4 +64,51 @@ public static GenericAssayMeta getGenericAssayMetaByStableId(String stableId) th } return null; } -} \ No newline at end of file + + public static void deleteGenericEntityPropertiesByStableId(String stableId) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + + try { + con = JdbcUtil.getDbConnection(DaoGeneticEntity.class); + pstmt = con.prepareStatement("DELETE FROM generic_entity_properties WHERE GENETIC_ENTITY_ID=?"); + GeneticEntity entity = DaoGeneticEntity.getGeneticEntityByStableId(stableId); + if (entity == null) { + return; + } + pstmt.setInt(1, entity.getId()); + pstmt.executeUpdate(); + } catch (SQLException e) { + e.printStackTrace(); + } finally { + JdbcUtil.closeAll(DaoGeneticEntity.class, con, pstmt, rs); + } + } + + public static boolean geneticEntitiesOnlyExistInSingleStudy(int cancerStudyId) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + + try { + con = JdbcUtil.getDbConnection(DaoGeneticEntity.class); + pstmt = con.prepareStatement("SELECT DISTINCT CANCER_STUDY_ID FROM genetic_profile WHERE GENETIC_PROFILE_ID IN (SELECT GENETIC_PROFILE_ID FROM genetic_alteration WHERE GENETIC_ENTITY_ID IN (SELECT GENETIC_ENTITY_ID FROM genetic_alteration WHERE GENETIC_PROFILE_ID IN (SELECT GENETIC_PROFILE_ID FROM genetic_profile WHERE CANCER_STUDY_ID=?)))"); + pstmt.setInt(1, cancerStudyId); + rs = pstmt.executeQuery(); + + List studies = new ArrayList(); + while(rs.next()) { + studies.add(rs.getInt("CANCER_STUDY_ID")); + } + // check if entities only exist in single study + return studies.size() == 1; + } catch (SQLException e) { + e.printStackTrace(); + } finally { + JdbcUtil.closeAll(DaoGeneticEntity.class, con, pstmt, rs); + } + // do not update if there is an error + return false; + } +} diff --git a/core/src/main/java/org/mskcc/cbio/portal/dao/DaoResourceData.java b/core/src/main/java/org/mskcc/cbio/portal/dao/DaoResourceData.java new file mode 100644 index 00000000000..1c48220f469 --- /dev/null +++ b/core/src/main/java/org/mskcc/cbio/portal/dao/DaoResourceData.java @@ -0,0 +1,116 @@ +package org.mskcc.cbio.portal.dao; + +import org.mskcc.cbio.portal.model.*; +import org.apache.commons.lang.StringUtils; +import java.sql.*; +import java.util.*; + +/** + * Data Access Object for `resource` tables + */ +public final class DaoResourceData { + + public static final String RESOURCE_SAMPLE_TABLE = "resource_sample"; + public static final String RESOURCE_PATIENT_TABLE = "resource_patient"; + public static final String RESOURCE_STUDY_TABLE = "resource_study"; + + private static final String SAMPLE_INSERT = "INSERT INTO " + RESOURCE_SAMPLE_TABLE + + "(`INTERNAL_ID`,`RESOURCE_ID`,`URL` VALUES(?,?,?)"; + private static final String PATIENT_INSERT = "INSERT INTO " + RESOURCE_PATIENT_TABLE + + "(`INTERNAL_ID`,`RESOURCE_ID`,`URL` VALUES(?,?,?)"; + private static final String STUDY_INSERT = "INSERT INTO " + RESOURCE_STUDY_TABLE + + "(`INTERNAL_ID`,`RESOURCE_ID`,`URL` VALUES(?,?,?)"; + + private DaoResourceData() { + } + + public static int addSampleDatum(int internalSampleId, String resourceId, String url) throws DaoException { + return addDatum(SAMPLE_INSERT, RESOURCE_SAMPLE_TABLE, internalSampleId, resourceId, url); + } + + public static int addPatientDatum(int internalPatientId, String resourceId, String url) throws DaoException { + return addDatum(PATIENT_INSERT, RESOURCE_PATIENT_TABLE, internalPatientId, resourceId, url); + } + + public static int addStudyDatum(int internalStudyId, String resourceId, String url) throws DaoException { + return addDatum(STUDY_INSERT, RESOURCE_STUDY_TABLE, internalStudyId, resourceId, url); + } + + public static int addDatum(String query, String tableName, int internalId, String resourceId, String url) + throws DaoException { + if (MySQLbulkLoader.isBulkLoad()) { + MySQLbulkLoader.getMySQLbulkLoader(tableName).insertRecord(Integer.toString(internalId), resourceId, url); + return 1; + } + + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoResourceData.class); + + pstmt = con.prepareStatement(query); + pstmt.setInt(1, internalId); + pstmt.setString(2, resourceId); + pstmt.setString(3, url); + + return pstmt.executeUpdate(); + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoResourceData.class, con, pstmt, rs); + } + } + + public static List getDataByPatientId(int cancerStudyId, String patientId) throws DaoException + { + List internalIds = new ArrayList(); + internalIds.add(DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudyId, patientId).getInternalId()); + return getDataByInternalIds(cancerStudyId, RESOURCE_PATIENT_TABLE, internalIds); + } + + private static List getDataByInternalIds(int internalCancerStudyId, String table, List internalIds) throws DaoException + { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + + List resources = new ArrayList(); + String sql = ("SELECT * FROM " + table + " WHERE `INTERNAL_ID` IN " + + "(" + generateIdsSql(internalIds) + ")"); + + try { + con = JdbcUtil.getDbConnection(DaoResourceData.class); + pstmt = con.prepareStatement(sql); + rs = pstmt.executeQuery(); + while (rs.next()) { + resources.add(extract(table, internalCancerStudyId, rs)); + } + } + catch (SQLException e) { + throw new DaoException(e); + } + finally { + JdbcUtil.closeAll(DaoResourceData.class, con, pstmt, rs); + } + + return resources; + } + + private static String generateIdsSql(Collection ids) { + return "'" + StringUtils.join(ids, "','") + "'"; + } + + private static ResourceBaseData extract(String table, int internalCancerStudyId, ResultSet rs) throws SQLException { + String stableId = getStableIdFromInternalId(table, rs.getInt("INTERNAL_ID")); + return new ResourceBaseData(internalCancerStudyId, stableId, rs.getString("RESOURCE_ID"), rs.getString("URL")); + } + + private static String getStableIdFromInternalId(String table, int internalId) { + if (table.equals(RESOURCE_SAMPLE_TABLE)) { + return DaoSample.getSampleById(internalId).getStableId(); + } else { + return DaoPatient.getPatientById(internalId).getStableId(); + } + } +} \ No newline at end of file diff --git a/core/src/main/java/org/mskcc/cbio/portal/dao/DaoResourceDefinition.java b/core/src/main/java/org/mskcc/cbio/portal/dao/DaoResourceDefinition.java new file mode 100644 index 00000000000..218839e0a0c --- /dev/null +++ b/core/src/main/java/org/mskcc/cbio/portal/dao/DaoResourceDefinition.java @@ -0,0 +1,135 @@ +package org.mskcc.cbio.portal.dao; + +import org.mskcc.cbio.portal.model.*; + +import org.apache.commons.lang.StringUtils; +import org.cbioportal.model.ResourceType; + +import java.sql.*; +import java.util.*; + +/** + * Data Access Object for `resource_definition` table + */ +public class DaoResourceDefinition { + + public static int addDatum(ResourceDefinition resource) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoResourceDefinition.class); + pstmt = con.prepareStatement("INSERT INTO resource_definition(" + "`RESOURCE_ID`," + "`DISPLAY_NAME`," + + "`DESCRIPTION`," + "`RESOURCE_TYPE`," + "`OPEN_BY_DEFAULT`," + "`PRIORITY`," + "`CANCER_STUDY_ID`)" + + " VALUES(?,?,?,?,?,?,?)"); + pstmt.setString(1, resource.getResourceId()); + pstmt.setString(2, resource.getDisplayName()); + pstmt.setString(3, resource.getDescription()); + pstmt.setString(4, resource.getResourceType().name()); + pstmt.setBoolean(5, resource.isOpenByDefault()); + pstmt.setInt(6, resource.getPriority()); + pstmt.setInt(7, resource.getCancerStudyId()); + return pstmt.executeUpdate(); + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoResourceDefinition.class, con, pstmt, rs); + } + } + + private static ResourceDefinition unpack(ResultSet rs) throws SQLException { + return new ResourceDefinition(rs.getString("RESOURCE_ID"), rs.getString("DISPLAY_NAME"), rs.getString("DESCRIPTION"), + ResourceType.valueOf(rs.getString("RESOURCE_TYPE")), rs.getBoolean("OPEN_BY_DEFAULT"), rs.getInt("PRIORITY"), + rs.getInt("CANCER_STUDY_ID")); + } + + public static ResourceDefinition getDatum(String resourceId, Integer cancerStudyId) throws DaoException { + List resources = getDatum(Arrays.asList(resourceId), cancerStudyId); + if (resources.isEmpty()) { + return null; + } + + return resources.get(0); + } + + public static List getDatum(Collection resourceIds, Integer cancerStudyId) + throws DaoException { + if (resourceIds == null || resourceIds.isEmpty()) { + return Collections.emptyList(); + } + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoResourceDefinition.class); + + pstmt = con.prepareStatement("SELECT * FROM resource_definition WHERE RESOURCE_ID IN ('" + + StringUtils.join(resourceIds, "','") + "') AND CANCER_STUDY_ID=" + String.valueOf(cancerStudyId)); + + rs = pstmt.executeQuery(); + + List list = new ArrayList(); + while (rs.next()) { + list.add(unpack(rs)); + } + + return list; + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoResourceDefinition.class, con, pstmt, rs); + } + } + + public static List getDatum(Collection resourceIds) throws DaoException { + if (resourceIds == null || resourceIds.isEmpty()) { + return Collections.emptyList(); + } + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoResourceDefinition.class); + + pstmt = con.prepareStatement("SELECT * FROM resource_definition WHERE RESOURCE_ID IN ('" + + StringUtils.join(resourceIds, "','") + "')"); + + rs = pstmt.executeQuery(); + + List list = new ArrayList(); + while (rs.next()) { + list.add(unpack(rs)); + } + + return list; + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoResourceDefinition.class, con, pstmt, rs); + } + } + + public static List getDatumByStudy(int cancerStudyId) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoResourceDefinition.class); + + pstmt = con.prepareStatement("SELECT * FROM resource_definition WHERE CANCER_STUDY_ID=" + String.valueOf(cancerStudyId)); + + rs = pstmt.executeQuery(); + + List list = new ArrayList(); + while (rs.next()) { + list.add(unpack(rs)); + } + + return list; + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoResourceDefinition.class, con, pstmt, rs); + } + } +} \ No newline at end of file diff --git a/core/src/main/java/org/mskcc/cbio/portal/model/CnaEvent.java b/core/src/main/java/org/mskcc/cbio/portal/model/CnaEvent.java index 429c107b5e8..0f7cf731c21 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/model/CnaEvent.java +++ b/core/src/main/java/org/mskcc/cbio/portal/model/CnaEvent.java @@ -32,9 +32,7 @@ package org.mskcc.cbio.portal.model; -import java.util.HashMap; -import java.util.Map; - +import org.cbioportal.model.CNA; import org.mskcc.cbio.portal.dao.DaoGeneOptimized; /** @@ -46,41 +44,6 @@ public class CnaEvent { private int cnaProfileId; private Event event; - public static enum CNA { - AMP ((short)2, "Amplified"), - GAIN ((short)1, "Gained"), - DIPLOID ((short)0, "Diploid"), - HETLOSS ((short)-1, "Heterozygously deleted"), - HOMDEL ((short)-2, "Homozygously deleted"); - - private short code; - private String desc; - - private CNA(short code, String desc) { - this.code = code; - this.desc = desc; - } - - private final static Map cache = new HashMap(); - static { - for (CNA cna : CNA.values()) { - cache.put(cna.code, cna); - } - } - - public static CNA getByCode(short code) { - return cache.get(code); - } - - public short getCode() { - return code; - } - - public String getDescription() { - return desc; - } - } - public static class Event { private long eventId; private CanonicalGene gene; diff --git a/core/src/main/java/org/mskcc/cbio/portal/model/ResourceBaseData.java b/core/src/main/java/org/mskcc/cbio/portal/model/ResourceBaseData.java new file mode 100644 index 00000000000..27f4ebc460d --- /dev/null +++ b/core/src/main/java/org/mskcc/cbio/portal/model/ResourceBaseData.java @@ -0,0 +1,70 @@ +package org.mskcc.cbio.portal.model; + +/** + * Encapsulates Resource Base Data. + */ +public class ResourceBaseData { + private int cancerStudyId; + private String stableId; + private String resourceId; + private String url; + + /** + * Constructor + */ + public ResourceBaseData() { + this(-1, "", "", ""); + } + + /** + * Constructor + * + * @param cancerStudyId database id of cancer study + * @param stableId stable id of the patient or sample or study + * @param resourceId resource id + * @param url url of the resource + */ + + public ResourceBaseData(int cancerStudyId, String stableId, String resourceId, String url) { + this.setCancerStudyId(cancerStudyId); + this.setStableId(stableId); + this.setResourceId(resourceId); + this.setUrl(url); + } + + public ResourceBaseData(ResourceBaseData other) { + this(other.getCancerStudyId(), other.getStableId(), other.getResourceId(), other.getUrl()); + } + + public int getCancerStudyId() { + return cancerStudyId; + } + + public void setCancerStudyId(int cancerStudyId) { + this.cancerStudyId = cancerStudyId; + } + + public String getStableId() { + return stableId; + } + + public void setStableId(String stableId) { + this.stableId = stableId; + } + + public String getResourceId() { + return resourceId; + } + + public void setResourceId(String resourceId) { + this.resourceId = resourceId; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } +} diff --git a/core/src/main/java/org/mskcc/cbio/portal/model/ResourceDefinition.java b/core/src/main/java/org/mskcc/cbio/portal/model/ResourceDefinition.java new file mode 100644 index 00000000000..e51ff9dd934 --- /dev/null +++ b/core/src/main/java/org/mskcc/cbio/portal/model/ResourceDefinition.java @@ -0,0 +1,79 @@ +package org.mskcc.cbio.portal.model; + +import org.cbioportal.model.ResourceType; + +public class ResourceDefinition { + + private String resourceId; + private String displayName; + private String description; + private ResourceType resourceType; + private boolean openByDefault; + private Integer priority; + private Integer cancerStudyId; + + public ResourceDefinition(String resourceId, String displayName, + String description, ResourceType resourceType, boolean openByDefault, Integer priority, Integer cancerStudyId) { + this.resourceId = resourceId; + this.displayName = displayName; + this.description = description; + this.resourceType = resourceType; + this.openByDefault = openByDefault; + this.priority = priority; + this.cancerStudyId = cancerStudyId; + } + + public String getResourceId() { + return resourceId; + } + + public void setResourceId(String resourceId) { + this.resourceId = resourceId; + } + + public String getDisplayName() { + return displayName; + } + + public void setDisplayName(String displayName) { + this.displayName = displayName; + } + + public String getDescription() { + return description; + } + + public void setDescription(String description) { + this.description = description; + } + + public ResourceType getResourceType() { + return resourceType; + } + + public void setResourceType(ResourceType resourceType) { + this.resourceType = resourceType; + } + + public boolean isOpenByDefault() { + return openByDefault; + } + + public void setOpenByDefault(boolean openByDefault) { + this.openByDefault = openByDefault; + } + + public Integer getPriority() { + return priority; + } + + public void setPriority(Integer priority) { + this.priority = priority; + } + public Integer getCancerStudyId() { + return cancerStudyId; + } + public void setCancerStudyId(Integer cancerStudyId) { + this.cancerStudyId = cancerStudyId; + } +} diff --git a/core/src/main/java/org/mskcc/cbio/portal/model/SampleListCategory.java b/core/src/main/java/org/mskcc/cbio/portal/model/SampleListCategory.java index 38a12904e65..1a09888b87f 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/model/SampleListCategory.java +++ b/core/src/main/java/org/mskcc/cbio/portal/model/SampleListCategory.java @@ -42,6 +42,7 @@ public enum SampleListCategory { ALL_CASES_IN_STUDY("all_cases_in_study"), ALL_CASES_WITH_MUTATION_DATA("all_cases_with_mutation_data"), + ALL_CASES_WITH_FUSION_DATA("all_cases_with_fusion_data"), ALL_CASES_WITH_CNA_DATA("all_cases_with_cna_data"), ALL_CASES_WITH_LOG2_CNA_DNA("all_cases_with_log2_cna_data"), ALL_CASES_WITH_METHYLATION_DATA("all_cases_with_methylation_data"), diff --git a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java index 1d17632d701..39ad8b6769e 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java +++ b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java @@ -96,7 +96,7 @@ public static void importData(File geneFile, String genomeBuild) throws IOExcept String strAliases = parts[4]; String strXrefs = parts[5]; String cytoband = parts[7]; - String chr = cytoband.split("p|q|;|c")[0]; // various strange characters were found in this column + String chr = cytoband.split("[pq;c \\|]")[0]; // various strange characters were found in this column int referenceGenomeId = DaoReferenceGenome.getReferenceGenomeByBuildName(genomeBuild).getReferenceGenomeId(); String desc = parts[8]; String type = parts[9]; diff --git a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java index 695be092cde..14b12551da9 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java +++ b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java @@ -39,7 +39,9 @@ import java.io.BufferedReader; import java.io.File; import java.io.FileReader; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import org.cbioportal.model.EntityType; import org.cbioportal.model.GeneticEntity; @@ -90,6 +92,10 @@ public void run() { // don't require additional properties OptionSpec additionalProperties = parser.accepts("additional-properties", "Additional properties need to be imported") .withOptionalArg().ofType(String.class); + + // don't require updateInfo, default as true + OptionSpec updateInfoArg = parser.accepts("update-info", "Update information for existing entities in the database") + .withOptionalArg().ofType(String.class); OptionSet options = null; try { @@ -115,8 +121,11 @@ public void run() { "'entityType' argument required"); } - // Check options - boolean updateInfo = options.has("update-info"); + // Check options, set default as false + boolean updateInfo = false; + if (options.has("update-info") && (options.valueOf(updateInfoArg).equalsIgnoreCase("true") || options.valueOf(updateInfoArg).equals("1"))) { + updateInfo = true; + } ProgressMonitor.setCurrentMessage("Adding new generic assay to the database\n"); startImport(options, data, entityType, additionalProperties, updateInfo); @@ -138,7 +147,7 @@ public static void startImport(OptionSet options, OptionSpec data, Optio File genericAssayFile = new File(options.valueOf(data)); GeneticAlterationType geneticAlterationTypeArg = GeneticAlterationType.valueOf(options.valueOf(geneticAlterationType)); String additionalPropertiesArg = options.valueOf(additionalProperties); - importData(genericAssayFile, geneticAlterationTypeArg, additionalPropertiesArg); + importData(genericAssayFile, geneticAlterationTypeArg, additionalPropertiesArg, updateInfo); } } @@ -150,7 +159,7 @@ public static void startImport(OptionSet options, OptionSpec data, Optio * @param additionalProperties * @throws Exception */ - public static void importData(File dataFile, GeneticAlterationType geneticAlterationType, String additionalProperties) throws Exception { + public static void importData(File dataFile, GeneticAlterationType geneticAlterationType, String additionalProperties, boolean updateInfo) throws Exception { ProgressMonitor.setCurrentMessage("Reading data from: " + dataFile.getCanonicalPath()); @@ -163,6 +172,11 @@ public static void importData(File dataFile, GeneticAlterationType geneticAltera // read generic assay data int indexStableIdField = getStableIdIndex(headerNames); + // entities have been overriden + List updatedEntities = new ArrayList<>(); + List notUpdatedEntities = new ArrayList<>(); + List newEntities = new ArrayList<>(); + currentLine = buf.readLine(); while (currentLine != null) { @@ -172,6 +186,7 @@ public static void importData(File dataFile, GeneticAlterationType geneticAltera // get stableId and get the meta by the stableId String genericAssayMetaStableId = parts[indexStableIdField]; GenericAssayMeta genericAssayMeta = DaoGenericAssay.getGenericAssayMetaByStableId(genericAssayMetaStableId); + GeneticEntity genericAssayEntity = DaoGeneticEntity.getGeneticEntityByStableId(genericAssayMetaStableId); // generic assay meta are always updated to based on the current import; // also when present in db a new record is created. @@ -191,10 +206,23 @@ public static void importData(File dataFile, GeneticAlterationType geneticAltera // log for the existing entities if (genericAssayMeta != null) { - ProgressMonitor.setCurrentMessage("Cannot add new entity, entity exists: " + genericAssayMetaStableId); + if (updateInfo) { + updatedEntities.add(stableId); + DaoGenericAssay.deleteGenericEntityPropertiesByStableId(stableId); + propertiesMap.forEach((k, v) -> { + try { + DaoGenericAssay.setGenericEntityProperty(genericAssayEntity.getId(), k, v); + } catch (DaoException e) { + e.printStackTrace(); + } + }); + } else { + notUpdatedEntities.add(stableId); + } } // create a new generic assay meta and add to the database else { + newEntities.add(stableId); GeneticEntity newGeneticEntity = new GeneticEntity(geneticAlterationType.name(), stableId); GeneticEntity createdGeneticEntity = DaoGeneticEntity.addNewGeneticEntity(newGeneticEntity); propertiesMap.forEach((k, v) -> { @@ -209,6 +237,17 @@ public static void importData(File dataFile, GeneticAlterationType geneticAltera currentLine = buf.readLine(); } + // show import result message + if (updatedEntities.size() > 0) { + ProgressMonitor.setCurrentMessage("--> Entities updated: " + updatedEntities.size() + " generic entities existing in the database that were overridden during import."); + } + if (notUpdatedEntities.size() > 0) { + ProgressMonitor.setCurrentMessage("--> Entities not updated: " + notUpdatedEntities.size() + " generic entities existing in the database that were not overridden during import."); + } + if (newEntities.size() > 0) { + ProgressMonitor.setCurrentMessage("--> New Entities: " + newEntities.size() + " generic entities have been imported into database during import."); + } + reader.close(); ProgressMonitor.setCurrentMessage("Finished loading generic assay.\n"); diff --git a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index bad39a63b62..6e7ca0820d9 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -56,9 +56,14 @@ public void run() { // Parse arguments // using a real options parser, helps avoid bugs String description = "Import 'profile' files that contain data matrices indexed by gene, case"; - OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, true); + OptionSet options = ConsoleUtil.parseStandardDataAndMetaUpdateOptions(args, description, true); File dataFile = new File((String) options.valueOf("data")); File descriptorFile = new File((String) options.valueOf( "meta" ) ); + // Check options, set default as false + boolean updateInfo = false; + if (options.has("update-info") && (((String) options.valueOf("update-info")).equalsIgnoreCase("true") || options.valueOf("update-info").equals("1"))) { + updateInfo = true; + } SpringUtil.initDataSource(); ProgressMonitor.setCurrentMessage("Reading data from: " + dataFile.getAbsolutePath()); // Load genetic profile and gene panel @@ -100,7 +105,7 @@ public void run() { importer.importData(); } else if (geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENERIC_ASSAY) { // add all missing `genetic_entities` for this assay to the database - ImportGenericAssayEntity.importData(dataFile, geneticProfile.getGeneticAlterationType(), geneticProfile.getOtherMetaDataField("generic_entity_meta_properties")); + ImportGenericAssayEntity.importData(dataFile, geneticProfile.getGeneticAlterationType(), geneticProfile.getOtherMetaDataField("generic_entity_meta_properties"), updateInfo); ImportTabDelimData genericAssayProfileImporter = new ImportTabDelimData(dataFile, geneticProfile.getTargetLine(), geneticProfile.getGeneticProfileId(), genePanel, geneticProfile.getOtherMetaDataField("generic_entity_meta_properties")); genericAssayProfileImporter.importData(numLines); diff --git a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java new file mode 100644 index 00000000000..3cc13dbd3fe --- /dev/null +++ b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java @@ -0,0 +1,506 @@ +package org.mskcc.cbio.portal.scripts; + +import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.model.*; +import org.mskcc.cbio.portal.util.*; + +import java.io.*; +import joptsimple.*; +import java.util.*; +import java.util.regex.*; +import java.util.stream.Collectors; + +import org.apache.commons.collections.map.MultiKeyMap; +import org.cbioportal.model.ResourceType; + +public class ImportResourceData extends ConsoleRunnable { + + public static final String DELIMITER = "\t"; + public static final String METADATA_PREFIX = "#"; + public static final String SAMPLE_ID_COLUMN_NAME = "SAMPLE_ID"; + public static final String PATIENT_ID_COLUMN_NAME = "PATIENT_ID"; + public static final String RESOURCE_ID_COLUMN_NAME = "RESOURCE_ID"; + public static final String URL_COLUMN_NAME = "URL"; + public static final String SAMPLE_TYPE_COLUMN_NAME = "SAMPLE_TYPE"; + private int numSampleSpecificResourcesAdded = 0; + private int numPatientSpecificResourcesAdded = 0; + private int numStudySpecificResourcesAdded = 0; + private int numEmptyResourcesSkipped = 0; + private int numSamplesProcessed = 0; + + private static Properties properties; + + private File resourceDataFile; + private CancerStudy cancerStudy; + private ResourceType resourceType; + private boolean relaxed; + private Set patientIds = new HashSet(); + + public void setFile(CancerStudy cancerStudy, File resourceDataFile, String resourceType, boolean relaxed) { + this.cancerStudy = cancerStudy; + this.resourceDataFile = resourceDataFile; + this.resourceType = ResourceType.valueOf(resourceType); + this.relaxed = relaxed; + } + + public void importData() throws Exception { + // if bulkLoading is ever turned off, + // code has to be added to check whether + // a resource data update should be + // perform instead of an insert + MySQLbulkLoader.bulkLoadOn(); + + if (relaxed) { + MySQLbulkLoader.relaxedModeOn(); + } + + FileReader reader = new FileReader(resourceDataFile); + BufferedReader buff = new BufferedReader(reader); + List resources = DaoResourceDefinition.getDatumByStudy(cancerStudy.getInternalId()); + String currentLine = buff.readLine(); + String[] headerNames = currentLine.split("\t"); + Map headerIndexMap = makeHeaderIndexMap(headerNames); + + int patientIdIndex = findPatientIdColumn(headerIndexMap); + int sampleIdIndex = findSampleIdColumn(headerIndexMap); + int resourceIdIndex = findResourceIdColumn(headerIndexMap); + int urlIndex = findURLColumn(headerIndexMap); + + // validate required columns: + if (resourceIdIndex < 0) { + throw new RuntimeException("Aborting owing to failure to find " + RESOURCE_ID_COLUMN_NAME + + " in file. Please check your file format and try again."); + } + if (urlIndex < 0) { + throw new RuntimeException("Aborting owing to failure to find " + URL_COLUMN_NAME + + " in file. Please check your file format and try again."); + } + if ((resourceType.equals(ResourceType.SAMPLE) || resourceType.equals(ResourceType.PATIENT)) && patientIdIndex < 0) { + // PATIENT_ID is required in both patient and sample file types: + throw new RuntimeException("Aborting owing to failure to find " + PATIENT_ID_COLUMN_NAME + + " in file. Please check your file format and try again."); + } + if (resourceType.equals(ResourceType.SAMPLE) && sampleIdIndex < 0) { + // SAMPLE_ID is required in SAMPLE file type: + throw new RuntimeException("Aborting owing to failure to find " + SAMPLE_ID_COLUMN_NAME + + " in file. Please check your file format and try again."); + } + importData(buff, resources, headerIndexMap); + buff.close(); + + if (MySQLbulkLoader.isBulkLoad()) { + MySQLbulkLoader.flushAll(); + MySQLbulkLoader.relaxedModeOff(); + } + } + + private void importData(BufferedReader buff, List resources, Map headerIndexMap) throws Exception { + String line; + MultiKeyMap resourceMap = new MultiKeyMap(); + // create resource_id set + Set patientResourceIdSet = resources + .stream() + .filter(resource -> resource.getResourceType().equals(ResourceType.PATIENT)) + .map(resource -> resource.getResourceId()) + .collect(Collectors.toSet()); + Set sampleResourceIdSet = resources + .stream() + .filter(resource -> resource.getResourceType().equals(ResourceType.SAMPLE)) + .map(resource -> resource.getResourceId()) + .collect(Collectors.toSet()); + Set studyResourceIdSet = resources + .stream() + .filter(resource -> resource.getResourceType().equals(ResourceType.STUDY)) + .map(resource -> resource.getResourceId()) + .collect(Collectors.toSet()); + + while ((line = buff.readLine()) != null) { + if (skipLine(line.trim())) { + continue; + } + + String[] fieldValues = getFieldValues(line, headerIndexMap); + addDatum(fieldValues, resources, resourceMap, headerIndexMap, patientResourceIdSet, sampleResourceIdSet, studyResourceIdSet); + } + } + + private boolean skipLine(String line) { + return (line.isEmpty() || line.substring(0, 1).equals(METADATA_PREFIX)); + } + + /** + * Takes in the given line and returns the list of field values by splitting the + * line on DELIMITER. + * + * @param line + * @param headerIndexMap + * @return the list of values, one for each column. Value will be "" for empty + * columns. + */ + private String[] getFieldValues(String line, Map headerIndexMap) { + // split on delimiter: + String[] fieldValues = line.split(DELIMITER, -1); + + // validate: if number of fields is incorrect, give exception + if (fieldValues.length != headerIndexMap.size()) { + throw new IllegalArgumentException("Number of columns in line is not as expected. Expected: " + + headerIndexMap.size() + " columns, found: " + fieldValues.length + ", for line: " + line); + } + + // now iterate over lines and trim each value: + for (int i = 0; i < fieldValues.length; i++) { + fieldValues[i] = fieldValues[i].trim(); + } + return fieldValues; + } + + private boolean addDatum(String[] fields, List resources, MultiKeyMap resourceMap, Map headerIndexMap, Set patientResourceIdSet, Set sampleResourceIdSet, Set studyResourceIdSet) + throws Exception { + int sampleIdIndex = findSampleIdColumn(headerIndexMap); + String stableSampleId = (sampleIdIndex >= 0) ? fields[sampleIdIndex] : ""; + stableSampleId = StableIdUtil.getSampleId(stableSampleId); + int patientIdIndex = findPatientIdColumn(headerIndexMap); + String stablePatientId = (patientIdIndex >= 0) ? fields[patientIdIndex] : ""; + stablePatientId = StableIdUtil.getPatientId(stablePatientId); + int resourceIdIndex = findResourceIdColumn(headerIndexMap); + int urlIndex = findURLColumn(headerIndexMap); + int internalSampleId = -1; + int internalPatientId = -1; + + // validate patient and sample for patient and sample attibutes + if (resourceType.equals(ResourceType.PATIENT) || resourceType.equals(ResourceType.SAMPLE)) { + Patient patient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), stablePatientId); + if (patient != null) { + // patient exists, get internal id: + internalPatientId = patient.getInternalId(); + } else { + // add patient: if patient do not exist and resource type is sample + internalPatientId = (stablePatientId.length() > 0) ? addPatientToDatabase(stablePatientId) : -1; + } + + if (resourceType.equals(ResourceType.SAMPLE)) { + // check if sample is not already added: + Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), stableSampleId, false); + if (sample != null) { + // get internal sample id if sample exists + internalSampleId = sample.getInternalId(); + } else { + // sample is new, so attempt to add to DB + internalSampleId = (stableSampleId.length() > 0) ? addSampleToDatabase(stableSampleId, fields, headerIndexMap, internalPatientId) + : -1; + } + + // validate and count: + if (internalSampleId != -1) { + // some minimal validation/fail safe for now: only continue if patientId is same + // as patient id in + // existing sample (can occur in case of this.isSupplementalData or in case of + // parsing bug in addSampleToDatabase): + if (internalPatientId != DaoSample.getSampleById(internalSampleId).getInternalPatientId()) { + throw new RuntimeException("Error: Sample " + stableSampleId + + " was previously linked to another patient, and not to " + stablePatientId); + } + numSamplesProcessed++; + } + } + } + + // if the resource id or url matches one of the missing values, skip this resource: + if ((resourceIdIndex != -1 && MissingValues.has(fields[resourceIdIndex])) || (urlIndex != -1 && MissingValues.has(fields[urlIndex]))) { + numEmptyResourcesSkipped++; + } else { + // if patient_id column exists and resource type is patient + if (getResourceType() == ResourceType.PATIENT && internalPatientId != -1) { + validateAddDatum(internalPatientId, stablePatientId, fields[resourceIdIndex], fields[urlIndex], + ResourceType.PATIENT, patientResourceIdSet, resourceMap); + } + // if sample_id column exists and resource type is sample + else if (getResourceType() == ResourceType.SAMPLE && internalSampleId != -1) { + validateAddDatum(internalSampleId, stableSampleId, fields[resourceIdIndex], fields[urlIndex], + ResourceType.SAMPLE, sampleResourceIdSet, resourceMap); + } + // if resource type is study + else if (getResourceType() == ResourceType.STUDY) { + validateAddDatum(cancerStudy.getInternalId(), cancerStudy.getCancerStudyStableId(), fields[resourceIdIndex], fields[urlIndex], + ResourceType.STUDY, studyResourceIdSet, resourceMap); + } + } + return true; + } + + private Map makeHeaderIndexMap(String[] headerNames) { + Map headerIndexMap = new HashMap(); + for (int i= 0; i < headerNames.length; i++) { + headerIndexMap.put(headerNames[i], i); + } + return headerIndexMap; + } + + private int findPatientIdColumn(Map headerIndexMap) { + return findColumnIndexInHeaders(PATIENT_ID_COLUMN_NAME, headerIndexMap); + } + + private int findSampleIdColumn(Map headerIndexMap) { + return findColumnIndexInHeaders(SAMPLE_ID_COLUMN_NAME, headerIndexMap); + } + + private int findResourceIdColumn(Map headerIndexMap) { + return findColumnIndexInHeaders(RESOURCE_ID_COLUMN_NAME, headerIndexMap); + } + + private int findURLColumn(Map headerIndexMap) { + return findColumnIndexInHeaders(URL_COLUMN_NAME, headerIndexMap); + } + + private int findSampleTypeColumn(Map headerIndexMap) { + return findColumnIndexInHeaders(SAMPLE_TYPE_COLUMN_NAME, headerIndexMap); + } + + private int findColumnIndexInHeaders(String columnHeader, Map headerIndexMap) { + return headerIndexMap.getOrDefault(columnHeader, -1); + } + + private int addPatientToDatabase(String patientId) throws Exception { + int internalPatientId = -1; + + if (validPatientId(patientId)) { + // in case of PATIENT data import and patient == null : + if (getResourceType() == ResourceType.PATIENT) { + // not finding the patient it unexpected (as SAMPLE data import should always + // precede it), but + // can happen when this patient does not have any samples for example. In any + // case, warn about it: + ProgressMonitor.logWarning("Patient " + patientId + + " being added for the first time. Apparently this patient was not in the samples file, or the samples file is not yet loaded (should be loaded before this one)"); + } + internalPatientId = DaoPatient.addPatient(new Patient(cancerStudy, patientId)); + } + return internalPatientId; + } + + private int addSampleToDatabase(String sampleId, String[] fields, Map headerIndexMap, int internalPatientId) + throws Exception { + int sampleTypeIndex = findSampleTypeColumn(headerIndexMap); + String sampleTypeStr = (sampleTypeIndex != -1) ? fields[sampleTypeIndex] : null; + if (sampleTypeStr != null) { + // want to match Sample.Type enum names + sampleTypeStr = sampleTypeStr.trim().toUpperCase().replaceAll(" ", "_"); + } + Sample.Type sampleType = Sample.Type.has(sampleTypeStr) ? Sample.Type.valueOf(sampleTypeStr) : null; + + int internalSampleId = -1; + if (validSampleId(sampleId) && !StableIdUtil.isNormal(sampleId)) { + // want to try and capture normal sample types based on value for SAMPLE_TYPE + // if present in resource data + if (sampleType != null && sampleType.isNormal()) { + return internalSampleId; + } + sampleId = StableIdUtil.getSampleId(sampleId); + if (internalPatientId != -1) { + internalSampleId = DaoSample.addSample( + new Sample(sampleId, internalPatientId, cancerStudy.getTypeOfCancerId(), sampleTypeStr)); + } + } + + return internalSampleId; + } + + private boolean validPatientId(String patientId) { + return (patientId != null && !patientId.isEmpty()); + } + + private boolean validSampleId(String sampleId) { + return (sampleId != null && !sampleId.isEmpty()); + } + + private void validateAddDatum(int internalId, String stableId, String resourceId, String resourceURL, ResourceType resourceType, Set resourceSet, MultiKeyMap resourceMap) throws Exception { + // throw exception if resource definition is not exist in the database + if (!resourceSet.contains(resourceId)) { + throw new RuntimeException("Error: " + resourceType.toString().toLowerCase() + " " + stableId + + " with resource " + resourceId + + " does not have matching resources information in database, please make sure to include resource definition in the resource definition file"); + } + // The resourceMap makes sure a pair of (internalId/resource_id/url) is unique + // added to the DB if there are no duplicates, + if (!resourceMap.containsKey(internalId, resourceId, resourceURL)) { + addDatum(internalId, resourceId, resourceURL,resourceType); + resourceMap.put(internalId, resourceId, resourceURL, resourceURL); + } + // handle duplicates + // if the "relaxed" flag was given, and the new record e.g. tries to ignore a duplicated resources and log a warning + else if (!relaxed) { + throw new RuntimeException("Error: Duplicated " + resourceType.toString().toLowerCase() + " resource in file"); + } + // log a warning + else if (!resourceMap.get(internalId, resourceId, resourceURL).equals(resourceURL)) { + ProgressMonitor.logWarning("Error: Duplicated " + resourceType.toString().toLowerCase() + " " + stableId + + " with different values for " + resourceType.toString().toLowerCase() + " resource " + resourceId + + "\n\tValues: " + resourceId + " " + resourceURL); + } + } + + // add datum for patient, sample and study resources + private void addDatum(int internalId, String resourceId, String resourceURL, ResourceType resourceType) throws Exception { + // if bulk loading is ever turned off, we need to check if + // resource value exists and if so, perfom an update + if (resourceType.equals(ResourceType.PATIENT)) { + numPatientSpecificResourcesAdded++; + DaoResourceData.addPatientDatum(internalId, resourceId, resourceURL); + } else if (resourceType.equals(ResourceType.SAMPLE)) { + numSampleSpecificResourcesAdded++; + DaoResourceData.addSampleDatum(internalId, resourceId, resourceURL); + } else { + numStudySpecificResourcesAdded++; + DaoResourceData.addStudyDatum(internalId, resourceId, resourceURL); + } + } + + public int getNumSampleSpecificResourcesAdded() { + return numSampleSpecificResourcesAdded; + } + + public int getNumPatientSpecificResourcesAdded() { + return numPatientSpecificResourcesAdded; + } + + public int getNumStudySpecificResourcesAdded() { + return numStudySpecificResourcesAdded; + } + + public int getNumEmptyResourcesSkipped() { + return numEmptyResourcesSkipped; + } + + public int getNumSamplesProcessed() { + return numSamplesProcessed; + } + + /** + * The type of resource found in the file. Basically the type of import + * running for this instance. Can be one of ResourceType. + * + * @return + */ + public ResourceType getResourceType() { + return resourceType; + } + + /** + * Imports resource definition and resource data (from the worksheet) + */ + public void run() { + try { + String progName = "importResourceData"; + String description = "Import resource files."; + // usage: --data --meta --loadMode + // [directLoad|bulkLoad (default)] [--noprogress] + + OptionParser parser = new OptionParser(); + OptionSpec data = parser.accepts("data", "profile data file").withRequiredArg() + .describedAs("data_file.txt").ofType(String.class); + OptionSpec meta = parser.accepts("meta", "meta (description) file").withOptionalArg() + .describedAs("meta_file.txt").ofType(String.class); + OptionSpec study = parser.accepts("study", "cancer study id").withOptionalArg().describedAs("study") + .ofType(String.class); + OptionSpec relaxedFlag = parser.accepts("r", + "(not recommended) Flag for relaxed mode, determining how to handle detected data harmonization problems in the same study") + .withOptionalArg().describedAs("r").ofType(String.class); + parser.accepts("loadMode", "direct (per record) or bulk load of data").withOptionalArg() + .describedAs("[directLoad|bulkLoad (default)]").ofType(String.class); + parser.accepts("noprogress", + "this option can be given to avoid the messages regarding memory usage and % complete"); + + OptionSet options = null; + try { + options = parser.parse(args); + } catch (OptionException e) { + throw new UsageException(progName, description, parser, e.getMessage()); + } + File resourceFile = null; + if (options.has(data)) { + resourceFile = new File(options.valueOf(data)); + } else { + throw new UsageException(progName, description, parser, "'data' argument required."); + } + String resourceType = null; + boolean relaxed = false; + String cancerStudyStableId = null; + if (options.has(study)) { + cancerStudyStableId = options.valueOf(study); + } + if (options.has(meta)) { + properties = new TrimmedProperties(); + properties.load(new FileInputStream(options.valueOf(meta))); + resourceType = properties.getProperty("resource_type"); + cancerStudyStableId = properties.getProperty("cancer_study_identifier"); + } + if (options.has(relaxedFlag)) { + relaxed = true; + + } + SpringUtil.initDataSource(); + CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyStableId); + if (cancerStudy == null) { + throw new IllegalArgumentException("Unknown cancer study: " + cancerStudyStableId); + } + ProgressMonitor.setCurrentMessage("Reading data from: " + resourceFile.getAbsolutePath()); + int numLines = FileUtil.getNumLines(resourceFile); + ProgressMonitor.setCurrentMessage(" --> total number of lines: " + numLines); + ProgressMonitor.setMaxValue(numLines); + + setFile(cancerStudy, resourceFile, resourceType, relaxed); + importData(); + + if (getResourceType() == ResourceType.PATIENT) { + ProgressMonitor.setCurrentMessage("Total number of patient specific resources added: " + + getNumPatientSpecificResourcesAdded()); + } + if (getResourceType() == ResourceType.SAMPLE) { + ProgressMonitor.setCurrentMessage("Total number of sample specific resources added: " + + getNumSampleSpecificResourcesAdded()); + ProgressMonitor.setCurrentMessage("Total number of samples processed: " + getNumSamplesProcessed()); + } + ProgressMonitor.setCurrentMessage("Total number of resource values skipped because of empty value: " + + getNumEmptyResourcesSkipped()); + if (getResourceType() == ResourceType.SAMPLE + && (getNumSampleSpecificResourcesAdded() + getNumSamplesProcessed()) == 0) { + // should not occur: + throw new RuntimeException("No sample resources data was added. " + "Please check your file format and try again."); + } + if (getResourceType() == ResourceType.PATIENT + && getNumPatientSpecificResourcesAdded() == 0) { + // could occur if patient resource file is given with only PATIENT_ID column: + throw new RuntimeException("No patient resources data was added. " + + "Please check your file format and try again. If you only have sample resources data, then a patients file with only PATIENT_ID column is not required."); + } + if (getResourceType() == ResourceType.STUDY + && getNumStudySpecificResourcesAdded() == 0) { + throw new RuntimeException("No study resource data was added. " + + "Please check your file format and try again."); + } + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + /** + * Makes an instance to run with the given command line arguments. + * + * @param args the command line arguments to be used + */ + public ImportResourceData(String[] args) { + super(args); + } + + /** + * Runs the command as a script and exits with an appropriate exit code. + * + * @param args the arguments given on the command line + */ + public static void main(String[] args) { + ConsoleRunnable runner = new ImportResourceData(args); + runner.runInConsole(); + } +} diff --git a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceDefinition.java b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceDefinition.java new file mode 100644 index 00000000000..e36801fcf67 --- /dev/null +++ b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceDefinition.java @@ -0,0 +1,331 @@ +package org.mskcc.cbio.portal.scripts; + +import org.cbioportal.model.ResourceType; +import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.model.*; +import org.mskcc.cbio.portal.util.*; + +import java.io.*; +import joptsimple.*; +import java.util.*; + +public class ImportResourceDefinition extends ConsoleRunnable { + + public static final String DELIMITER = "\t"; + public static final String RESOURCE_ID_COLUMN_NAME = "RESOURCE_ID"; + public static final String DISPLAY_NAME_COLUMN_NAME = "DISPLAY_NAME"; + public static final String DESCRIPTION_COLUMN_NAME = "DESCRIPTION"; + public static final String RESOURCE_TYPE_COLUMN_NAME = "RESOURCE_TYPE"; + public static final String OPEN_BY_DEFAULT_COLUMN_NAME = "OPEN_BY_DEFAULT"; + public static final String PRIORITY_COLUMN_NAME = "PRIORITY"; + private int numResourceDefinitionsAdded = 0; + + private static Properties properties; + + private File resourceDataFile; + private CancerStudy cancerStudy; + private boolean relaxed; + + public void setFile(CancerStudy cancerStudy, File resourceDataFile, boolean relaxed) { + this.cancerStudy = cancerStudy; + this.resourceDataFile = resourceDataFile; + this.relaxed = relaxed; + } + + public void importData() throws Exception { + // if bulkLoading is ever turned off, + // code has to be added to check whether + // a resource data update should be + // perform instead of an insert + MySQLbulkLoader.bulkLoadOn(); + + if (relaxed) { + MySQLbulkLoader.relaxedModeOn(); + } + + FileReader reader = new FileReader(resourceDataFile); + BufferedReader buff = new BufferedReader(reader); + + String line = buff.readLine(); + String[] headerNames = splitFields(line); + Map headerIndexMap = makeHeaderIndexMap(headerNames); + + // validate columns and get index + int resourceIdIndex = findAndValidateResourceIdColumn(headerIndexMap); + int displayNameIndex = findAndValidateDisplayNameColumn(headerIndexMap); + int descriptionIndex = findAndValidateDescriptionColumn(headerIndexMap); + int resourceTypeIndex = findAndValidateResourceTypeColumn(headerIndexMap); + int openByDefaultIndex = findAndValidateOpenByDefaultColumn(headerIndexMap); + int priorityIndex = findAndValidatePriorityColumn(headerIndexMap); + + while ((line = buff.readLine()) != null) { + if (skipLine(line.trim())) { + continue; + } + + String[] fieldValues = getFieldValues(line, headerIndexMap); + + // set default value + String resourceId = ""; + String displayName = ""; + String description = ""; + ResourceType resourceType = null; + Boolean openByDefault = false; + int priority = 1; + // get resource definitions from columns + // get resourceId + if (isValueNotMissing(fieldValues[resourceIdIndex].toUpperCase())) { + resourceId = fieldValues[resourceIdIndex].toUpperCase(); + } else { + throw new RuntimeException( + "Please provide a valid resource id"); + } + // get displayName + if (isValueNotMissing(fieldValues[displayNameIndex])) { + displayName = fieldValues[displayNameIndex]; + } + else { + throw new RuntimeException( + "Please provide a valid display name"); + } + // get description (optional) + if (isValueNotMissing(fieldValues[descriptionIndex])) { + description = fieldValues[descriptionIndex]; + } + // get resourceType (must be value of ResourceTypes) + if (isValidResourceType(fieldValues[resourceTypeIndex])) { + resourceType = ResourceType.valueOf(fieldValues[resourceTypeIndex]); + } + // get openByDefault (optional) + if (isValidOpenByDefault(fieldValues[openByDefaultIndex])) { + openByDefault = Boolean.parseBoolean(fieldValues[openByDefaultIndex]); + } + // get priority (optional) + try { + priority = Integer.parseInt(fieldValues[priorityIndex]); + } catch (NumberFormatException ex) { + throw new DaoException( + "priority cannot be parsed as an integer, all priority should be an integer."); + } + + // add resource definitions into database + ResourceDefinition resource = new ResourceDefinition(resourceId, displayName, + description, resourceType, openByDefault, + priority, cancerStudy.getInternalId()); + + ResourceDefinition resourceInDb = DaoResourceDefinition.getDatum(resource.getResourceId(), + cancerStudy.getInternalId()); + if (resourceInDb != null) { + ProgressMonitor.logWarning("Resource " + resourceInDb.getResourceId() + " found twice in your study!"); + continue; + } + if (DaoResourceDefinition.addDatum(resource) > 0) { + numResourceDefinitionsAdded++; + } + } + buff.close(); + + if (MySQLbulkLoader.isBulkLoad()) { + MySQLbulkLoader.flushAll(); + MySQLbulkLoader.relaxedModeOff(); + } + } + + private String[] splitFields(String line) throws IOException { + return line.split(DELIMITER, -1); + } + + private boolean skipLine(String line) { + return line.isEmpty(); + } + + /** + * Takes in the given line and returns the list of field values by splitting the + * line on DELIMITER. + * + * @param line + * @param headerIndexMap + * @return the list of values, one for each column. Value will be "" for empty + * columns. + */ + private String[] getFieldValues(String line, Map headerIndexMap) { + // split on delimiter: + String[] fieldValues = line.split(DELIMITER, -1); + + // validate: if number of fields is incorrect, give exception + if (fieldValues.length != headerIndexMap.size()) { + throw new IllegalArgumentException("Number of columns in line is not as expected. Expected: " + + headerIndexMap.size() + " columns, found: " + fieldValues.length + ", for line: " + line); + } + + // now iterate over lines and trim each value: + for (int i = 0; i < fieldValues.length; i++) { + fieldValues[i] = fieldValues[i].trim(); + } + return fieldValues; + } + + private Boolean isValueNotMissing(String value) { + return !MissingValues.has(value); + } + + private Boolean isValidResourceType(String value) { + try { + ResourceType.valueOf(value); + } catch (IllegalArgumentException ex) { + throw new IllegalArgumentException("Resource_Type should be one of the following : 'SAMPLE', 'PATIENT' or 'STUDY'." + + "found: " + value); + } + return true; + } + + private Boolean isValidOpenByDefault(String value) { + if (value.equalsIgnoreCase("true") || value.equalsIgnoreCase("false")) { + return true; + } + // open by default value not valid, will set as false by default + ProgressMonitor.logWarning("OpenByDefault value is not true or false, set to false by default."); + return false; + } + + private Map makeHeaderIndexMap(String[] headerNames) { + Map headerIndexMap = new HashMap(); + for (int i= 0; i < headerNames.length; i++) { + headerIndexMap.put(headerNames[i], i); + } + return headerIndexMap; + } + + private int findAndValidateResourceIdColumn(Map headerIndexMap) { + return findAndValidateColumnIndexInHeaders(RESOURCE_ID_COLUMN_NAME, headerIndexMap); + } + + private int findAndValidateDisplayNameColumn(Map headerIndexMap) { + return findAndValidateColumnIndexInHeaders(DISPLAY_NAME_COLUMN_NAME, headerIndexMap); + } + + private int findAndValidateDescriptionColumn(Map headerIndexMap) { + return findAndValidateColumnIndexInHeaders(DESCRIPTION_COLUMN_NAME, headerIndexMap); + } + + private int findAndValidateResourceTypeColumn(Map headerIndexMap) { + return findAndValidateColumnIndexInHeaders(RESOURCE_TYPE_COLUMN_NAME, headerIndexMap); + } + + private int findAndValidateOpenByDefaultColumn(Map headerIndexMap) { + return findAndValidateColumnIndexInHeaders(OPEN_BY_DEFAULT_COLUMN_NAME, headerIndexMap); + } + + private int findAndValidatePriorityColumn(Map headerIndexMap) { + return findAndValidateColumnIndexInHeaders(PRIORITY_COLUMN_NAME, headerIndexMap); + } + + private int findAndValidateColumnIndexInHeaders(String columnHeader, Map headerIndexMap) { + if (headerIndexMap.containsKey(columnHeader)) { + return headerIndexMap.get(columnHeader); + } + throw new RuntimeException("Aborting owing to failure to find " + columnHeader + + " in file. Please check your file format and try again."); + } + + public int getNumResourceDefinitionsAdded() { + return numResourceDefinitionsAdded; + } + + /** + * Imports resource definition data (from the worksheet) + */ + public void run() { + try { + String progName = "importResourceDefinition"; + String description = "Import resource definition file"; + // usage: --data --meta --loadMode + // [directLoad|bulkLoad (default)] [--noprogress] + + OptionParser parser = new OptionParser(); + OptionSpec data = parser.accepts("data", "profile data file").withRequiredArg() + .describedAs("data_file.txt").ofType(String.class); + OptionSpec meta = parser.accepts("meta", "meta (description) file").withOptionalArg() + .describedAs("meta_file.txt").ofType(String.class); + OptionSpec study = parser.accepts("study", "cancer study id").withOptionalArg().describedAs("study") + .ofType(String.class); + OptionSpec relaxedFlag = parser.accepts("r", + "(not recommended) Flag for relaxed mode, determining how to handle detected data harmonization problems in the same study") + .withOptionalArg().describedAs("r").ofType(String.class); + parser.accepts("loadMode", "direct (per record) or bulk load of data").withOptionalArg() + .describedAs("[directLoad|bulkLoad (default)]").ofType(String.class); + parser.accepts("noprogress", + "this option can be given to avoid the messages regarding memory usage and % complete"); + + OptionSet options = null; + try { + options = parser.parse(args); + } catch (OptionException e) { + throw new UsageException(progName, description, parser, e.getMessage()); + } + File resourceFile = null; + if (options.has(data)) { + resourceFile = new File(options.valueOf(data)); + } else { + throw new UsageException(progName, description, parser, "'data' argument required."); + } + boolean relaxed = false; + String cancerStudyStableId = null; + if (options.has(study)) { + cancerStudyStableId = options.valueOf(study); + } + if (options.has(meta)) { + properties = new TrimmedProperties(); + properties.load(new FileInputStream(options.valueOf(meta))); + cancerStudyStableId = properties.getProperty("cancer_study_identifier"); + } + if (options.has(relaxedFlag)) { + relaxed = true; + + } + SpringUtil.initDataSource(); + CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyStableId); + if (cancerStudy == null) { + throw new IllegalArgumentException("Unknown cancer study: " + cancerStudyStableId); + } + ProgressMonitor.setCurrentMessage("Reading data from: " + resourceFile.getAbsolutePath()); + int numLines = FileUtil.getNumLines(resourceFile); + ProgressMonitor.setCurrentMessage(" --> total number of lines: " + numLines); + ProgressMonitor.setMaxValue(numLines); + + setFile(cancerStudy, resourceFile, relaxed); + importData(); + + // log import information + ProgressMonitor.setCurrentMessage( + "Total number of resource definitions added: " + getNumResourceDefinitionsAdded()); + if (getNumResourceDefinitionsAdded() == 0) { + throw new RuntimeException( + "No resource definition was added. " + "Please check your file format and try again."); + } + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + /** + * Makes an instance to run with the given command line arguments. + * + * @param args the command line arguments to be used + */ + public ImportResourceDefinition(String[] args) { + super(args); + } + + /** + * Runs the command as a script and exits with an appropriate exit code. + * + * @param args the arguments given on the command line + */ + public static void main(String[] args) { + ConsoleRunnable runner = new ImportResourceDefinition(args); + runner.runInConsole(); + } +} diff --git a/core/src/main/java/org/mskcc/cbio/portal/servlet/CnaJSON.java b/core/src/main/java/org/mskcc/cbio/portal/servlet/CnaJSON.java index d48d53a7873..85d4075b108 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/servlet/CnaJSON.java +++ b/core/src/main/java/org/mskcc/cbio/portal/servlet/CnaJSON.java @@ -36,6 +36,7 @@ import org.mskcc.cbio.portal.dao.*; import org.mskcc.cbio.portal.util.*; import org.apache.log4j.Logger; +import org.cbioportal.model.CNA; import org.codehaus.jackson.map.ObjectMapper; import java.io.*; @@ -296,8 +297,8 @@ private Map> getDrugs(List cnaEvents, boolean fdaOnl for (CnaEvent cnaEvent : cnaEvents) { long gene = cnaEvent.getEntrezGeneId(); - if (cnaEvent.getAlteration()==CnaEvent.CNA.AMP - ||cnaEvent.getAlteration()==CnaEvent.CNA.GAIN) { // since drugs are usually intibiting + if (cnaEvent.getAlteration()==CNA.AMP + ||cnaEvent.getAlteration()==CNA.GAIN) { // since drugs are usually intibiting genes.add(gene); } @@ -471,15 +472,15 @@ private void exportCopyNumberSegment(List list, CopyNumberSegment seg) list.add(row); } - private static final Map>> gisticMap // map from cancer study id - = new HashMap>>(); // to map from gene to a list of params + private static final Map>> gisticMap // map from cancer study id + = new HashMap>>(); // to map from gene to a list of params - private static List getGistic(int cancerStudyId, String gene, CnaEvent.CNA cna) throws DaoException { - Map> mapGeneGistic; + private static List getGistic(int cancerStudyId, String gene, CNA cna) throws DaoException { + Map> mapGeneGistic; synchronized(gisticMap) { mapGeneGistic = gisticMap.get(cancerStudyId); if (mapGeneGistic == null) { - mapGeneGistic = new HashMap>(); + mapGeneGistic = new HashMap>(); gisticMap.put(cancerStudyId, mapGeneGistic); List gistics = DaoGistic.getAllGisticByCancerStudyId(cancerStudyId); for (Gistic g : gistics) { @@ -491,9 +492,9 @@ private static List getGistic(int cancerStudyId, String gene, CnaEvent.CNA cna) l.add(g.getqValue()); l.add(genes.size()); for (String hugo : genes) { - Map mapCC = mapGeneGistic.get(hugo); + Map mapCC = mapGeneGistic.get(hugo); if (mapCC==null) { - mapCC = new EnumMap(CnaEvent.CNA.class); + mapCC = new EnumMap(CNA.class); mapGeneGistic.put(hugo, mapCC); } mapCC.put(cna,l); @@ -502,7 +503,7 @@ private static List getGistic(int cancerStudyId, String gene, CnaEvent.CNA cna) } } - Map m = mapGeneGistic.get(gene); + Map m = mapGeneGistic.get(gene); return m==null ? null : m.get(cna); } diff --git a/core/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java b/core/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java index a4fccccd0c4..b5b362270ea 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java +++ b/core/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java @@ -226,6 +226,74 @@ public static OptionSet parseStandardDataAndStudyOptions(String[] args, String d "Error: 'study' argument required."); } + return options; + } + + /** + * Default method to be used when Importer class main method expects only 'data' and 'meta' as mandatory options + * and an optional 'loadMode' parameter and an optional 'update-info' parameter + * + * @param args: the same args given to main() method of the tool + * @param description: short description of the tool (to display in the usage line if necessary) + * @param hasLoadMode: set to true to let this method validate whether the command line argument loadMode was given + * + * @return the parsed options + */ + public static OptionSet parseStandardDataAndMetaUpdateOptions(String[] args, String description, boolean hasLoadMode) { + // using a real options parser, helps avoid bugs + OptionParser parser = new OptionParser(); + parser.accepts("noprogress", "this option can be given to avoid the messages regarding memory usage and % complete"); + OptionSpec help = parser.accepts( "help", "print this help info" ); + parser.accepts( "data", "profile data file" ).withRequiredArg().describedAs( "data_file.txt" ).ofType( String.class ); + parser.accepts( "update-info", "Update information for existing entities in the database").withOptionalArg().ofType(String.class); + parser.accepts( "meta", "meta (description) file" ).withRequiredArg().describedAs( "meta_file.txt" ).ofType( String.class ); + if (hasLoadMode) { + parser.accepts( "loadMode", "direct (per record) or bulk load of data" ) + .withRequiredArg().describedAs( "[directLoad|bulkLoad (default)]" ).ofType( String.class ); + } + String progName = "importScript"; + + OptionSet options = null; + try { + options = parser.parse( args ); + } catch (OptionException e) { + throw new UsageException(progName, description, parser, + e.getMessage()); + } + + if( options.has( help ) ){ + throw new UsageException(progName, description, parser); + } + + //these extra checks are needed, since withRequiredArg above only indicated that the option + //has a mandatory argument but does not make the option itself mandatory. + if(!options.has("data")) { + throw new UsageException(progName, description, parser, + "Error: 'data' argument required."); + } + + if(!options.has("meta")) { + throw new UsageException(progName, description, parser, + "Error: 'meta' argument required."); + } + + if (hasLoadMode) { + if( options.has( "loadMode" ) ){ + String actionArg = (String) options.valueOf( "loadMode" ); + if (actionArg.equalsIgnoreCase("directLoad")) { + MySQLbulkLoader.bulkLoadOff(); + } else if (actionArg.equalsIgnoreCase( "bulkLoad" )) { + MySQLbulkLoader.bulkLoadOn(); + } else { + throw new UsageException(progName, description, parser, + "Error: unknown loadMode action: " + actionArg); + } + } + else { + throw new UsageException(progName, description, parser, + "Error: 'loadMode' argument required."); + } + } return options; } } diff --git a/core/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java b/core/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java index aa62eb6f27c..8d5be0c93de 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java +++ b/core/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java @@ -82,6 +82,19 @@ public static GeneticProfile loadGeneticProfile(File file) throws IOException, D throw new RuntimeException("Error: genetic_profile record found with same Stable ID as the one used in your data: " + existingGeneticProfile.getStableId() + ". Remove the existing genetic_profile record first."); } else if (geneticProfile.getDatatype().equals("FUSION")) { + String svStableId = existingGeneticProfile.getStableId().replace("mutations", "fusion"); + // check if structural variant genetic proile already exists for fusions + // if an auto-generated _fusion genetic profile exists, do not attempt to add it again + // otherwise, exception is thrown causing import to exit + // populate the structural variant genetic profile for fusions + GeneticProfile existingSVGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId(svStableId); + if (existingSVGeneticProfile == null ) { + GeneticProfile gp = new GeneticProfile(geneticProfile); + gp.setGeneticAlterationType(GeneticAlterationType.STRUCTURAL_VARIANT); + gp.setStableId(svStableId); + DaoGeneticProfile.addGeneticProfile(gp); + } + // return existing mutation genetic profile for fusions that are currently stored in the mutation_event table geneticProfile.setGeneticProfileId(existingGeneticProfile.getGeneticProfileId()); return geneticProfile; } else { diff --git a/core/src/main/java/org/mskcc/cbio/portal/util/GlobalProperties.java b/core/src/main/java/org/mskcc/cbio/portal/util/GlobalProperties.java index cfe5730212e..470df705168 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/util/GlobalProperties.java +++ b/core/src/main/java/org/mskcc/cbio/portal/util/GlobalProperties.java @@ -39,6 +39,7 @@ import org.springframework.security.core.context.SecurityContextHolder; import org.springframework.stereotype.Component; import org.springframework.util.ResourceUtils; +import org.springframework.util.StringUtils; import java.io.*; import java.nio.file.Files; @@ -142,8 +143,8 @@ public class GlobalProperties { public static final String STUDY_VIEW_MDACC_HEATMAP_URL = "mdacc.heatmap.study.url"; public static final String STUDY_VIEW_MDACC_HEATMAP_META_URL = "mdacc.heatmap.study.meta.url"; - public static final String ONCOKB_PUBLIC_API_URL = "oncokb.public_api.url"; public static final String SHOW_ONCOKB = "show.oncokb"; + public static final String ONCOKB_TOKEN = "oncokb.token"; private static String sessionServiceURL; @Value("${session.service.url:}") // default is empty string @@ -270,6 +271,10 @@ public class GlobalProperties { @Value("${show.genomenexus:true}") // default is true public void setShowGenomeNexus(String property) { showGenomeNexus = Boolean.parseBoolean(property); } + private static boolean showMutationMapperToolGrch38; + @Value("${show.mutation_mappert_tool.grch38:true}") // default is true + public void setShowMutationMapperToolGrch38(String property) { showMutationMapperToolGrch38 = Boolean.parseBoolean(property); } + private static boolean datRevokeOtherTokens; @Value("${dat.uuid.revoke_other_tokens:true}") // default is true public void setDatRevokeOtherTokens(String property) { datRevokeOtherTokens = Boolean.parseBoolean(property);} @@ -313,6 +318,10 @@ public static String parseUrl(String url) @Value("${genomenexus.url:v1.genomenexus.org}") // default public void setGenomeNexusApiUrl(String property) { genomeNexusApiUrl = parseUrl(property); } + private static String genomeNexusGrch38ApiUrl; + @Value("${genomenexus.url.grch38:grch38.genomenexus.org}") // default + public void setGenomeNexusGrch38ApiUrl(String property) { genomeNexusGrch38ApiUrl = parseUrl(property); } + private static String frontendUrl; @Value("${frontend.url:}") // default is empty string public void setFrontendUrl(String property) { frontendUrl = parseUrl(property); } @@ -858,28 +867,6 @@ public static String getSessionServicePassword() return sessionServicePassword; } - public static String getOncoKBPublicApiUrl() - { - String oncokbApiUrl = portalProperties.getProperty(ONCOKB_PUBLIC_API_URL); - String showOncokb = portalProperties.getProperty(SHOW_ONCOKB); - - if(showOncokb == null || showOncokb.isEmpty()) { - showOncokb = "true"; - } - - // Empty string should be used if you want to disable the OncoKB annotation. - if(oncokbApiUrl == null || oncokbApiUrl.isEmpty()) { - oncokbApiUrl = "oncokb.org/api/v1"; - } - - if(showOncokb.equals("true")) { - return oncokbApiUrl; - } else { - return ""; - } - - } - public static String getCivicUrl() { return civicUrl; } @@ -888,6 +875,10 @@ public static String getGenomeNexusApiUrl() { return genomeNexusApiUrl; } + public static String getGenomeNexusGrch38ApiUrl() { + return genomeNexusGrch38ApiUrl; + } + public static boolean showOncoKB() { String showOncokb = portalProperties.getProperty(SHOW_ONCOKB); if (showOncokb==null || showOncokb.isEmpty()) { @@ -922,6 +913,10 @@ public static boolean showGenomeNexus() { return showGenomeNexus; } + public static boolean showMutationMapperToolGrch38() { + return showMutationMapperToolGrch38; + } + public static String getFrontendUrl() { if (frontendUrlRuntime.length() > 0) { try { @@ -1189,4 +1184,8 @@ public static String getDatMethod() { public static String getReferenceGenomeName() { return portalProperties.getProperty(UCSC_BUILD, DEFAULT_UCSC_BUILD); } + + public static String getOncoKbToken() { + return portalProperties.getProperty(ONCOKB_TOKEN, null); + } } \ No newline at end of file diff --git a/core/src/main/java/org/mskcc/cbio/portal/util/MissingValues.java b/core/src/main/java/org/mskcc/cbio/portal/util/MissingValues.java new file mode 100644 index 00000000000..d52dad8ef4a --- /dev/null +++ b/core/src/main/java/org/mskcc/cbio/portal/util/MissingValues.java @@ -0,0 +1,39 @@ +package org.mskcc.cbio.portal.util; + +public enum MissingValues { + NOT_APPLICABLE("Not Applicable"), + NOT_AVAILABLE("Not Available"), + PENDING("Pending"), + DISCREPANCY("Discrepancy"), + COMPLETED("Completed"), + NULL("null"), MISSING(""), + NA("NA"); + + private String propertyName; + + MissingValues(String propertyName) { + this.propertyName = propertyName; + } + + public String toString() { + return propertyName; + } + + static public boolean has(String value) { + if (value == null) + return false; + if (value.trim().equals("")) + return true; + try { + value = value.replaceAll("[\\[|\\]]", ""); + value = value.replaceAll(" ", "_"); + return valueOf(value.toUpperCase()) != null; + } catch (IllegalArgumentException x) { + return false; + } + } + + static public String getNotAvailable() { + return "[" + NOT_AVAILABLE.toString() + "]"; + } +} diff --git a/core/src/main/scripts/importer/allowed_data_types.txt b/core/src/main/scripts/importer/allowed_data_types.txt index 39d6bfbf28f..d2d276c6e22 100644 --- a/core/src/main/scripts/importer/allowed_data_types.txt +++ b/core/src/main/scripts/importer/allowed_data_types.txt @@ -35,6 +35,16 @@ MRNA_EXPRESSION CONTINUOUS mrna_seq_fpkm_capture MRNA_EXPRESSION Z-SCORE mrna_seq_fpkm_capture_Zscores MRNA_EXPRESSION CONTINUOUS mrna_seq_fpkm_polya MRNA_EXPRESSION Z-SCORE mrna_seq_fpkm_polya_Zscores +MRNA_EXPRESSION Z-SCORE mrna_U133_all_sample_Zscores +MRNA_EXPRESSION Z-SCORE mrna_all_sample_Zscores +MRNA_EXPRESSION Z-SCORE rna_seq_mrna_median_all_sample_Zscores +MRNA_EXPRESSION Z-SCORE mrna_median_all_sample_Zscores +MRNA_EXPRESSION Z-SCORE rna_seq_v2_mrna_median_all_sample_Zscores +MRNA_EXPRESSION Z-SCORE mrna_seq_cpm_all_sample_Zscores +MRNA_EXPRESSION Z-SCORE mrna_seq_tpm_all_sample_Zscores +MRNA_EXPRESSION Z-SCORE rna_seq_mrna_capture_all_sample_Zscores +MRNA_EXPRESSION Z-SCORE mrna_seq_fpkm_capture_all_sample_Zscores +MRNA_EXPRESSION Z-SCORE mrna_seq_fpkm_polya_all_sample_Zscores METHYLATION CONTINUOUS methylation_hm27 METHYLATION CONTINUOUS methylation_hm450 FUSION FUSION fusion diff --git a/core/src/main/scripts/importer/cbioportalImporter.py b/core/src/main/scripts/importer/cbioportalImporter.py index 5fbbb18c5a7..a5f04ce50c6 100755 --- a/core/src/main/scripts/importer/cbioportalImporter.py +++ b/core/src/main/scripts/importer/cbioportalImporter.py @@ -81,7 +81,7 @@ def update_study_status(jvm_args, study_id): args.append("--noprogress") # don't report memory usage and % progress run_java(*args) -def remove_study(jvm_args, meta_filename): +def remove_study_meta(jvm_args, meta_filename): args = jvm_args.split(' ') args.append(REMOVE_STUDY_CLASS) meta_dictionary = cbioportal_common.parse_metadata_file( @@ -94,8 +94,15 @@ def remove_study(jvm_args, meta_filename): args.append("--noprogress") # don't report memory usage and % progress run_java(*args) +def remove_study_id(jvm_args, study_id): + args = jvm_args.split(' ') + args.append(REMOVE_STUDY_CLASS) + args.append(study_id) + args.append("--noprogress") # don't report memory usage and % progress + run_java(*args) -def import_study_data(jvm_args, meta_filename, data_filename, meta_file_dictionary = None): + +def import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, meta_file_dictionary = None): args = jvm_args.split(' ') # In case the meta file is already parsed in a previous function, it is not @@ -107,6 +114,11 @@ def import_study_data(jvm_args, meta_filename, data_filename, meta_file_dictiona # Retrieve meta file type meta_file_type = meta_file_dictionary['meta_file_type'] + # Do not update entities by default + shouldUpdateGenericAssayEntities = False + if update_generic_assay_entity != None and update_generic_assay_entity.casefold() == "True".casefold(): + shouldUpdateGenericAssayEntities = True + # invalid file, skip if meta_file_type is None: print(("Unrecognized meta file type '%s', skipping file" @@ -126,6 +138,12 @@ def import_study_data(jvm_args, meta_filename, data_filename, meta_file_dictiona args.append(meta_filename) args.append("--loadMode") args.append("bulkload") + if importer == "org.mskcc.cbio.portal.scripts.ImportProfileData" and shouldUpdateGenericAssayEntities: + args.append("--update-info") + args.append("True") + elif importer == "org.mskcc.cbio.portal.scripts.ImportProfileData" and not shouldUpdateGenericAssayEntities: + args.append("--update-info") + args.append("False") if importer in ("org.mskcc.cbio.portal.scripts.ImportMutSigData", "org.mskcc.cbio.portal.scripts.ImportGisticData"): args.append("--data") args.append(data_filename) @@ -179,19 +197,26 @@ def process_case_lists(jvm_args, case_list_dir): if not (case_list.startswith('.') or case_list.endswith('~')): import_case_list(jvm_args, os.path.join(case_list_dir, case_list)) -def process_command(jvm_args, command, meta_filename, data_filename): +def process_command(jvm_args, command, meta_filename, data_filename, study_ids, update_generic_assay_entity = None): if command == IMPORT_CANCER_TYPE: import_cancer_type(jvm_args, data_filename) elif command == IMPORT_STUDY: import_study(jvm_args, meta_filename) elif command == REMOVE_STUDY: - remove_study(jvm_args, meta_filename) + if study_ids == None: + remove_study_meta(jvm_args, meta_filename) + elif meta_filename == None: + study_ids = study_ids.split(",") + for study_id in study_ids: + remove_study_id(jvm_args, study_id) + else: + raise RuntimeError('Your command uses both -id and -meta. Please, use only one of the two parameters.') elif command == IMPORT_STUDY_DATA: - import_study_data(jvm_args, meta_filename, data_filename) + import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity) elif command == IMPORT_CASE_LIST: import_case_list(jvm_args, meta_filename) -def process_directory(jvm_args, study_directory): +def process_directory(jvm_args, study_directory, update_generic_assay_entity = None): """ Import an entire study directory based on meta files found. @@ -205,6 +230,8 @@ def process_directory(jvm_args, study_directory): study_meta_dictionary = {} cancer_type_filepairs = [] sample_attr_filepair = None + sample_resource_filepair = None + resource_definition_filepair = None regular_filepairs = [] gene_panel_matrix_filepair = None zscore_filepairs = [] @@ -257,6 +284,14 @@ def process_directory(jvm_args, study_directory): # Determine the study meta filename study_meta_filename = meta_filename study_meta_dictionary[study_meta_filename] = meta_dictionary + # Check for resource definitions + elif meta_file_type == MetaFileTypes.RESOURCES_DEFINITION: + if resource_definition_filepair is not None: + raise RuntimeError( + 'Multiple resource definition files found: {} and {}'.format( + resource_definition_filepair[0], meta_filename)) # pylint: disable=unsubscriptable-object + resource_definition_filepair = ( + meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])) # Check for sample attributes elif meta_file_type == MetaFileTypes.SAMPLE_ATTRIBUTES: if sample_attr_filepair is not None: @@ -265,6 +300,13 @@ def process_directory(jvm_args, study_directory): sample_attr_filepair[0], meta_filename)) # pylint: disable=unsubscriptable-object sample_attr_filepair = ( meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])) + elif meta_file_type == MetaFileTypes.SAMPLE_RESOURCES: + if sample_resource_filepair is not None: + raise RuntimeError( + 'Multiple sample resource files found: {} and {}'.format( + sample_resource_filepair[0], meta_filename)) # pylint: disable=unsubscriptable-object + sample_resource_filepair = ( + meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])) # Check for gene panel matrix elif meta_file_type == MetaFileTypes.GENE_PANEL_MATRIX: gene_panel_matrix_filepair = ( @@ -299,7 +341,7 @@ def process_directory(jvm_args, study_directory): raise RuntimeError('No meta_study file found') else: # First remove study if exists - remove_study(jvm_args, study_meta_filename) + remove_study_meta(jvm_args, study_meta_filename) import_study(jvm_args, study_meta_filename) # Next, we need to import sample definitions @@ -307,37 +349,47 @@ def process_directory(jvm_args, study_directory): raise RuntimeError('No sample attribute file found') else: meta_filename, data_filename = sample_attr_filepair - import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) + import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + + # Next, we need to import resource definitions for resource data + if resource_definition_filepair is not None: + meta_filename, data_filename = resource_definition_filepair + import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + + # Next, we need to import sample definitions for resource data + if sample_resource_filepair is not None: + meta_filename, data_filename = sample_resource_filepair + import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Next, import everything else except gene panel, fusion data, GSVA and # z-score expression. If in the future more types refer to each other, (like # in a tree structure) this could be programmed in a recursive fashion. for meta_filename, data_filename in regular_filepairs: - import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) + import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Import fusion data (after mutation) if fusion_filepair is not None: meta_filename, data_filename = fusion_filepair - import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) + import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Import expression z-score (after expression) for meta_filename, data_filename in zscore_filepairs: - import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) + import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Import GSVA genetic profiles (after expression and z-scores) if gsva_score_filepair is not None: # First import the GSVA score data meta_filename, data_filename = gsva_score_filepair - import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) + import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Second import the GSVA p-value data meta_filename, data_filename = gsva_pvalue_filepair - import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) + import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) if gene_panel_matrix_filepair is not None: meta_filename, data_filename = gene_panel_matrix_filepair - import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) + import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Import the case lists case_list_dirname = os.path.join(study_directory, 'case_lists') @@ -357,6 +409,7 @@ def usage(): '--command [%s] --study_directory ' '--meta_filename ' '--data_filename ' + '--study_ids ' '--properties-filename ' % (COMMANDS)), file=OUTPUT_FILE) def check_args(command): @@ -379,12 +432,7 @@ def check_dir(study_directory): print('Study cannot be found: ' + study_directory, file=ERROR_FILE) sys.exit(2) -def interface(): - parser = argparse.ArgumentParser(description='cBioPortal meta Importer') - parser.add_argument('-c', '--command', type=str, required=False, - help='Command for import. Allowed commands: import-cancer-type, ' - 'import-study, import-study-data, import-case-list or ' - 'remove-study') +def add_parser_args(parser): parser.add_argument('-s', '--study_directory', type=str, required=False, help='Path to Study Directory') parser.add_argument('-jar', '--jar_path', type=str, required=False, @@ -393,10 +441,43 @@ def interface(): help='Path to meta file') parser.add_argument('-data', '--data_filename', type=str, required=False, help='Path to Data file') + +def interface(): + parent_parser = argparse.ArgumentParser(description='cBioPortal meta Importer') + add_parser_args(parent_parser) + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(title='subcommands', dest='subcommand', + help='Command for import. Allowed commands: import-cancer-type, ' + 'import-study, import-study-data, import-case-list or ' + 'remove-study') + import_cancer_type = subparsers.add_parser('import-cancer-type', parents=[parent_parser], add_help=False) + import_study = subparsers.add_parser('import-study', parents=[parent_parser], add_help=False) + import_study_data = subparsers.add_parser('import-study-data', parents=[parent_parser], add_help=False) + import_case_list = subparsers.add_parser('import-case-list', parents=[parent_parser], add_help=False) + remove_study = subparsers.add_parser('remove-study', parents=[parent_parser], add_help=False) + + remove_study.add_argument('-id', '--study_ids', type=str, required=False, + help='Cancer Study IDs for `remove-study` command, comma separated') + parser.add_argument('-c', '--command', type=str, required=False, + help='This argument is outdated. Please use the listed subcommands, without the -c flag. ' + 'Command for import. Allowed commands: import-cancer-type, ' + 'import-study, import-study-data, import-case-list or ' + 'remove-study') + add_parser_args(parser) + parser.add_argument('-id', '--study_ids', type=str, required=False, + help='Cancer Study IDs for `remove-study` command, comma separated') + + parser.add_argument('-update', '--update_generic_assay_entity', type=str, required=False, + help='Set as True to update the existing generic assay entities, set as False to keep the existing generic assay entities for generic assay') # TODO - add same argument to metaimporter # TODO - harmonize on - and _ parser = parser.parse_args() + if parser.command is not None and parser.subcommand is not None: + print('Cannot call multiple commands') + sys.exit(2) + elif parser.subcommand is not None: + parser.command = parser.subcommand return parser @@ -449,11 +530,11 @@ def main(args): if study_directory != None: check_dir(study_directory) - process_directory(jvm_args, study_directory) + process_directory(jvm_args, study_directory, args.update_generic_assay_entity) else: check_args(args.command) check_files(args.meta_filename, args.data_filename) - process_command(jvm_args, args.command, args.meta_filename, args.data_filename) + process_command(jvm_args, args.command, args.meta_filename, args.data_filename, args.study_ids, args.update_generic_assay_entity) # ------------------------------------------------------------------------------ # ready to roll diff --git a/core/src/main/scripts/importer/cbioportal_common.py b/core/src/main/scripts/importer/cbioportal_common.py index d5fa8d55924..d17ba4a5859 100644 --- a/core/src/main/scripts/importer/cbioportal_common.py +++ b/core/src/main/scripts/importer/cbioportal_common.py @@ -62,6 +62,10 @@ class MetaFileTypes(object): GSVA_PVALUES = 'meta_gsva_pvalues' GENERIC_ASSAY = 'meta_generic_assay' STRUCTURAL_VARIANT = 'meta_structural_variants' + SAMPLE_RESOURCES = 'meta_resource_sample' + PATIENT_RESOURCES = 'meta_resource_patient' + STUDY_RESOURCES = 'meta_resource_study' + RESOURCES_DEFINITION = 'meta_resource_definition' # fields allowed in each meta file type, maps to True if required @@ -276,7 +280,27 @@ class MetaFileTypes(object): 'profile_description': True, 'data_filename': True, 'gene_panel': False, - } + }, + MetaFileTypes.SAMPLE_RESOURCES: { + 'cancer_study_identifier': True, + 'resource_type': True, + 'data_filename': True + }, + MetaFileTypes.PATIENT_RESOURCES: { + 'cancer_study_identifier': True, + 'resource_type': True, + 'data_filename': True + }, + MetaFileTypes.STUDY_RESOURCES: { + 'cancer_study_identifier': True, + 'resource_type': True, + 'data_filename': True + }, + MetaFileTypes.RESOURCES_DEFINITION: { + 'cancer_study_identifier': True, + 'resource_type': True, + 'data_filename': True + }, } IMPORTER_CLASSNAME_BY_META_TYPE = { @@ -301,7 +325,11 @@ class MetaFileTypes(object): MetaFileTypes.GSVA_SCORES: "org.mskcc.cbio.portal.scripts.ImportProfileData", MetaFileTypes.GSVA_PVALUES: "org.mskcc.cbio.portal.scripts.ImportProfileData", MetaFileTypes.GENERIC_ASSAY: "org.mskcc.cbio.portal.scripts.ImportProfileData", - MetaFileTypes.STRUCTURAL_VARIANT: "org.mskcc.cbio.portal.scripts.ImportProfileData" + MetaFileTypes.STRUCTURAL_VARIANT: "org.mskcc.cbio.portal.scripts.ImportProfileData", + MetaFileTypes.SAMPLE_RESOURCES: "org.mskcc.cbio.portal.scripts.ImportResourceData", + MetaFileTypes.PATIENT_RESOURCES: "org.mskcc.cbio.portal.scripts.ImportResourceData", + MetaFileTypes.STUDY_RESOURCES: "org.mskcc.cbio.portal.scripts.ImportResourceData", + MetaFileTypes.RESOURCES_DEFINITION: "org.mskcc.cbio.portal.scripts.ImportResourceDefinition", } IMPORTER_REQUIRES_METADATA = { @@ -311,7 +339,9 @@ class MetaFileTypes(object): "org.mskcc.cbio.portal.scripts.ImportMutSigData" : False, "org.mskcc.cbio.portal.scripts.ImportProfileData" : True, "org.mskcc.cbio.portal.scripts.ImportTimelineData" : True, - "org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap" : False + "org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap" : False, + "org.mskcc.cbio.portal.scripts.ImportResourceData" : True, + "org.mskcc.cbio.portal.scripts.ImportResourceDefinition" : True } # ------------------------------------------------------------------------------ @@ -583,6 +613,15 @@ def get_meta_file_type(meta_dictionary, logger, filename): result = MetaFileTypes.STUDY elif 'type_of_cancer' in meta_dictionary: result = MetaFileTypes.CANCER_TYPE + elif 'cancer_study_identifier' in meta_dictionary and 'resource_type' in meta_dictionary: + if meta_dictionary['resource_type'] == 'PATIENT': + result = MetaFileTypes.PATIENT_RESOURCES + elif meta_dictionary['resource_type'] == 'SAMPLE': + result = MetaFileTypes.SAMPLE_RESOURCES + elif meta_dictionary['resource_type'] == 'STUDY': + result = MetaFileTypes.STUDY_RESOURCES + elif meta_dictionary['resource_type'] == 'DEFINITION': + result = MetaFileTypes.RESOURCES_DEFINITION else: logger.error('Could not determine the file type. Did not find expected meta file fields. Please check your meta files for correct configuration.', extra={'filename_': filename}) diff --git a/core/src/main/scripts/importer/metaImport.py b/core/src/main/scripts/importer/metaImport.py index afdf5c0bcef..798c74b9ffa 100755 --- a/core/src/main/scripts/importer/metaImport.py +++ b/core/src/main/scripts/importer/metaImport.py @@ -105,6 +105,8 @@ def interface(): 'report. For example, set this to a high number to ' 'report all genes that could not be loaded, instead ' 'of reporting "(GeneA, GeneB, GeneC, 213 more)".') + parser.add_argument('-update', '--update_generic_assay_entity', type=str, required=False, default="False", + help='Set as True to update the existing generic assay entities, set as False to keep the existing generic assay entities for generic assay') parser = parser.parse_args() return parser diff --git a/core/src/main/scripts/importer/validateData.py b/core/src/main/scripts/importer/validateData.py index e5521d6610a..ce2e3ed5386 100755 --- a/core/src/main/scripts/importer/validateData.py +++ b/core/src/main/scripts/importer/validateData.py @@ -46,6 +46,7 @@ from base64 import urlsafe_b64encode import math from abc import ABCMeta, abstractmethod +from urllib.parse import urlparse # configure relative imports if running as a script; see PEP 366 # it might passed as empty string by certain tooling to mark a top level module @@ -74,6 +75,10 @@ mutation_file_sample_ids = set() fusion_file_sample_ids = set() +# resource globals +RESOURCE_DEFINITION_DICTIONARY = {} +RESOURCE_PATIENTS_WITH_SAMPLES = None + # globals required for gene set scoring validation prior_validated_sample_ids = None prior_validated_geneset_ids = None @@ -103,7 +108,11 @@ cbioportal_common.MetaFileTypes.GSVA_SCORES:'GsvaScoreValidator', cbioportal_common.MetaFileTypes.GSVA_PVALUES:'GsvaPvalueValidator', cbioportal_common.MetaFileTypes.GENERIC_ASSAY:'GenericAssayValidator', - cbioportal_common.MetaFileTypes.STRUCTURAL_VARIANT:'StructuralVariantValidator' + cbioportal_common.MetaFileTypes.STRUCTURAL_VARIANT:'StructuralVariantValidator', + cbioportal_common.MetaFileTypes.SAMPLE_RESOURCES:'SampleResourceValidator', + cbioportal_common.MetaFileTypes.PATIENT_RESOURCES:'PatientResourceValidator', + cbioportal_common.MetaFileTypes.STUDY_RESOURCES:'StudyResourceValidator', + cbioportal_common.MetaFileTypes.RESOURCES_DEFINITION:'ResourceDefinitionValidator', } @@ -2461,7 +2470,10 @@ def checkLine(self, data): pass elif data_type == 'NUMBER': if not self.checkFloat(value): - self.logger.error( + if (value[0] in ('>','<')) and value[1:].isdigit(): + pass + else: + self.logger.error( 'Value of numeric attribute is not a real number', extra={'line_number': self.line_number, 'column_number': col_index + 1, @@ -3487,6 +3499,354 @@ def checkLine(self, data): finally: self.logger.logger.removeHandler(tracking_handler) +class ResourceDefinitionValidator(Validator): + # 'RESOURCE_ID', 'RESOURCE_TYPE', 'DISPLAY_NAME' are required + REQUIRE_COLUMN_ORDER = False + REQUIRED_HEADERS = ['RESOURCE_ID', 'RESOURCE_TYPE', 'DISPLAY_NAME'] + NULL_VALUES = ["[not applicable]", "[not available]", "[pending]", "[discrepancy]", "[completed]", "[null]", "", "na"] + RESOURCE_TYPES = ["SAMPLE", "PATIENT", "STUDY"] + ALLOW_BLANKS = True + + def __init__(self, *args, **kwargs): + """Initialize a ResourceDefinitionValidator with the given parameters.""" + super(ResourceDefinitionValidator, self).__init__(*args, **kwargs) + self.resource_definition_dictionary = {} + + def checkLine(self, data): + """Check the values in a line of data.""" + super(ResourceDefinitionValidator, self).checkLine(data) + resource_id = '' + resource_type = '' + for col_index, col_name in enumerate(self.cols): + # treat cells beyond the end of the line as blanks, + # super().checkLine() has already logged an error + value = '' + if col_index < len(data): + value = data[col_index].strip() + + # make sure that RESOURCE_ID is present + if col_name == 'RESOURCE_ID': + if value.strip().lower() in self.NULL_VALUES: + self.logger.error( + 'Missing RESOURCE_ID', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + else: + resource_id = value + + # make sure that RESOURCE_TYPE is present and correct + if col_name == 'RESOURCE_TYPE': + if value.strip().lower() in self.NULL_VALUES: + self.logger.error( + 'Missing RESOURCE_TYPE', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + elif value.strip() not in self.RESOURCE_TYPES: + self.logger.error( + 'RESOURCE_TYPE is not one of the following : SAMPLE, PATIENT or STUDY', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + else: + resource_type = value + + # make sure that DISPLAY_NAME is present + if col_name == 'DISPLAY_NAME': + if value.strip().lower() in self.NULL_VALUES: + self.logger.error( + 'Missing DISPLAY_NAME', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + + # pass for other null columns + if value.strip().lower() in self.NULL_VALUES: + pass + + # validate if OPEN_BY_DEFAULT and priority are correct + if col_name == 'OPEN_BY_DEFAULT': + if not (value.strip().lower() == 'true' or value.strip().lower() == 'false'): + self.logger.error( + 'wrong value of OPEN_BY_DEFAULT, should be true or false', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + + if col_name == 'PRIORITY': + try: + int(value.strip()) + except ValueError: + self.logger.error( + 'wrong value of PRIORITY, the value should be integer', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + # add resource_id into dictionary + self.resource_definition_dictionary.setdefault(resource_id, []).append(resource_type) + +class ResourceValidator(Validator): + + """Abstract Validator class for resource data files. + + Subclasses define the columns that must be present in REQUIRED_HEADERS. + """ + + REQUIRE_COLUMN_ORDER = False + NULL_VALUES = ["[not applicable]", "[not available]", "[pending]", "[discrepancy]", "[completed]", "[null]", "", "na"] + ALLOW_BLANKS = True + + INVALID_ID_CHARACTERS = r'[^A-Za-z0-9._-]' + + def __init__(self, *args, **kwargs): + """Initialize the instance attributes of the data file validator.""" + super(ResourceValidator, self).__init__(*args, **kwargs) + + def url_validator(self, url): + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except: + return False + + def checkLine(self, data): + """Check the values in a line of data.""" + super(ResourceValidator, self).checkLine(data) + for col_index, col_name in enumerate(self.cols): + # treat cells beyond the end of the line as blanks, + # super().checkLine() has already logged an error + value = '' + if col_index < len(data): + value = data[col_index].strip() + + # make sure that RESOURCE_ID is present + if col_name == 'RESOURCE_ID': + if value.strip().lower() in self.NULL_VALUES: + self.logger.error( + 'Missing RESOURCE_ID', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + # make sure that RESOURCE_ID is defined in the resource definition file + if value not in RESOURCE_DEFINITION_DICTIONARY: + self.logger.error( + 'RESOURCE_ID is not defined in resource definition file', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + # if not blank, check if values match the datatype + if value.strip().lower() in self.NULL_VALUES: + pass + + if col_name == 'URL': + # value should be a url + # Characters that affect netloc parsing under NFKC normalization will raise ValueError + if self.url_validator(value.strip()): + pass + else: + self.logger.error( + 'Value of resource is not an url, url should start with http or https', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'column_name': col_name, + 'cause': value}) + # make sure that PATIENT_ID is present + if col_name == 'PATIENT_ID': + if value.strip().lower() in self.NULL_VALUES: + self.logger.error( + 'Missing PATIENT_ID', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + + if col_name == 'PATIENT_ID' or col_name == 'SAMPLE_ID': + if re.findall(self.INVALID_ID_CHARACTERS, value): + self.logger.error( + 'PATIENT_ID and SAMPLE_ID can only contain letters, ' + 'numbers, points, underscores and/or hyphens', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + +class SampleResourceValidator(ResourceValidator): + """Validator for files defining and setting sample-level attributes.""" + + REQUIRED_HEADERS = ['SAMPLE_ID', 'PATIENT_ID', 'RESOURCE_ID', 'URL'] + + def __init__(self, *args, **kwargs): + """Initialize a SampleResourceValidator with the given parameters.""" + super(SampleResourceValidator, self).__init__(*args, **kwargs) + self.sample_id_lines = {} + self.sampleIds = self.sample_id_lines.keys() + self.patient_ids = set() + self.defined_resources = {} + + def checkLine(self, data): + """Check the values in a line of data.""" + super(SampleResourceValidator, self).checkLine(data) + resource_id = '' + sample_id = '' + resource_url = '' + for col_index, col_name in enumerate(self.cols): + # treat cells beyond the end of the line as blanks, + # super().checkLine() has already logged an error + value = '' + if col_index < len(data): + value = data[col_index].strip() + # make sure RESOURCE_ID is defined correctly + if col_name == 'RESOURCE_ID': + if value not in RESOURCE_DEFINITION_DICTIONARY or 'SAMPLE' not in RESOURCE_DEFINITION_DICTIONARY[value]: + self.logger.error( + 'RESOURCE_ID for sample resource is not defined correctly in resource definition file', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + if value in RESOURCE_DEFINITION_DICTIONARY and len(RESOURCE_DEFINITION_DICTIONARY[value]) > 1: + self.logger.warning( + 'RESOURCE_ID for sample resource has been used by more than one RESOURCE_TYPE', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': RESOURCE_DEFINITION_DICTIONARY[value]}) + resource_id = value + if col_name == 'SAMPLE_ID': + if value.strip().lower() in self.NULL_VALUES: + self.logger.error( + 'Missing SAMPLE_ID', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + continue + if value not in self.sample_id_lines: + self.sample_id_lines[value] = self.line_number + sample_id = value + elif col_name == 'PATIENT_ID': + self.patient_ids.add(value) + if col_name == 'URL': + resource_url = value + # check duplicate + if (resource_id, sample_id, resource_url) in self.defined_resources: + self.logger.error( + 'Duplicated resources found', + extra={'line_number': self.line_number, + 'duplicated_line_number': self.defined_resources[(resource_id, sample_id, resource_url)]}) + else: + self.defined_resources[(resource_id, sample_id, resource_url)] = self.line_number + +class PatientResourceValidator(ResourceValidator): + + REQUIRED_HEADERS = ['PATIENT_ID', 'RESOURCE_ID', 'URL'] + + def __init__(self, *args, **kwargs): + """Initialize a PatientResourceValidator with the given parameters.""" + super(PatientResourceValidator, self).__init__(*args, **kwargs) + self.patient_id_lines = {} + self.defined_resources = {} + + def checkLine(self, data): + """Check the values in a line of data.""" + super(PatientResourceValidator, self).checkLine(data) + resource_id = '' + patient_id = '' + resource_url = '' + for col_index, col_name in enumerate(self.cols): + # treat cells beyond the end of the line as blanks, + # super().checkLine() has already logged an error + value = '' + if col_index < len(data): + value = data[col_index].strip() + # make sure RESOURCE_ID is defined correctly + if col_name == 'RESOURCE_ID': + if value not in RESOURCE_DEFINITION_DICTIONARY or 'PATIENT' not in RESOURCE_DEFINITION_DICTIONARY[value]: + self.logger.error( + 'RESOURCE_ID for patient resource is not defined correctly in resource definition file', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + if value in RESOURCE_DEFINITION_DICTIONARY and len(RESOURCE_DEFINITION_DICTIONARY[value]) > 1: + self.logger.warning( + 'RESOURCE_ID for patient resource has been used by more than one RESOURCE_TYPE', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': RESOURCE_DEFINITION_DICTIONARY[value]}) + resource_id = value + if col_name == 'PATIENT_ID': + if value not in RESOURCE_PATIENTS_WITH_SAMPLES: + self.logger.warning( + 'Resource data defined for a patient with ' + 'no samples', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + if value not in self.patient_id_lines: + self.patient_id_lines[value] = self.line_number + patient_id = value + if col_name == 'URL': + resource_url = value + # check duplicate + if (resource_id, patient_id, resource_url) in self.defined_resources: + self.logger.error( + 'Duplicated resources found', + extra={'line_number': self.line_number, + 'duplicated_line_number': self.defined_resources[(resource_id, patient_id, resource_url)]}) + else: + self.defined_resources[(resource_id, patient_id, resource_url)] = self.line_number + + def onComplete(self): + """Perform final validations based on the data parsed.""" + for patient_id in RESOURCE_PATIENTS_WITH_SAMPLES: + if patient_id not in self.patient_id_lines: + self.logger.warning( + 'Missing resource data for a patient associated with ' + 'samples', + extra={'cause': patient_id}) + super(PatientResourceValidator, self).onComplete() + +class StudyResourceValidator(ResourceValidator): + + REQUIRED_HEADERS = ['RESOURCE_ID', 'URL'] + + def __init__(self, *args, **kwargs): + """Initialize a StudyResourceValidator with the given parameters.""" + super(StudyResourceValidator, self).__init__(*args, **kwargs) + self.defined_resources = {} + + def checkLine(self, data): + """Check the values in a line of data.""" + super(StudyResourceValidator, self).checkLine(data) + resource_id = '' + resource_url = '' + for col_index, col_name in enumerate(self.cols): + # treat cells beyond the end of the line as blanks, + # super().checkLine() has already logged an error + value = '' + if col_index < len(data): + value = data[col_index].strip() + # make sure RESOURCE_ID is defined correctly + if col_name == 'RESOURCE_ID': + if value not in RESOURCE_DEFINITION_DICTIONARY or 'STUDY' not in RESOURCE_DEFINITION_DICTIONARY[value]: + self.logger.error( + 'RESOURCE_ID for study resource is not defined correctly in resource definition file', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': value}) + if value in RESOURCE_DEFINITION_DICTIONARY and len(RESOURCE_DEFINITION_DICTIONARY[value]) > 1: + self.logger.warning( + 'RESOURCE_ID for study resource has been used by more than one RESOURCE_TYPE', + extra={'line_number': self.line_number, + 'column_number': col_index + 1, + 'cause': RESOURCE_DEFINITION_DICTIONARY[value]}) + resource_id = value + if col_name == 'URL': + resource_url = value + # check duplicate + if (resource_id, resource_url) in self.defined_resources: + self.logger.error( + 'Duplicated resources found', + extra={'line_number': self.line_number, + 'duplicated_line_number': self.defined_resources[(resource_id, resource_url)]}) + else: + self.defined_resources[(resource_id, resource_url)] = self.line_number class GisticGenesValidator(Validator): @@ -3810,7 +4170,7 @@ def parseFeatureColumns(self, nonsample_col_vals): """Check the feature id column.""" - ALLOWED_CHARACTERS = r'[^A-Za-z0-9_-]' + ALLOWED_CHARACTERS = r'[^A-Za-z0-9_.-]' feature_id = nonsample_col_vals[0].strip() @@ -3821,7 +4181,7 @@ def parseFeatureColumns(self, nonsample_col_vals): elif re.search(ALLOWED_CHARACTERS, feature_id) is not None: self.logger.error('Feature id contains one or more illegal characters', extra={'line_number': self.line_number, - 'cause': 'id was`'+feature_id+'` and only alpha-numeric, _ and - are allowed.'}) + 'cause': 'id was`'+feature_id+'` and only alpha-numeric, _, . and - are allowed.'}) else: # Check if this is the second data file if self.get_prior_validated_feature_ids() is not None: @@ -4709,6 +5069,8 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ global DEFINED_SAMPLE_IDS global DEFINED_SAMPLE_ATTRIBUTES global PATIENTS_WITH_SAMPLES + global RESOURCE_DEFINITION_DICTIONARY + global RESOURCE_PATIENTS_WITH_SAMPLES if portal_instance.cancer_type_dict is None: logger.warning('Skipping validations relating to cancer types ' @@ -4804,6 +5166,61 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ validators_by_meta_type[ cbioportal_common.MetaFileTypes.PATIENT_ATTRIBUTES])}) + # validate resources definition before validate the other resources data + if cbioportal_common.MetaFileTypes.RESOURCES_DEFINITION in validators_by_meta_type: + if len(validators_by_meta_type[ + cbioportal_common.MetaFileTypes.RESOURCES_DEFINITION]) > 1: + logger.error( + 'Multiple resource definition files detected', + extra={'cause': ', '.join( + validator.filenameShort for validator in + validators_by_meta_type[ + cbioportal_common.MetaFileTypes.RESOURCES_DEFINITION])}) + for resources_definition_validator in validators_by_meta_type[ + cbioportal_common.MetaFileTypes.RESOURCES_DEFINITION]: + resources_definition_validator.validate() + RESOURCE_DEFINITION_DICTIONARY = resources_definition_validator.resource_definition_dictionary + + # then validate the resource data if exist + if cbioportal_common.MetaFileTypes.SAMPLE_RESOURCES in validators_by_meta_type: + if len(validators_by_meta_type[ + cbioportal_common.MetaFileTypes.SAMPLE_RESOURCES]) > 1: + logger.error( + 'Multiple sample resources files detected', + extra={'cause': ', '.join( + validator.filenameShort for validator in + validators_by_meta_type[ + cbioportal_common.MetaFileTypes.SAMPLE_RESOURCES])}) + + # parse the data file(s) that define sample IDs valid for this study + defined_resource_sample_ids = None + for sample_validator in validators_by_meta_type[ + cbioportal_common.MetaFileTypes.SAMPLE_RESOURCES]: + sample_validator.validate() + if sample_validator.fileCouldBeParsed: + if defined_resource_sample_ids is None: + defined_resource_sample_ids = set() + # include parsed sample IDs in the set (union) + defined_resource_sample_ids |= sample_validator.sampleIds + # this will be set if a file was successfully parsed + if defined_resource_sample_ids is None: + logger.error("Sample file could not be parsed. Please fix " + "the problems found there first before continuing.") + if not relaxed_mode: + return + RESOURCE_PATIENTS_WITH_SAMPLES = sample_validator.patient_ids + + if cbioportal_common.MetaFileTypes.PATIENT_RESOURCES in validators_by_meta_type: + if len(validators_by_meta_type.get( + cbioportal_common.MetaFileTypes.PATIENT_RESOURCES, + [])) > 1: + logger.error( + 'Multiple patient resources files detected', + extra={'cause': ', '.join( + validator.filenameShort for validator in + validators_by_meta_type[ + cbioportal_common.MetaFileTypes.PATIENT_RESOURCES])}) + # then validate the study tags YAML file if it exists if tags_file_path is not None: validateStudyTags(tags_file_path, logger=logger) @@ -4812,6 +5229,8 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ # skip cancer type and clinical files, they have already been validated if meta_file_type in (cbioportal_common.MetaFileTypes.CANCER_TYPE, cbioportal_common.MetaFileTypes.SAMPLE_ATTRIBUTES, + cbioportal_common.MetaFileTypes.RESOURCES_DEFINITION, + cbioportal_common.MetaFileTypes.SAMPLE_RESOURCES, cbioportal_common.MetaFileTypes.GENE_PANEL_MATRIX): continue for validator in sorted( diff --git a/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportGenericAssayData.java b/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportGenericAssayData.java index 4d578fe198c..090019118b9 100644 --- a/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportGenericAssayData.java +++ b/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportGenericAssayData.java @@ -69,7 +69,7 @@ public void testImportTreatmentData() throws Exception { File file = new File("src/test/resources/treatments/data_treatment_ic50.txt"); // import data and test all treatments were added - ImportGenericAssayEntity.importData(file, GeneticAlterationType.GENERIC_ASSAY, "NAME,DESCRIPTION,URL"); + ImportGenericAssayEntity.importData(file, GeneticAlterationType.GENERIC_ASSAY, "NAME,DESCRIPTION,URL", true); assertEquals(10, getNumRecordsForGenericAssay()); // test wether a record can be retrieved via stable id @@ -80,6 +80,12 @@ public void testImportTreatmentData() throws Exception { assertEquals("Name of Irinotecan", treatment1.getGenericEntityMetaProperties().get("NAME")); assertEquals("Desc of Irinotecan", treatment1.getGenericEntityMetaProperties().get("DESCRIPTION")); assertEquals("Url of Irinotecan", treatment1.getGenericEntityMetaProperties().get("URL")); + + // test fields are updated after loading new treatment file + File fileNewDesc = new File("src/test/resources/treatments/data_treatment_ic50_newdesc.txt"); + ImportGenericAssayEntity.importData(fileNewDesc, GeneticAlterationType.GENERIC_ASSAY, "NAME,DESCRIPTION,URL", true); + GenericAssayMeta treatment2 = DaoGenericAssay.getGenericAssayMetaByStableId("Irinotecan"); + assertEquals("New desc of Irinotecan", treatment2.getGenericEntityMetaProperties().get("DESCRIPTION")); } @Test @@ -91,7 +97,7 @@ public void testImportGenericAssayData() throws Exception { File file = new File("src/test/resources/data_mutational_signature.txt"); // import data and test all mutational signatures were added - ImportGenericAssayEntity.importData(file, GeneticAlterationType.GENERIC_ASSAY, "name,description"); + ImportGenericAssayEntity.importData(file, GeneticAlterationType.GENERIC_ASSAY, "name,description", false); assertEquals(61, getNumRecordsForGenericAssay()); // test wether a record can be retrieved via stable id @@ -101,6 +107,18 @@ public void testImportGenericAssayData() throws Exception { // Test whether fields were populated correctly assertEquals("mean_1", genericAssayMeta1.getGenericEntityMetaProperties().get("name")); assertEquals("mean_1", genericAssayMeta1.getGenericEntityMetaProperties().get("description")); + + // // test fields should not be updated after loading new generic assay meta file + File fileNewDesc = new File("src/test/resources/data_mutational_signature_new.txt"); + ImportGenericAssayEntity.importData(fileNewDesc, GeneticAlterationType.GENERIC_ASSAY, "name,description", false); + GenericAssayMeta genericAssayMeta2 = DaoGenericAssay.getGenericAssayMetaByStableId("mean_1"); + assertEquals("mean_1", genericAssayMeta2.getGenericEntityMetaProperties().get("description")); + + // // test fields should be updated after loading new generic assay meta file + ImportGenericAssayEntity.importData(fileNewDesc, GeneticAlterationType.GENERIC_ASSAY, "name,description", true); + GenericAssayMeta genericAssayMeta3 = DaoGenericAssay.getGenericAssayMetaByStableId("mean_1"); + assertEquals("new mean_1", genericAssayMeta3.getGenericEntityMetaProperties().get("description")); + } private int getNumRecordsForGenericAssay() { diff --git a/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportProfileData.java b/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportProfileData.java index a4e3e68027f..a688257c3af 100644 --- a/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportProfileData.java +++ b/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportProfileData.java @@ -57,6 +57,7 @@ import org.mskcc.cbio.portal.model.CnaEvent; import org.mskcc.cbio.portal.model.ExtendedMutation; import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.GeneticAlterationType; import org.mskcc.cbio.portal.model.Patient; import org.mskcc.cbio.portal.model.Sample; @@ -218,6 +219,41 @@ public void testImportSplitMutationsFile() throws Exception { assertEquals("1", clinicalData.get(1).getAttrVal()); } + @Test + public void testImportSplitFusionsFile() throws Exception { + /* + * Check case where study has multiple fusions file. + * i.e somatic and germline fusions are in seperate files + * Check that an SV genetic profile is created. + * Check that the second fusion file does not insert duplicate genetic profile. + */ + String svStudyStableId = "study_tcga_pub_fusion"; + GeneticProfile svGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId(svStudyStableId); + assertNull(svGeneticProfile); + + String[] args = { + "--data","src/test/resources/splitFusionsData/data_fusions.txt", + "--meta","src/test/resources/splitFusionsData/meta_fusions.txt", + "--loadMode", "bulkLoad" + }; + ImportProfileData runner = new ImportProfileData(args); + runner.run(); + svGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId(svStudyStableId); + assertNotNull(svGeneticProfile); + + // load a second fusions file - new genetic profile not created + String[] secondArgs = { + "--data","src/test/resources/splitFusionsData/data_fusions_gml.txt", + "--meta","src/test/resources/splitFusionsData/meta_fusions_gml.txt", + "--loadMode", "bulkLoad" + }; + ImportProfileData secondRunner = new ImportProfileData(secondArgs); + secondRunner.run(); + svGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId(svStudyStableId); + assertNotNull(svGeneticProfile); + assertEquals(GeneticAlterationType.STRUCTURAL_VARIANT, svGeneticProfile.getGeneticAlterationType()); + } + @Test public void testImportGermlineOnlyFile() throws Exception { /* Mutations file split over two files with same stable id */ diff --git a/core/src/test/java/org/mskcc/cbio/portal/scripts/TestIntegrationTest.java b/core/src/test/java/org/mskcc/cbio/portal/scripts/TestIntegrationTest.java index 31d98233d73..3bc4a5e8838 100644 --- a/core/src/test/java/org/mskcc/cbio/portal/scripts/TestIntegrationTest.java +++ b/core/src/test/java/org/mskcc/cbio/portal/scripts/TestIntegrationTest.java @@ -178,13 +178,13 @@ public void testLoadStudyEs0() throws Throwable { } assertEquals(countFusions, 5); - // Is there a separate fusion profile? -> false + // Is there a separate fusion profile? -> true GeneticProfileMapperLegacy geneticProfileMapperLegacy = applicationContext.getBean(GeneticProfileMapperLegacy.class); geneticProfileStableIds = new ArrayList(); geneticProfileStableIds.add("study_es_0_fusion"); List geneticProfiles = geneticProfileMapperLegacy .getGeneticProfiles(geneticProfileStableIds); - assertEquals(geneticProfiles.size(), 0); + assertEquals(geneticProfiles.size(), 1); //===== Check STRUCTURAL VARIANT data ======== // 45 structural variant events are imported, using 31 unique genes, using 39 samples diff --git a/core/src/test/resources/splitFusionsData/data_fusions.txt b/core/src/test/resources/splitFusionsData/data_fusions.txt new file mode 100644 index 00000000000..584de8d64a3 --- /dev/null +++ b/core/src/test/resources/splitFusionsData/data_fusions.txt @@ -0,0 +1,3 @@ +Hugo_Symbol Entrez_Gene_Id Center Tumor_Sample_Barcode Fusion DNA_support RNA_support Method Frame Comments +CD79B 0 MEDICAL-PLACE TCGA-AA-3664-01 TTLL7-CD79B fusion yes unknown unknown There are some comments about the fusion event. +TTLL7 0 MEDICAL-PLACE TCGA-AA-3664-01 TTLL7-CD79B fusion yes unknown unknown More comments about this sample. diff --git a/core/src/test/resources/splitFusionsData/data_fusions_gml.txt b/core/src/test/resources/splitFusionsData/data_fusions_gml.txt new file mode 100644 index 00000000000..9d678b873cc --- /dev/null +++ b/core/src/test/resources/splitFusionsData/data_fusions_gml.txt @@ -0,0 +1,3 @@ +Hugo_Symbol Entrez_Gene_Id Center Tumor_Sample_Barcode Fusion DNA_support RNA_support Method Frame Comments Fusion_Status +PMS2 0 MEDICAL-PLACE TCGA-AA-3664-01 PMS2-intragenic loss yes unknown unknown This person has some data. GERMLINE +RB1 0 MEDICAL-PLACE TCGA-AA-3664-01 RB1-intragenic loss yes unknown unknown Some fake sentence to simulate a comment. GERMLINE diff --git a/core/src/test/resources/splitFusionsData/meta_fusions.txt b/core/src/test/resources/splitFusionsData/meta_fusions.txt new file mode 100644 index 00000000000..7241057edd4 --- /dev/null +++ b/core/src/test/resources/splitFusionsData/meta_fusions.txt @@ -0,0 +1,7 @@ +cancer_study_identifier: study_tcga_pub +stable_id: study_tcga_pub_mutations +datatype: FUSION +genetic_alteration_type: FUSION +show_profile_in_analysis_tab: true +profile_description: Fusions. +profile_name: Fusions diff --git a/core/src/test/resources/splitFusionsData/meta_fusions_gml.txt b/core/src/test/resources/splitFusionsData/meta_fusions_gml.txt new file mode 100644 index 00000000000..ffa574b5935 --- /dev/null +++ b/core/src/test/resources/splitFusionsData/meta_fusions_gml.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: FUSION +datatype: FUSION +stable_id: study_tcga_pub_mutations +profile_name: Fusions (GML) +profile_description: Fusion data derived from mutation file. (GML) +show_profile_in_analysis_tab: false +data_filename: data_fusions_gml.txt diff --git a/core/src/test/scripts/test_data/data_resource_definition_missing_resourceId.txt b/core/src/test/scripts/test_data/data_resource_definition_missing_resourceId.txt new file mode 100644 index 00000000000..6907dcb3541 --- /dev/null +++ b/core/src/test/scripts/test_data/data_resource_definition_missing_resourceId.txt @@ -0,0 +1,4 @@ +RESOURCE_ID DISPLAY_NAME DESCRIPTION RESOURCE_TYPE OPEN_BY_DEFAULT PRIORITY +PATHOLOGY_SLIDE Pathology Slide The pathology slide for the sample SAMPLE TRUE 1 + Patient Notes Notes about the patient PATIENT FALSE 2 +STUDY_SPONSORS Study Sponsors Sponsors of this study STUDY TRUE 3 \ No newline at end of file diff --git a/core/src/test/scripts/test_data/data_resource_is_not_url.txt b/core/src/test/scripts/test_data/data_resource_is_not_url.txt new file mode 100644 index 00000000000..182b15ef310 --- /dev/null +++ b/core/src/test/scripts/test_data/data_resource_is_not_url.txt @@ -0,0 +1,4 @@ +PATIENT_ID SAMPLE_ID RESOURCE_ID URL +TCGA-A2-A04P TCGA-A2-A04P-01 PATHOLOGY_SLIDE not_a_url +TCGA-A1-A0SK TCGA-A1-A0SK-01 PATHOLOGY_SLIDE http://url-to-slide-sample2 +TCGA-A2-A0CM TCGA-A2-A0CM-01 PATHOLOGY_SLIDE http://url-to-slide-sample3 \ No newline at end of file diff --git a/core/src/test/scripts/test_data/data_resource_patient_duplicate.txt b/core/src/test/scripts/test_data/data_resource_patient_duplicate.txt new file mode 100644 index 00000000000..cd805fac4f6 --- /dev/null +++ b/core/src/test/scripts/test_data/data_resource_patient_duplicate.txt @@ -0,0 +1,4 @@ +PATIENT_ID RESOURCE_ID URL +TCGA-A2-A04P PATIENT_NOTES http://url-to-patient-notes-patient1 +TCGA-A1-A0SK PATIENT_NOTES http://url-to-patient-notes-patient2 +TCGA-A1-A0SK PATIENT_NOTES http://url-to-patient-notes-patient2 \ No newline at end of file diff --git a/core/src/test/scripts/test_data/data_resource_patient_valid.txt b/core/src/test/scripts/test_data/data_resource_patient_valid.txt new file mode 100644 index 00000000000..ad9b93a8229 --- /dev/null +++ b/core/src/test/scripts/test_data/data_resource_patient_valid.txt @@ -0,0 +1,4 @@ +PATIENT_ID RESOURCE_ID URL +TCGA-A2-A04P PATIENT_NOTES http://url-to-patient-notes-patient1 +TCGA-A1-A0SK PATIENT_NOTES http://url-to-patient-notes-patient2 +TCGA-A2-A0CM PATIENT_NOTES http://url-to-patient-notes-patient3 \ No newline at end of file diff --git a/core/src/test/scripts/test_data/data_resource_sample_duplicate.txt b/core/src/test/scripts/test_data/data_resource_sample_duplicate.txt new file mode 100644 index 00000000000..6dd97c15c43 --- /dev/null +++ b/core/src/test/scripts/test_data/data_resource_sample_duplicate.txt @@ -0,0 +1,4 @@ +PATIENT_ID SAMPLE_ID RESOURCE_ID URL +TCGA-A2-A04P TCGA-A2-A04P-01 PATHOLOGY_SLIDE http://url-to-slide-sample1 +TCGA-A1-A0SK TCGA-A1-A0SK-01 PATHOLOGY_SLIDE http://url-to-slide-sample2 +TCGA-A1-A0SK TCGA-A1-A0SK-01 PATHOLOGY_SLIDE http://url-to-slide-sample2 \ No newline at end of file diff --git a/core/src/test/scripts/test_data/data_resource_sample_valid.txt b/core/src/test/scripts/test_data/data_resource_sample_valid.txt new file mode 100644 index 00000000000..241b0330d9e --- /dev/null +++ b/core/src/test/scripts/test_data/data_resource_sample_valid.txt @@ -0,0 +1,4 @@ +PATIENT_ID SAMPLE_ID RESOURCE_ID URL +TCGA-A2-A04P TCGA-A2-A04P-01 PATHOLOGY_SLIDE http://url-to-slide-sample1 +TCGA-A1-A0SK TCGA-A1-A0SK-01 PATHOLOGY_SLIDE http://url-to-slide-sample2 +TCGA-A2-A0CM TCGA-A2-A0CM-01 PATHOLOGY_SLIDE http://url-to-slide-sample3 \ No newline at end of file diff --git a/core/src/test/scripts/test_data/data_resource_study_duplicate.txt b/core/src/test/scripts/test_data/data_resource_study_duplicate.txt new file mode 100644 index 00000000000..8d0b6a3474e --- /dev/null +++ b/core/src/test/scripts/test_data/data_resource_study_duplicate.txt @@ -0,0 +1,3 @@ +RESOURCE_ID URL +STUDY_SPONSORS http://url-to-study-sponsors1 +STUDY_SPONSORS http://url-to-study-sponsors1 \ No newline at end of file diff --git a/core/src/test/scripts/test_data/data_resource_study_valid.txt b/core/src/test/scripts/test_data/data_resource_study_valid.txt new file mode 100644 index 00000000000..9502811d11a --- /dev/null +++ b/core/src/test/scripts/test_data/data_resource_study_valid.txt @@ -0,0 +1,3 @@ +RESOURCE_ID URL +STUDY_SPONSORS http://url-to-study-sponsors1 +STUDY_SPONSORS http://url-to-study-sponsors2 \ No newline at end of file diff --git a/core/src/test/scripts/test_data/study_es_0/data_resource_definition.txt b/core/src/test/scripts/test_data/study_es_0/data_resource_definition.txt new file mode 100644 index 00000000000..7ead4f0a88e --- /dev/null +++ b/core/src/test/scripts/test_data/study_es_0/data_resource_definition.txt @@ -0,0 +1,4 @@ +RESOURCE_ID DISPLAY_NAME DESCRIPTION RESOURCE_TYPE OPEN_BY_DEFAULT PRIORITY +PATHOLOGY_SLIDE Pathology Slide The pathology slide for the sample SAMPLE TRUE 1 +PATIENT_NOTES Patient Notes Notes about the patient PATIENT FALSE 2 +STUDY_SPONSORS Study Sponsors Sponsors of this study STUDY TRUE 3 \ No newline at end of file diff --git a/core/src/test/scripts/test_data/study_es_0/data_resource_patient.txt b/core/src/test/scripts/test_data/study_es_0/data_resource_patient.txt new file mode 100644 index 00000000000..ad9b93a8229 --- /dev/null +++ b/core/src/test/scripts/test_data/study_es_0/data_resource_patient.txt @@ -0,0 +1,4 @@ +PATIENT_ID RESOURCE_ID URL +TCGA-A2-A04P PATIENT_NOTES http://url-to-patient-notes-patient1 +TCGA-A1-A0SK PATIENT_NOTES http://url-to-patient-notes-patient2 +TCGA-A2-A0CM PATIENT_NOTES http://url-to-patient-notes-patient3 \ No newline at end of file diff --git a/core/src/test/scripts/test_data/study_es_0/data_resource_sample.txt b/core/src/test/scripts/test_data/study_es_0/data_resource_sample.txt new file mode 100644 index 00000000000..241b0330d9e --- /dev/null +++ b/core/src/test/scripts/test_data/study_es_0/data_resource_sample.txt @@ -0,0 +1,4 @@ +PATIENT_ID SAMPLE_ID RESOURCE_ID URL +TCGA-A2-A04P TCGA-A2-A04P-01 PATHOLOGY_SLIDE http://url-to-slide-sample1 +TCGA-A1-A0SK TCGA-A1-A0SK-01 PATHOLOGY_SLIDE http://url-to-slide-sample2 +TCGA-A2-A0CM TCGA-A2-A0CM-01 PATHOLOGY_SLIDE http://url-to-slide-sample3 \ No newline at end of file diff --git a/core/src/test/scripts/test_data/study_es_0/data_resource_study.txt b/core/src/test/scripts/test_data/study_es_0/data_resource_study.txt new file mode 100644 index 00000000000..4f81bc201ec --- /dev/null +++ b/core/src/test/scripts/test_data/study_es_0/data_resource_study.txt @@ -0,0 +1,2 @@ +RESOURCE_ID URL +STUDY_SPONSORS http://url-to-study-sponsors \ No newline at end of file diff --git a/core/src/test/scripts/test_data/study_es_0/meta_resource_definition.txt b/core/src/test/scripts/test_data/study_es_0/meta_resource_definition.txt new file mode 100644 index 00000000000..9016a0ffaf8 --- /dev/null +++ b/core/src/test/scripts/test_data/study_es_0/meta_resource_definition.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_es_0 +resource_type: DEFINITION +data_filename: data_resource_definition.txt \ No newline at end of file diff --git a/core/src/test/scripts/test_data/study_es_0/meta_resource_patient.txt b/core/src/test/scripts/test_data/study_es_0/meta_resource_patient.txt new file mode 100644 index 00000000000..2dc33f724d2 --- /dev/null +++ b/core/src/test/scripts/test_data/study_es_0/meta_resource_patient.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_es_0 +resource_type: PATIENT +data_filename: data_resource_patient.txt \ No newline at end of file diff --git a/core/src/test/scripts/test_data/study_es_0/meta_resource_sample.txt b/core/src/test/scripts/test_data/study_es_0/meta_resource_sample.txt new file mode 100644 index 00000000000..a255170402f --- /dev/null +++ b/core/src/test/scripts/test_data/study_es_0/meta_resource_sample.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_es_0 +resource_type: SAMPLE +data_filename: data_resource_sample.txt \ No newline at end of file diff --git a/core/src/test/scripts/test_data/study_es_0/meta_resource_study.txt b/core/src/test/scripts/test_data/study_es_0/meta_resource_study.txt new file mode 100644 index 00000000000..8a2e2fd2957 --- /dev/null +++ b/core/src/test/scripts/test_data/study_es_0/meta_resource_study.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_es_0 +resource_type: STUDY +data_filename: data_resource_study.txt \ No newline at end of file diff --git a/core/src/test/scripts/test_data/study_es_0/result_report.html b/core/src/test/scripts/test_data/study_es_0/result_report.html index 2f5c367b65a..f71c510450d 100644 --- a/core/src/test/scripts/test_data/study_es_0/result_report.html +++ b/core/src/test/scripts/test_data/study_es_0/result_report.html @@ -1279,6 +1279,218 @@

data_mutations_extended.maf

+
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Line NumberColumn NumberMessageValue Encountered
DebugStarting validation of file
InfoValidation of file complete
InfoRead 4 lines. Lines with warning: 0. Lines with error: 0
+
+
+
+ +
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Line NumberColumn NumberMessageValue Encountered
DebugStarting validation of file
InfoValidation of file complete
InfoRead 4 lines. Lines with warning: 0. Lines with error: 0
+
+
+
+ +
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Line NumberColumn NumberMessageValue Encountered
DebugStarting validation of file
InfoValidation of file complete
InfoRead 4 lines. Lines with warning: 0. Lines with error: 0
+
+
+
+ +
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Line NumberColumn NumberMessageValue Encountered
DebugStarting validation of file
InfoValidation of file complete
InfoRead 2 lines. Lines with warning: 0. Lines with error: 0
+
+
+
+
+
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Line NumberColumn NumberMessageValue Encountered
DebugStarting validation of meta file
InfoValidation of meta file complete
+
+
+
+ +
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Line NumberColumn NumberMessageValue Encountered
DebugStarting validation of meta file
InfoValidation of meta file complete
+
+
+
+ +
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Line NumberColumn NumberMessageValue Encountered
DebugStarting validation of meta file
InfoValidation of meta file complete
+
+
+
+ +
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Line NumberColumn NumberMessageValue Encountered
DebugStarting validation of meta file
InfoValidation of meta file complete
+
+
+
+