diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..8dd399a --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 88 +extend-ignore = E203 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..3225814 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Enforce Linux-style line endings for all text files +* text=auto eol=lf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..6f00ac0 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,92 @@ +name: CI + +on: + push: + pull_request: + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install lint dependencies + run: | + python -m pip install --upgrade pip + python -m pip install .[dev] + - name: Lint + run: flake8 src tests tools + - name: Validate packaged registry + run: python tools/check_registry.py + - name: Validate notebooks + run: python tools/check_notebooks.py + - name: Check notebook exports + run: python tools/export_notebooks.py --check + - name: Check README sync + run: python tools/gen_readme.py --check + + docs-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install docs extras + run: | + python -m pip install --upgrade pip + python -m pip install .[docs] + - name: Export notebooks and README + run: | + python tools/export_notebooks.py --check + python tools/gen_readme.py --check + - name: Build docs + run: mkdocs build --strict + + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install + run: | + python -m pip install --upgrade pip + python -m pip install .[test] + - name: Test + run: pytest + + build-dist: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install build dependencies + run: | + python -m pip install --upgrade pip + python -m pip install build twine + - name: Build distributions + run: python -m build + - name: Validate metadata + run: python -m twine check dist/* + - name: Check packaged files + run: python tools/check_dist.py dist + - name: Install built wheel and smoke-test it + run: | + python -m pip install --force-reinstall --no-deps dist/*.whl + python - <<'PY' + import atomref as ar + + assert ar.get_covalent_radius('C') == 0.76 + assert ar.get_vdw_radius('C') == 1.77 + assert 'atomic_radius' in ar.list_quantities() + assert 'rahm2016' in ar.list_dataset_ids('atomic_radius', usage_role='support') + PY diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..418ce0d --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,25 @@ +name: Docs + +on: + push: + branches: [main, master] + workflow_dispatch: + +jobs: + build-docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install docs extras + run: | + python -m pip install --upgrade pip + python -m pip install .[docs] + - name: Check generated files + run: | + python tools/export_notebooks.py --check + python tools/gen_readme.py --check + - name: Build docs + run: mkdocs build --strict diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..18d2c3a --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,127 @@ +# Changelog + +## 0.1.4 - 2026-03-15 + +### Added + +- `LookupResult.transfer_depth`, which records how many transfer steps were + involved in the returned numeric value. +- Explicit nested-policy safeguards for `LinearTransfer` via: + - `fit_sources` + - `fit_max_depth` + - `prediction_sources` + - `prediction_max_depth` +- Regression tests covering generic-policy cycles, wrapper-policy cycles, + conservative nested-fit defaults, and explicit opt-in for deeper nested + linear workflows. + +### Changed + +- Nested policy-backed linear transfers are now guarded in two phases: + conservative defaults are used for fit training, while one additional nested + completion step remains allowed at prediction time. +- Linear-transfer fitting now distinguishes direct predictor values from nested + policy-derived predictor values. +- Cycle detection now tracks both generic policies and wrapper policies using a + context-local activation stack, so recursion through freshly materialized + wrapper policies is detected reliably and safely. +- Radii and X–H convenience helpers now resolve through wrapper-aware cycle + tracking rather than materializing a fresh generic policy for each public + lookup call. + +### Documentation + +- Expanded the transfer and policy docs to explain nested-policy safeguards, + `transfer_depth`, and cycle detection. +- Added guidance on when chained correlations are scientifically reasonable and + how to opt in deliberately when broader fit training is desired. + +## 0.1.3 - 2026-03-15 + +### Added + +- Support for using generic policies and wrapper policies as transfer sources in + `SubstitutionTransfer` and `LinearTransfer`. +- Public `atomref.xh` module docs and examples for policy-backed predictor + workflows. + +### Changed + +- `LinearTransfer` now treats predictors as **sources** rather than only raw + datasets, while still keeping the current runtime to one predictor at a time. +- Generic policy resolution now supports blocked element keys, which is used by + the X–H helper to prevent invalid `H` parent-element lookups. +- Transfer results now preserve nested-policy provenance through + `resolved_from` and explanatory notes when a policy source is involved. + +## 0.1.2 - 2026-03-15 + +### Added + +- New `xh_bond_length` quantity family. +- Packaged provisional X–H dataset `csd_legacy_xh_cno` with ConQuest/CSD + hydrogen-normalisation targets for `C`, `N`, and `O`. +- New `atomref.xh` convenience layer with `XHPolicy`, `DEFAULT_XH_POLICY`, set + listing helpers, and X–H lookup helpers. + +### Documentation + +- Added X–H dataset and API pages. +- Documented the provisional scope of X–H support in `0.1.x` and the planned + broader follow-up in `0.2.x`. + +## 0.1.1 - 2026-03-15 + +### Added + +- Public generic lookup helpers `lookup_value(...)` and `get_value(...)`. +- Tests for alias normalization, immutable metadata, non-finite-value rejection, + collision detection, and explicit placeholder notes. + +### Changed + +- Registry metadata returned by `get_dataset_info(...)` is now frozen so callers + cannot mutate the cached registry state. +- Dataset-alias resolution now normalizes Unicode and dash variants more + robustly. +- Custom-set construction and policy configuration now reject normalized-key + collisions and non-finite numeric values. +- Radii-specific wrappers now reject negative override and fallback values. +- Base and substitution lookups now emit explicit placeholder notes when the + returned numeric value is a dataset placeholder. +- `LinearTransfer` now validates empty-predictor and invalid-`min_points` + configurations eagerly. +- The docs now explain the distinction between quantity, domain, dataset, and + policy, and clarify that the current runtime supports only the `element` + domain. + +## 0.1.0 - 2026-03-15 + +First public release. + +### Added + +- Packaged element metadata and curated radii tables. +- Quantity-aware registry metadata that separates operational lookup quantity + from scientific classification and dataset usage role. +- Provenance-aware radii policies with deterministic resolution order. +- Substitution and linear-transfer support for restoring missing values from + curated support datasets. +- Public helpers for inspecting quantities, dataset metadata, and packaged + built-in sets. +- Runnable notebooks together with generated Markdown notebook pages in the + documentation. +- Validation and maintenance tools for registry checks, notebook export, README + generation, and distribution-artifact inspection. + +### Documentation + +- Expanded dataset guides with citations and selection-oriented descriptions. +- Added module-level API pages and notebook walkthroughs. +- Added developer-facing curation and tooling notes. + +### Packaging + +- Built and validated wheel and source-distribution artifacts. +- Added CI coverage for linting, tests, docs builds, notebook sync, and + distribution checks. diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..f288702 --- /dev/null +++ b/COPYING @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/DEV_PLAN.md b/DEV_PLAN.md new file mode 100644 index 0000000..94cdaac --- /dev/null +++ b/DEV_PLAN.md @@ -0,0 +1,33 @@ +# Development plan + +## Current status (implemented in the `0.1.x` line) + +- stable element metadata +- curated covalent, van der Waals, and atomic-radius support datasets +- explicit provenance and coverage metadata +- generic value-policy core plus radii and X–H convenience wrappers +- substitution and linear transfer +- custom element-indexed scalar sets +- policy-backed transfer sources +- nested-policy safeguards, transfer-depth tracking, and cycle detection +- provisional X–H support via `csd_legacy_xh_cno`, `XHPolicy`, and + `DEFAULT_XH_POLICY` + +## Planned for `0.2.x` + +- broader X–H datasets and policies +- experimental plus computational support sets +- pairwise helper logic such as reference sums and normalization schemes +- restoration of incomplete experimental data from broader-support predictors + +## Longer-term design ideas + +- radial atomic reference functions +- simple proto-density support based on spherically averaged atomic data + +## Possible future directions + +- more radii sets +- uncertainty and confidence flags +- ion-specific or atom-type-specific domains +- density-derived radii and related reference transforms diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0a04128 --- /dev/null +++ b/LICENSE @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/NOTICE.md b/NOTICE.md new file mode 100644 index 0000000..01f1cf1 --- /dev/null +++ b/NOTICE.md @@ -0,0 +1,12 @@ +# atomref + +atomref is a Python library for curated atomic reference data and transfer +policies for geometry and structure-analysis algorithms. + +Copyright (c) 2026 Ivan Chernyshov +License: LGPL-3.0-or-later (see LICENSE and COPYING) + +## Third-party material + +The initial scaffold reuses and adapts data tables and design ideas from the +Delone Commons `molcryst` repository, also authored by Ivan Chernyshov. diff --git a/README.md b/README.md new file mode 100644 index 0000000..869aace --- /dev/null +++ b/README.md @@ -0,0 +1,173 @@ +# atomref + +[![CI](https://github.com/DeloneCommons/atomref/actions/workflows/ci.yml/badge.svg)](https://github.com/DeloneCommons/atomref/actions/workflows/ci.yml) +[![Docs](https://github.com/DeloneCommons/atomref/actions/workflows/docs.yml/badge.svg)](https://github.com/DeloneCommons/atomref/actions/workflows/docs.yml) +[![PyPI](https://img.shields.io/pypi/v/atomref.svg)](https://pypi.org/project/atomref/) +[![Python Versions](https://img.shields.io/pypi/pyversions/atomref.svg)](https://pypi.org/project/atomref/) +[![License](https://img.shields.io/pypi/l/atomref.svg)](https://github.com/DeloneCommons/atomref/blob/main/LICENSE) + +`atomref` is a small pure-Python package for **curated atomic reference data** +and **provenance-aware lookup policies** used by geometry and +structure-analysis algorithms. + +It is not meant to be yet another periodic-table encyclopedia. The package is +for code that needs stable atomic reference values with explicit provenance, +clear fallback behavior, and honest handling of incomplete preferred datasets. + +What you get in the current release line: + +- stable element metadata, +- curated named radii sets, +- provisional X–H bond-length support for hydrogen-normalisation workflows, +- dataset provenance and coverage metadata, +- deterministic lookup policies, +- substitution and linear transfer from support datasets or policies into target datasets, +- guarded nested policy-backed transfers with explicit transfer depth, + conservative fit/prediction controls, and cycle detection, +- user-defined custom element-indexed scalar sets. + +## Core terms + +`atomref` uses a small vocabulary on purpose. + +- **quantity** — the operational property family being requested, such as + `covalent_radius`, `van_der_waals_radius`, `atomic_radius`, or + `xh_bond_length`. +- **domain** — the key space used to index that quantity. In the current + runtime, the supported domain is `element`, meaning lookups are keyed by an + element symbol. +- **dataset** — one curated named table inside a quantity, such as + `cordero2008`, `alvarez2013`, or `csd_legacy_xh_cno`. +- **policy** — the ordered rule set that decides what value to return when the + preferred dataset is incomplete. + +The metadata layer already records `domain` explicitly because the package is +built for later extension, but the current runtime intentionally keeps the +implementation narrow and stable: **the current runtime resolves only +element-domain scalar values**. + +## Why this exists + +Scientific software often wants a complete lookup table, but the best dataset +for the job is rarely complete. `atomref` makes that situation explicit. +Instead of hiding ad hoc defaults inside algorithm code, you choose a target +set, describe how missing values may be restored, and keep provenance on what +was actually returned. + +The built-in default behavior is intentionally simple and practical: + +- **Cordero covalent radii** (`cordero2008`) are the preferred covalent target + set, with missing values substituted from the **legacy CSD covalent radii** + (`csd_legacy_cov`). +- **Alvarez van der Waals radii** (`alvarez2013`) are the preferred vdW target + set, with missing values restored from the **Rahm isodensity atomic radii** + (`rahm2016`) through a fitted linear transfer. +- **CSD/ConQuest hydrogen-normalisation defaults** (`csd_legacy_xh_cno`) are a + provisional sparse X–H target set for `C`, `N`, and `O`, with other parent + elements inferred from **Cordero covalent radii** through a fitted linear + transfer. + +Nested policy predictors are supported too. `LinearTransfer` separates +**fit-time** use of nested predictor values from **prediction-time** use. By default, the fit may use only direct nested +values, while the final requested element may still use one additional +nested completion step. That is a useful compromise for workflows such as +provisional X–H inference from a chosen covalent-radii policy. + +## Quick example + +```pycon +>>> import atomref as ar +>>> ar.get_covalent_radius("C") +0.76 +>>> ar.get_vdw_radius("O") +1.5 +>>> ar.get_xh_bond_length("N") +1.015 +>>> lookup = ar.lookup_vdw_radius("Pm") +>>> lookup.value +2.8972265395148358 +>>> lookup.source +'transfer_linear' +>>> lookup.transfer_depth +1 +>>> lookup.resolved_from +(DatasetRef(quantity='atomic_radius', set_id='rahm2016'),) +``` + +`get_*` returns only the number. `lookup_*` returns a `LookupResult` that also +records where the value came from, whether a transfer model or policy source was +involved, and how many transfer steps were needed (`transfer_depth`). + +You can inspect the packaged quantity and dataset catalog directly: + +```pycon +>>> import atomref as ar +>>> ar.list_quantities() +('covalent_radius', 'van_der_waals_radius', 'atomic_radius', 'xh_bond_length') +>>> ar.get_quantity_info("xh_bond_length") +QuantityInfo(quantity='xh_bond_length', domain='element', units='angstrom', description='Element-indexed reference X-H bond lengths keyed by parent element X and intended for hydrogen-position normalisation or related geometry workflows.') +>>> [info.ref.set_id for info in ar.list_dataset_infos("van_der_waals_radius", usage_role="target")] +['bondi1964', 'rowland_taylor1996', 'alvarez2013', 'chernyshov2020'] +``` + +You can also load a packaged set directly: + +```pycon +>>> import atomref as ar +>>> vdw = ar.get_radii_set("van_der_waals", "alvarez2013") +>>> vdw.get("O") +1.5 +>>> xh = ar.get_xh_set("csd_legacy_xh_cno") +>>> xh.get("C") +1.089 +``` + +## Notebook walkthroughs + +The repository ships example notebooks for the main workflows. In the +documentation they are also available as rendered Markdown pages, so users can +read them without opening Jupyter first. + +- [Notebook overview](https://delonecommons.github.io/atomref/guide/notebooks/) +- [Quickstart notebook](https://delonecommons.github.io/atomref/notebooks/01-quickstart/) +- [Policies and assessment notebook](https://delonecommons.github.io/atomref/notebooks/02-policies-and-assessment/) +- [Custom sets and discovery notebook](https://delonecommons.github.io/atomref/notebooks/03-custom-sets-and-discovery/) + +## Relationship to Delone Commons + +`atomref` is designed as a standalone package, but within Delone Commons it is +primarily intended to support chemistry-aware packages such as: + +- `molcryst`, for covalent-bond detection, contact analysis, and hydrogen workflows, +- future `chemvoro`, for chemistry-aware contact and hydrogen workflows. + +By contrast, `pyvoro2` and `pbcgraph` are intentionally general mathematical +packages and are not direct consumers of `atomref`. + +## Data curation and developer tools + +The repository also ships small maintenance tools. The most important ones are: + +- `python tools/check_registry.py` — validate curated registry metadata against + packaged CSV tables, +- `python tools/check_notebooks.py` — execute notebook code cells, +- `python tools/export_notebooks.py` — turn notebooks into Markdown pages for + the docs, +- `python tools/gen_readme.py` — regenerate `README.md` from this page, +- `python tools/release_check.py` — run the full release-preparation checklist, + including linting, tests, docs, builds, and artifact validation. + +See the [tools README](https://github.com/DeloneCommons/atomref/blob/main/tools/README.md) +for a short description of each script. + +--- + +This README is generated from `docs/index.md`. + +To regenerate it: + +```bash +python tools/gen_readme.py +``` + +Edit the documentation sources instead of editing `README.md` directly. diff --git a/docs/api/atomref.md b/docs/api/atomref.md new file mode 100644 index 0000000..3536e34 --- /dev/null +++ b/docs/api/atomref.md @@ -0,0 +1,6 @@ +# atomref + +The top-level package re-exports the main user-facing API so that most code can +simply do `import atomref as ar`. + +::: atomref diff --git a/docs/api/elements.md b/docs/api/elements.md new file mode 100644 index 0000000..2f066c7 --- /dev/null +++ b/docs/api/elements.md @@ -0,0 +1,7 @@ +# atomref.elements + +Element identity is intentionally minimal in the current implementation: +atomic number, symbol, and name. The module also contains the canonicalization helpers used throughout the +package. + +::: atomref.elements diff --git a/docs/api/index.md b/docs/api/index.md new file mode 100644 index 0000000..f56eb7c --- /dev/null +++ b/docs/api/index.md @@ -0,0 +1,33 @@ +# API + +The public API is small on purpose. + +Most users will spend most of their time in the top-level package namespace and +in the quantity-specific convenience helpers. The lower-level modules are still +documented because they expose the actual data model behind the package. + +## Common tasks + +- get a single value: use `get_covalent_radius(...)`, `get_vdw_radius(...)`, or + `get_xh_bond_length(...)` +- inspect provenance: use `lookup_covalent_radius(...)`, + `lookup_vdw_radius(...)`, `lookup_xh_bond_length(...)`, or the generic + `lookup_value(...)` +- browse packaged datasets: use `list_quantities()`, `get_quantity_info(...)`, + `list_dataset_infos(...)`, `list_radii_set_infos(...)`, or + `list_xh_set_infos(...)` +- load a packaged set directly: use `get_builtin_set(...)`, `get_radii_set(...)`, + or `get_xh_set(...)` +- define a custom set: use `ElementScalarSet.from_mapping(...)` +- define transfer-backed lookup behavior: use `ValuePolicy`, `RadiiPolicy`, + `XHPolicy`, `SubstitutionTransfer`, and `LinearTransfer` + +## Module reference + +- [Top-level package](atomref.md) +- [Elements](elements.md) +- [Registry and packaged datasets](registry.md) +- [Transfer models](transfer.md) +- [Generic policy core](policy.md) +- [Radii API](radii.md) +- [X–H API](xh.md) diff --git a/docs/api/policy.md b/docs/api/policy.md new file mode 100644 index 0000000..29b4142 --- /dev/null +++ b/docs/api/policy.md @@ -0,0 +1,26 @@ +# atomref.policy + +This module contains the generic resolver that sits below the radii-specific and +X–H-specific convenience APIs. + +Use it when you want to work directly with the shared value-selection engine: + +- `ValuePolicy` — generic element-domain policy configuration, +- `lookup_value(...)` — resolve one value together with provenance, +- `get_value(...)` — resolve only the numeric value, +- `LookupResult` — the structured result object returned by the resolver. + +A few practical notes: + +- The current runtime supports **element-domain** scalar policies. +- `ValuePolicy` normalizes element-symbol overrides eagerly. +- Transfer sources may be packaged datasets, custom sets, generic policies, or + wrapper policies that expose `as_value_policy()`. +- `LookupResult.is_placeholder` refers to the returned numeric value itself, not + to whether any transfer happened. +- `LookupResult.transfer_depth` counts how many transfer steps were involved in + the returned numeric value. +- Nested lookup is cycle-checked across both generic `ValuePolicy` objects and + wrapper policies such as `RadiiPolicy` and `XHPolicy`. + +::: atomref.policy diff --git a/docs/api/radii.md b/docs/api/radii.md new file mode 100644 index 0000000..ff5e214 --- /dev/null +++ b/docs/api/radii.md @@ -0,0 +1,8 @@ +# atomref.radii + +This is the main user-facing module for radii workflows. + +It provides radii policies, packaged radii-set discovery, lookup helpers, and +policy-assessment reports. + +::: atomref.radii diff --git a/docs/api/registry.md b/docs/api/registry.md new file mode 100644 index 0000000..9c41653 --- /dev/null +++ b/docs/api/registry.md @@ -0,0 +1,19 @@ +# atomref.registry + +This module contains the packaged data model. + +If you want to understand how `atomref` classifies datasets, how aliases are +resolved, or how built-in CSV tables are turned into typed in-memory objects, +this is the key module to read. + +The most important registry ideas are: + +- **quantity** — the operational property family, +- **domain** — the key space used to index that quantity, +- **dataset** — one curated named table inside the quantity. + +In the current runtime, the implemented lookup domain is `element`. +The registry still stores `domain` explicitly because the metadata design is +meant to stay reusable as the package grows. + +::: atomref.registry diff --git a/docs/api/transfer.md b/docs/api/transfer.md new file mode 100644 index 0000000..17e07ad --- /dev/null +++ b/docs/api/transfer.md @@ -0,0 +1,39 @@ +# atomref.transfer + +Transfer models describe how missing target values may be restored from other +sources. + +In the current runtime the built-in models are: + +- direct substitution (`SubstitutionTransfer`), +- one-predictor linear transfer (`LinearTransfer`). + +A transfer source may be: + +- a packaged dataset reference, +- a custom `ElementScalarSet`, +- a generic `ValuePolicy`, +- a wrapper policy that exposes `as_value_policy()`. + +`LinearTransfer` currently accepts exactly one predictor source at runtime, even +though the public API stores predictors as a tuple for forward compatibility. + +For policy-backed linear predictors, `LinearTransfer` separates two questions: + +- which nested predictor values may be used to **fit** the linear model + (`fit_sources`, `fit_max_depth`), and +- which nested predictor values may be used to **predict** the final requested + element (`prediction_sources`, `prediction_max_depth`). + +The defaults are intentionally conservative: + +- fit only on nested predictor values that came directly from `base` or + `override`, +- but allow one additional nested transfer step when evaluating the predictor + for the requested element. + +That default is meant for workflows such as a sparse X–H target set correlated +against a partial covalent-radii policy that is itself completed from a broader +support set. + +::: atomref.transfer diff --git a/docs/api/xh.md b/docs/api/xh.md new file mode 100644 index 0000000..f96db27 --- /dev/null +++ b/docs/api/xh.md @@ -0,0 +1,24 @@ +# atomref.xh + +This module provides the provisional X–H bond-length helpers available in the +current release line. + +It is intentionally narrow: + +- one packaged sparse target dataset, `csd_legacy_xh_cno`, +- one wrapper policy, `XHPolicy`, +- convenience helpers for listing packaged X–H sets and resolving X–H values. + +The built-in quantity is keyed by the **parent element `X`** in `X–H` and is +currently aimed at hydrogen-position normalisation or related geometry +workflows. + +In the default policy: + +- `C`, `N`, and `O` use curated ConQuest/CSD defaults, +- other parent elements may be inferred from `cordero2008`, +- policy-backed predictors are supported as well, with conservative nested-fit + defaults and one additional nested prediction step allowed by default, +- fuller X–H literature support is planned for `0.2.x`. + +::: atomref.xh diff --git a/docs/datasets/atomic_radius.md b/docs/datasets/atomic_radius.md new file mode 100644 index 0000000..2852b3e --- /dev/null +++ b/docs/datasets/atomic_radius.md @@ -0,0 +1,22 @@ +# Atomic radius + +The `atomic_radius` quantity exists to hold support datasets that are +scientifically useful but should not be presented as direct condensed-phase vdW +radii. + +## Rahm isodensity atomic radii (`rahm2016`) + +This is currently the only built-in atomic-radius dataset. + +- **What it is:** radii for isolated neutral atoms defined by the + ρ = 0.001 e/bohr³ electron-density isosurface. +- **Source idea:** a consistent theory-based atomic size measure derived from + computed electron densities. +- **Coverage:** broad, but not complete for the full periodic table. +- **Why it matters here:** it correlates well with structural vdW radii and is a + useful support baseline when a condensed-phase target set is incomplete. +- **How `atomref` uses it:** support-only dataset for linear transfer into + target vdW values such as `alvarez2013`. + +This is an important example of the package philosophy: a dataset can be very +useful algorithmically without being mislabeled as something it is not. diff --git a/docs/datasets/covalent_radius.md b/docs/datasets/covalent_radius.md new file mode 100644 index 0000000..5e022fd --- /dev/null +++ b/docs/datasets/covalent_radius.md @@ -0,0 +1,37 @@ +# Covalent radius + +The covalent-radius quantity is aimed at bond-detection and related geometry +workflows. It currently ships one preferred target dataset and one +legacy support dataset. + +## Cordero covalent radii (`cordero2008`) + +This is the main covalent-radius target set in the current release line. + +- **What it is:** a broad covalent-radius compilation based mainly on + crystallographic bond distances. +- **Why it matters:** it is a modern, widely used reference set for element-wise + covalent radii. +- **Coverage:** broad coverage across the periodic table, but not complete for + every element. +- **How `atomref` uses it:** direct target dataset for covalent-radius lookup. + +If you want one covalent set to start with, this is usually the right first +choice. + +## Legacy CSD covalent radii (`csd_legacy_cov`) + +This set reflects the older covalent radii historically used in CSD software for +bond perception. + +- **What it is:** a practical, legacy-oriented bond-assignment table. +- **Why it matters:** it has long been used in chemistry software and contains + placeholder conventions that are still relevant for compatibility work. +- **Coverage:** broad practical coverage, with explicit placeholder values for + elements not covered by the historical table. +- **How `atomref` uses it:** support dataset for substitution when the preferred + Cordero target set is missing a value. + +Because it contains legacy placeholders, it is not the preferred scientific +starting point. It is mainly useful as a support layer and for compatibility +with older workflows. diff --git a/docs/datasets/index.md b/docs/datasets/index.md new file mode 100644 index 0000000..d3b2951 --- /dev/null +++ b/docs/datasets/index.md @@ -0,0 +1,38 @@ +# Datasets + +`atomref` does not treat all datasets as interchangeable lookup tables. +Instead, the package records several layers of classification: + +- **quantity** — the operational property being requested, +- **domain** — the key space used to index that quantity, +- **semantic class** — what the dataset scientifically represents, +- **origin class** — how the values were obtained, +- **phase context** — what physical context they describe, +- **usage role** — whether the package treats the dataset as a direct target set + or as support data for transfer. + +This is what allows a dataset such as **Rahm isodensity atomic radii** +(`rahm2016`) to be useful in van der Waals workflows without pretending that it +is itself a condensed-phase structural vdW-radius set. + +## Programmatic inspection + +The most useful catalog helpers are: + +- `atomref.list_quantities()` +- `atomref.get_quantity_info(...)` +- `atomref.list_dataset_infos(...)` +- `atomref.list_radii_set_infos(...)` +- `atomref.list_xh_set_infos(...)` + +If you only need dataset ids, use `list_dataset_ids(...)`, `list_radii_sets(...)`, +or `list_xh_sets(...)`. +If you want the packaged values themselves, use `get_builtin_set(...)`, +`get_radii_set(...)`, or `get_xh_set(...)`. + +## Built-in quantity families + +- [Covalent radius](covalent_radius.md) +- [van der Waals radius](van_der_waals_radius.md) +- [Atomic radius](atomic_radius.md) +- [X–H bond length](xh_bond_length.md) diff --git a/docs/datasets/van_der_waals_radius.md b/docs/datasets/van_der_waals_radius.md new file mode 100644 index 0000000..3013d57 --- /dev/null +++ b/docs/datasets/van_der_waals_radius.md @@ -0,0 +1,57 @@ +# van der Waals radius + +The van der Waals quantity intentionally includes several target sets with +different scientific backgrounds. This lets users choose between a classic +historical compilation, structural contact-derived sets, and compatibility-only +legacy tables. + +## Bondi van der Waals radii (`bondi1964`) + +A classic historical reference set compiled from mixed experimental sources. + +- **What it is:** the traditional Bondi vdW table used throughout chemistry. +- **Coverage:** limited, especially for transition metals and heavier elements. +- **Why you might use it:** historical consistency or comparison with older + literature and software defaults. + +## Rowland & Taylor nonbonded-contact radii (`rowland_taylor1996`) + +A small but influential structural set derived from organic-crystal nonbonded +contacts. + +- **What it is:** a condensed-phase structural vdW set focused on common organic + elements. +- **Coverage:** intentionally narrow. +- **Why you might use it:** organic-crystal contact analysis and comparisons to + classic contact-distance literature. + +## Alvarez van der Waals radii (`alvarez2013`) + +This is the main van der Waals target set in the current release line. + +- **What it is:** a broad structural vdW set derived from statistical analysis + of many interatomic distances in the Cambridge Structural Database. +- **Coverage:** broad, but still incomplete for some elements. +- **Why you might use it:** it is a strong default for general condensed-phase + geometry and contact work. +- **How `atomref` uses it:** direct target set for vdW lookup, with missing + values restored from support data when requested by policy. + +## Chernyshov line-of-sight vdW radii (`chernyshov2020`) + +A reduced element-wise view of a more atom-type-aware structural analysis. + +- **What it is:** vdW radii inferred from line-of-sight contact classification. +- **Coverage:** focused on elements common in molecular crystals. +- **Why you might use it:** you want a contact-derived set informed by the LoS + idea while still using a simple element-wise API. + +## Legacy CSD van der Waals radii (`csd_legacy_vdw`) + +A compatibility-oriented table used historically in CSD tools. + +- **What it is:** an older practical vdW table with placeholder conventions. +- **Coverage:** broad practical coverage, but not a modern scientific target + set. +- **How `atomref` uses it:** support-only data for legacy compatibility and + future migration work. diff --git a/docs/datasets/xh_bond_length.md b/docs/datasets/xh_bond_length.md new file mode 100644 index 0000000..28364c5 --- /dev/null +++ b/docs/datasets/xh_bond_length.md @@ -0,0 +1,39 @@ +# X–H bond length + +The `xh_bond_length` quantity is a small provisional addition in the current +release line. + +Its purpose is not to claim a complete literature survey of X–H bond lengths. +Instead, it provides a stable, provenance-aware starting point for +hydrogen-normalisation workflows and related geometry code. + +## Packaged target dataset + +### CSD legacy X–H neutron-normalisation targets (`csd_legacy_xh_cno`) + +- **What it is:** the fixed `C–H`, `N–H`, and `O–H` target lengths used by + ConQuest for terminal-hydrogen normalisation. +- **Coverage:** only parent elements `C`, `N`, and `O`. +- **Values:** `C–H = 1.089 Å`, `N–H = 1.015 Å`, `O–H = 0.993 Å`. +- **Primary provenance:** the ConQuest user guide section *Hydrogen Atom + Location in Crystal Structure Analyses*. +- **Secondary provenance:** Allen & Bruno (2010), which the ConQuest guide cites + for these defaults. + +## How `atomref` uses it + +The built-in `DEFAULT_XH_POLICY` treats `csd_legacy_xh_cno` as a sparse target +set and restores missing parent elements through a fitted linear transfer from +`cordero2008` covalent radii. + +That means the package draws a sharp line between: + +- **curated dataset values** — currently only `C`, `N`, and `O`, and +- **policy-generated values** — inferred for other parent elements when the + predictor policy can supply a covalent radius. + +## Scope note + +This is intentionally a small addendum rather than full X–H support. +Broader X–H datasets, richer policies, and more complete literature treatment +are planned for `0.2.x`. diff --git a/docs/dev/architecture.md b/docs/dev/architecture.md new file mode 100644 index 0000000..680b755 --- /dev/null +++ b/docs/dev/architecture.md @@ -0,0 +1,109 @@ +# Architecture + +Publicly, `atomref` is still radii-first, with a small provisional X–H helper. + +Internally, the package is built around four layers: + +1. **elements** — stable element metadata and symbol canonicalization, +2. **registry** — curated quantity and dataset metadata plus packaged data + loading, +3. **policy core** — generic value selection with overrides, transfers, + fallbacks, blocked keys, and provenance, +4. **quantity wrappers** — convenience APIs such as `atomref.radii` and + `atomref.xh`. + +## Core terminology + +A few terms are deliberately separated in the design: + +- **quantity** — the operational property family being requested, +- **domain** — the key space used to index that quantity, +- **dataset** — one curated source table inside the quantity, +- **policy** — the ordered rule set used to select a final value. + +This separation is what allows the package to say, for example, that +`rahm2016` belongs to the `atomic_radius` quantity but can still act as support +data in a van der Waals policy. + +## Domain support in the current runtime + +The registry schema is domain-aware, but the current resolver intentionally +implements only one domain: + +- `element` + +That means: + +- packaged built-in sets are currently element-indexed scalar tables, +- `ValuePolicy` resolves element symbols, +- transfer fitting is performed over element-wise overlap. + +The metadata keeps `domain` explicit now so later versions can extend the data +model without having to reinterpret existing registry entries. + +## Policy resolution and transfer sources + +The generic resolver works in a fixed order: + +1. blocked keys, +2. overrides, +3. base dataset, +4. transfer models, +5. fallback, +6. missing. + +Transfer sources can be: + +- packaged datasets, +- custom `ElementScalarSet` objects, +- generic `ValuePolicy` objects, +- wrapper policies exposing `as_value_policy()`. + +That last point is important. It means higher-level code can express +"infer values from my chosen covalent-radii policy" instead of being forced to +refer to one hard-coded predictor dataset. + +## Nested-policy safeguards and cycle detection + +Policy-backed transfer sources are materialized with more than just raw numeric +values. The resolver also tracks, per element: + +- whether the value came from `base`, `override`, substitution, linear transfer, + or fallback, +- the nested transfer depth that was required to produce it, +- placeholder status. + +`LinearTransfer` uses that information twice: + +- once when fitting the linear relation (`fit_sources` / `fit_max_depth`), +- again when deciding whether the predictor value for the requested element is + admissible (`prediction_sources` / `prediction_max_depth`). + +The default policy is intentionally conservative: fit only on direct nested +predictor values, but allow one additional nested completion step when +predicting the final requested element. This keeps the common two-stage use case +possible without silently training on arbitrarily long inference chains. + +Cycle detection is handled with a context-local activation stack. Both generic +`ValuePolicy` objects and wrapper policies are tracked, so recursion through a +freshly materialized wrapper policy is still detected reliably and safely. + +## Placeholder handling + +Placeholder semantics stay attached to the value that was actually returned. +This means `LookupResult.is_placeholder` can be true for: + +- a base lookup, +- a substitution transfer, +- a nested policy used as a transfer source. + +A linear transfer normally returns a computed value and therefore does not carry +placeholder status itself. Instead, its provenance is carried by +`resolved_from`, explanatory notes, and `transfer_depth`. + +## Why the design stays small + +The package deliberately avoids a large object graph or a chemistry-specific DSL. +A quantity wrapper is usually only a thin adapter over the generic policy core. +That keeps the internals easy to test and lets other scientific packages reuse +`atomref` without bringing in the rest of the Delone Commons stack. diff --git a/docs/dev/data_curation.md b/docs/dev/data_curation.md new file mode 100644 index 0000000..689ae24 --- /dev/null +++ b/docs/dev/data_curation.md @@ -0,0 +1,26 @@ +# Data curation + +Packaged tables are stored as CSV files indexed by atomic number. Dataset +metadata and provenance live in `src/atomref/data/registry.json`. + +Placeholder values are modeled as dataset metadata, not as hard-coded Python +constants. + +The registry distinguishes several orthogonal concerns: + +- `quantity` — the operational lookup target, such as `covalent_radius` or + `van_der_waals_radius` +- `semantic_class` — what the dataset scientifically represents +- `usage_role` — whether the dataset is intended as a direct target set or as + support data for transfer +- `phase_context` — the physical context of the underlying values + +This matters for support-only datasets such as `atomic_radius:rahm2016`, which +is packaged as atomic support data and then used by the default van der Waals +policy through linear transfer. + +To check that metadata and packaged tables stay synchronized, run: + +```bash +python tools/check_registry.py +``` diff --git a/docs/dev/dev_plan.md b/docs/dev/dev_plan.md new file mode 100644 index 0000000..94cdaac --- /dev/null +++ b/docs/dev/dev_plan.md @@ -0,0 +1,33 @@ +# Development plan + +## Current status (implemented in the `0.1.x` line) + +- stable element metadata +- curated covalent, van der Waals, and atomic-radius support datasets +- explicit provenance and coverage metadata +- generic value-policy core plus radii and X–H convenience wrappers +- substitution and linear transfer +- custom element-indexed scalar sets +- policy-backed transfer sources +- nested-policy safeguards, transfer-depth tracking, and cycle detection +- provisional X–H support via `csd_legacy_xh_cno`, `XHPolicy`, and + `DEFAULT_XH_POLICY` + +## Planned for `0.2.x` + +- broader X–H datasets and policies +- experimental plus computational support sets +- pairwise helper logic such as reference sums and normalization schemes +- restoration of incomplete experimental data from broader-support predictors + +## Longer-term design ideas + +- radial atomic reference functions +- simple proto-density support based on spherically averaged atomic data + +## Possible future directions + +- more radii sets +- uncertainty and confidence flags +- ion-specific or atom-type-specific domains +- density-derived radii and related reference transforms diff --git a/docs/guide/custom_sets.md b/docs/guide/custom_sets.md new file mode 100644 index 0000000..71306bb --- /dev/null +++ b/docs/guide/custom_sets.md @@ -0,0 +1,31 @@ +# Custom sets + +`atomref` is not limited to the packaged tables. You can build a small +user-defined element-indexed scalar dataset and use it as a base dataset or as a +support dataset inside a transfer-backed policy. + +The simplest entry point is `ElementScalarSet.from_mapping(...)`. + +```python +from atomref import DatasetRef, ElementScalarSet, RadiiPolicy + +custom = ElementScalarSet.from_mapping( + ref=DatasetRef("covalent_radius", "my_cov"), + values={"C": 0.75, "H": 0.31}, + name="My custom covalent radii", + units="angstrom", +) + +policy = RadiiPolicy(kind="covalent", base_set=custom) +``` + +This is useful when you want to: + +- test an alternative reference table, +- pin a small project-specific dataset without creating a full package fork, +- combine a user dataset with built-in support data through substitution or + linear transfer. + +In the current implementation custom sets are element-domain scalar datasets, +which keeps the data model small and stable. Later versions may add more specialized domains, but +custom element-wise sets are already enough for many geometry workflows. diff --git a/docs/guide/install.md b/docs/guide/install.md new file mode 100644 index 0000000..e7e0697 --- /dev/null +++ b/docs/guide/install.md @@ -0,0 +1,30 @@ +# Install + +For normal use, install the runtime package: + +```bash +pip install atomref +``` + +`atomref` is pure Python and has no required runtime dependencies outside the +standard library. + +For local development, documentation work, and tests, install the editable +package together with the main extras: + +```bash +pip install -e ".[test,docs,dev]" +``` + +Those extras currently cover: + +- `test` — pytest and test-only compatibility helpers, +- `docs` — MkDocs and API documentation tooling, +- `dev` — flake8, build, and release metadata checks. + + +For a full local pre-release validation pass after installing those extras, run: + +```bash +python tools/release_check.py +``` diff --git a/docs/guide/non_goals.md b/docs/guide/non_goals.md new file mode 100644 index 0000000..b38aa68 --- /dev/null +++ b/docs/guide/non_goals.md @@ -0,0 +1,23 @@ +# Non-goals + +`atomref` is intentionally narrow. + +It is **not** trying to be: + +- a general periodic-table encyclopedia, +- a home for arbitrary atomic or chemical properties, +- a structure parser, +- a crystallographic symmetry package, +- a structure-inference engine, +- a Voronoi / tessellation library, +- an environment-specific chemistry model, +- a machine-learning framework for extrapolating unseen chemistry. + +The package is about **curated reference data and explicit lookup policies**. +That includes provenance, transfer from broader support datasets, and stable API +surfaces that higher-level scientific code can rely on. + +Future versions may widen the range of supported *reference-data families* — for +example X–H distances or radial atomic reference functions — but the package +should still remain a small reference-data layer rather than a full chemistry +platform. diff --git a/docs/guide/notebooks.md b/docs/guide/notebooks.md new file mode 100644 index 0000000..2ad0045 --- /dev/null +++ b/docs/guide/notebooks.md @@ -0,0 +1,25 @@ +# Notebook gallery + +`atomref` ships example Jupyter notebooks that cover the main workflows. +Each notebook is available in two forms: + +- the original `.ipynb` file in the repository, +- a rendered Markdown copy included in these docs. + +That way users can either run the notebooks locally or read them directly on the +documentation site. + +## Available notebooks + +- [Quickstart notebook](../notebooks/01-quickstart.md) — basic imports, + `get_*` vs `lookup_*`, quantity discovery, and packaged-set access. +- [Policies and assessment notebook](../notebooks/02-policies-and-assessment.md) + — overrides, transfer-backed policies, and policy summaries. +- [Custom sets and discovery notebook](../notebooks/03-custom-sets-and-discovery.md) + — user-defined sets, catalog inspection, and metadata exploration. + +The original notebook files are also in the repository: + +- [`01-quickstart.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/01-quickstart.ipynb) +- [`02-policies-and-assessment.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/02-policies-and-assessment.ipynb) +- [`03-custom-sets-and-discovery.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/03-custom-sets-and-discovery.ipynb) diff --git a/docs/guide/policies.md b/docs/guide/policies.md new file mode 100644 index 0000000..b9e3b7a --- /dev/null +++ b/docs/guide/policies.md @@ -0,0 +1,222 @@ +# Policies + +A policy tells `atomref` how to answer the question “what value should I use for +this element?” + +That may sound simple, but in practice scientific datasets are often +incomplete. A policy makes the decision process explicit instead of hiding it in +algorithm code. + +## Terms used in the policy layer + +A few terms appear repeatedly in the API and docs: + +- **quantity** — the operational property family being requested. +- **domain** — the lookup key space. In the current runtime that means + `element`, so lookups are keyed by element symbol. +- **dataset** — a curated named table inside one quantity. +- **policy** — the ordered rule set used to resolve missing values. + +The quantity and dataset live in the curated registry. The policy is the +selection logic that sits on top of them. + +## Resolution order + +In the current implementation every lookup follows the same ordered path: + +1. **Blocked key** (optional) +2. **Override** +3. **Base dataset** +4. **Transfer models**, in the order you listed them +5. **Fallback** +6. **Missing** + +Each step has a specific meaning. + +### Blocked key + +Some quantity wrappers need to declare that certain domain keys should never be +resolved, even if a transfer model could otherwise invent a number. The current +X–H helper uses this for `H`, because `xh_bond_length` is keyed by the parent +atom `X` in `X–H`, not by hydrogen itself. + +### Override + +An override is a value you provide directly for a specific element. It wins over +everything else and is useful when you want to pin one or two elements without +changing the whole dataset. + +### Base dataset + +The base dataset is the preferred source. For example, the default covalent +policy starts from the **Cordero covalent radii** (`cordero2008`), and the +default vdW policy starts from the **Alvarez van der Waals radii** +(`alvarez2013`). + +### Transfer + +A transfer model is used only when the base dataset has no value for the +requested element. + +Built-in transfer models are: + +- `SubstitutionTransfer` — take a value directly from another dataset or policy, +- `LinearTransfer` — infer a target-equivalent value from another dataset or + policy through a fitted linear model. + +`LinearTransfer` already accepts a tuple of predictors in the API, but the +current runtime intentionally supports exactly one predictor source. That keeps +the implementation simple now while leaving room for later multi-predictor +linear models. + +Transfer sources can be: + +- a packaged dataset reference (`DatasetRef`), +- a custom `ElementScalarSet`, +- a generic `ValuePolicy`, +- a wrapper policy such as `RadiiPolicy` or `XHPolicy`. + +When a transfer source is itself a policy, `atomref` uses the values selected by +that policy. This lets higher-level workflows express things like “infer X–H +lengths from my chosen covalent-radii policy” instead of hard-coding a specific +support dataset. + +#### Nested policy safeguards for `LinearTransfer` + +When a predictor source is itself a policy, two different questions matter: + +1. Which nested predictor values are trustworthy enough to train the linear fit? +2. Which nested predictor value is acceptable for the final requested element? + +`atomref` keeps those two decisions separate. By default: + +- `fit_sources=("base", "override")` and `fit_max_depth=0`, +- `prediction_sources=("base", "override", "transfer_substitution", "transfer_linear")` + and `prediction_max_depth=1`. + +That means the fitted relationship is trained only on direct predictor values by +default, while one additional nested completion step is still allowed at +prediction time. + +This is a good default for workflows such as: + +- sparse target X–H data from `csd_legacy_xh_cno`, +- a partial covalent-radii predictor policy with direct `s,p` values, +- one inner transfer from a broader support set such as `cordero2008` to make + the predictor usable for `d` or `f` elements. + +In that setup, the outer X–H fit still uses direct predictor anchors, while the +final requested element may use one nested predictor transfer. If you really do +want fit training to use nested predictor values as well, you can opt in +explicitly by widening `fit_sources` and/or increasing `fit_max_depth`. + +### Fallback + +A fallback is a constant last-resort value. It is useful when an algorithm must +receive *some* number even if both the base dataset and transfer sources are +missing a value. + +### Missing + +If nothing above can produce a value and no fallback was configured, the result +is simply missing. In that case `get_*` returns `None`, while `lookup_*` +returns a `LookupResult` with `source="missing"` and explanatory notes. + +## Placeholder values and `is_placeholder` + +Some support datasets use placeholder numbers to stand in for “unknown but keep +this legacy table dense enough for downstream heuristics”. + +`LookupResult.is_placeholder` answers one narrow question: + +> Is the **returned numeric value itself** marked as a placeholder by the source +> that supplied it? + +It does **not** mean “a transfer happened”. Examples: + +- a base lookup can have `is_placeholder=True` if the base dataset contains a + placeholder value, +- a substitution transfer can also have `is_placeholder=True` if it copied a + placeholder from the transfer source, +- a linear transfer is computed, not copied, so `is_placeholder` is normally + `False`. + +## Transfer depth and cycle detection + +`LookupResult.transfer_depth` counts how many transfer steps were needed to +produce the returned value: + +- direct base and override values have depth `0`, +- one substitution or linear restoration has depth `1`, +- nested transfer chains increase the depth further. + +This makes nested-policy behavior inspectable without trying to infer it from +notes alone. + +Because policies may now depend on other policies, the resolver also performs +cycle detection. A cyclic reference such as policy A depending on policy B while +policy B depends back on policy A raises `PolicyError` instead of recurring +indefinitely. The same protection applies when recursion goes through wrapper +policies such as `RadiiPolicy` or `XHPolicy`. + +## Target datasets and support datasets + +`atomref` separates **what a dataset is used for** from **what it scientifically +represents**. + +That is why the package stores: + +- the operational **quantity**, +- the lookup **domain**, +- the scientific **semantic class**, +- the package-level **usage role**. + +This distinction matters for datasets such as **Rahm isodensity atomic radii** +(`rahm2016`). They are useful support data for restoring missing van der Waals +radii, but they are not the same thing as a condensed-phase structural vdW +radius set. In `atomref`, that difference is recorded in the metadata instead of +being hidden. + +## Examples + +A standard dataset-backed transfer: + +```python +import atomref as ar + +policy = ar.RadiiPolicy( + kind="van_der_waals", + base_set="alvarez2013", + transfers=( + ar.LinearTransfer( + predictors=(ar.DatasetRef("atomic_radius", "rahm2016"),), + ), + ), + overrides={"Xe": 2.10}, +) +``` + +A policy-backed transfer source: + +```python +import atomref as ar + +xh_policy = ar.XHPolicy( + base_set="csd_legacy_xh_cno", + transfers=( + ar.LinearTransfer( + predictors=(ar.DEFAULT_COVALENT_POLICY,), + min_points=3, + ), + ), +) +``` + +With that X–H policy: + +- `C`, `N`, and `O` use the curated ConQuest defaults, +- missing parent elements may be inferred from the **selected covalent-radii + policy**, not just from one hard-coded support dataset, +- if the predictor policy itself needed a transfer to produce a covalent radius, + the resulting `LookupResult` still records that provenance in `resolved_from`, + `notes`, and `transfer_depth`. diff --git a/docs/guide/quickstart.md b/docs/guide/quickstart.md new file mode 100644 index 0000000..72e6858 --- /dev/null +++ b/docs/guide/quickstart.md @@ -0,0 +1,61 @@ +# Quickstart + +The two most important user-facing ideas in `atomref` are: + +- `get_*` returns only the selected number, +- `lookup_*` returns the number **and** provenance metadata. + +```pycon +>>> import atomref as ar +>>> ar.get_covalent_radius("C") +0.76 +>>> ar.get_vdw_radius("O") +1.5 +>>> ar.get_xh_bond_length("N") +1.015 +>>> lookup = ar.lookup_vdw_radius("Pm") +>>> lookup.value +2.8972265395148358 +>>> lookup.source +'transfer_linear' +>>> lookup.resolved_from +(DatasetRef(quantity='atomic_radius', set_id='rahm2016'),) +``` + +Use `get_*` when you only need the value. Use `lookup_*` when you want to know +whether the result came from the preferred dataset, a support dataset, a policy +override, or a fallback. + +You can inspect the packaged quantity layer directly: + +```pycon +>>> import atomref as ar +>>> ar.list_quantities() +('covalent_radius', 'van_der_waals_radius', 'atomic_radius', 'xh_bond_length') +>>> ar.get_quantity_info("xh_bond_length") +QuantityInfo(quantity='xh_bond_length', domain='element', units='angstrom', description='Element-indexed reference X-H bond lengths keyed by parent element X and intended for hydrogen-position normalisation or related geometry workflows.') +>>> [info.ref.set_id for info in ar.list_radii_set_infos("van_der_waals", usage_role="target")] +['bondi1964', 'rowland_taylor1996', 'alvarez2013', 'chernyshov2020'] +``` + +And you can load a packaged set object directly: + +```pycon +>>> import atomref as ar +>>> vdw = ar.get_radii_set("van_der_waals", "alvarez2013") +>>> vdw.get("O") +1.5 +>>> raw = ar.get_builtin_set(ar.DatasetRef("atomic_radius", "rahm2016")) +>>> raw.get("Pm") +2.83 +>>> xh = ar.get_xh_set("csd_legacy_xh_cno") +>>> xh.get("C") +1.089 +``` + +For longer, runnable examples see: + +- the [notebook overview](notebooks.md), +- the [quickstart notebook page](../notebooks/01-quickstart.md), +- the [policies notebook page](../notebooks/02-policies-and-assessment.md), +- the [custom sets notebook page](../notebooks/03-custom-sets-and-discovery.md). diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..198fa6a --- /dev/null +++ b/docs/index.md @@ -0,0 +1,161 @@ +# atomref + +[![CI](https://github.com/DeloneCommons/atomref/actions/workflows/ci.yml/badge.svg)](https://github.com/DeloneCommons/atomref/actions/workflows/ci.yml) +[![Docs](https://github.com/DeloneCommons/atomref/actions/workflows/docs.yml/badge.svg)](https://github.com/DeloneCommons/atomref/actions/workflows/docs.yml) +[![PyPI](https://img.shields.io/pypi/v/atomref.svg)](https://pypi.org/project/atomref/) +[![Python Versions](https://img.shields.io/pypi/pyversions/atomref.svg)](https://pypi.org/project/atomref/) +[![License](https://img.shields.io/pypi/l/atomref.svg)](https://github.com/DeloneCommons/atomref/blob/main/LICENSE) + +`atomref` is a small pure-Python package for **curated atomic reference data** +and **provenance-aware lookup policies** used by geometry and +structure-analysis algorithms. + +It is not meant to be yet another periodic-table encyclopedia. The package is +for code that needs stable atomic reference values with explicit provenance, +clear fallback behavior, and honest handling of incomplete preferred datasets. + +What you get in the current release line: + +- stable element metadata, +- curated named radii sets, +- provisional X–H bond-length support for hydrogen-normalisation workflows, +- dataset provenance and coverage metadata, +- deterministic lookup policies, +- substitution and linear transfer from support datasets or policies into target datasets, +- guarded nested policy-backed transfers with explicit transfer depth, + conservative fit/prediction controls, and cycle detection, +- user-defined custom element-indexed scalar sets. + +## Core terms + +`atomref` uses a small vocabulary on purpose. + +- **quantity** — the operational property family being requested, such as + `covalent_radius`, `van_der_waals_radius`, `atomic_radius`, or + `xh_bond_length`. +- **domain** — the key space used to index that quantity. In the current + runtime, the supported domain is `element`, meaning lookups are keyed by an + element symbol. +- **dataset** — one curated named table inside a quantity, such as + `cordero2008`, `alvarez2013`, or `csd_legacy_xh_cno`. +- **policy** — the ordered rule set that decides what value to return when the + preferred dataset is incomplete. + +The metadata layer already records `domain` explicitly because the package is +built for later extension, but the current runtime intentionally keeps the +implementation narrow and stable: **the current runtime resolves only +element-domain scalar values**. + +## Why this exists + +Scientific software often wants a complete lookup table, but the best dataset +for the job is rarely complete. `atomref` makes that situation explicit. +Instead of hiding ad hoc defaults inside algorithm code, you choose a target +set, describe how missing values may be restored, and keep provenance on what +was actually returned. + +The built-in default behavior is intentionally simple and practical: + +- **Cordero covalent radii** (`cordero2008`) are the preferred covalent target + set, with missing values substituted from the **legacy CSD covalent radii** + (`csd_legacy_cov`). +- **Alvarez van der Waals radii** (`alvarez2013`) are the preferred vdW target + set, with missing values restored from the **Rahm isodensity atomic radii** + (`rahm2016`) through a fitted linear transfer. +- **CSD/ConQuest hydrogen-normalisation defaults** (`csd_legacy_xh_cno`) are a + provisional sparse X–H target set for `C`, `N`, and `O`, with other parent + elements inferred from **Cordero covalent radii** through a fitted linear + transfer. + +Nested policy predictors are supported too. `LinearTransfer` separates +**fit-time** use of nested predictor values from **prediction-time** use. By default, the fit may use only direct nested +values, while the final requested element may still use one additional +nested completion step. That is a useful compromise for workflows such as +provisional X–H inference from a chosen covalent-radii policy. + +## Quick example + +```pycon +>>> import atomref as ar +>>> ar.get_covalent_radius("C") +0.76 +>>> ar.get_vdw_radius("O") +1.5 +>>> ar.get_xh_bond_length("N") +1.015 +>>> lookup = ar.lookup_vdw_radius("Pm") +>>> lookup.value +2.8972265395148358 +>>> lookup.source +'transfer_linear' +>>> lookup.transfer_depth +1 +>>> lookup.resolved_from +(DatasetRef(quantity='atomic_radius', set_id='rahm2016'),) +``` + +`get_*` returns only the number. `lookup_*` returns a `LookupResult` that also +records where the value came from, whether a transfer model or policy source was +involved, and how many transfer steps were needed (`transfer_depth`). + +You can inspect the packaged quantity and dataset catalog directly: + +```pycon +>>> import atomref as ar +>>> ar.list_quantities() +('covalent_radius', 'van_der_waals_radius', 'atomic_radius', 'xh_bond_length') +>>> ar.get_quantity_info("xh_bond_length") +QuantityInfo(quantity='xh_bond_length', domain='element', units='angstrom', description='Element-indexed reference X-H bond lengths keyed by parent element X and intended for hydrogen-position normalisation or related geometry workflows.') +>>> [info.ref.set_id for info in ar.list_dataset_infos("van_der_waals_radius", usage_role="target")] +['bondi1964', 'rowland_taylor1996', 'alvarez2013', 'chernyshov2020'] +``` + +You can also load a packaged set directly: + +```pycon +>>> import atomref as ar +>>> vdw = ar.get_radii_set("van_der_waals", "alvarez2013") +>>> vdw.get("O") +1.5 +>>> xh = ar.get_xh_set("csd_legacy_xh_cno") +>>> xh.get("C") +1.089 +``` + +## Notebook walkthroughs + +The repository ships example notebooks for the main workflows. In the +documentation they are also available as rendered Markdown pages, so users can +read them without opening Jupyter first. + +- [Notebook overview](https://delonecommons.github.io/atomref/guide/notebooks/) +- [Quickstart notebook](https://delonecommons.github.io/atomref/notebooks/01-quickstart/) +- [Policies and assessment notebook](https://delonecommons.github.io/atomref/notebooks/02-policies-and-assessment/) +- [Custom sets and discovery notebook](https://delonecommons.github.io/atomref/notebooks/03-custom-sets-and-discovery/) + +## Relationship to Delone Commons + +`atomref` is designed as a standalone package, but within Delone Commons it is +primarily intended to support chemistry-aware packages such as: + +- `molcryst`, for covalent-bond detection, contact analysis, and hydrogen workflows, +- future `chemvoro`, for chemistry-aware contact and hydrogen workflows. + +By contrast, `pyvoro2` and `pbcgraph` are intentionally general mathematical +packages and are not direct consumers of `atomref`. + +## Data curation and developer tools + +The repository also ships small maintenance tools. The most important ones are: + +- `python tools/check_registry.py` — validate curated registry metadata against + packaged CSV tables, +- `python tools/check_notebooks.py` — execute notebook code cells, +- `python tools/export_notebooks.py` — turn notebooks into Markdown pages for + the docs, +- `python tools/gen_readme.py` — regenerate `README.md` from this page, +- `python tools/release_check.py` — run the full release-preparation checklist, + including linting, tests, docs, builds, and artifact validation. + +See the [tools README](https://github.com/DeloneCommons/atomref/blob/main/tools/README.md) +for a short description of each script. diff --git a/docs/notebooks/01-quickstart.md b/docs/notebooks/01-quickstart.md new file mode 100644 index 0000000..12e8813 --- /dev/null +++ b/docs/notebooks/01-quickstart.md @@ -0,0 +1,72 @@ + + +[Open the original notebook on GitHub](https://github.com/DeloneCommons/atomref/blob/main/notebooks/01-quickstart.ipynb) +# atomref quickstart + +This notebook covers the main public API: element helpers, direct +`get_*` calls, provenance-carrying `lookup_*` calls, and packaged dataset +discovery. +```python +import atomref as ar + +print(ar.get_element('Cl')) +print(ar.list_quantities()) +``` +**Output** +```text +Element(z=17, symbol='Cl', name='Chlorine') +('covalent_radius', 'van_der_waals_radius', 'atomic_radius', 'xh_bond_length') +``` +```python +r_c = ar.get_covalent_radius('C') +r_vdw = ar.get_vdw_radius('O') +print(r_c) +print(r_vdw) +assert r_c == 0.76 +assert r_vdw == 1.50 +``` +**Output** +```text +0.76 +1.5 +``` +```python +lookup = ar.lookup_vdw_radius('Pm') +print(f"{lookup.value:.12f}") +print(lookup.source) +print(lookup.resolved_from) +assert lookup.source == 'transfer_linear' +``` +**Output** +```text +2.897226539515 +transfer_linear +(DatasetRef(quantity='atomic_radius', set_id='rahm2016'),) +``` +```python +quantity = ar.get_quantity_info('atomic_radius') +print(quantity.quantity, quantity.domain, quantity.units) + +for info in ar.list_dataset_infos('van_der_waals_radius', usage_role='target'): + print(info.ref.set_id, info.name, info.usage_role) +``` +**Output** +```text +atomic_radius element angstrom +bondi1964 Bondi van der Waals radii target +rowland_taylor1996 Rowland & Taylor nonbonded contact radii target +alvarez2013 Alvarez van der Waals radii target +chernyshov2020 Chernyshov LoS van der Waals radii target +``` +```python +vdw = ar.get_radii_set('van_der_waals', 'alvarez2013') +print(vdw.get('O')) + +support = ar.get_builtin_set(ar.DatasetRef('atomic_radius', 'rahm2016')) +print(support.get('Pm')) +``` +**Output** +```text +1.5 +2.83 +``` diff --git a/docs/notebooks/02-policies-and-assessment.md b/docs/notebooks/02-policies-and-assessment.md new file mode 100644 index 0000000..4f6baf6 --- /dev/null +++ b/docs/notebooks/02-policies-and-assessment.md @@ -0,0 +1,73 @@ + + +[Open the original notebook on GitHub](https://github.com/DeloneCommons/atomref/blob/main/notebooks/02-policies-and-assessment.ipynb) +# Policies and assessment + +This notebook shows how `atomref` resolves missing values through ordered +policy steps and how to inspect policy-level behavior. +```python +import atomref as ar +``` +```python +covalent_policy = ar.RadiiPolicy( + kind='covalent', + base_set='cordero2008', + transfers=( + ar.SubstitutionTransfer( + source=ar.DatasetRef('covalent_radius', 'csd_legacy_cov') + ), + ), +) +lookup = ar.lookup_covalent_radius('Bk', policy=covalent_policy) +print(lookup.source) +print(f"{lookup.value:.12f}") +print(lookup.resolved_from) +``` +**Output** +```text +transfer_substitution +1.540000000000 +(DatasetRef(quantity='covalent_radius', set_id='csd_legacy_cov'),) +``` +```python +vdw_policy = ar.RadiiPolicy( + kind='van_der_waals', + base_set='alvarez2013', + transfers=( + ar.LinearTransfer( + predictors=(ar.DatasetRef('atomic_radius', 'rahm2016'),) + ), + ), +) +lookup = ar.lookup_vdw_radius('Pm', policy=vdw_policy) +print(f"{lookup.value:.12f}") +print(lookup.source) +print( + f"slope={lookup.fit.coefficients[0]:.12f} intercept={lookup.fit.intercept:.12f} n={lookup.fit.n_points}" +) +``` +**Output** +```text +2.897226539515 +transfer_linear +slope=1.135336645553 intercept=-0.315776167399 n=90 +``` +```python +assessment = ar.assess_radii_policy( + ['C', 'Xe', 'Pm', 'Bk'], + policy=vdw_policy, + detail=True, +) +print(assessment.n_base, assessment.n_transfer_linear, assessment.n_missing) +for row in assessment.per_element: + value = 'None' if row.lookup.value is None else f"{row.lookup.value:.12f}" + print(row.symbol, row.lookup.source, value) +``` +**Output** +```text +3 1 0 +C base 1.770000000000 +Xe base 2.060000000000 +Pm transfer_linear 2.897226539515 +Bk base 3.400000000000 +``` diff --git a/docs/notebooks/03-custom-sets-and-discovery.md b/docs/notebooks/03-custom-sets-and-discovery.md new file mode 100644 index 0000000..47138bf --- /dev/null +++ b/docs/notebooks/03-custom-sets-and-discovery.md @@ -0,0 +1,56 @@ + + +[Open the original notebook on GitHub](https://github.com/DeloneCommons/atomref/blob/main/notebooks/03-custom-sets-and-discovery.ipynb) +# Custom sets and dataset discovery + +This notebook shows how to define a small user-provided set, plug it into a +policy, and inspect the packaged dataset catalog. +```python +import atomref as ar +``` +```python +custom_cov = ar.ElementScalarSet.from_mapping( + ref=ar.DatasetRef("covalent_radius", "demo_user_cov"), + values={"C": 0.77, "O": 0.67}, + name="Demo user covalent set", + units="angstrom", + description="Example custom set for notebook usage.", + notes=("Notebook example",), +) + +policy = ar.RadiiPolicy( + kind="covalent", + base_set=custom_cov, + transfers=( + ar.SubstitutionTransfer( + source=ar.DatasetRef("covalent_radius", "cordero2008") + ), + ), +) + +for symbol in ("C", "O", "N"): + print(symbol, ar.lookup_covalent_radius(symbol, policy=policy)) +``` +**Output** +```text +C LookupResult(value=0.77, source='base', target=DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'), resolved_from=(DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'),), is_placeholder=False, fit=None, notes=(), transfer_depth=0) +O LookupResult(value=0.67, source='base', target=DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'), resolved_from=(DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'),), is_placeholder=False, fit=None, notes=(), transfer_depth=0) +N LookupResult(value=0.71, source='transfer_substitution', target=DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'), resolved_from=(DatasetRef(quantity='covalent_radius', set_id='cordero2008'),), is_placeholder=False, fit=None, notes=('missing in base set; substituted from transfer source',), transfer_depth=1) +``` +```python +for info in ar.list_radii_set_infos("van_der_waals", usage_role="target"): + print(info.ref.set_id, info.semantic_class, info.origin_class, info.phase_context) + +rahm = ar.get_dataset_info(ar.DatasetRef("atomic_radius", "rahm2016")) +print(rahm.name) +print(rahm.semantic_class, rahm.phase_context, rahm.usage_role) +``` +**Output** +```text +bondi1964 vdw_compiled compiled_experimental mixed_or_legacy +rowland_taylor1996 vdw_structural structural condensed_phase +alvarez2013 vdw_structural structural condensed_phase +chernyshov2020 vdw_structural_typed_reduced structural condensed_phase +Rahm isodensity atomic radii (ρ=0.001 e/bohr³) +atomic_isodensity isolated_atom support +``` diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..e0952f2 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,50 @@ +site_name: atomref +site_url: https://delonecommons.github.io/atomref/ +repo_url: https://github.com/DeloneCommons/atomref +repo_name: DeloneCommons/atomref + +theme: + name: material + +plugins: + - search + - mkdocstrings: + handlers: + python: + paths: [src] + options: + show_root_heading: true + show_source: false + +nav: + - Home: index.md + - Guide: + - Install: guide/install.md + - Quickstart: guide/quickstart.md + - Policies: guide/policies.md + - Custom sets: guide/custom_sets.md + - Non-goals: guide/non_goals.md + - Datasets: + - Overview: datasets/index.md + - Covalent radius: datasets/covalent_radius.md + - van der Waals radius: datasets/van_der_waals_radius.md + - Atomic radius: datasets/atomic_radius.md + - X–H bond length: datasets/xh_bond_length.md + - Notebooks: + - Overview: guide/notebooks.md + - Quickstart notebook: notebooks/01-quickstart.md + - Policies and assessment notebook: notebooks/02-policies-and-assessment.md + - Custom sets and discovery notebook: notebooks/03-custom-sets-and-discovery.md + - Development: + - Architecture: dev/architecture.md + - Data curation: dev/data_curation.md + - Development plan: dev/dev_plan.md + - API: + - Overview: api/index.md + - atomref: api/atomref.md + - atomref.elements: api/elements.md + - atomref.registry: api/registry.md + - atomref.transfer: api/transfer.md + - atomref.policy: api/policy.md + - atomref.radii: api/radii.md + - atomref.xh: api/xh.md diff --git a/notebooks/01-quickstart.ipynb b/notebooks/01-quickstart.ipynb new file mode 100644 index 0000000..47b58d1 --- /dev/null +++ b/notebooks/01-quickstart.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# atomref quickstart\n\nThis notebook covers the main public API: element helpers, direct\n`get_*` calls, provenance-carrying `lookup_*` calls, and packaged dataset\ndiscovery.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import atomref as ar\n", + "\n", + "print(ar.get_element('Cl'))\n", + "print(ar.list_quantities())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "r_c = ar.get_covalent_radius('C')\n", + "r_vdw = ar.get_vdw_radius('O')\n", + "print(r_c)\n", + "print(r_vdw)\n", + "assert r_c == 0.76\n", + "assert r_vdw == 1.50\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lookup = ar.lookup_vdw_radius('Pm')\n", + "print(f\"{lookup.value:.12f}\")\n", + "print(lookup.source)\n", + "print(lookup.resolved_from)\n", + "assert lookup.source == 'transfer_linear'\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "quantity = ar.get_quantity_info('atomic_radius')\n", + "print(quantity.quantity, quantity.domain, quantity.units)\n", + "\n", + "for info in ar.list_dataset_infos('van_der_waals_radius', usage_role='target'):\n", + " print(info.ref.set_id, info.name, info.usage_role)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vdw = ar.get_radii_set('van_der_waals', 'alvarez2013')\n", + "print(vdw.get('O'))\n", + "\n", + "support = ar.get_builtin_set(ar.DatasetRef('atomic_radius', 'rahm2016'))\n", + "print(support.get('Pm'))\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/02-policies-and-assessment.ipynb b/notebooks/02-policies-and-assessment.ipynb new file mode 100644 index 0000000..dfe2678 --- /dev/null +++ b/notebooks/02-policies-and-assessment.ipynb @@ -0,0 +1,97 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Policies and assessment\n", + "\n", + "This notebook shows how `atomref` resolves missing values through ordered\n", + "policy steps and how to inspect policy-level behavior.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import atomref as ar\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "covalent_policy = ar.RadiiPolicy(\n", + " kind='covalent',\n", + " base_set='cordero2008',\n", + " transfers=(\n", + " ar.SubstitutionTransfer(\n", + " source=ar.DatasetRef('covalent_radius', 'csd_legacy_cov')\n", + " ),\n", + " ),\n", + ")\n", + "lookup = ar.lookup_covalent_radius('Bk', policy=covalent_policy)\n", + "print(lookup.source)\n", + "print(f\"{lookup.value:.12f}\")\n", + "print(lookup.resolved_from)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vdw_policy = ar.RadiiPolicy(\n", + " kind='van_der_waals',\n", + " base_set='alvarez2013',\n", + " transfers=(\n", + " ar.LinearTransfer(\n", + " predictors=(ar.DatasetRef('atomic_radius', 'rahm2016'),)\n", + " ),\n", + " ),\n", + ")\n", + "lookup = ar.lookup_vdw_radius('Pm', policy=vdw_policy)\n", + "print(f\"{lookup.value:.12f}\")\n", + "print(lookup.source)\n", + "print(\n", + " f\"slope={lookup.fit.coefficients[0]:.12f} intercept={lookup.fit.intercept:.12f} n={lookup.fit.n_points}\"\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assessment = ar.assess_radii_policy(\n", + " ['C', 'Xe', 'Pm', 'Bk'],\n", + " policy=vdw_policy,\n", + " detail=True,\n", + ")\n", + "print(assessment.n_base, assessment.n_transfer_linear, assessment.n_missing)\n", + "for row in assessment.per_element:\n", + " value = 'None' if row.lookup.value is None else f\"{row.lookup.value:.12f}\"\n", + " print(row.symbol, row.lookup.source, value)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/03-custom-sets-and-discovery.ipynb b/notebooks/03-custom-sets-and-discovery.ipynb new file mode 100644 index 0000000..58f9d92 --- /dev/null +++ b/notebooks/03-custom-sets-and-discovery.ipynb @@ -0,0 +1,79 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Custom sets and dataset discovery\n", + "\n", + "This notebook shows how to define a small user-provided set, plug it into a\n", + "policy, and inspect the packaged dataset catalog.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import atomref as ar\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_cov = ar.ElementScalarSet.from_mapping(\n", + " ref=ar.DatasetRef(\"covalent_radius\", \"demo_user_cov\"),\n", + " values={\"C\": 0.77, \"O\": 0.67},\n", + " name=\"Demo user covalent set\",\n", + " units=\"angstrom\",\n", + " description=\"Example custom set for notebook usage.\",\n", + " notes=(\"Notebook example\",),\n", + ")\n", + "\n", + "policy = ar.RadiiPolicy(\n", + " kind=\"covalent\",\n", + " base_set=custom_cov,\n", + " transfers=(\n", + " ar.SubstitutionTransfer(\n", + " source=ar.DatasetRef(\"covalent_radius\", \"cordero2008\")\n", + " ),\n", + " ),\n", + ")\n", + "\n", + "for symbol in (\"C\", \"O\", \"N\"):\n", + " print(symbol, ar.lookup_covalent_radius(symbol, policy=policy))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for info in ar.list_radii_set_infos(\"van_der_waals\", usage_role=\"target\"):\n", + " print(info.ref.set_id, info.semantic_class, info.origin_class, info.phase_context)\n", + "\n", + "rahm = ar.get_dataset_info(ar.DatasetRef(\"atomic_radius\", \"rahm2016\"))\n", + "print(rahm.name)\n", + "print(rahm.semantic_class, rahm.phase_context, rahm.usage_role)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b712101 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,97 @@ +[build-system] +requires = ["hatchling>=1.24"] +build-backend = "hatchling.build" + +[project] +name = "atomref" +dynamic = ["version"] +description = "Curated atomic reference data and transfer policies for geometry and structure-analysis algorithms." +readme = "README.md" +requires-python = ">=3.10" +license = { file = "LICENSE" } +authors = [ + { name = "Ivan Yu. Chernyshov", email = "ivan.chernyshoff@gmail.com" } +] +keywords = ["chemistry", "materials", "crystallography", "reference data", "atomic radii"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Chemistry", + "Topic :: Software Development :: Libraries", + "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Operating System :: OS Independent", + "Typing :: Typed", +] +dependencies = [] + +[project.urls] +Homepage = "https://delonecommons.github.io/atomref/" +Documentation = "https://delonecommons.github.io/atomref/" +Repository = "https://github.com/DeloneCommons/atomref" +Issues = "https://github.com/DeloneCommons/atomref/issues" +Changelog = "https://github.com/DeloneCommons/atomref/blob/main/CHANGELOG.md" + +[project.optional-dependencies] +test = [ + "pytest>=7", + "tomli>=2; python_version < '3.11'", +] +docs = [ + "mkdocs>=1.6,<2", + "mkdocs-material>=9.5", + "mkdocstrings[python]>=0.25", + "mkdocs-include-markdown-plugin>=6.2", + "pymdown-extensions>=10.0", + "tomli>=2; python_version < '3.11'", +] +dev = [ + "build>=1.2", + "twine>=5", + "flake8>=7", +] + +[tool.hatch.version] +path = "src/atomref/__about__.py" + +[tool.hatch.build.targets.wheel] +packages = ["src/atomref"] +include = [ + "src/atomref/data/*.csv", + "src/atomref/data/*.json", +] + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", + "/docs", + "/tools", + "/notebooks", + "/mkdocs.yml", + "/README.md", + "/CHANGELOG.md", + "/DEV_PLAN.md", + "/NOTICE.md", + "/LICENSE", + "/COPYING", +] + +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "-ra --ignore=build --ignore=dist" +norecursedirs = [ + ".git", + ".pytest_cache", + "__pycache__", + ".venv", + ".tox", + "dist", + ".eggs", + "*.egg-info", +] diff --git a/src/atomref/__about__.py b/src/atomref/__about__.py new file mode 100644 index 0000000..bbab024 --- /dev/null +++ b/src/atomref/__about__.py @@ -0,0 +1 @@ +__version__ = "0.1.4" diff --git a/src/atomref/__init__.py b/src/atomref/__init__.py new file mode 100644 index 0000000..fb569b3 --- /dev/null +++ b/src/atomref/__init__.py @@ -0,0 +1,102 @@ +"""Public package exports for :mod:`atomref`.""" + +from .__about__ import __version__ +from .elements import ( + Element, + canonicalize_element_symbol, + get_element, + iter_elements, + is_valid_element_symbol, +) +from .policy import LookupResult, ValuePolicy, get_value, lookup_value +from .radii import ( + DEFAULT_COVALENT_POLICY, + DEFAULT_VDW_POLICY, + RadiiElementAssessment, + RadiiPolicy, + RadiiPolicyAssessment, + assess_radii_policy, + get_covalent_radius, + get_radii_set, + get_radii_set_info, + get_vdw_radius, + list_radii_set_infos, + list_radii_sets, + lookup_covalent_radius, + lookup_vdw_radius, +) +from .xh import ( + DEFAULT_XH_POLICY, + XHPolicy, + get_xh_bond_length, + get_xh_set, + get_xh_set_info, + list_xh_set_infos, + list_xh_sets, + lookup_xh_bond_length, +) +from .registry import ( + CoverageInfo, + DatasetInfo, + DatasetRef, + ElementScalarSet, + QuantityInfo, + Reference, + get_builtin_set, + get_dataset_info, + get_quantity_info, + list_dataset_ids, + list_dataset_infos, + list_quantities, +) +from .transfer import LinearFit, LinearTransfer, SubstitutionTransfer + +__all__ = [ + "__version__", + "Element", + "canonicalize_element_symbol", + "get_element", + "iter_elements", + "is_valid_element_symbol", + "CoverageInfo", + "DatasetInfo", + "DatasetRef", + "ElementScalarSet", + "QuantityInfo", + "Reference", + "get_builtin_set", + "get_dataset_info", + "get_quantity_info", + "list_dataset_ids", + "list_dataset_infos", + "list_quantities", + "LinearFit", + "LinearTransfer", + "SubstitutionTransfer", + "LookupResult", + "ValuePolicy", + "lookup_value", + "get_value", + "RadiiPolicy", + "RadiiElementAssessment", + "RadiiPolicyAssessment", + "DEFAULT_COVALENT_POLICY", + "DEFAULT_VDW_POLICY", + "list_radii_sets", + "list_radii_set_infos", + "get_radii_set", + "get_radii_set_info", + "lookup_covalent_radius", + "get_covalent_radius", + "lookup_vdw_radius", + "get_vdw_radius", + "assess_radii_policy", + "XHPolicy", + "DEFAULT_XH_POLICY", + "list_xh_sets", + "list_xh_set_infos", + "get_xh_set", + "get_xh_set_info", + "lookup_xh_bond_length", + "get_xh_bond_length", +] diff --git a/src/atomref/data/__init__.py b/src/atomref/data/__init__.py new file mode 100644 index 0000000..835d4e0 --- /dev/null +++ b/src/atomref/data/__init__.py @@ -0,0 +1 @@ +"""Packaged data files for atomref.""" diff --git a/src/atomref/data/covalent.csv b/src/atomref/data/covalent.csv new file mode 100644 index 0000000..053a71a --- /dev/null +++ b/src/atomref/data/covalent.csv @@ -0,0 +1,119 @@ +z,cordero2008,csd_legacy_cov +1,0.31,0.23 +2,0.28,1.5 +3,1.28,1.28 +4,0.96,0.96 +5,0.84,0.83 +6,0.76,0.68 +7,0.71,0.68 +8,0.66,0.68 +9,0.57,0.64 +10,0.58,1.5 +11,1.66,1.66 +12,1.41,1.41 +13,1.21,1.21 +14,1.11,1.2 +15,1.07,1.05 +16,1.05,1.02 +17,1.02,0.99 +18,1.06,1.51 +19,2.03,2.03 +20,1.76,1.76 +21,1.7,1.7 +22,1.6,1.6 +23,1.53,1.53 +24,1.39,1.39 +25,1.61,1.61 +26,1.52,1.52 +27,1.5,1.26 +28,1.24,1.24 +29,1.32,1.32 +30,1.22,1.22 +31,1.22,1.22 +32,1.2,1.17 +33,1.19,1.21 +34,1.2,1.22 +35,1.2,1.21 +36,1.16,1.5 +37,2.2,2.2 +38,1.95,1.95 +39,1.9,1.9 +40,1.75,1.75 +41,1.64,1.64 +42,1.54,1.54 +43,1.47,1.47 +44,1.46,1.46 +45,1.42,1.42 +46,1.39,1.39 +47,1.45,1.45 +48,1.44,1.54 +49,1.42,1.42 +50,1.39,1.39 +51,1.39,1.39 +52,1.38,1.47 +53,1.39,1.4 +54,1.4,1.5 +55,2.44,2.44 +56,2.15,2.15 +57,2.07,2.07 +58,2.04,2.04 +59,2.03,2.03 +60,2.01,2.01 +61,1.99,1.99 +62,1.98,1.98 +63,1.98,1.98 +64,1.96,1.96 +65,1.94,1.94 +66,1.92,1.92 +67,1.92,1.92 +68,1.89,1.89 +69,1.9,1.9 +70,1.87,1.87 +71,1.87,1.87 +72,1.75,1.75 +73,1.7,1.7 +74,1.62,1.62 +75,1.51,1.51 +76,1.44,1.44 +77,1.41,1.41 +78,1.36,1.36 +79,1.36,1.36 +80,1.32,1.32 +81,1.45,1.45 +82,1.46,1.46 +83,1.48,1.48 +84,1.4,1.4 +85,1.5,1.21 +86,1.5,1.5 +87,2.6,2.6 +88,2.21,2.21 +89,2.15,2.15 +90,2.06,2.06 +91,2,2 +92,1.96,1.96 +93,1.9,1.9 +94,1.87,1.87 +95,1.8,1.8 +96,1.69,1.69 +97,,1.54 +98,,1.83 +99,,1.5 +100,,1.5 +101,,1.5 +102,,1.5 +103,,1.5 +104,,1.5 +105,,1.5 +106,,1.5 +107,,1.5 +108,,1.5 +109,,1.5 +110,,1.5 +111,, +112,, +113,, +114,, +115,, +116,, +117,, +118,, diff --git a/src/atomref/data/periodic_table.csv b/src/atomref/data/periodic_table.csv new file mode 100644 index 0000000..744b4aa --- /dev/null +++ b/src/atomref/data/periodic_table.csv @@ -0,0 +1,119 @@ +z,symbol,name +1,H,Hydrogen +2,He,Helium +3,Li,Lithium +4,Be,Beryllium +5,B,Boron +6,C,Carbon +7,N,Nitrogen +8,O,Oxygen +9,F,Fluorine +10,Ne,Neon +11,Na,Sodium +12,Mg,Magnesium +13,Al,Aluminium +14,Si,Silicon +15,P,Phosphorus +16,S,Sulphur +17,Cl,Chlorine +18,Ar,Argon +19,K,Potassium +20,Ca,Calcium +21,Sc,Scandium +22,Ti,Titanium +23,V,Vanadium +24,Cr,Chromium +25,Mn,Manganese +26,Fe,Iron +27,Co,Cobalt +28,Ni,Nickel +29,Cu,Copper +30,Zn,Zinc +31,Ga,Gallium +32,Ge,Germanium +33,As,Arsenic +34,Se,Selenium +35,Br,Bromine +36,Kr,Krypton +37,Rb,Rubidium +38,Sr,Strontium +39,Y,Yttrium +40,Zr,Zirconium +41,Nb,Niobium +42,Mo,Molybdenum +43,Tc,Technetium +44,Ru,Ruthenium +45,Rh,Rhodium +46,Pd,Palladium +47,Ag,Silver +48,Cd,Cadmium +49,In,Indium +50,Sn,Tin +51,Sb,Antimony +52,Te,Tellurium +53,I,Iodine +54,Xe,Xenon +55,Cs,Caesium +56,Ba,Barium +57,La,Lanthanum +58,Ce,Cerium +59,Pr,Praseodymium +60,Nd,Neodymium +61,Pm,Promethium +62,Sm,Samarium +63,Eu,Europium +64,Gd,Gadolinium +65,Tb,Terbium +66,Dy,Dysprosium +67,Ho,Holmium +68,Er,Erbium +69,Tm,Thulium +70,Yb,Ytterbium +71,Lu,Lutetium +72,Hf,Hafnium +73,Ta,Tantalum +74,W,Tungsten +75,Re,Rhenium +76,Os,Osmium +77,Ir,Iridium +78,Pt,Platinum +79,Au,Gold +80,Hg,Mercury +81,Tl,Thallium +82,Pb,Lead +83,Bi,Bismuth +84,Po,Polonium +85,At,Astatine +86,Rn,Radon +87,Fr,Francium +88,Ra,Radium +89,Ac,Actinium +90,Th,Thorium +91,Pa,Protactinium +92,U,Uranium +93,Np,Neptunium +94,Pu,Plutonium +95,Am,Americium +96,Cm,Curium +97,Bk,Berkelium +98,Cf,Californium +99,Es,Einsteinium +100,Fm,Fermium +101,Md,Mendelevium +102,No,Nobelium +103,Lr,Lawrencium +104,Rf,Rutherfordium +105,Db,Dubnium +106,Sg,Seaborgium +107,Bh,Bohrium +108,Hs,Hassium +109,Mt,Meitnerium +110,Ds,Darmstadtium +111,Rg,Roentgenium +112,Cn,Copernicium +113,Nh,Nihonium +114,Fl,Flerovium +115,Mc,Moscovium +116,Lv,Livermorium +117,Ts,Tennessine +118,Og,Oganesson diff --git a/src/atomref/data/registry.json b/src/atomref/data/registry.json new file mode 100644 index 0000000..e6e4469 --- /dev/null +++ b/src/atomref/data/registry.json @@ -0,0 +1,506 @@ +{ + "schema_version": "0.1", + "created_from": { + "source_project": "molcryst", + "source_schema_version": "0.2", + "notes": [ + "Transformed for the initial atomref v0.1 scaffold.", + "Rahm 2016 is reclassified from van_der_waals to atomic_radius." + ] + }, + "quantities": { + "covalent_radius": { + "domain": "element", + "units": "angstrom", + "description": "Element-indexed covalent radii intended for geometry and bonding heuristics." + }, + "van_der_waals_radius": { + "domain": "element", + "units": "angstrom", + "description": "Element-indexed condensed-phase or contact-derived van der Waals radii." + }, + "atomic_radius": { + "domain": "element", + "units": "angstrom", + "description": "Element-indexed isolated-atom or theory-defined atomic radii used as transferable support data." + }, + "xh_bond_length": { + "domain": "element", + "units": "angstrom", + "description": "Element-indexed reference X-H bond lengths keyed by parent element X and intended for hydrogen-position normalisation or related geometry workflows." + } + }, + "datasets": { + "covalent_radius": { + "cordero2008": { + "name": "Cordero et al. covalent radii", + "description": "Covalent radii from Cordero et al. (2008) (last author: Alvarez).", + "semantic_class": "covalent_structural", + "origin_class": "compiled_experimental", + "phase_context": "condensed_phase", + "method_summary": "Derived from crystallographic bond distances (primarily single bonds) across the periodic table.", + "storage": { + "format": "dense_by_z_csv", + "filename": "covalent.csv", + "column": "cordero2008" + }, + "coverage": { + "n_values": 96, + "z_min": 1, + "z_max": 96, + "has_placeholders": false + }, + "placeholder_value": null, + "extraction_source": "Table 2 in B. Cordero et al. (2008), column 'r'", + "aliases": [ + "Cordero covalent radii", + "Cordero–Alvarez covalent radii", + "Alvarez covalent radii (2008)" + ], + "references": [ + { + "authors": "B. Cordero et al.", + "doi": "10.1039/B801115J", + "title": "Covalent radii revisited", + "venue": "Dalton Trans. (2008) 2832-2838" + } + ], + "notes": [ + "The source paper provides multiple radii per element for different atom types/environments; this package currently includes C(sp3) value for C and high-spin values for Mn/Fe/Co." + ], + "usage_role": "target" + }, + "csd_legacy_cov": { + "name": "CSD legacy covalent radii (bond perception)", + "description": "Legacy covalent radii used in CSD software for bond assignment (Rcov).", + "semantic_class": "covalent_legacy", + "origin_class": "curated_heuristic", + "phase_context": "mixed_or_legacy", + "method_summary": null, + "storage": { + "format": "dense_by_z_csv", + "filename": "covalent.csv", + "column": "csd_legacy_cov" + }, + "coverage": { + "n_values": 110, + "z_min": 1, + "z_max": 110, + "has_placeholders": true + }, + "placeholder_value": 1.5, + "extraction_source": "CCDC Elemental_Radii.xlsx (CSD radii table), column 'Covalent Radius'.", + "aliases": [], + "references": [ + { + "publisher": "Cambridge Crystallographic Data Centre (CCDC)", + "title": "Elemental Data and Radii (Excel)", + "url": "https://www.ccdc.cam.ac.uk/media/Documentation/F8D8439E-30C5-4FA8-B781-D9E65AAB0BF3/Elemental_Radii.xlsx" + }, + { + "authors": "B. Cordero et al.", + "doi": "10.1039/B801115J", + "title": "Covalent radii revisited", + "venue": "Dalton Trans. (2008) 2832-2838" + } + ], + "notes": [ + "CSD bond assignment heuristic: a bond A-B may be inferred if distance d satisfies Rcov(A)+Rcov(B)-t <= d <= Rcov(A)+Rcov(B)+t, with typical t=0.4 Å. (See the CCDC spreadsheet notes.)", + "For Z>=111, csd_legacy values are omitted because the legacy CSD table does not provide radii beyond Darmstadtium (Z=110).", + "Elements not yet encountered in the CSD have Rcov = 1.50 Å." + ], + "usage_role": "support" + } + }, + "van_der_waals_radius": { + "bondi1964": { + "name": "Bondi van der Waals radii", + "description": "Classic van der Waals radii compiled by Bondi (1964), available for 38 elements.", + "semantic_class": "vdw_compiled", + "origin_class": "compiled_experimental", + "phase_context": "mixed_or_legacy", + "method_summary": "Bondi compiled van der Waals radii from a combination of experimental sources (e.g., crystal structures, liquid-state properties, gas kinetic data) to reproduce molecular/atomic volumes and sizes. This set is widely used as a historical reference and in many computational chemistry defaults.", + "storage": { + "format": "dense_by_z_csv", + "filename": "van_der_waals.csv", + "column": "bondi1964" + }, + "coverage": { + "n_values": 38, + "z_min": 1, + "z_max": 92, + "has_placeholders": false, + "covered_z": [ + 1, + 2, + 3, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 14, + 15, + 16, + 17, + 18, + 19, + 28, + 29, + 30, + 31, + 33, + 34, + 35, + 36, + 46, + 47, + 48, + 49, + 50, + 52, + 53, + 54, + 78, + 79, + 80, + 81, + 82, + 92 + ] + }, + "placeholder_value": null, + "extraction_source": "Bondi column in Table 1 of Alvarez (2013) (used as a convenient transcription of Bondi's tabulation).", + "aliases": [ + "Bondi radii", + "Bondi vdW radii" + ], + "references": [ + { + "authors": "A. Bondi", + "title": "van der Waals Volumes and Radii", + "venue": "J. Phys. Chem. 68 (1964) 441-451", + "doi": "10.1021/j100785a001" + }, + { + "authors": "S. Alvarez", + "title": "A cartography of the van der Waals territories", + "venue": "Dalton Trans. 42 (2013) 8617-8636", + "doi": "10.1039/C3DT50599E", + "note": "Table 1 reproduces Bondi radii for 38 elements." + } + ], + "notes": [ + "Coverage is limited (38 elements, including only a few transition metals and uranium).", + "Because Bondi radii were not derived exclusively from crystal nonbonded contact statistics, they can differ slightly from later 'structural' vdW radii." + ], + "usage_role": "target" + }, + "rowland_taylor1996": { + "name": "Rowland & Taylor nonbonded contact radii", + "description": "Nonbonded contact radii derived from organic crystal structures (Rowland & Taylor, 1996).", + "semantic_class": "vdw_structural", + "origin_class": "structural", + "phase_context": "condensed_phase", + "method_summary": "Rowland & Taylor analyzed distributions of intermolecular nonbonded contact distances in organic crystal structures from the Cambridge Structural Database (CSD). They fitted/estimated characteristic contact distances and solved for per-element radii by least-squares analysis over many element-pair distance distributions.", + "storage": { + "format": "dense_by_z_csv", + "filename": "van_der_waals.csv", + "column": "rowland_taylor1996" + }, + "coverage": { + "n_values": 9, + "z_min": 1, + "z_max": 53, + "has_placeholders": false, + "covered_z": [ + 1, + 6, + 7, + 8, + 9, + 16, + 17, + 35, + 53 + ] + }, + "placeholder_value": null, + "extraction_source": "Table 3 in Rowland & Taylor (1996), column 'r_c' (least-squares radii, not the normalized R_d column).", + "aliases": [ + "Rowland–Taylor radii", + "Rowland & Taylor vdW radii" + ], + "references": [ + { + "authors": "R. S. Rowland; R. Taylor", + "title": "Intermolecular Nonbonded Contact Distances in Organic Crystal Structures: Comparison with Distances Expected from van der Waals Radii", + "venue": "J. Phys. Chem. 100 (1996) 7384-7391", + "doi": "10.1021/jp953141+" + } + ], + "notes": [ + "Coverage is intentionally limited to common organic-crystal nonmetals (H, C, N, O, F, S, Cl, Br, I).", + "Rowland & Taylor also report a normalized set (R_d) constrained to match the total of Bondi radii; this package uses the raw least-squares r_c values." + ], + "usage_role": "target" + }, + "alvarez2013": { + "name": "Alvarez van der Waals radii", + "description": "van der Waals radii from Alvarez (2013).", + "semantic_class": "vdw_structural", + "origin_class": "structural", + "phase_context": "condensed_phase", + "method_summary": null, + "storage": { + "format": "dense_by_z_csv", + "filename": "van_der_waals.csv", + "column": "alvarez2013" + }, + "coverage": { + "n_values": 93, + "z_min": 1, + "z_max": 99, + "has_placeholders": false, + "missing_z": [ + 61, + 84, + 85, + 86, + 87, + 88 + ] + }, + "placeholder_value": null, + "extraction_source": "Table 1 in Alvarez (2013), column 'r_vdW'.", + "aliases": [ + "Alvarez vdW radii", + "Alvarez (2013) r_vdW", + "Dalton Trans. vdW cartography radii" + ], + "references": [ + { + "authors": "S. Alvarez", + "doi": "10.1039/C3DT50599E", + "title": "A cartography of the van der Waals territories", + "venue": "Dalton Trans. 42 (2013) 8617-8636" + } + ], + "notes": [ + "Obtained by statistical analysis of millions of interatomic distances in the Cambridge Structural Database (CSD), locating the vdW peak after the vdW gap." + ], + "usage_role": "target" + }, + "chernyshov2020": { + "name": "Chernyshov LoS van der Waals radii", + "description": "van der Waals radii from Chernyshov et al. (ChemPhysChem 2020) using line-of-sight (LoS) classification of direct contacts.", + "semantic_class": "vdw_structural_typed_reduced", + "origin_class": "structural", + "phase_context": "condensed_phase", + "method_summary": "Chernyshov et al. introduce a line-of-sight (LoS) criterion to identify 'direct' interatomic contacts in complex molecular crystals. vdW radii are then inferred from statistically analyzed contact-distance distributions for specific atom types, yielding radii (including R_half and R_max variants) intended to better reflect steric/anistropic effects than simple distance-based heuristics.", + "storage": { + "format": "dense_by_z_csv", + "filename": "van_der_waals.csv", + "column": "chernyshov2020" + }, + "coverage": { + "n_values": 10, + "z_min": 1, + "z_max": 53, + "has_placeholders": false, + "covered_z": [ + 1, + 6, + 7, + 8, + 9, + 16, + 17, + 34, + 35, + 53 + ] + }, + "placeholder_value": null, + "extraction_source": "Table 1 in Chernyshov et al. (2020): R_max values for the 'default' atom types typical for organic compounds.", + "aliases": [ + "LoS vdW radii", + "Chernyshov vdW radii" + ], + "references": [ + { + "authors": "I. Yu. Chernyshov; I. V. Ananyev; E. A. Pidko", + "title": "Revisiting van der Waals Radii: From Comprehensive Structural Analysis to Knowledge-Based Classification of Interatomic Contacts", + "venue": "ChemPhysChem 21 (2020) 1–8", + "doi": "10.1002/cphc.201901083" + } + ], + "notes": [ + "The source paper provides multiple radii per element for different atom types/environments; this package currently includes only the main/default R_max values used in Table 1.", + "Primarily targeted at elements common in organic crystals (H, C, N, O, F, S, Cl, Se, Br, I)." + ], + "usage_role": "target" + }, + "csd_legacy_vdw": { + "name": "CSD legacy van der Waals radii (pre-2024.3)", + "description": "Legacy van der Waals radii historically used in CSD tools (pre-2024.3).", + "semantic_class": "vdw_legacy", + "origin_class": "curated_heuristic", + "phase_context": "mixed_or_legacy", + "method_summary": null, + "storage": { + "format": "dense_by_z_csv", + "filename": "van_der_waals.csv", + "column": "csd_legacy_vdw" + }, + "coverage": { + "n_values": 110, + "z_min": 1, + "z_max": 110, + "has_placeholders": true + }, + "placeholder_value": 2.0, + "extraction_source": "CCDC Elemental_Radii.xlsx (CSD radii table), column 'vdW Radius' (Bondi/Rowland-Taylor based with defaults).", + "aliases": [], + "references": [ + { + "authors": "A. Bondi", + "doi": "10.1021/j100785a001", + "title": "van der Waals Volumes and Radii", + "venue": "J. Phys. Chem. 68 (1964) 441-451" + }, + { + "authors": "R. S. Rowland; R. Taylor", + "doi": "10.1021/jp953141+", + "title": "Intermolecular Nonbonded Contact Distances in Organic Crystal Structures: Comparison with Distances Expected from van der Waals Radii", + "venue": "J. Phys. Chem. 100 (1996) 7384-7391" + }, + { + "publisher": "CCDC", + "title": "Elemental Data and Radii (Excel)", + "url": "https://www.ccdc.cam.ac.uk/media/Documentation/F8D8439E-30C5-4FA8-B781-D9E65AAB0BF3/Elemental_Radii.xlsx" + }, + { + "publisher": "CCDC blog", + "title": "Updates to van der Waals radii used in the CSD and Mercury", + "url": "https://www.ccdc.cam.ac.uk/discover/blog/updates-to-van-der-waals-radii-csd-mercury/" + } + ], + "notes": [ + "For Z>=111, csd_legacy values are omitted because the legacy CSD table does not provide radii beyond Darmstadtium (Z=110).", + "Radii that are not available in either Bondi or Rowland & Taylor versions were assigned RvdW of 2.00 Å.", + "The CSD 2024.3 release updated the vdW radii used in CSD and Mercury to Alvarez-derived values (see CCDC blog post)." + ], + "usage_role": "support" + } + }, + "atomic_radius": { + "rahm2016": { + "name": "Rahm isodensity atomic radii (ρ=0.001 e/bohr³)", + "description": "Computed atomic radii for neutral atoms (elements 1–96) defined by the ρ=0.001 e/bohr³ electron-density isosurface (Rahm et al., 2016).", + "semantic_class": "atomic_isodensity", + "origin_class": "computational", + "phase_context": "isolated_atom", + "method_summary": "Rahm et al. computed relativistic all-electron DFT electron densities (close to the basis-set limit) for isolated atoms and ions. Radii are defined by an electron-density threshold, producing a consistent, theory-based size measure that correlates well with structural van der Waals radii derived from crystal structures.", + "storage": { + "format": "dense_by_z_csv", + "filename": "van_der_waals.csv", + "column": "rahm2016" + }, + "coverage": { + "n_values": 96, + "z_min": 1, + "z_max": 96, + "has_placeholders": false + }, + "placeholder_value": null, + "extraction_source": "Supporting Information for Rahm et al. (2016), Table S1: neutral-atom radii for elements 1–96.", + "aliases": [ + "Rahm radii", + "Rahm–Hoffmann–Ashcroft atomic radii", + "0.001 e/bohr^3 radii" + ], + "references": [ + { + "authors": "M. Rahm; R. Hoffmann; N. W. Ashcroft", + "title": "Atomic and Ionic Radii of Elements 1–96", + "venue": "Chem. Eur. J. 22 (2016) 14625–14632", + "doi": "10.1002/chem.201602949" + }, + { + "title": "Chem. Eur. J. 2016, 22, 14625–14632 (Rahm et al.) – Misc. Information", + "url": "http://dx.doi.org/10.1002/chem.201602949", + "publisher": "Supporting Information", + "note": "Table S1 contains the neutral-atom radii used here." + } + ], + "notes": [ + "The original work also reports cationic radii (+1) for the first 96 elements and selected anionic radii (−1) for some elements; these are not yet included in the current CSV.", + "In atomref this dataset is classified as atomic support data, not as a direct condensed-phase van der Waals-radius set, because it describes isolated atoms in vacuum and is used here primarily as a transferable baseline." + ], + "usage_role": "support" + } + }, + "xh_bond_length": { + "csd_legacy_xh_cno": { + "name": "CSD legacy X-H neutron-normalisation targets (C/N/O)", + "description": "Fixed C-H, N-H, and O-H target bond lengths used by ConQuest for hydrogen-position normalisation.", + "semantic_class": "xh_neutron_normalisation", + "origin_class": "compiled_experimental", + "phase_context": "condensed_phase", + "method_summary": "Sparse parent-element target set for hydrogen normalisation. ConQuest moves H along the experimentally determined X-H vector to these neutron-derived distances.", + "storage": { + "format": "dense_by_z_csv", + "filename": "xh_bond_length.csv", + "column": "csd_legacy_xh_cno" + }, + "coverage": { + "n_values": 3, + "z_min": 6, + "z_max": 8, + "has_placeholders": false, + "covered_z": [ + 6, + 7, + 8 + ], + "missing_z": [ + 1, + 2, + 3, + 4, + 5 + ] + }, + "placeholder_value": null, + "extraction_source": "ConQuest User Guide and Tutorials, section 'Hydrogen Atom Location in Crystal Structure Analyses'.", + "aliases": [ + "CSD X-H normalisation defaults", + "ConQuest X-H normalisation", + "CSD legacy X-H" + ], + "references": [ + { + "publisher": "Cambridge Crystallographic Data Centre (CCDC)", + "title": "ConQuest User Guide and Tutorials", + "url": "https://www.ccdc.cam.ac.uk/media/Documentation/C82017ED-FAE4-4D93-BA5A-8D841F1E4314/ConQuest-UserGuide_2020_1.pdf", + "note": "Hydrogen Atom Location in Crystal Structure Analyses; ConQuest normalises terminal C-H, N-H, and O-H distances to 1.089 Å, 1.015 Å, and 0.993 Å, respectively." + }, + { + "authors": "F. H. Allen; I. J. Bruno", + "title": "Bond lengths in organic and metal-organic compounds revisited: X-H bond lengths from neutron diffraction data", + "venue": "Acta Cryst. B66 (2010) 380-386" + } + ], + "notes": [ + "Sparse provisional target set for parent elements C, N, and O only.", + "In atomref v0.1.x this dataset seeds transfer-based inference for other parent elements rather than claiming direct curated coverage beyond C/N/O.", + "Fuller X-H dataset and policy support is planned for atomref 0.2.x." + ], + "usage_role": "target" + } + } + } +} diff --git a/src/atomref/data/van_der_waals.csv b/src/atomref/data/van_der_waals.csv new file mode 100644 index 0000000..86e7be3 --- /dev/null +++ b/src/atomref/data/van_der_waals.csv @@ -0,0 +1,119 @@ +z,bondi1964,rowland_taylor1996,alvarez2013,chernyshov2020,csd_legacy_vdw,rahm2016 +1,1.2,1.1,1.2,1.21,1.09,1.54 +2,1.4,,1.43,,1.4,1.34 +3,1.81,,2.12,,1.82,2.2 +4,,,1.98,,2,2.19 +5,,,1.91,,2,2.05 +6,1.7,1.77,1.77,1.91,1.7,1.9 +7,1.55,1.64,1.66,1.76,1.55,1.79 +8,1.52,1.58,1.5,1.74,1.52,1.71 +9,1.47,1.46,1.46,1.55,1.47,1.63 +10,1.54,,1.58,,1.54,1.56 +11,2.27,,2.5,,2.27,2.25 +12,1.73,,2.51,,1.73,2.4 +13,,,2.25,,2,2.39 +14,2.22,,2.19,,2.1,2.32 +15,1.8,,1.9,,1.8,2.23 +16,1.8,1.81,1.89,1.95,1.8,2.14 +17,1.75,1.76,1.82,1.91,1.75,2.06 +18,1.76,,1.83,,1.88,1.97 +19,2.75,,2.73,,2.75,2.34 +20,,,2.62,,2,2.7 +21,,,2.58,,2,2.63 +22,,,2.46,,2,2.57 +23,,,2.42,,2,2.52 +24,,,2.45,,2,2.33 +25,,,2.45,,2,2.42 +26,,,2.44,,2,2.26 +27,,,2.4,,2,2.22 +28,1.63,,2.4,,1.63,2.19 +29,1.4,,2.38,,1.4,2.17 +30,1.39,,2.39,,1.39,2.22 +31,1.87,,2.32,,1.87,2.33 +32,,,2.29,,2,2.34 +33,1.85,,1.88,,1.85,2.31 +34,1.9,,1.82,2.04,1.9,2.24 +35,1.83,1.87,1.86,2,1.85,2.19 +36,2.02,,2.25,,2.02,2.12 +37,,,3.21,,2,2.4 +38,,,2.84,,2,2.79 +39,,,2.75,,2,2.74 +40,,,2.52,,2,2.68 +41,,,2.56,,2,2.51 +42,,,2.45,,2,2.44 +43,,,2.44,,2,2.41 +44,,,2.46,,2,2.37 +45,,,2.44,,2,2.33 +46,1.63,,2.15,,1.63,2.15 +47,1.72,,2.53,,1.72,2.25 +48,1.62,,2.49,,1.58,2.38 +49,1.93,,2.43,,1.93,2.46 +50,2.17,,2.42,,2.17,2.48 +51,,,2.47,,2,2.46 +52,2,,1.99,,2.06,2.42 +53,1.98,2.03,2.04,2.17,1.98,2.38 +54,2.16,,2.06,,2.16,2.32 +55,,,3.48,,2,2.49 +56,,,3.03,,2,2.93 +57,,,2.98,,2,2.84 +58,,,2.88,,2,2.82 +59,,,2.92,,2,2.86 +60,,,2.95,,2,2.84 +61,,,,,2,2.83 +62,,,2.9,,2,2.8 +63,,,2.87,,2,2.8 +64,,,2.83,,2,2.77 +65,,,2.79,,2,2.76 +66,,,2.87,,2,2.75 +67,,,2.81,,2,2.73 +68,,,2.83,,2,2.72 +69,,,2.79,,2,2.71 +70,,,2.8,,2,2.77 +71,,,2.74,,2,2.7 +72,,,2.63,,2,2.64 +73,,,2.53,,2,2.58 +74,,,2.57,,2,2.53 +75,,,2.49,,2,2.49 +76,,,2.48,,2,2.44 +77,,,2.41,,2,2.33 +78,1.72,,2.29,,1.72,2.3 +79,1.66,,2.32,,1.66,2.26 +80,1.7,,2.45,,1.55,2.29 +81,1.96,,2.47,,1.96,2.42 +82,2.02,,2.6,,2.02,2.49 +83,,,2.54,,2,2.5 +84,,,,,2,2.5 +85,,,,,2,2.47 +86,,,,,2,2.43 +87,,,,,2,2.58 +88,,,,,2,2.92 +89,,,2.8,,2,2.93 +90,,,2.93,,2,2.89 +91,,,2.88,,2,2.85 +92,1.86,,2.71,,1.86,2.83 +93,,,2.82,,2,2.8 +94,,,2.81,,2,2.78 +95,,,2.83,,2,2.76 +96,,,3.05,,2,2.76 +97,,,3.4,,2, +98,,,3.05,,2, +99,,,2.7,,2, +100,,,,,2, +101,,,,,2, +102,,,,,2, +103,,,,,2, +104,,,,,2, +105,,,,,2, +106,,,,,2, +107,,,,,2, +108,,,,,2, +109,,,,,2, +110,,,,,2, +111,,,,,, +112,,,,,, +113,,,,,, +114,,,,,, +115,,,,,, +116,,,,,, +117,,,,,, +118,,,,,, diff --git a/src/atomref/data/xh_bond_length.csv b/src/atomref/data/xh_bond_length.csv new file mode 100644 index 0000000..4ae4bca --- /dev/null +++ b/src/atomref/data/xh_bond_length.csv @@ -0,0 +1,119 @@ +z,csd_legacy_xh_cno +1, +2, +3, +4, +5, +6,1.089 +7,1.015 +8,0.993 +9, +10, +11, +12, +13, +14, +15, +16, +17, +18, +19, +20, +21, +22, +23, +24, +25, +26, +27, +28, +29, +30, +31, +32, +33, +34, +35, +36, +37, +38, +39, +40, +41, +42, +43, +44, +45, +46, +47, +48, +49, +50, +51, +52, +53, +54, +55, +56, +57, +58, +59, +60, +61, +62, +63, +64, +65, +66, +67, +68, +69, +70, +71, +72, +73, +74, +75, +76, +77, +78, +79, +80, +81, +82, +83, +84, +85, +86, +87, +88, +89, +90, +91, +92, +93, +94, +95, +96, +97, +98, +99, +100, +101, +102, +103, +104, +105, +106, +107, +108, +109, +110, +111, +112, +113, +114, +115, +116, +117, +118, diff --git a/src/atomref/elements.py b/src/atomref/elements.py new file mode 100644 index 0000000..5245b80 --- /dev/null +++ b/src/atomref/elements.py @@ -0,0 +1,110 @@ +"""Periodic-table access for stable element identity.""" + +from __future__ import annotations + +import csv +import re +from dataclasses import dataclass +from functools import lru_cache +from importlib import resources + + +_MISSING_TOKENS = {"", "?", "."} +_LEADING_ALPHA_RE = re.compile(r"([A-Za-z]{1,3})") + + +@dataclass(frozen=True, slots=True) +class Element: + """Chemical element identity keyed by atomic number and symbol.""" + + z: int + symbol: str + name: str + + +def _normalize_element_token(token: str | None) -> str | None: + """Strip quotes and obvious missing-value markers from a token.""" + + if token is None: + return None + + raw = token.strip() + if raw in _MISSING_TOKENS: + return None + + if (raw.startswith("'") and raw.endswith("'")) or ( + raw.startswith('"') and raw.endswith('"') + ): + raw = raw[1:-1].strip() + if raw in _MISSING_TOKENS: + return None + + if not raw: + return None + return raw + + +def canonicalize_element_symbol(token: str | None) -> str | None: + """Canonicalize a free-form token to a conventional element symbol. + + The function accepts strings such as ``"cl"``, ``" Cl "`` or + ``"Cl12"`` and returns ``"Cl"`` when a leading element-like token can be + identified. Missing-value markers and non-element strings return ``None``. + """ + + raw = _normalize_element_token(token) + if raw is None: + return None + + match = _LEADING_ALPHA_RE.match(raw) + if match is None: + return None + + letters = match.group(1) + return letters[0].upper() + letters[1:].lower() + + +@lru_cache(maxsize=1) +def _load_elements_by_symbol() -> dict[str, Element]: + """Load the packaged periodic table into a symbol-keyed mapping.""" + + table_path = resources.files("atomref.data").joinpath("periodic_table.csv") + with table_path.open("r", encoding="utf-8", newline="") as handle: + reader = csv.DictReader(handle) + out: dict[str, Element] = {} + for row in reader: + z = int(row["z"]) + symbol = row["symbol"] + name = row["name"] + out[symbol] = Element(z=z, symbol=symbol, name=name) + return out + + +@lru_cache(maxsize=1) +def _elements_in_z_order() -> tuple[Element, ...]: + """Return packaged elements sorted by increasing atomic number.""" + + return tuple(sorted(_load_elements_by_symbol().values(), key=lambda e: e.z)) + + +def is_valid_element_symbol(symbol: str | None) -> bool: + """Return ``True`` if ``symbol`` is a known packaged element symbol.""" + + if symbol is None: + return False + return symbol in _load_elements_by_symbol() + + +def get_element(symbol: str | None) -> Element | None: + """Look up packaged element identity from a symbol-like token.""" + + sym = canonicalize_element_symbol(symbol) + if sym is None: + return None + return _load_elements_by_symbol().get(sym) + + +def iter_elements() -> tuple[Element, ...]: + """Return all packaged elements in increasing atomic-number order.""" + + return _elements_in_z_order() diff --git a/src/atomref/errors.py b/src/atomref/errors.py new file mode 100644 index 0000000..d31660a --- /dev/null +++ b/src/atomref/errors.py @@ -0,0 +1,17 @@ +"""Package-local exceptions used across :mod:`atomref`.""" + + +class AtomrefError(Exception): + """Base class for package-defined errors.""" + + +class DatasetError(AtomrefError): + """Raised when packaged data or registry metadata are invalid.""" + + +class MissingValueError(AtomrefError): + """Raised when a required reference value is unavailable.""" + + +class PolicyError(AtomrefError): + """Raised for invalid policy configuration or transfer resolution.""" diff --git a/src/atomref/policy.py b/src/atomref/policy.py new file mode 100644 index 0000000..79cc9f3 --- /dev/null +++ b/src/atomref/policy.py @@ -0,0 +1,794 @@ +"""Generic value-policy resolution for element-indexed scalar datasets.""" + +from __future__ import annotations + +from collections.abc import Mapping +import contextvars +from dataclasses import dataclass, field +from functools import lru_cache +import math +from types import MappingProxyType +from typing import Generic, Literal, TypeVar + +from .elements import ( + canonicalize_element_symbol, + is_valid_element_symbol, + iter_elements, +) +from .errors import PolicyError +from .registry import ( + DatasetLike, + DatasetRef, + ElementScalarSet, + _is_placeholder_value, + get_builtin_set, + resolve_dataset_like, +) +from .transfer import ( + LinearFit, + LinearTransfer, + SubstitutionTransfer, + SupportsValuePolicy, + TransferModel, +) + +K = TypeVar("K") + +LookupSource = Literal[ + "override", + "base", + "transfer_substitution", + "transfer_linear", + "fallback", + "missing", +] + +PolicyToken = tuple[str, int] +_ACTIVE_POLICY_TOKENS: contextvars.ContextVar[tuple[PolicyToken, ...]] = ( + contextvars.ContextVar("atomref_active_policy_tokens", default=()) +) + + +@dataclass(frozen=True, slots=True) +class LookupResult: + """Result of resolving one value through a policy. + + ``value`` carries the final scalar value when one could be produced, while + ``source`` and the remaining metadata explain how that value was obtained. + ``transfer_depth`` counts how many transfer steps were involved in producing + the returned value. Direct base and override values therefore have depth 0. + """ + + value: float | None + source: LookupSource + target: DatasetRef + resolved_from: tuple[DatasetRef, ...] = () + is_placeholder: bool = False + fit: LinearFit | None = None + notes: tuple[str, ...] = () + transfer_depth: int = 0 + + def __float__(self) -> float: + """Coerce the resolved value to ``float`` or raise if it is missing.""" + + if self.value is None: + raise TypeError("reference value is missing") + return float(self.value) + + +@dataclass(frozen=True, slots=True) +class ValuePolicy(Generic[K]): + """Ordered rule set for resolving element-domain scalar values. + + The current runtime resolves only element-domain policies even though the + metadata layer already records a more general ``domain`` concept. During + construction, element-domain override keys are normalized to canonical + element symbols and validated as finite floats. + """ + + base: DatasetLike + transfers: tuple[TransferModel, ...] = () + overrides: Mapping[K, float] = field(default_factory=dict) + fallback: float | None = None + blocked: tuple[str, ...] = () + + def __post_init__(self) -> None: + """Validate and normalize policy configuration eagerly.""" + + if self.fallback is not None: + object.__setattr__( + self, + "fallback", + _coerce_policy_float(self.fallback, what="policy fallback"), + ) + + base_set = resolve_dataset_like(self.base) + if base_set.info.domain != "element": + return + + normalized_blocked: list[str] = [] + seen_blocked: set[str] = set() + for key in self.blocked: + if not isinstance(key, str): + raise PolicyError( + "element-domain blocked keys must be element-symbol strings" + ) + sym = _normalize_element_symbol(key) + if sym is None: + raise PolicyError(f"invalid blocked element symbol: {key!r}") + if sym not in seen_blocked: + normalized_blocked.append(sym) + seen_blocked.add(sym) + object.__setattr__(self, "blocked", tuple(normalized_blocked)) + + normalized_overrides: dict[str, float] = {} + seen_original_keys: dict[str, str] = {} + for key, value in self.overrides.items(): + if not isinstance(key, str): + raise PolicyError( + "element-domain policy overrides must be keyed by element " + "symbols" + ) + sym = _normalize_element_symbol(key) + if sym is None: + raise PolicyError(f"invalid override element symbol: {key!r}") + if sym in seen_blocked: + raise PolicyError(f"override key {key!r} is blocked in this policy") + previous = seen_original_keys.get(sym) + if previous is not None and previous != key: + raise PolicyError( + f"override keys {previous!r} and {key!r} both normalize to " + f"{sym!r}" + ) + seen_original_keys[sym] = key + normalized_overrides[sym] = _coerce_policy_float( + value, + what=f"override value for {key!r}", + ) + + object.__setattr__( + self, + "overrides", + MappingProxyType(normalized_overrides), + ) + + +@dataclass(frozen=True, slots=True) +class _ResolvedElementSource: + """Internal representation of an element-domain transfer source.""" + + ref: DatasetRef + values_by_z: tuple[float | None, ...] + placeholder_by_z: tuple[bool, ...] + lookup_source_by_z: tuple[LookupSource | None, ...] + transfer_depth_by_z: tuple[int | None, ...] + via_policy: bool = False + + +@dataclass(frozen=True, slots=True) +class _TransferSourceValue: + """Internal representation of one value obtained from a transfer source.""" + + value: float + ref: DatasetRef + resolved_from: tuple[DatasetRef, ...] + is_placeholder: bool + via_policy: bool = False + lookup_source: LookupSource | None = None + notes: tuple[str, ...] = () + transfer_depth: int = 0 + + +def _coerce_policy_float(value: object, *, what: str) -> float: + """Return a finite float for policy configuration values.""" + + try: + out = float(value) + except (TypeError, ValueError) as exc: + raise PolicyError(f"{what} must be a finite float") from exc + if not math.isfinite(out): + raise PolicyError(f"{what} must be a finite float") + return out + + +def _normalize_element_symbol(symbol: str | None) -> str | None: + """Normalize user input to a packaged element symbol. + + The current resolver treats ``D`` and ``T`` as hydrogen aliases. + """ + + cand = canonicalize_element_symbol(symbol) + if cand in {"D", "T"}: + cand = "H" + if cand is None: + return None + if not is_valid_element_symbol(cand): + return None + return cand + + +def _resolve_target_ref(policy: ValuePolicy[object]) -> DatasetRef: + """Return the target dataset reference implied by a policy base.""" + + return resolve_dataset_like(policy.base).ref + + +def _policy_resolution_tokens( + policy: ValuePolicy[object], + *, + owner: object | None = None, +) -> tuple[PolicyToken, ...]: + """Return all tokens that should be considered active for one resolution. + + We always track the concrete :class:`ValuePolicy` object identity. When a + wrapper object such as :class:`atomref.radii.RadiiPolicy` or + :class:`atomref.xh.XHPolicy` is the logical source, we also track the + wrapper identity so recursion through freshly materialized generic policies + is still detected. + """ + + tokens: list[PolicyToken] = [("policy", id(policy))] + if owner is not None: + tokens.append((f"owner:{type(owner).__qualname__}", id(owner))) + return tuple(tokens) + + +def _lookup_value_with_owner( + symbol: str | None, + *, + policy: ValuePolicy[str], + owner: object | None, +) -> LookupResult: + """Internal lookup helper that carries wrapper identity for cycle checks.""" + + return _resolve_value(symbol, policy=policy, resolution_owner=owner) + + +def _coerce_nested_policy( + source: object, +) -> tuple[ValuePolicy[str] | None, object | None]: + """Return ``source`` as a generic value policy and its logical owner.""" + + if isinstance(source, ValuePolicy): + return source, None + if isinstance(source, SupportsValuePolicy): + nested = source.as_value_policy() + if not isinstance(nested, ValuePolicy): + raise PolicyError("policy-like transfer sources must return ValuePolicy") + return nested, source + return None, None + + +def _materialize_transfer_source( + source: DatasetLike | SupportsValuePolicy | ValuePolicy[str], +) -> _ResolvedElementSource: + """Materialize any element-domain transfer source into dense by-Z arrays.""" + + nested_policy, nested_owner = _coerce_nested_policy(source) + if nested_policy is None: + dataset = resolve_dataset_like(source) + placeholders = tuple( + False + if value is None + else _is_placeholder_value(dataset.info, float(value)) + for value in dataset.values_by_z + ) + lookup_sources = tuple( + "base" if value is not None else None for value in dataset.values_by_z + ) + transfer_depths = tuple( + 0 if value is not None else None for value in dataset.values_by_z + ) + return _ResolvedElementSource( + ref=dataset.ref, + values_by_z=dataset.values_by_z, + placeholder_by_z=placeholders, + lookup_source_by_z=lookup_sources, + transfer_depth_by_z=transfer_depths, + via_policy=False, + ) + + target = _resolve_target_ref(nested_policy) + n_z = max(elem.z for elem in iter_elements()) + values: list[float | None] = [None] * (n_z + 1) + placeholders: list[bool] = [False] * (n_z + 1) + lookup_sources: list[LookupSource | None] = [None] * (n_z + 1) + transfer_depths: list[int | None] = [None] * (n_z + 1) + for elem in iter_elements(): + lookup = _lookup_value_with_owner( + elem.symbol, + policy=nested_policy, + owner=nested_owner, + ) + values[elem.z] = lookup.value + if lookup.value is not None: + placeholders[elem.z] = lookup.is_placeholder + lookup_sources[elem.z] = lookup.source + transfer_depths[elem.z] = lookup.transfer_depth + return _ResolvedElementSource( + ref=target, + values_by_z=tuple(values), + placeholder_by_z=tuple(placeholders), + lookup_source_by_z=tuple(lookup_sources), + transfer_depth_by_z=tuple(transfer_depths), + via_policy=True, + ) + + +def _lookup_transfer_source_value( + symbol: str, + source: DatasetLike | SupportsValuePolicy | ValuePolicy[str], +) -> tuple[_TransferSourceValue | None, str | None]: + """Resolve one element value from a transfer source or nested policy.""" + + nested_policy, nested_owner = _coerce_nested_policy(source) + if nested_policy is None: + source_set = resolve_dataset_like(source) + value = source_set.get(symbol) + if value is None: + return None, f"no value in {source_set.ref.set_id}" + value_f = float(value) + return ( + _TransferSourceValue( + value=value_f, + ref=source_set.ref, + resolved_from=(source_set.ref,), + is_placeholder=_is_placeholder_value(source_set.info, value_f), + via_policy=False, + lookup_source="base", + notes=(), + transfer_depth=0, + ), + None, + ) + + lookup = _lookup_value_with_owner( + symbol, + policy=nested_policy, + owner=nested_owner, + ) + if lookup.value is None: + if lookup.notes: + return ( + None, + "policy source returned no value: " + "; ".join(lookup.notes), + ) + return None, "policy source returned no value" + + return ( + _TransferSourceValue( + value=float(lookup.value), + ref=_resolve_target_ref(nested_policy), + resolved_from=lookup.resolved_from, + is_placeholder=lookup.is_placeholder, + via_policy=True, + lookup_source=lookup.source, + notes=lookup.notes, + transfer_depth=lookup.transfer_depth, + ), + None, + ) + + +def _transfer_source_is_allowed( + lookup_source: LookupSource | None, + transfer_depth: int | None, + *, + allowed_sources: tuple[str, ...], + max_depth: int, +) -> bool: + """Return whether a nested predictor value may participate downstream.""" + + if lookup_source is None or transfer_depth is None: + return False + return lookup_source in allowed_sources and transfer_depth <= max_depth + + +def _explain_rejected_transfer_source( + *, + source_role: str, + lookup_source: LookupSource | None, + transfer_depth: int | None, + allowed_sources: tuple[str, ...], + max_depth: int, +) -> str: + """Return a human-readable explanation for a rejected nested source.""" + + if lookup_source is None or transfer_depth is None: + return f"{source_role} policy source did not return a usable value" + if lookup_source not in allowed_sources: + allowed = ", ".join(allowed_sources) + return ( + f"{source_role} policy source resolved via {lookup_source}, which is " + f"excluded by {source_role}_sources=({allowed})" + ) + return ( + f"{source_role} policy source transfer depth {transfer_depth} exceeds " + f"allowed maximum {max_depth} ({source_role}_max_depth)" + ) + + +def _fit_linear_transfer( + base_set: ElementScalarSet, + predictor_source: _ResolvedElementSource, + *, + min_points: int, + exclude_placeholders: bool, + fit_sources: tuple[str, ...], + fit_max_depth: int, +) -> LinearFit: + """Fit a one-predictor linear transfer model between two sources.""" + + xs: list[float] = [] + ys: list[float] = [] + filtered_by_fit_restrictions = 0 + + n_z = min(len(base_set.values_by_z), len(predictor_source.values_by_z)) + for z in range(1, n_z): + y = base_set.values_by_z[z] + x = predictor_source.values_by_z[z] + if y is None or x is None: + continue + if not _transfer_source_is_allowed( + predictor_source.lookup_source_by_z[z], + predictor_source.transfer_depth_by_z[z], + allowed_sources=fit_sources, + max_depth=fit_max_depth, + ): + filtered_by_fit_restrictions += 1 + continue + y_f = float(y) + x_f = float(x) + if exclude_placeholders and ( + _is_placeholder_value(base_set.info, y_f) + or predictor_source.placeholder_by_z[z] + ): + continue + xs.append(x_f) + ys.append(y_f) + + n = len(xs) + if n < min_points: + if predictor_source.via_policy and filtered_by_fit_restrictions > 0: + raise PolicyError( + "not enough overlapping elements to fit linear transfer after " + "applying fit source constraints (fit-source restrictions)" + ) + raise PolicyError("not enough overlapping elements to fit linear transfer") + + x_mean = sum(xs) / n + y_mean = sum(ys) / n + sxx = sum((x - x_mean) ** 2 for x in xs) + if sxx == 0: + raise PolicyError("cannot fit linear transfer: zero predictor variance") + + sxy = sum((x - x_mean) * (y - y_mean) for x, y in zip(xs, ys)) + slope = sxy / sxx + intercept = y_mean - slope * x_mean + + y_hat = [slope * x + intercept for x in xs] + sse = sum((y - yh) ** 2 for y, yh in zip(ys, y_hat)) + sst = sum((y - y_mean) ** 2 for y in ys) + r2 = 1.0 - sse / sst if sst != 0 else 1.0 + rmse = math.sqrt(sse / n) + + return LinearFit( + coefficients=(slope,), + intercept=intercept, + n_points=n, + r2=r2, + rmse=rmse, + ) + + +@lru_cache(maxsize=None) +def _fit_linear_transfer_cached( + base_ref: DatasetRef, + predictor_ref: DatasetRef, + min_points: int, + exclude_placeholders: bool, + fit_sources: tuple[str, ...], + fit_max_depth: int, +) -> LinearFit: + """Cache fits between two packaged datasets for repeated reuse.""" + + return _fit_linear_transfer( + get_builtin_set(base_ref), + _materialize_transfer_source(predictor_ref), + min_points=min_points, + exclude_placeholders=exclude_placeholders, + fit_sources=fit_sources, + fit_max_depth=fit_max_depth, + ) + + +def _fit_transfer_model(base: DatasetLike, transfer: TransferModel) -> LinearFit | None: + """Return the fit object for a transfer model when it needs one.""" + + if not isinstance(transfer, LinearTransfer): + return None + if len(transfer.predictors) != 1: + raise PolicyError( + "LinearTransfer currently supports exactly one predictor source" + ) + + predictor = transfer.predictors[0] + if isinstance(base, DatasetRef) and isinstance(predictor, DatasetRef): + return _fit_linear_transfer_cached( + base, + predictor, + transfer.min_points, + transfer.exclude_placeholders, + transfer.fit_sources, + transfer.fit_max_depth, + ) + return _fit_linear_transfer( + resolve_dataset_like(base), + _materialize_transfer_source(predictor), + min_points=transfer.min_points, + exclude_placeholders=transfer.exclude_placeholders, + fit_sources=transfer.fit_sources, + fit_max_depth=transfer.fit_max_depth, + ) + + +def _apply_substitution_transfer( + symbol: str, + *, + target: DatasetRef, + transfer: SubstitutionTransfer, +) -> tuple[LookupResult | None, str | None]: + """Try to resolve ``symbol`` by direct substitution from another source.""" + + source_value, note = _lookup_transfer_source_value(symbol, transfer.source) + if source_value is None: + return None, note + + notes = [ + "missing in base set; substituted from policy source" + if source_value.via_policy + else "missing in base set; substituted from transfer source" + ] + if source_value.via_policy and source_value.lookup_source not in (None, "base"): + notes.append( + f"policy source resolved the value via {source_value.lookup_source}" + ) + if source_value.is_placeholder: + notes.append("transfer source value is marked as a placeholder") + return ( + LookupResult( + value=source_value.value, + source="transfer_substitution", + target=target, + resolved_from=source_value.resolved_from, + is_placeholder=source_value.is_placeholder, + notes=tuple(notes), + transfer_depth=source_value.transfer_depth + 1, + ), + None, + ) + + +def _apply_linear_transfer( + symbol: str, + *, + base: DatasetLike, + target: DatasetRef, + transfer: LinearTransfer, +) -> tuple[LookupResult | None, str | None]: + """Try to resolve ``symbol`` through linear transfer from predictor data.""" + + if len(transfer.predictors) != 1: + raise PolicyError( + "LinearTransfer currently supports exactly one predictor source" + ) + + predictor_value, note = _lookup_transfer_source_value( + symbol, + transfer.predictors[0], + ) + if predictor_value is None: + return None, note + + if not _transfer_source_is_allowed( + predictor_value.lookup_source, + predictor_value.transfer_depth, + allowed_sources=transfer.prediction_sources, + max_depth=transfer.prediction_max_depth, + ): + return ( + None, + _explain_rejected_transfer_source( + source_role="prediction", + lookup_source=predictor_value.lookup_source, + transfer_depth=predictor_value.transfer_depth, + allowed_sources=transfer.prediction_sources, + max_depth=transfer.prediction_max_depth, + ), + ) + + if transfer.exclude_placeholders and predictor_value.is_placeholder: + if predictor_value.via_policy: + return None, "predictor value from policy source is a placeholder" + return None, f"predictor value in {predictor_value.ref.set_id} is a placeholder" + + fit = _fit_transfer_model(base, transfer) + if fit is None: + return None, "no fit available for linear transfer" + predicted = fit.coefficients[0] * predictor_value.value + fit.intercept + + notes = ["missing in base set; inferred via linear transfer"] + if predictor_value.via_policy: + notes.append("predictor value supplied by policy source") + notes.append( + "linear fit applied fit-source and transfer-depth limits to " + "policy-materialized predictor values" + ) + if predictor_value.lookup_source not in (None, "base"): + notes.append( + "policy predictor resolved the value via " + f"{predictor_value.lookup_source}" + ) + + return ( + LookupResult( + value=float(predicted), + source="transfer_linear", + target=target, + resolved_from=predictor_value.resolved_from, + is_placeholder=False, + fit=fit, + notes=tuple(notes), + transfer_depth=predictor_value.transfer_depth + 1, + ), + None, + ) + + +def _resolve_value( + symbol: str | None, + *, + policy: ValuePolicy[str], + resolution_owner: object | None = None, +) -> LookupResult: + """Resolve a value through override, base, transfer, and fallback steps.""" + + active_tokens = _ACTIVE_POLICY_TOKENS.get() + resolution_tokens = _policy_resolution_tokens(policy, owner=resolution_owner) + if any(token in active_tokens for token in resolution_tokens): + raise PolicyError("cyclic policy resolution detected") + + stack_token = _ACTIVE_POLICY_TOKENS.set(active_tokens + resolution_tokens) + try: + target = _resolve_target_ref(policy) + base_set = resolve_dataset_like(policy.base) + if base_set.info.domain != "element": + raise PolicyError( + "the resolver currently supports only element-domain datasets" + ) + + sym = _normalize_element_symbol(symbol) + if sym is None: + note = "unknown element" if symbol is not None else "missing element symbol" + return LookupResult( + value=None, + source="missing", + target=target, + notes=(note,), + ) + + if sym in policy.blocked: + return LookupResult( + value=None, + source="missing", + target=target, + notes=(f"{sym} is blocked by this policy",), + ) + + if sym in policy.overrides: + return LookupResult( + value=float(policy.overrides[sym]), + source="override", + target=target, + notes=("value supplied by policy override",), + transfer_depth=0, + ) + + base_value = base_set.get(sym) + if base_value is not None: + base_f = float(base_value) + is_placeholder = _is_placeholder_value(base_set.info, base_f) + notes = ( + ("base dataset value is marked as a placeholder",) + if is_placeholder + else () + ) + return LookupResult( + value=base_f, + source="base", + target=target, + resolved_from=(base_set.ref,), + is_placeholder=is_placeholder, + notes=notes, + transfer_depth=0, + ) + + transfer_notes: list[str] = ["missing in base set"] + for transfer in policy.transfers: + if isinstance(transfer, SubstitutionTransfer): + result, note = _apply_substitution_transfer( + sym, + target=target, + transfer=transfer, + ) + elif isinstance(transfer, LinearTransfer): + result, note = _apply_linear_transfer( + sym, + base=policy.base, + target=target, + transfer=transfer, + ) + else: # pragma: no cover - closed union today + raise PolicyError(f"unsupported transfer model: {type(transfer)!r}") + + if result is not None: + return result + if note: + transfer_notes.append(note) + + if policy.fallback is not None: + return LookupResult( + value=float(policy.fallback), + source="fallback", + target=target, + notes=tuple(transfer_notes + ["using fallback value"]), + transfer_depth=0, + ) + + return LookupResult( + value=None, + source="missing", + target=target, + notes=tuple(transfer_notes), + ) + finally: + _ACTIVE_POLICY_TOKENS.reset(stack_token) + + +def _lookup_value_from_policy_source( + symbol: str | None, + *, + source: ValuePolicy[str] | SupportsValuePolicy, +) -> LookupResult: + """Resolve a value from either a generic policy or a wrapper policy.""" + + if isinstance(source, ValuePolicy): + return _lookup_value_with_owner(symbol, policy=source, owner=None) + policy = source.as_value_policy() + return _lookup_value_with_owner(symbol, policy=policy, owner=source) + + +def _get_value_from_policy_source( + symbol: str | None, + *, + source: ValuePolicy[str] | SupportsValuePolicy, +) -> float | None: + """Return only the scalar selected by a generic or wrapper policy.""" + + return _lookup_value_from_policy_source(symbol, source=source).value + + +def lookup_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupResult: + """Public entry point for generic element-domain scalar lookup. + + This is the same resolver used internally by the radii convenience layer. + In the current implementation the runtime supports only element-domain policies. + """ + + return _lookup_value_with_owner(symbol, policy=policy, owner=None) + + +def get_value(symbol: str | None, *, policy: ValuePolicy[str]) -> float | None: + """Return only the resolved scalar value for an element-domain policy.""" + + return lookup_value(symbol, policy=policy).value diff --git a/src/atomref/py.typed b/src/atomref/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/src/atomref/radii.py b/src/atomref/radii.py new file mode 100644 index 0000000..b33877f --- /dev/null +++ b/src/atomref/radii.py @@ -0,0 +1,363 @@ +"""Radii-specific public API built on the generic policy core.""" + +from __future__ import annotations + +from collections.abc import Iterable, Mapping +from dataclasses import dataclass, field +import math +from typing import Literal + +from .elements import canonicalize_element_symbol, get_element, is_valid_element_symbol +from .errors import PolicyError +from .policy import ( + LookupResult, + ValuePolicy, + _fit_transfer_model, + _get_value_from_policy_source, + _lookup_value_from_policy_source, +) +from .registry import ( + DatasetInfo, + DatasetRef, + ElementScalarSet, + get_builtin_set, + get_dataset_info, + list_dataset_ids, + list_dataset_infos, +) +from .transfer import LinearFit, LinearTransfer, SubstitutionTransfer, TransferModel + +RadiiKind = Literal["covalent", "van_der_waals"] +RadiiSet = ElementScalarSet + + +_KIND_TO_QUANTITY = { + "covalent": "covalent_radius", + "van_der_waals": "van_der_waals_radius", +} + + +@dataclass(frozen=True, slots=True) +class RadiiPolicy: + """Policy wrapper specialized for radii lookup. + + ``kind`` determines the target quantity, while the remaining fields mirror + the generic :class:`atomref.policy.ValuePolicy` interface. + """ + + kind: RadiiKind + base_set: str | RadiiSet + transfers: tuple[TransferModel, ...] = () + overrides: Mapping[str, float] = field(default_factory=dict) + fallback: float | None = None + + def as_value_policy(self) -> ValuePolicy[str]: + """Convert the radii policy into the generic scalar-value policy.""" + + quantity = _quantity_for_kind(self.kind) + if isinstance(self.base_set, ElementScalarSet): + if self.base_set.ref.quantity != quantity: + msg = ( + f"base_set quantity {self.base_set.ref.quantity!r} " + f"is incompatible with radii kind {self.kind!r}" + ) + raise PolicyError(msg) + base = self.base_set + else: + base = DatasetRef(quantity, self.base_set) + + checked_overrides = { + key: _coerce_non_negative_radii_value( + value, + what=f"radii override value for {key!r}", + ) + for key, value in self.overrides.items() + } + checked_fallback = ( + None + if self.fallback is None + else _coerce_non_negative_radii_value( + self.fallback, + what="radii fallback", + ) + ) + + return ValuePolicy( + base=base, + transfers=self.transfers, + overrides=checked_overrides, + fallback=checked_fallback, + ) + + +@dataclass(frozen=True, slots=True) +class RadiiElementAssessment: + """Per-element row in a radii policy assessment report.""" + + symbol: str + lookup: LookupResult + + +@dataclass(frozen=True, slots=True) +class RadiiPolicyAssessment: + """Summary of how a radii policy behaved over a set of elements.""" + + kind: RadiiKind + policy: RadiiPolicy + elements: tuple[str, ...] + + n_elements: int + n_override: int + n_base: int + n_transfer_substitution: int + n_transfer_linear: int + n_fallback: int + n_missing: int + n_placeholders: int + + missing_symbols: tuple[str, ...] + placeholder_symbols: tuple[str, ...] + + fits: tuple[LinearFit, ...] = () + warnings: tuple[str, ...] = () + per_element: tuple[RadiiElementAssessment, ...] = () + + +def _coerce_non_negative_radii_value(value: object, *, what: str) -> float: + """Validate a radii-like policy number. + + The generic :class:`atomref.policy.ValuePolicy` accepts any finite scalar. + Radii-specific convenience helpers are stricter and reject negative values. + """ + + try: + out = float(value) + except (TypeError, ValueError) as exc: + raise PolicyError(f"{what} must be a finite float") from exc + if not math.isfinite(out): + raise PolicyError(f"{what} must be a finite float") + if out < 0: + raise PolicyError(f"{what} must be non-negative") + return out + + +def _quantity_for_kind(kind: RadiiKind) -> str: + """Translate public radii kind names into registry quantity ids.""" + + try: + return _KIND_TO_QUANTITY[kind] + except KeyError as exc: + raise PolicyError(f"unknown radii kind: {kind!r}") from exc + + +def _normalize_radii_symbol(symbol: str | None) -> str | None: + """Normalize symbols accepted by the radii convenience layer.""" + + cand = canonicalize_element_symbol(symbol) + if cand in {"D", "T"}: + cand = "H" + return cand + + +def _normalize_assessment_elements(elements: Iterable[str]) -> tuple[str, ...]: + """Normalize, validate, deduplicate, and sort assessment element labels.""" + + symbols: set[str] = set() + for token in elements: + sym = _normalize_radii_symbol(token) + if sym is None: + raise ValueError("missing element symbol") + if not is_valid_element_symbol(sym): + raise ValueError(f"invalid element symbol: {sym!r}") + symbols.add(sym) + return tuple( + sorted(symbols, key=lambda s: get_element(s).z if get_element(s) else 0) + ) + + +def list_radii_sets( + kind: RadiiKind, + *, + usage_role: str | None = None, +) -> tuple[str, ...]: + """List packaged radii-set ids for one radii kind.""" + + return list_dataset_ids(_quantity_for_kind(kind), usage_role=usage_role) + + +def list_radii_set_infos( + kind: RadiiKind, + *, + usage_role: str | None = None, +) -> tuple[DatasetInfo, ...]: + """Return packaged metadata objects for radii sets of one kind.""" + + return list_dataset_infos(_quantity_for_kind(kind), usage_role=usage_role) + + +def get_radii_set_info(kind: RadiiKind, set_id: str) -> DatasetInfo: + """Return metadata for one packaged radii set.""" + + return get_dataset_info(DatasetRef(_quantity_for_kind(kind), set_id)) + + +def get_radii_set(kind: RadiiKind, set_id: str) -> RadiiSet: + """Load one packaged radii set as an :class:`ElementScalarSet`.""" + + return get_builtin_set(DatasetRef(_quantity_for_kind(kind), set_id)) + + +def _validate_policy_kind(policy: RadiiPolicy, *, expected: RadiiKind) -> None: + """Raise when a policy is used with the wrong public radii helper.""" + + if policy.kind != expected: + raise PolicyError(f"expected a {expected!r} radii policy, got {policy.kind!r}") + + +def _lookup_radius(symbol: str | None, *, policy: RadiiPolicy) -> LookupResult: + """Shared implementation for radii lookup helpers.""" + + return _lookup_value_from_policy_source(symbol, source=policy) + + +def lookup_covalent_radius( + symbol: str | None, + *, + policy: RadiiPolicy | None = None, +) -> LookupResult: + """Resolve a covalent radius together with provenance information.""" + + active = DEFAULT_COVALENT_POLICY if policy is None else policy + _validate_policy_kind(active, expected="covalent") + return _lookup_radius(symbol, policy=active) + + +def get_covalent_radius( + symbol: str | None, + *, + policy: RadiiPolicy | None = None, +) -> float | None: + """Return only the selected covalent-radius value, without provenance.""" + + active = DEFAULT_COVALENT_POLICY if policy is None else policy + _validate_policy_kind(active, expected="covalent") + return _get_value_from_policy_source(symbol, source=active) + + +def lookup_vdw_radius( + symbol: str | None, + *, + policy: RadiiPolicy | None = None, +) -> LookupResult: + """Resolve a van der Waals radius together with provenance information.""" + + active = DEFAULT_VDW_POLICY if policy is None else policy + _validate_policy_kind(active, expected="van_der_waals") + return _lookup_radius(symbol, policy=active) + + +def get_vdw_radius( + symbol: str | None, + *, + policy: RadiiPolicy | None = None, +) -> float | None: + """Return only the selected van der Waals-radius value, without provenance.""" + + active = DEFAULT_VDW_POLICY if policy is None else policy + _validate_policy_kind(active, expected="van_der_waals") + return _get_value_from_policy_source(symbol, source=active) + + +def assess_radii_policy( + elements: Iterable[str], + *, + policy: RadiiPolicy, + detail: bool = False, +) -> RadiiPolicyAssessment: + """Assess how a radii policy resolves values over a set of elements.""" + + elems = _normalize_assessment_elements(elements) + value_policy = policy.as_value_policy() + + n_override = 0 + n_base = 0 + n_transfer_substitution = 0 + n_transfer_linear = 0 + n_fallback = 0 + n_missing = 0 + n_placeholders = 0 + + missing_symbols: list[str] = [] + placeholder_symbols: list[str] = [] + per_element: list[RadiiElementAssessment] = [] + + for symbol in elems: + lookup = _lookup_value_from_policy_source(symbol, source=policy) + if lookup.source == "override": + n_override += 1 + elif lookup.source == "base": + n_base += 1 + elif lookup.source == "transfer_substitution": + n_transfer_substitution += 1 + elif lookup.source == "transfer_linear": + n_transfer_linear += 1 + elif lookup.source == "fallback": + n_fallback += 1 + elif lookup.source == "missing": + n_missing += 1 + missing_symbols.append(symbol) + + if lookup.is_placeholder: + n_placeholders += 1 + placeholder_symbols.append(symbol) + + if detail: + per_element.append(RadiiElementAssessment(symbol=symbol, lookup=lookup)) + + fits: list[LinearFit] = [] + warnings: list[str] = [] + for transfer in value_policy.transfers: + if isinstance(transfer, LinearTransfer): + try: + fit = _fit_transfer_model(value_policy.base, transfer) + except Exception as exc: # noqa: BLE001 + warnings.append(str(exc)) + else: + if fit is not None: + fits.append(fit) + + return RadiiPolicyAssessment( + kind=policy.kind, + policy=policy, + elements=elems, + n_elements=len(elems), + n_override=n_override, + n_base=n_base, + n_transfer_substitution=n_transfer_substitution, + n_transfer_linear=n_transfer_linear, + n_fallback=n_fallback, + n_missing=n_missing, + n_placeholders=n_placeholders, + missing_symbols=tuple(missing_symbols), + placeholder_symbols=tuple(placeholder_symbols), + fits=tuple(fits), + warnings=tuple(warnings), + per_element=tuple(per_element), + ) + + +DEFAULT_COVALENT_POLICY = RadiiPolicy( + kind="covalent", + base_set="cordero2008", + transfers=( + SubstitutionTransfer(source=DatasetRef("covalent_radius", "csd_legacy_cov")), + ), +) +"""Default covalent-radii policy used by the convenience helpers.""" + +DEFAULT_VDW_POLICY = RadiiPolicy( + kind="van_der_waals", + base_set="alvarez2013", + transfers=(LinearTransfer(predictors=(DatasetRef("atomic_radius", "rahm2016"),)),), +) +"""Default vdW-radii policy used by the convenience helpers.""" diff --git a/src/atomref/registry.py b/src/atomref/registry.py new file mode 100644 index 0000000..b17b941 --- /dev/null +++ b/src/atomref/registry.py @@ -0,0 +1,609 @@ +"""Dataset registry and packaged element-scalar set loading.""" + +from __future__ import annotations + +from collections.abc import Iterable, Mapping +from dataclasses import dataclass +import csv +from functools import lru_cache +from importlib import resources +import json +import math +from types import MappingProxyType +import unicodedata + +from .elements import canonicalize_element_symbol, get_element, iter_elements +from .errors import DatasetError + +QuantityId = str +DomainId = str + + +@dataclass(frozen=True, slots=True) +class DatasetRef: + """Stable reference to a packaged dataset. + + The ``quantity`` identifies the operational property family, while + ``set_id`` names a specific curated dataset within that family. + """ + + quantity: QuantityId + set_id: str + + +@dataclass(frozen=True, slots=True) +class Reference: + """Bibliographic record attached to packaged dataset metadata.""" + + authors: str | None = None + year: int | None = None + title: str | None = None + venue: str | None = None + doi: str | None = None + url: str | None = None + publisher: str | None = None + note: str | None = None + + +@dataclass(frozen=True, slots=True) +class CoverageInfo: + """Coverage summary for an element-indexed scalar dataset.""" + + n_values: int + z_min: int | None = None + z_max: int | None = None + has_placeholders: bool = False + covered_z: tuple[int, ...] = () + missing_z: tuple[int, ...] = () + + +@dataclass(frozen=True, slots=True) +class QuantityInfo: + """Metadata shared by all datasets that belong to one quantity.""" + + quantity: QuantityId + domain: DomainId + units: str | None = None + description: str | None = None + + +@dataclass(frozen=True, slots=True) +class DatasetInfo: + """Curated metadata for one packaged dataset. + + This object keeps operational classification such as ``ref.quantity`` and + ``usage_role`` separate from scientific classification such as + ``semantic_class`` and ``phase_context``. + """ + + ref: DatasetRef + domain: DomainId + units: str | None + name: str + description: str | None = None + usage_role: str | None = None + semantic_class: str | None = None + origin_class: str | None = None + phase_context: str | None = None + method_summary: str | None = None + placeholder_value: float | None = None + extraction_source: str | None = None + aliases: tuple[str, ...] = () + references: tuple[Reference, ...] = () + notes: tuple[str, ...] = () + storage: Mapping[str, object] | None = None + coverage: CoverageInfo | None = None + + +@dataclass(frozen=True, slots=True) +class ElementScalarSet: + """Element-indexed scalar dataset stored densely by atomic number.""" + + ref: DatasetRef + info: DatasetInfo + values_by_z: tuple[float | None, ...] + + @classmethod + def from_mapping( + cls, + *, + ref: DatasetRef, + values: Mapping[str, float | None], + name: str, + units: str | None, + description: str | None = None, + usage_role: str = "user", + semantic_class: str = "user", + origin_class: str = "user", + phase_context: str | None = None, + references: Iterable[Reference] = (), + notes: Iterable[str] = (), + placeholder_value: float | None = None, + ) -> "ElementScalarSet": + """Build a custom element-domain dataset from a symbol-keyed mapping.""" + + n_z = max(e.z for e in iter_elements()) + values_by_z: list[float | None] = [None] * (n_z + 1) + seen_keys: dict[str, str] = {} + + placeholder_f = ( + None + if placeholder_value is None + else _coerce_finite_float( + placeholder_value, + what=f"placeholder value for custom dataset {ref.set_id!r}", + ) + ) + + for key, value in values.items(): + sym = _normalize_element_domain_symbol(key) + elem = get_element(sym) + if elem is None: + raise DatasetError(f"invalid element symbol in custom set: {key!r}") + previous = seen_keys.get(sym) + if previous is not None and previous != key: + raise DatasetError( + "custom-set keys " + f"{previous!r} and {key!r} both normalize to {sym!r}" + ) + seen_keys[sym] = key + values_by_z[elem.z] = ( + None + if value is None + else _coerce_finite_float( + value, + what=f"value for element {key!r} in custom dataset {ref.set_id!r}", + ) + ) + + covered_z = tuple( + z for z, value in enumerate(values_by_z) if z > 0 and value is not None + ) + has_placeholders = False + if placeholder_f is not None: + has_placeholders = any( + value is not None and abs(value - placeholder_f) < 1e-12 + for value in values_by_z[1:] + ) + + info = DatasetInfo( + ref=ref, + domain="element", + units=units, + name=name, + description=description, + usage_role=usage_role, + semantic_class=semantic_class, + origin_class=origin_class, + phase_context=phase_context, + placeholder_value=placeholder_f, + aliases=(), + references=tuple(references), + notes=tuple(notes), + storage=None, + coverage=CoverageInfo( + n_values=len(covered_z), + z_min=min(covered_z) if covered_z else None, + z_max=max(covered_z) if covered_z else None, + has_placeholders=has_placeholders, + covered_z=covered_z, + missing_z=tuple(z for z in range(1, n_z + 1) if values_by_z[z] is None), + ), + ) + return cls(ref=ref, info=info, values_by_z=tuple(values_by_z)) + + def get(self, symbol: str | None) -> float | None: + """Return the scalar value for ``symbol`` or ``None`` if absent.""" + + sym = _normalize_element_domain_symbol(symbol) + elem = get_element(sym) + if elem is None: + return None + return self.values_by_z[elem.z] + + +DatasetLike = DatasetRef | ElementScalarSet + + +_DASH_TRANSLATION = str.maketrans( + { + "‐": "-", + "‑": "-", + "‒": "-", + "–": "-", + "—": "-", + "―": "-", + "−": "-", + } +) + + +def _normalize_element_domain_symbol(symbol: str | None) -> str | None: + """Normalize element-domain symbols and fold D/T onto hydrogen.""" + + cand = canonicalize_element_symbol(symbol) + if cand in {"D", "T"}: + return "H" + return cand + + +@lru_cache(maxsize=1) +def _load_registry_json() -> dict[str, object]: + """Load the packaged registry JSON as a validated top-level mapping.""" + + path = resources.files("atomref.data").joinpath("registry.json") + with path.open("r", encoding="utf-8") as handle: + data = json.load(handle) + if not isinstance(data, dict): + raise DatasetError("invalid registry.json: expected JSON object") + return data + + +def _freeze_json_like(value: object) -> object: + """Recursively freeze JSON-like metadata structures. + + Registry metadata is cached globally. Returning raw dicts or lists from that + cache would let callers mutate shared package state through the metadata + objects returned by :func:`get_dataset_info`. + """ + + if isinstance(value, dict): + frozen = {str(key): _freeze_json_like(item) for key, item in value.items()} + return MappingProxyType(frozen) + if isinstance(value, list): + return tuple(_freeze_json_like(item) for item in value) + return value + + +def _coerce_finite_float(value: object, *, what: str) -> float: + """Return ``value`` as a finite float or raise :class:`DatasetError`.""" + + try: + out = float(value) + except (TypeError, ValueError) as exc: + raise DatasetError(f"{what} must be a finite float") from exc + if not math.isfinite(out): + raise DatasetError(f"{what} must be a finite float") + return out + + +def _get_quantities_mapping() -> Mapping[str, object]: + """Return the raw ``quantities`` mapping from ``registry.json``.""" + + quantities = _load_registry_json().get("quantities") + if not isinstance(quantities, dict): + raise DatasetError("invalid registry.json: missing quantities mapping") + return quantities + + +def _get_datasets_mapping() -> Mapping[str, object]: + """Return the raw ``datasets`` mapping from ``registry.json``.""" + + datasets = _load_registry_json().get("datasets") + if not isinstance(datasets, dict): + raise DatasetError("invalid registry.json: missing datasets mapping") + return datasets + + +def _datasets_for_quantity(quantity: QuantityId) -> Mapping[str, object]: + """Return the dataset table for one quantity or raise on unknown input.""" + + datasets = _get_datasets_mapping().get(quantity) + if not isinstance(datasets, dict): + raise DatasetError(f"unknown quantity: {quantity!r}") + return datasets + + +def list_quantities() -> tuple[str, ...]: + """List packaged quantity identifiers in registry order.""" + + return tuple(_get_quantities_mapping().keys()) + + +def get_quantity_info(quantity: QuantityId) -> QuantityInfo: + """Return quantity-level metadata for a packaged quantity.""" + + raw = _get_quantities_mapping().get(quantity) + if not isinstance(raw, dict): + raise DatasetError(f"unknown quantity: {quantity!r}") + domain = raw.get("domain") if isinstance(raw.get("domain"), str) else None + if domain is None: + raise DatasetError(f"missing domain for quantity: {quantity!r}") + units = raw.get("units") if isinstance(raw.get("units"), str) else None + description = ( + raw.get("description") if isinstance(raw.get("description"), str) else None + ) + return QuantityInfo( + quantity=quantity, + domain=domain, + units=units, + description=description, + ) + + +def _canonicalize_alias_token(value: str) -> str: + """Normalize a dataset id or alias for case-insensitive comparison.""" + + normalized = unicodedata.normalize("NFKC", value) + normalized = normalized.translate(_DASH_TRANSLATION) + return " ".join(normalized.strip().lower().split()) + + +def _resolve_set_id(quantity: QuantityId, set_id: str) -> str: + """Resolve a dataset id or alias to its canonical packaged set id.""" + + by_quantity = _datasets_for_quantity(quantity) + if set_id in by_quantity: + return set_id + + wanted = _canonicalize_alias_token(set_id) + for actual_id, raw_entry in by_quantity.items(): + if _canonicalize_alias_token(actual_id) == wanted: + return actual_id + if isinstance(raw_entry, dict): + aliases = raw_entry.get("aliases", ()) + if isinstance(aliases, list): + for alias in aliases: + if ( + isinstance(alias, str) + and _canonicalize_alias_token(alias) == wanted + ): + return actual_id + raise DatasetError(f"unknown dataset id for {quantity!r}: {set_id!r}") + + +def list_dataset_ids( + quantity: QuantityId, *, usage_role: str | None = None +) -> tuple[str, ...]: + """List packaged dataset identifiers for a quantity. + + When ``usage_role`` is provided, only datasets with a matching normalized + role such as ``"target"`` or ``"support"`` are returned. + """ + + dataset_ids = tuple(_datasets_for_quantity(quantity).keys()) + if usage_role is None: + return dataset_ids + + filtered: list[str] = [] + wanted = usage_role.strip().lower() + for set_id in dataset_ids: + info = get_dataset_info(DatasetRef(quantity, set_id)) + role = (info.usage_role or "").strip().lower() + if role == wanted: + filtered.append(set_id) + return tuple(filtered) + + +def list_dataset_infos( + quantity: QuantityId, *, usage_role: str | None = None +) -> tuple[DatasetInfo, ...]: + """Return packaged dataset metadata objects for a quantity.""" + + return tuple( + get_dataset_info(DatasetRef(quantity, set_id)) + for set_id in list_dataset_ids(quantity, usage_role=usage_role) + ) + + +def _coerce_reference(obj: object) -> Reference: + """Coerce a raw registry reference entry into :class:`Reference`.""" + + if not isinstance(obj, dict): + raise DatasetError("invalid reference entry in registry.json") + return Reference( + authors=obj.get("authors") if isinstance(obj.get("authors"), str) else None, + year=obj.get("year") if isinstance(obj.get("year"), int) else None, + title=obj.get("title") if isinstance(obj.get("title"), str) else None, + venue=obj.get("venue") if isinstance(obj.get("venue"), str) else None, + doi=obj.get("doi") if isinstance(obj.get("doi"), str) else None, + url=obj.get("url") if isinstance(obj.get("url"), str) else None, + publisher=( + obj.get("publisher") if isinstance(obj.get("publisher"), str) else None + ), + note=obj.get("note") if isinstance(obj.get("note"), str) else None, + ) + + +def _coerce_coverage(obj: object) -> CoverageInfo | None: + """Coerce raw coverage metadata into :class:`CoverageInfo`.""" + + if not isinstance(obj, dict): + return None + covered = obj.get("covered_z") + missing = obj.get("missing_z") + covered_z = tuple(int(z) for z in covered) if isinstance(covered, list) else () + missing_z = tuple(int(z) for z in missing) if isinstance(missing, list) else () + return CoverageInfo( + n_values=int(obj["n_values"]), + z_min=int(obj["z_min"]) if isinstance(obj.get("z_min"), int) else None, + z_max=int(obj["z_max"]) if isinstance(obj.get("z_max"), int) else None, + has_placeholders=bool(obj.get("has_placeholders", False)), + covered_z=covered_z, + missing_z=missing_z, + ) + + +def get_dataset_info(ref: DatasetRef) -> DatasetInfo: + """Return curated metadata for a packaged dataset reference.""" + + actual_set_id = _resolve_set_id(ref.quantity, ref.set_id) + actual_ref = DatasetRef(quantity=ref.quantity, set_id=actual_set_id) + + quantities = _get_quantities_mapping() + quantity_info = quantities.get(actual_ref.quantity) + if not isinstance(quantity_info, dict): + raise DatasetError(f"unknown quantity: {actual_ref.quantity!r}") + + units = ( + quantity_info.get("units") + if isinstance(quantity_info.get("units"), str) + else None + ) + domain = ( + quantity_info.get("domain") + if isinstance(quantity_info.get("domain"), str) + else None + ) + if domain is None: + raise DatasetError(f"missing domain for quantity: {actual_ref.quantity!r}") + + raw_entry = _datasets_for_quantity(actual_ref.quantity).get(actual_ref.set_id) + if not isinstance(raw_entry, dict): + raise DatasetError(f"unknown dataset: {actual_ref}") + + refs_raw = raw_entry.get("references", []) + references = ( + tuple(_coerce_reference(item) for item in refs_raw) + if isinstance(refs_raw, list) + else () + ) + aliases_raw = raw_entry.get("aliases", []) + aliases = ( + tuple(item for item in aliases_raw if isinstance(item, str)) + if isinstance(aliases_raw, list) + else () + ) + notes_raw = raw_entry.get("notes", []) + notes = ( + tuple(item for item in notes_raw if isinstance(item, str)) + if isinstance(notes_raw, list) + else () + ) + storage = ( + _freeze_json_like(raw_entry.get("storage")) + if isinstance(raw_entry.get("storage"), dict) + else None + ) + + return DatasetInfo( + ref=actual_ref, + domain=domain, + units=units, + name=( + raw_entry.get("name") + if isinstance(raw_entry.get("name"), str) + else actual_ref.set_id + ), + description=( + raw_entry.get("description") + if isinstance(raw_entry.get("description"), str) + else None + ), + usage_role=( + raw_entry.get("usage_role") + if isinstance(raw_entry.get("usage_role"), str) + else None + ), + semantic_class=( + raw_entry.get("semantic_class") + if isinstance(raw_entry.get("semantic_class"), str) + else None + ), + origin_class=( + raw_entry.get("origin_class") + if isinstance(raw_entry.get("origin_class"), str) + else None + ), + phase_context=( + raw_entry.get("phase_context") + if isinstance(raw_entry.get("phase_context"), str) + else None + ), + method_summary=( + raw_entry.get("method_summary") + if isinstance(raw_entry.get("method_summary"), str) + else None + ), + placeholder_value=( + _coerce_finite_float( + raw_entry["placeholder_value"], + what=f"placeholder value for packaged dataset {actual_ref!r}", + ) + if raw_entry.get("placeholder_value") is not None + else None + ), + extraction_source=( + raw_entry.get("extraction_source") + if isinstance(raw_entry.get("extraction_source"), str) + else None + ), + aliases=aliases, + references=references, + notes=notes, + storage=storage if isinstance(storage, Mapping) else None, + coverage=_coerce_coverage(raw_entry.get("coverage")), + ) + + +@lru_cache(maxsize=None) +def _load_csv_columns(filename: str) -> dict[str, tuple[float | None, ...]]: + """Load all value columns from one packaged dense-by-Z CSV table.""" + + path = resources.files("atomref.data").joinpath(filename) + with path.open("r", encoding="utf-8", newline="") as handle: + reader = csv.DictReader(handle) + if reader.fieldnames is None or "z" not in reader.fieldnames: + raise DatasetError(f"invalid CSV file: {filename!r}") + columns = [name for name in reader.fieldnames if name != "z"] + values: dict[str, list[float | None]] = {name: [None] * 119 for name in columns} + for row in reader: + z_text = row.get("z") + if z_text is None: + continue + z = int(z_text) + for name in columns: + raw = row.get(name) + if raw is None: + values[name][z] = None + continue + raw = raw.strip() + values[name][z] = ( + _coerce_finite_float( + raw, + what=f"value in {filename!r} column {name!r} for Z={z}", + ) + if raw + else None + ) + return {name: tuple(vals) for name, vals in values.items()} + + +@lru_cache(maxsize=None) +def get_builtin_set(ref: DatasetRef) -> ElementScalarSet: + """Load a packaged dataset as an :class:`ElementScalarSet`.""" + + info = get_dataset_info(ref) + if info.domain != "element": + raise DatasetError( + f"only element-domain datasets are currently supported: {info.ref!r}" + ) + if not isinstance(info.storage, Mapping): + raise DatasetError(f"missing storage metadata for dataset: {info.ref!r}") + + filename = info.storage.get("filename") + column = info.storage.get("column") + if not isinstance(filename, str) or not isinstance(column, str): + raise DatasetError(f"invalid storage metadata for dataset: {info.ref!r}") + + table = _load_csv_columns(filename) + if column not in table: + raise DatasetError(f"column {column!r} not found in {filename!r}") + + return ElementScalarSet(ref=info.ref, info=info, values_by_z=table[column]) + + +def resolve_dataset_like(dataset: DatasetLike) -> ElementScalarSet: + """Resolve either a packaged reference or a custom set to a loaded set.""" + + if isinstance(dataset, ElementScalarSet): + return dataset + return get_builtin_set(dataset) + + +def _is_placeholder_value(info: DatasetInfo, value: float) -> bool: + """Return ``True`` when ``value`` equals the dataset's placeholder value.""" + + if info.placeholder_value is None: + return False + return abs(value - info.placeholder_value) < 1e-12 diff --git a/src/atomref/transfer.py b/src/atomref/transfer.py new file mode 100644 index 0000000..9adb0ce --- /dev/null +++ b/src/atomref/transfer.py @@ -0,0 +1,168 @@ +"""Transfer-model configuration types for policy-based lookup.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Literal, Protocol, runtime_checkable + +from .errors import PolicyError +from .registry import DatasetLike + +if TYPE_CHECKING: # pragma: no cover - typing only + from .policy import ValuePolicy + + +TransferValueSource = Literal[ + "override", + "base", + "transfer_substitution", + "transfer_linear", + "fallback", +] +"""Source labels that may be admitted into nested linear-transfer workflows.""" + +_ALLOWED_TRANSFER_VALUE_SOURCES = frozenset( + { + "override", + "base", + "transfer_substitution", + "transfer_linear", + "fallback", + } +) + +_DEFAULT_LINEAR_FIT_SOURCES: tuple[TransferValueSource, ...] = ( + "base", + "override", +) +_DEFAULT_LINEAR_PREDICTION_SOURCES: tuple[TransferValueSource, ...] = ( + "base", + "override", + "transfer_substitution", + "transfer_linear", +) + + +@runtime_checkable +class SupportsValuePolicy(Protocol): + """Protocol for wrapper objects that can expose a generic value policy.""" + + def as_value_policy(self) -> "ValuePolicy[str]": + """Return the generic element-domain value policy.""" + + +@dataclass(frozen=True, slots=True) +class LinearFit: + """Summary statistics for a fitted linear transfer model. + + Parameters are stored in a compact, serializable form so they can be + attached to :class:`atomref.policy.LookupResult` objects and reused in + reporting code. + """ + + coefficients: tuple[float, ...] + intercept: float + n_points: int + r2: float + rmse: float + + +@dataclass(frozen=True, slots=True) +class SubstitutionTransfer: + """Use another dataset or policy directly when the base dataset is missing. + + The selected value is copied from the source rather than inferred. + """ + + source: DatasetLike | SupportsValuePolicy | ValuePolicy[str] + + +@dataclass(frozen=True, slots=True) +class LinearTransfer: + """Infer missing target values from one or more predictor datasets or policies. + + In the current implementation the public API stores predictors as a tuple + for forward compatibility, but the runtime intentionally accepts exactly one + predictor source. + + For nested policy predictors, two safeguards apply: + + - ``fit_sources`` / ``fit_max_depth`` control which predictor values may be + used when fitting the linear model itself; + - ``prediction_sources`` / ``prediction_max_depth`` control which nested + predictor values may be used for the final requested element. + + The defaults are intentionally conservative for fitting and permissive only + enough to allow one additional completion step at prediction time. + """ + + predictors: tuple[DatasetLike | SupportsValuePolicy | ValuePolicy[str], ...] + min_points: int = 2 + exclude_placeholders: bool = True + fit_sources: tuple[TransferValueSource, ...] = _DEFAULT_LINEAR_FIT_SOURCES + prediction_sources: tuple[TransferValueSource, ...] = ( + _DEFAULT_LINEAR_PREDICTION_SOURCES + ) + fit_max_depth: int = 0 + prediction_max_depth: int = 1 + + def __post_init__(self) -> None: + """Validate obvious configuration errors eagerly.""" + + if not self.predictors: + raise PolicyError("LinearTransfer requires at least one predictor") + if self.min_points < 2: + raise PolicyError("LinearTransfer min_points must be at least 2") + + object.__setattr__( + self, + "fit_sources", + _normalize_transfer_value_sources( + self.fit_sources, + field_name="fit_sources", + ), + ) + object.__setattr__( + self, + "prediction_sources", + _normalize_transfer_value_sources( + self.prediction_sources, + field_name="prediction_sources", + ), + ) + + if self.fit_max_depth < 0: + raise PolicyError("LinearTransfer fit_max_depth must be non-negative") + if self.prediction_max_depth < 0: + raise PolicyError( + "LinearTransfer prediction_max_depth must be non-negative" + ) + + +TransferModel = SubstitutionTransfer | LinearTransfer +"""Closed union of transfer models supported by the core resolver.""" + + +def _normalize_transfer_value_sources( + sources: tuple[str, ...], + *, + field_name: str, +) -> tuple[TransferValueSource, ...]: + """Validate and deduplicate source-label controls for linear transfers.""" + + if not sources: + raise PolicyError(f"LinearTransfer {field_name} may not be empty") + + normalized: list[TransferValueSource] = [] + seen: set[str] = set() + for source in sources: + if source not in _ALLOWED_TRANSFER_VALUE_SOURCES: + allowed = ", ".join(sorted(_ALLOWED_TRANSFER_VALUE_SOURCES)) + raise PolicyError( + f"LinearTransfer {field_name} contains unsupported source " + f"{source!r}; allowed values are: {allowed}" + ) + if source not in seen: + normalized.append(source) + seen.add(source) + return tuple(normalized) diff --git a/src/atomref/xh.py b/src/atomref/xh.py new file mode 100644 index 0000000..5018d99 --- /dev/null +++ b/src/atomref/xh.py @@ -0,0 +1,175 @@ +"""X-H bond-length helpers built on the generic policy core.""" + +from __future__ import annotations + +from collections.abc import Mapping +from dataclasses import dataclass, field +import math + +from .elements import canonicalize_element_symbol, is_valid_element_symbol +from .errors import PolicyError +from .policy import ( + LookupResult, + ValuePolicy, + _get_value_from_policy_source, + _lookup_value_from_policy_source, +) +from .registry import ( + DatasetInfo, + DatasetRef, + ElementScalarSet, + get_builtin_set, + get_dataset_info, + list_dataset_ids, + list_dataset_infos, +) +from .transfer import LinearTransfer, TransferModel + +XHSet = ElementScalarSet + +_QUANTITY = "xh_bond_length" + + +@dataclass(frozen=True, slots=True) +class XHPolicy: + """Policy wrapper specialized for parent-element X-H bond lengths. + + The quantity key is fixed to ``"xh_bond_length"`` and uses the parent + element ``X`` as the lookup key. ``H`` itself is not considered a valid + parent element for this quantity. + """ + + base_set: str | XHSet + transfers: tuple[TransferModel, ...] = () + overrides: Mapping[str, float] = field(default_factory=dict) + fallback: float | None = None + + def as_value_policy(self) -> ValuePolicy[str]: + """Convert the X-H policy into the generic scalar-value policy.""" + + if isinstance(self.base_set, ElementScalarSet): + if self.base_set.ref.quantity != _QUANTITY: + raise PolicyError( + "base_set quantity " + f"{self.base_set.ref.quantity!r} is incompatible " + "with X-H lookup" + ) + base = self.base_set + else: + base = DatasetRef(_QUANTITY, self.base_set) + + checked_overrides: dict[str, float] = {} + for key, value in self.overrides.items(): + sym = _normalize_xh_symbol(key) + if sym is None or not is_valid_element_symbol(sym): + raise PolicyError(f"invalid X-H parent element symbol: {key!r}") + if sym == "H": + raise PolicyError("H is not a valid parent element for xh_bond_length") + checked_overrides[key] = _coerce_non_negative_xh_value( + value, + what=f"X-H override value for {key!r}", + ) + + checked_fallback = ( + None + if self.fallback is None + else _coerce_non_negative_xh_value(self.fallback, what="X-H fallback") + ) + + return ValuePolicy( + base=base, + transfers=self.transfers, + overrides=checked_overrides, + fallback=checked_fallback, + blocked=("H",), + ) + + +def _coerce_non_negative_xh_value(value: object, *, what: str) -> float: + """Validate an X-H-like policy number.""" + + try: + out = float(value) + except (TypeError, ValueError) as exc: + raise PolicyError(f"{what} must be a finite float") from exc + if not math.isfinite(out): + raise PolicyError(f"{what} must be a finite float") + if out < 0: + raise PolicyError(f"{what} must be non-negative") + return out + + +def _normalize_xh_symbol(symbol: str | None) -> str | None: + """Normalize symbols accepted by the X-H convenience layer.""" + + cand = canonicalize_element_symbol(symbol) + if cand in {"D", "T"}: + cand = "H" + return cand + + +def list_xh_sets(*, usage_role: str | None = None) -> tuple[str, ...]: + """List packaged X-H set ids.""" + + return list_dataset_ids(_QUANTITY, usage_role=usage_role) + + +def list_xh_set_infos(*, usage_role: str | None = None) -> tuple[DatasetInfo, ...]: + """Return packaged metadata objects for X-H sets.""" + + return list_dataset_infos(_QUANTITY, usage_role=usage_role) + + +def get_xh_set_info(set_id: str) -> DatasetInfo: + """Return metadata for one packaged X-H set.""" + + return get_dataset_info(DatasetRef(_QUANTITY, set_id)) + + +def get_xh_set(set_id: str) -> XHSet: + """Load one packaged X-H set as an :class:`ElementScalarSet`.""" + + return get_builtin_set(DatasetRef(_QUANTITY, set_id)) + + +def lookup_xh_bond_length( + symbol: str | None, + *, + policy: XHPolicy | None = None, +) -> LookupResult: + """Resolve a parent-element X-H bond length with provenance.""" + + active = DEFAULT_XH_POLICY if policy is None else policy + lookup = _lookup_value_from_policy_source(symbol, source=active) + if lookup.value is None and _normalize_xh_symbol(symbol) == "H": + return LookupResult( + value=None, + source="missing", + target=lookup.target, + notes=("H is not a valid parent element for xh_bond_length",), + ) + return lookup + + +def get_xh_bond_length( + symbol: str | None, + *, + policy: XHPolicy | None = None, +) -> float | None: + """Return only the selected X-H bond-length value, without provenance.""" + + active = DEFAULT_XH_POLICY if policy is None else policy + return _get_value_from_policy_source(symbol, source=active) + + +DEFAULT_XH_POLICY = XHPolicy( + base_set="csd_legacy_xh_cno", + transfers=( + LinearTransfer( + predictors=(DatasetRef("covalent_radius", "cordero2008"),), + min_points=3, + exclude_placeholders=True, + ), + ), +) +"""Default X-H policy used by the convenience helpers.""" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..08328a4 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +SRC = REPO_ROOT / 'src' +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) diff --git a/tests/elements/test_elements.py b/tests/elements/test_elements.py new file mode 100644 index 0000000..161b420 --- /dev/null +++ b/tests/elements/test_elements.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +import atomref as ar + + +def test_element_lookup_and_validation() -> None: + assert ar.is_valid_element_symbol('C') + assert ar.is_valid_element_symbol('cl') is False + assert ar.get_element('cl') is not None + assert ar.get_element('C').z == 6 + assert ar.get_element('Xx') is None + + +def test_iter_elements_is_sorted_and_complete() -> None: + elems = ar.iter_elements() + assert elems[0].symbol == 'H' + assert elems[-1].symbol == 'Og' + assert elems[0].z == 1 + assert elems[-1].z == 118 diff --git a/tests/meta/test_imports.py b/tests/meta/test_imports.py new file mode 100644 index 0000000..66210e7 --- /dev/null +++ b/tests/meta/test_imports.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +import importlib + + +MODULES = [ + 'atomref', + 'atomref.elements', + 'atomref.registry', + 'atomref.transfer', + 'atomref.policy', + 'atomref.radii', + 'atomref.xh', +] + + +def test_imports() -> None: + for name in MODULES: + importlib.import_module(name) diff --git a/tests/meta/test_notebooks.py b/tests/meta/test_notebooks.py new file mode 100644 index 0000000..d420476 --- /dev/null +++ b/tests/meta/test_notebooks.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from pathlib import Path +import subprocess +import sys + + +REPO_ROOT = Path(__file__).resolve().parents[2] +CHECK_SCRIPT = REPO_ROOT / "tools" / "check_notebooks.py" +EXPORT_SCRIPT = REPO_ROOT / "tools" / "export_notebooks.py" +NOTEBOOKS = REPO_ROOT / "notebooks" +EXPORTED_NOTEBOOKS = REPO_ROOT / "docs" / "notebooks" + + +def test_notebook_files_exist() -> None: + expected = { + "01-quickstart.ipynb", + "02-policies-and-assessment.ipynb", + "03-custom-sets-and-discovery.ipynb", + } + actual = {path.name for path in NOTEBOOKS.glob("*.ipynb")} + assert expected.issubset(actual) + + +def test_notebooks_validate_and_execute() -> None: + subprocess.run([sys.executable, str(CHECK_SCRIPT)], cwd=REPO_ROOT, check=True) + + +def test_exported_notebook_pages_are_in_sync() -> None: + expected = { + "01-quickstart.md", + "02-policies-and-assessment.md", + "03-custom-sets-and-discovery.md", + } + actual = {path.name for path in EXPORTED_NOTEBOOKS.glob("*.md")} + assert expected.issubset(actual) + subprocess.run( + [sys.executable, str(EXPORT_SCRIPT), "--check"], + cwd=REPO_ROOT, + check=True, + ) diff --git a/tests/meta/test_package_data.py b/tests/meta/test_package_data.py new file mode 100644 index 0000000..a9a7e61 --- /dev/null +++ b/tests/meta/test_package_data.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from importlib import resources +import json + + +def test_packaged_data_files_are_available() -> None: + data_root = resources.files('atomref.data') + for name in ( + 'periodic_table.csv', + 'covalent.csv', + 'van_der_waals.csv', + 'registry.json', + 'xh_bond_length.csv', + ): + assert data_root.joinpath(name).is_file(), name + + +def test_packaged_registry_keeps_atomic_support_classification() -> None: + data_root = resources.files('atomref.data') + raw = json.loads(data_root.joinpath('registry.json').read_text(encoding='utf-8')) + + assert 'atomic_radius' in raw['datasets'] + assert 'xh_bond_length' in raw['datasets'] + rahm = raw['datasets']['atomic_radius']['rahm2016'] + assert rahm['usage_role'] == 'support' + assert rahm['semantic_class'] == 'atomic_isodensity' + assert rahm['phase_context'] == 'isolated_atom' diff --git a/tests/meta/test_public_api.py b/tests/meta/test_public_api.py new file mode 100644 index 0000000..f3583a1 --- /dev/null +++ b/tests/meta/test_public_api.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import atomref as ar + + +REQUIRED_PUBLIC_NAMES = { + 'Element', + 'DatasetRef', + 'DatasetInfo', + 'ElementScalarSet', + 'QuantityInfo', + 'LookupResult', + 'RadiiPolicy', + 'DEFAULT_COVALENT_POLICY', + 'DEFAULT_VDW_POLICY', + 'LinearTransfer', + 'SubstitutionTransfer', + 'get_builtin_set', + 'get_radii_set', + 'get_covalent_radius', + 'lookup_covalent_radius', + 'get_vdw_radius', + 'lookup_vdw_radius', + 'XHPolicy', + 'DEFAULT_XH_POLICY', + 'get_xh_set', + 'get_xh_bond_length', + 'lookup_xh_bond_length', + 'list_xh_sets', + 'list_xh_set_infos', + 'list_quantities', + 'list_dataset_ids', + 'list_dataset_infos', + 'list_radii_sets', + 'list_radii_set_infos', +} + + +def test___all___exports_existing_objects() -> None: + for name in ar.__all__: + assert hasattr(ar, name), name + + +def test_core_public_api_names_are_exported() -> None: + assert REQUIRED_PUBLIC_NAMES.issubset(set(ar.__all__)) diff --git a/tests/meta/test_readme_sync.py b/tests/meta/test_readme_sync.py new file mode 100644 index 0000000..fe56ac2 --- /dev/null +++ b/tests/meta/test_readme_sync.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from pathlib import Path +import subprocess +import sys + + +REPO_ROOT = Path(__file__).resolve().parents[2] +README = REPO_ROOT / 'README.md' +SCRIPT = REPO_ROOT / 'tools' / 'gen_readme.py' + + +def test_readme_is_in_sync(tmp_path: Path) -> None: + generated = tmp_path / 'README.generated.md' + subprocess.run( + [sys.executable, str(SCRIPT), '--output', str(generated)], + cwd=REPO_ROOT, + check=True, + ) + assert generated.read_text(encoding='utf-8') == README.read_text(encoding='utf-8') diff --git a/tests/meta/test_registry_integrity.py b/tests/meta/test_registry_integrity.py new file mode 100644 index 0000000..a32b44c --- /dev/null +++ b/tests/meta/test_registry_integrity.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from collections import defaultdict +from dataclasses import asdict + +import atomref as ar +from atomref.registry import _canonicalize_alias_token, get_builtin_set + +_ALLOWED_USAGE_ROLES = {"target", "support"} + + +def test_dataset_aliases_are_unique_within_each_quantity() -> None: + for quantity in ar.list_quantities(): + seen: dict[str, str] = {} + for set_id in ar.list_dataset_ids(quantity): + info = ar.get_dataset_info(ar.DatasetRef(quantity, set_id)) + for token in (set_id, *info.aliases): + key = _canonicalize_alias_token(token) + previous = seen.get(key) + assert previous in (None, set_id) + seen[key] = set_id + + +def test_every_built_in_dataset_loads_and_matches_coverage_metadata() -> None: + for quantity in ar.list_quantities(): + quantity_info = ar.get_quantity_info(quantity) + for set_id in ar.list_dataset_ids(quantity): + ref = ar.DatasetRef(quantity, set_id) + info = ar.get_dataset_info(ref) + dataset = get_builtin_set(ref) + + assert info.domain == quantity_info.domain + assert info.units == quantity_info.units + assert info.usage_role in _ALLOWED_USAGE_ROLES + assert info.references + assert info.coverage is not None + + max_z = ( + info.coverage.z_max + if info.coverage.z_max is not None + else len(dataset.values_by_z) - 1 + ) + covered_z = tuple( + z + for z, value in enumerate(dataset.values_by_z) + if z > 0 and value is not None and z <= max_z + ) + covered_set = set(covered_z) + missing_z = tuple(z for z in range(1, max_z + 1) if z not in covered_set) + has_placeholders = info.placeholder_value is not None and any( + value is not None and abs(value - info.placeholder_value) < 1e-12 + for value in dataset.values_by_z[1 : max_z + 1] + ) + + coverage = asdict(info.coverage) + assert coverage["n_values"] == len(covered_z) + assert coverage["z_min"] == (min(covered_z) if covered_z else None) + assert coverage["z_max"] == (max(covered_z) if covered_z else None) + assert coverage["has_placeholders"] is has_placeholders + if coverage["covered_z"]: + assert tuple(coverage["covered_z"]) == covered_z + if coverage["missing_z"]: + assert tuple(coverage["missing_z"]) == missing_z + + +def test_non_atomic_quantities_have_at_least_one_target_dataset() -> None: + by_role: dict[str, list[str]] = defaultdict(list) + for quantity in ar.list_quantities(): + for set_id in ar.list_dataset_ids(quantity): + role = ar.get_dataset_info(ar.DatasetRef(quantity, set_id)).usage_role + assert role is not None + by_role[role].append(quantity) + + for quantity in ar.list_quantities(): + if quantity != "atomic_radius": + assert quantity in by_role["target"] diff --git a/tests/meta/test_release_tools.py b/tests/meta/test_release_tools.py new file mode 100644 index 0000000..7cbff90 --- /dev/null +++ b/tests/meta/test_release_tools.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +import subprocess +import sys +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +# Keeping this as a subprocess test ensures the helper stays importable and +# exposes a stable CLI entry point without running the expensive full release +# workflow inside the unit test suite. +def test_release_check_help() -> None: + result = subprocess.run( + [sys.executable, "tools/release_check.py", "--help"], + cwd=REPO_ROOT, + check=True, + capture_output=True, + text=True, + ) + assert "release-preparation checks" in result.stdout diff --git a/tests/meta/test_text_generation_tools.py b/tests/meta/test_text_generation_tools.py new file mode 100644 index 0000000..b6203a7 --- /dev/null +++ b/tests/meta/test_text_generation_tools.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +import importlib.util +from pathlib import Path +import sys + + +REPO_ROOT = Path(__file__).resolve().parents[2] +MODULE_PATH = REPO_ROOT / "tools" / "export_notebooks.py" + +spec = importlib.util.spec_from_file_location("export_notebooks_tool", MODULE_PATH) +assert spec is not None and spec.loader is not None +export_notebooks = importlib.util.module_from_spec(spec) +sys.modules[spec.name] = export_notebooks +spec.loader.exec_module(export_notebooks) + + +def test_export_notebooks_check_ignores_crlf(tmp_path: Path) -> None: + """Notebook export checks should ignore Windows vs Unix newline differences.""" + + output_dir = tmp_path / "docs" + output_dir.mkdir() + + for notebook_name, output_name in export_notebooks.NOTEBOOK_OUTPUTS.items(): + rendered = export_notebooks._export_markdown( + export_notebooks.NOTEBOOKS / notebook_name + ) + (output_dir / output_name).write_text( + rendered.replace("\n", "\r\n"), + encoding="utf-8", + newline="", + ) + + assert export_notebooks.export_notebooks(output_dir, check=True) == 0 diff --git a/tests/policy/test_policy.py b/tests/policy/test_policy.py new file mode 100644 index 0000000..618829a --- /dev/null +++ b/tests/policy/test_policy.py @@ -0,0 +1,248 @@ +from __future__ import annotations + +from dataclasses import dataclass + +import pytest + +import atomref as ar +from atomref.errors import PolicyError + + +def _make_custom_set( + quantity: str, + set_id: str, + values: dict[str, float | None], +) -> ar.ElementScalarSet: + return ar.ElementScalarSet.from_mapping( + ref=ar.DatasetRef(quantity, set_id), + values=values, + name=set_id, + units='angstrom', + ) + + +def _make_partial_covalent_policy(*, include_o: bool) -> ar.RadiiPolicy: + values = { + 'C': 0.76, + 'N': 0.71, + } + if include_o: + values['O'] = 0.66 + custom = ar.ElementScalarSet.from_mapping( + ref=ar.DatasetRef('covalent_radius', 'demo_partial_cov'), + values=values, + name='Demo partial covalent set', + units='angstrom', + ) + return ar.RadiiPolicy( + kind='covalent', + base_set=custom, + transfers=( + ar.LinearTransfer( + predictors=(ar.DatasetRef('covalent_radius', 'cordero2008'),), + min_points=2, + exclude_placeholders=True, + ), + ), + ) + + +@dataclass +class _DemoPolicyWrapper: + base: ar.ElementScalarSet + source: object | None = None + + def as_value_policy(self) -> ar.ValuePolicy[str]: + transfers = () + if self.source is not None: + transfers = (ar.SubstitutionTransfer(source=self.source),) + return ar.ValuePolicy(base=self.base, transfers=transfers) + + +def test_lookup_value_is_public_generic_entry_point() -> None: + policy = ar.ValuePolicy( + base=ar.DatasetRef('covalent_radius', 'cordero2008'), + overrides={'d': 0.5}, + ) + lookup = ar.lookup_value('H', policy=policy) + assert lookup.source == 'override' + assert lookup.value == pytest.approx(0.5) + assert lookup.transfer_depth == 0 + + +def test_get_value_returns_only_scalar() -> None: + policy = ar.ValuePolicy(base=ar.DatasetRef('covalent_radius', 'cordero2008')) + assert ar.get_value('C', policy=policy) == pytest.approx(0.76) + + +def test_value_policy_rejects_normalized_override_collisions() -> None: + with pytest.raises(PolicyError): + ar.ValuePolicy( + base=ar.DatasetRef('covalent_radius', 'cordero2008'), + overrides={'H': 0.31, 'D': 0.4}, + ) + + +def test_value_policy_rejects_non_finite_fallback() -> None: + with pytest.raises(PolicyError): + ar.ValuePolicy( + base=ar.DatasetRef('covalent_radius', 'cordero2008'), + fallback=float('nan'), + ) + + +def test_substitution_transfer_accepts_policy_source() -> None: + custom = ar.ElementScalarSet.from_mapping( + ref=ar.DatasetRef('covalent_radius', 'demo_user_cov'), + values={'C': 0.77}, + name='Demo covalent set', + units='angstrom', + ) + policy = ar.ValuePolicy( + base=custom, + transfers=(ar.SubstitutionTransfer(source=ar.DEFAULT_COVALENT_POLICY),), + ) + lookup = ar.lookup_value('Bk', policy=policy) + assert lookup.source == 'transfer_substitution' + assert lookup.value == pytest.approx(1.54) + assert lookup.transfer_depth == 2 + assert lookup.resolved_from == ( + ar.DatasetRef('covalent_radius', 'csd_legacy_cov'), + ) + assert any('policy source' in note for note in lookup.notes) + + +def test_linear_transfer_accepts_policy_predictor() -> None: + predictor_policy = ar.ValuePolicy(base=ar.DatasetRef('atomic_radius', 'rahm2016')) + policy = ar.RadiiPolicy( + kind='van_der_waals', + base_set='alvarez2013', + transfers=(ar.LinearTransfer(predictors=(predictor_policy,),),), + ) + lookup = ar.lookup_vdw_radius('Pm', policy=policy) + assert lookup.source == 'transfer_linear' + assert lookup.value == pytest.approx(ar.lookup_vdw_radius('Pm').value) + assert lookup.transfer_depth == 1 + assert lookup.fit is not None + assert any('policy source' in note for note in lookup.notes) + + +def test_linear_transfer_defaults_allow_direct_fit_and_one_nested_prediction() -> None: + predictor_policy = _make_partial_covalent_policy(include_o=True) + policy = ar.XHPolicy( + base_set='csd_legacy_xh_cno', + transfers=( + ar.LinearTransfer( + predictors=(predictor_policy,), + min_points=3, + exclude_placeholders=True, + ), + ), + ) + lookup = ar.lookup_xh_bond_length('S', policy=policy) + assert lookup.source == 'transfer_linear' + assert lookup.transfer_depth == 2 + assert lookup.fit is not None + assert lookup.fit.n_points == 3 + assert lookup.value == pytest.approx(ar.lookup_xh_bond_length('S').value) + + +def test_linear_transfer_fit_restrictions_block_inference_on_inference_by_default( +) -> None: + predictor_policy = _make_partial_covalent_policy(include_o=False) + policy = ar.XHPolicy( + base_set='csd_legacy_xh_cno', + transfers=( + ar.LinearTransfer( + predictors=(predictor_policy,), + min_points=3, + exclude_placeholders=True, + ), + ), + ) + with pytest.raises(PolicyError, match='fit-source restrictions'): + ar.lookup_xh_bond_length('S', policy=policy) + + +def test_linear_transfer_fit_restrictions_can_be_relaxed_explicitly() -> None: + predictor_policy = _make_partial_covalent_policy(include_o=False) + policy = ar.XHPolicy( + base_set='csd_legacy_xh_cno', + transfers=( + ar.LinearTransfer( + predictors=(predictor_policy,), + min_points=3, + exclude_placeholders=True, + fit_sources=('base', 'override', 'transfer_linear'), + fit_max_depth=1, + ), + ), + ) + lookup = ar.lookup_xh_bond_length('S', policy=policy) + assert lookup.source == 'transfer_linear' + assert lookup.fit is not None + assert lookup.fit.n_points == 3 + + +def test_linear_transfer_prediction_depth_can_be_tightened() -> None: + predictor_policy = _make_partial_covalent_policy(include_o=True) + policy = ar.XHPolicy( + base_set='csd_legacy_xh_cno', + transfers=( + ar.LinearTransfer( + predictors=(predictor_policy,), + min_points=3, + exclude_placeholders=True, + prediction_max_depth=0, + ), + ), + ) + lookup = ar.lookup_xh_bond_length('S', policy=policy) + assert lookup.value is None + assert lookup.source == 'missing' + assert any('prediction_max_depth' in note for note in lookup.notes) + + +def test_linear_transfer_rejects_invalid_nested_source_configuration() -> None: + with pytest.raises(PolicyError, match='fit_max_depth'): + ar.LinearTransfer( + predictors=(ar.DatasetRef('covalent_radius', 'cordero2008'),), + fit_max_depth=-1, + ) + with pytest.raises(PolicyError, match='allowed values'): + ar.LinearTransfer( + predictors=(ar.DatasetRef('covalent_radius', 'cordero2008'),), + prediction_sources=('missing',), # type: ignore[arg-type] + ) + + +def test_lookup_value_detects_generic_policy_cycles() -> None: + empty_1 = _make_custom_set('covalent_radius', 'cycle_empty_1', {}) + empty_2 = _make_custom_set('covalent_radius', 'cycle_empty_2', {}) + policy_1 = ar.ValuePolicy(base=empty_1) + policy_2 = ar.ValuePolicy( + base=empty_2, + transfers=(ar.SubstitutionTransfer(source=policy_1),), + ) + object.__setattr__( + policy_1, + 'transfers', + (ar.SubstitutionTransfer(source=policy_2),), + ) + + with pytest.raises(PolicyError, match='cyclic policy resolution detected'): + ar.lookup_value('C', policy=policy_1) + + +def test_wrapper_policy_cycles_are_detected() -> None: + empty = _make_custom_set('covalent_radius', 'demo_empty_cov', {}) + wrapper_a = _DemoPolicyWrapper(base=empty) + wrapper_b = _DemoPolicyWrapper(base=empty, source=wrapper_a) + wrapper_a.source = wrapper_b + + policy = ar.ValuePolicy( + base=empty, + transfers=(ar.SubstitutionTransfer(source=wrapper_a),), + ) + with pytest.raises(PolicyError, match='cyclic policy resolution detected'): + ar.lookup_value('C', policy=policy) diff --git a/tests/radii/test_assessment.py b/tests/radii/test_assessment.py new file mode 100644 index 0000000..664d867 --- /dev/null +++ b/tests/radii/test_assessment.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +import atomref as ar + + +def test_assess_vdw_default_linear_counts() -> None: + rep = ar.assess_radii_policy(['Pm', 'O'], policy=ar.DEFAULT_VDW_POLICY) + assert rep.kind == 'van_der_waals' + assert rep.n_elements == 2 + assert rep.n_base == 1 + assert rep.n_transfer_linear == 1 + assert rep.n_missing == 0 + assert rep.fits + assert rep.fits[0].n_points == 90 + + +def test_assess_vdw_detail_reports_sources() -> None: + rep = ar.assess_radii_policy(['Pm', 'O'], policy=ar.DEFAULT_VDW_POLICY, detail=True) + by_sym = {d.symbol: d for d in rep.per_element} + assert by_sym['O'].lookup.source == 'base' + assert by_sym['Pm'].lookup.source == 'transfer_linear' + + +def test_assess_covalent_sub_placeholder_count() -> None: + rep = ar.assess_radii_policy(['Es'], policy=ar.DEFAULT_COVALENT_POLICY) + assert rep.kind == 'covalent' + assert rep.n_elements == 1 + assert rep.n_transfer_substitution == 1 + assert rep.n_placeholders == 1 + assert rep.placeholder_symbols == ('Es',) + assert rep.n_missing == 0 + + +def test_assess_covalent_missing_in_both_sets() -> None: + rep = ar.assess_radii_policy(['Rg'], policy=ar.DEFAULT_COVALENT_POLICY) + assert rep.n_missing == 1 + assert rep.missing_symbols == ('Rg',) diff --git a/tests/radii/test_selection.py b/tests/radii/test_selection.py new file mode 100644 index 0000000..8977363 --- /dev/null +++ b/tests/radii/test_selection.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import pytest + +import atomref as ar +from atomref.errors import PolicyError + + +def test_get_covalent_radius_default_prefers_cordero() -> None: + assert ar.get_covalent_radius("C") == pytest.approx(0.76) + + +def test_get_covalent_radius_maps_deuterium_to_hydrogen() -> None: + assert ar.get_covalent_radius("D") == pytest.approx(0.31) + + +def test_get_vdw_radius_default_prefers_alvarez() -> None: + assert ar.get_vdw_radius("C") == pytest.approx(1.77) + + +def test_completion_is_used_for_missing_base_values() -> None: + m = ar.lookup_covalent_radius("Bk") + assert m.value is not None + assert m.source == "transfer_substitution" + + m2 = ar.lookup_vdw_radius("Pm") + assert m2.value is not None + assert m2.source == "transfer_linear" + assert m2.value == pytest.approx(2.897226539514835) + + +def test_linear_transfer_rejects_placeholder_values() -> None: + scheme = ar.RadiiPolicy( + kind="van_der_waals", + base_set="bondi1964", + transfers=( + ar.LinearTransfer( + predictors=(ar.DatasetRef("van_der_waals_radius", "csd_legacy_vdw"),) + ), + ), + ) + m = ar.lookup_vdw_radius("Be", policy=scheme) + assert m.value is None + assert m.source == "missing" + assert any("placeholder" in s for s in m.notes) + + +def test_lookup_float_conversion() -> None: + m = ar.lookup_covalent_radius("C") + assert float(m) == pytest.approx(0.76) + + m_missing = ar.lookup_covalent_radius("Xx") + with pytest.raises(TypeError): + float(m_missing) + + +def test_override_precedes_base_value() -> None: + policy = ar.RadiiPolicy( + kind="covalent", + base_set="cordero2008", + overrides={"C": 9.99}, + ) + lookup = ar.lookup_covalent_radius("C", policy=policy) + assert lookup.source == "override" + assert lookup.value == pytest.approx(9.99) + + +def test_fallback_is_used_only_after_transfers_fail() -> None: + policy = ar.RadiiPolicy( + kind="van_der_waals", + base_set="bondi1964", + transfers=( + ar.LinearTransfer( + predictors=(ar.DatasetRef("van_der_waals_radius", "csd_legacy_vdw"),) + ), + ), + fallback=2.5, + ) + lookup = ar.lookup_vdw_radius("Be", policy=policy) + assert lookup.source == "fallback" + assert lookup.value == pytest.approx(2.5) + assert any("placeholder" in note for note in lookup.notes) + + +def test_linear_transfer_rejects_multiple_predictors_in_v0_1() -> None: + policy = ar.RadiiPolicy( + kind="van_der_waals", + base_set="alvarez2013", + transfers=( + ar.LinearTransfer( + predictors=( + ar.DatasetRef("atomic_radius", "rahm2016"), + ar.DatasetRef("covalent_radius", "cordero2008"), + ) + ), + ), + ) + with pytest.raises(PolicyError): + ar.lookup_vdw_radius("Pm", policy=policy) + + +def test_base_placeholder_note_is_explicit() -> None: + policy = ar.RadiiPolicy(kind='covalent', base_set='csd_legacy_cov') + lookup = ar.lookup_covalent_radius('Es', policy=policy) + assert lookup.source == 'base' + assert lookup.is_placeholder is True + assert any('placeholder' in note for note in lookup.notes) + + +def test_substitution_placeholder_note_is_explicit() -> None: + lookup = ar.lookup_covalent_radius('Es') + assert lookup.source == 'transfer_substitution' + assert lookup.is_placeholder is True + assert any('placeholder' in note for note in lookup.notes) + + +def test_radii_policy_rejects_normalized_override_collisions() -> None: + policy = ar.RadiiPolicy( + kind='covalent', + base_set='cordero2008', + overrides={'H': 0.31, 'D': 0.4}, + ) + with pytest.raises(PolicyError): + ar.lookup_covalent_radius('H', policy=policy) + + +def test_radii_policy_rejects_non_finite_override() -> None: + policy = ar.RadiiPolicy( + kind='covalent', + base_set='cordero2008', + overrides={'C': float('nan')}, + ) + with pytest.raises(PolicyError): + ar.lookup_covalent_radius('C', policy=policy) + + +def test_radii_policy_rejects_negative_fallback() -> None: + policy = ar.RadiiPolicy( + kind='van_der_waals', + base_set='bondi1964', + fallback=-1.0, + ) + with pytest.raises(PolicyError): + ar.lookup_vdw_radius('Be', policy=policy) + + +def test_linear_transfer_validates_empty_predictors() -> None: + with pytest.raises(PolicyError): + ar.LinearTransfer(predictors=()) diff --git a/tests/registry/test_registry.py b/tests/registry/test_registry.py new file mode 100644 index 0000000..d497d9f --- /dev/null +++ b/tests/registry/test_registry.py @@ -0,0 +1,143 @@ +from __future__ import annotations + +from importlib import resources +from types import MappingProxyType + +import pytest + +import atomref as ar +from atomref.errors import DatasetError +from atomref.registry import get_builtin_set + + +def test_packaged_data_files_exist() -> None: + pkg = 'atomref.data' + assert resources.files(pkg).joinpath('periodic_table.csv').is_file() + assert resources.files(pkg).joinpath('covalent.csv').is_file() + assert resources.files(pkg).joinpath('van_der_waals.csv').is_file() + assert resources.files(pkg).joinpath('registry.json').is_file() + + +def test_registry_lists_vdw_sets_but_not_atomic_support_sets() -> None: + vdw_sets = ar.list_radii_sets('van_der_waals') + assert 'alvarez2013' in vdw_sets + assert 'rahm2016' not in vdw_sets + + +def test_rahm_is_registered_as_atomic_radius() -> None: + info = ar.get_dataset_info(ar.DatasetRef('atomic_radius', 'rahm2016')) + assert info.ref.quantity == 'atomic_radius' + assert info.semantic_class == 'atomic_isodensity' + assert info.phase_context == 'isolated_atom' + + +def test_builtin_set_loading_works() -> None: + ds = get_builtin_set(ar.DatasetRef('covalent_radius', 'cordero2008')) + assert ds.get('C') == 0.76 + + +def test_list_quantities_and_quantity_info() -> None: + quantities = ar.list_quantities() + assert quantities == ( + 'covalent_radius', + 'van_der_waals_radius', + 'atomic_radius', + 'xh_bond_length', + ) + + info = ar.get_quantity_info('atomic_radius') + assert info.quantity == 'atomic_radius' + assert info.domain == 'element' + assert info.units == 'angstrom' + assert 'support' in (info.description or '') + + +def test_rahm_note_no_longer_claims_it_is_classified_as_vdw() -> None: + info = ar.get_dataset_info(ar.DatasetRef('atomic_radius', 'rahm2016')) + joined = ' '.join(info.notes).lower() + assert 'classified as vdw' not in joined + assert 'atomic support data' in joined + + +def test_usage_role_is_exposed_on_dataset_info() -> None: + info = ar.get_dataset_info(ar.DatasetRef('atomic_radius', 'rahm2016')) + assert info.usage_role == 'support' + + +def test_list_dataset_ids_can_filter_by_usage_role() -> None: + assert ar.list_dataset_ids('atomic_radius', usage_role='support') == ('rahm2016',) + assert ar.list_dataset_ids('van_der_waals_radius', usage_role='target') == ( + 'bondi1964', + 'rowland_taylor1996', + 'alvarez2013', + 'chernyshov2020', + ) + + +def test_list_radii_sets_can_filter_by_usage_role() -> None: + assert ar.list_radii_sets('covalent', usage_role='support') == ('csd_legacy_cov',) + assert 'alvarez2013' in ar.list_radii_sets('van_der_waals', usage_role='target') + + +def test_list_dataset_infos_can_filter_by_usage_role() -> None: + infos = ar.list_dataset_infos('atomic_radius', usage_role='support') + assert tuple(info.ref.set_id for info in infos) == ('rahm2016',) + assert all(info.usage_role == 'support' for info in infos) + + +def test_list_radii_set_infos_can_filter_by_usage_role() -> None: + infos = ar.list_radii_set_infos('van_der_waals', usage_role='target') + assert 'alvarez2013' in {info.ref.set_id for info in infos} + assert all(info.ref.quantity == 'van_der_waals_radius' for info in infos) + + +def test_public_builtin_set_helper_is_exported() -> None: + ds = ar.get_builtin_set(ar.DatasetRef('covalent_radius', 'cordero2008')) + assert ds.info.ref.quantity == 'covalent_radius' + assert ds.get('C') == 0.76 + + +def test_public_radii_set_helper_returns_packaged_radii_set() -> None: + ds = ar.get_radii_set('van_der_waals', 'alvarez2013') + assert ds.info.ref.quantity == 'van_der_waals_radius' + assert ds.info.ref.set_id == 'alvarez2013' + assert ds.get('O') == 1.5 + + +def test_dataset_info_storage_is_frozen() -> None: + info = ar.get_dataset_info(ar.DatasetRef('covalent_radius', 'cordero2008')) + assert isinstance(info.storage, MappingProxyType) + assert info.storage['column'] == 'cordero2008' + with pytest.raises(TypeError): + info.storage['column'] = 'broken' + + fresh = ar.get_dataset_info(ar.DatasetRef('covalent_radius', 'cordero2008')) + assert fresh.storage is not None + assert fresh.storage['column'] == 'cordero2008' + + +def test_dataset_alias_resolution_normalizes_dash_variants() -> None: + info = ar.get_dataset_info( + ar.DatasetRef('covalent_radius', 'Cordero-Alvarez covalent radii') + ) + assert info.ref.set_id == 'cordero2008' + + +def test_custom_set_rejects_normalized_key_collisions() -> None: + with pytest.raises(DatasetError): + ar.ElementScalarSet.from_mapping( + ref=ar.DatasetRef('covalent_radius', 'demo'), + values={'H': 0.31, 'D': 0.5}, + name='Demo', + units='angstrom', + ) + + +def test_custom_set_rejects_non_finite_values() -> None: + with pytest.raises(DatasetError): + ar.ElementScalarSet.from_mapping( + ref=ar.DatasetRef('covalent_radius', 'demo'), + values={'C': float('nan')}, + name='Demo', + units='angstrom', + ) diff --git a/tests/test_smoke.py b/tests/test_smoke.py new file mode 100644 index 0000000..6a96b08 --- /dev/null +++ b/tests/test_smoke.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +import atomref as ar + + +def test_version_is_present() -> None: + assert isinstance(ar.__version__, str) + assert ar.__version__ + + +def test_basic_smoke_import_and_lookup() -> None: + assert ar.get_covalent_radius('C') == 0.76 + assert ar.get_vdw_radius('C') == 1.77 diff --git a/tests/xh/test_xh.py b/tests/xh/test_xh.py new file mode 100644 index 0000000..3cffe15 --- /dev/null +++ b/tests/xh/test_xh.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import pytest + +import atomref as ar +from atomref.errors import PolicyError + + +def test_get_xh_bond_length_returns_curated_cno_values() -> None: + assert ar.get_xh_bond_length('C') == pytest.approx(1.089) + assert ar.get_xh_bond_length('N') == pytest.approx(1.015) + assert ar.get_xh_bond_length('O') == pytest.approx(0.993) + + +def test_lookup_xh_bond_length_infers_other_elements_from_cordero() -> None: + lookup = ar.lookup_xh_bond_length('S') + assert lookup.source == 'transfer_linear' + assert lookup.resolved_from == (ar.DatasetRef('covalent_radius', 'cordero2008'),) + assert lookup.fit is not None + assert lookup.fit.n_points == 3 + assert lookup.value == pytest.approx(1.3587333333333333) + + +def test_lookup_xh_bond_length_rejects_h_as_parent_element() -> None: + lookup = ar.lookup_xh_bond_length('H') + assert lookup.value is None + assert lookup.source == 'missing' + assert any('not a valid parent element' in note for note in lookup.notes) + + +def test_list_xh_sets_and_metadata() -> None: + assert ar.list_xh_sets() == ('csd_legacy_xh_cno',) + info = ar.get_xh_set_info('csd_legacy_xh_cno') + assert info.ref.quantity == 'xh_bond_length' + assert info.usage_role == 'target' + assert info.coverage is not None + assert info.coverage.n_values == 3 + + +def test_xh_policy_rejects_h_override_key() -> None: + policy = ar.XHPolicy(base_set='csd_legacy_xh_cno', overrides={'H': 1.0}) + with pytest.raises(PolicyError): + policy.as_value_policy() + + +def test_xh_policy_rejects_negative_fallback() -> None: + policy = ar.XHPolicy(base_set='csd_legacy_xh_cno', fallback=-1.0) + with pytest.raises(PolicyError): + policy.as_value_policy() + + +def test_xh_policy_accepts_wrapper_policy_predictor() -> None: + policy = ar.XHPolicy( + base_set='csd_legacy_xh_cno', + transfers=( + ar.LinearTransfer( + predictors=(ar.DEFAULT_COVALENT_POLICY,), + min_points=3, + exclude_placeholders=True, + ), + ), + ) + lookup = ar.lookup_xh_bond_length('Bk', policy=policy) + assert lookup.source == 'transfer_linear' + assert lookup.value == pytest.approx(1.8291333333333335) + assert lookup.resolved_from == (ar.DatasetRef('covalent_radius', 'csd_legacy_cov'),) + assert any('policy source' in note for note in lookup.notes) diff --git a/tools/README.md b/tools/README.md new file mode 100644 index 0000000..943900d --- /dev/null +++ b/tools/README.md @@ -0,0 +1,30 @@ +# tools + +This directory contains small maintenance scripts used during development and +release preparation. + +## Scripts + +- `check_dist.py` — verify that wheel and source-distribution artifacts contain + the key files expected by the project. +- `check_notebooks.py` — validate notebook JSON and execute notebook code cells. +- `check_registry.py` — validate curated registry metadata against packaged CSV + tables. +- `export_notebooks.py` — render the bundled notebooks into Markdown pages under + `docs/notebooks/`. +- `gen_readme.py` — regenerate `README.md` from `docs/index.md`. +- `release_check.py` — run the full release-preparation checklist, + including linting, tests, docs, builds, and artifact validation. + +## Typical commands + +```bash +python tools/check_registry.py +python tools/check_notebooks.py +python tools/export_notebooks.py +python tools/gen_readme.py +python tools/release_check.py +``` + +The main project README is generated from the documentation home page. To change +`README.md`, edit `docs/index.md` and then run `python tools/gen_readme.py`. diff --git a/tools/check_dist.py b/tools/check_dist.py new file mode 100644 index 0000000..df70910 --- /dev/null +++ b/tools/check_dist.py @@ -0,0 +1,116 @@ +"""Verify that built distributions contain the project's key files.""" + +from __future__ import annotations + +import argparse +from pathlib import Path +import tarfile +import zipfile + + +REQUIRED_WHEEL_MEMBERS = { + "atomref/data/periodic_table.csv", + "atomref/data/covalent.csv", + "atomref/data/van_der_waals.csv", + "atomref/data/registry.json", + "atomref/py.typed", +} + +REQUIRED_SDIST_SUFFIXES = { + "src/atomref/data/periodic_table.csv", + "src/atomref/data/covalent.csv", + "src/atomref/data/van_der_waals.csv", + "src/atomref/data/registry.json", + "src/atomref/py.typed", + "README.md", + "CHANGELOG.md", + "DEV_PLAN.md", + "LICENSE", + "pyproject.toml", + "notebooks/01-quickstart.ipynb", + "notebooks/02-policies-and-assessment.ipynb", + "notebooks/03-custom-sets-and-discovery.ipynb", + "docs/notebooks/01-quickstart.md", + "docs/notebooks/02-policies-and-assessment.md", + "docs/notebooks/03-custom-sets-and-discovery.md", + "tools/check_notebooks.py", + "tools/export_notebooks.py", + "tools/gen_readme.py", + "tools/release_check.py", + "tools/README.md", +} + + +class DistCheckError(RuntimeError): + """Raised when a built distribution is missing required members.""" + + +def _assert_members_present( + actual: set[str], + required: set[str], + *, + label: str, +) -> None: + """Raise when ``required`` contains members not present in ``actual``.""" + + missing = sorted(required - actual) + if missing: + joined = ", ".join(missing) + raise DistCheckError(f"{label} is missing required members: {joined}") + + +def _members_matching_suffixes(actual: set[str], suffixes: set[str]) -> set[str]: + """Return suffixes that match at least one member name from ``actual``.""" + + matched: set[str] = set() + for suffix in suffixes: + if any(name.endswith(suffix) for name in actual): + matched.add(suffix) + return matched + + +def check_wheel(path: Path) -> None: + """Validate the contents of one built wheel.""" + + with zipfile.ZipFile(path) as zf: + names = set(zf.namelist()) + matched = { + member + for member in REQUIRED_WHEEL_MEMBERS + if any(name.endswith(member) for name in names) + } + _assert_members_present(matched, REQUIRED_WHEEL_MEMBERS, label=path.name) + + +def check_sdist(path: Path) -> None: + """Validate the contents of one built source distribution.""" + + with tarfile.open(path, "r:gz") as tf: + names = {member.name for member in tf.getmembers()} + matched = _members_matching_suffixes(names, REQUIRED_SDIST_SUFFIXES) + _assert_members_present(matched, REQUIRED_SDIST_SUFFIXES, label=path.name) + + +def main() -> None: + """Validate wheel and sdist artifacts found in a distribution directory.""" + + parser = argparse.ArgumentParser() + parser.add_argument("dist_dir", type=Path, nargs="?", default=Path("dist")) + args = parser.parse_args() + + dist_dir = args.dist_dir + wheels = sorted(dist_dir.glob("*.whl")) + sdists = sorted(dist_dir.glob("*.tar.gz")) + if not wheels: + raise DistCheckError(f"no wheel files found in {dist_dir}") + if not sdists: + raise DistCheckError(f"no source distributions found in {dist_dir}") + + for wheel in wheels: + check_wheel(wheel) + for sdist in sdists: + check_sdist(sdist) + + +if __name__ == "__main__": + main() diff --git a/tools/check_notebooks.py b/tools/check_notebooks.py new file mode 100644 index 0000000..51d9dfa --- /dev/null +++ b/tools/check_notebooks.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +"""Validate notebook JSON structure and execute notebook code cells.""" + +from __future__ import annotations + +from contextlib import redirect_stdout +import io +import json +from pathlib import Path +import sys + + +REPO_ROOT = Path(__file__).resolve().parents[1] +SRC = REPO_ROOT / "src" +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) + +NOTEBOOKS = REPO_ROOT / "notebooks" +REQUIRED_NOTEBOOKS = ( + "01-quickstart.ipynb", + "02-policies-and-assessment.ipynb", + "03-custom-sets-and-discovery.ipynb", +) + + +class NotebookCheckError(RuntimeError): + """Raised when a notebook is malformed or fails to execute.""" + + +def iter_notebooks() -> tuple[Path, ...]: + """Return the notebooks that are expected to ship with the project.""" + + return tuple(NOTEBOOKS / name for name in REQUIRED_NOTEBOOKS) + + +def load_notebook(path: Path) -> dict[str, object]: + """Load one notebook JSON document.""" + + data = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise NotebookCheckError(f"{path.name}: expected top-level JSON object") + return data + + +def iter_code_cells(data: dict[str, object], *, path: Path) -> tuple[str, ...]: + """Return notebook code-cell sources in order.""" + + cells = data.get("cells") + if not isinstance(cells, list): + raise NotebookCheckError(f"{path.name}: missing notebook cell list") + + code: list[str] = [] + for index, cell in enumerate(cells): + if not isinstance(cell, dict): + raise NotebookCheckError(f"{path.name}: cell {index} is not an object") + cell_type = cell.get("cell_type") + if cell_type != "code": + continue + source = cell.get("source", []) + if isinstance(source, str): + text = source + elif isinstance(source, list) and all(isinstance(line, str) for line in source): + text = "".join(source) + else: + raise NotebookCheckError( + f"{path.name}: cell {index} has invalid code source" + ) + code.append(text) + if not code: + raise NotebookCheckError(f"{path.name}: contains no code cells") + return tuple(code) + + +def execute_notebook(path: Path) -> None: + """Execute all code cells from one notebook in a shared namespace.""" + + if not path.exists(): + raise NotebookCheckError(f"missing notebook: {path}") + data = load_notebook(path) + namespace = {"__name__": "__main__"} + for index, source in enumerate(iter_code_cells(data, path=path), start=1): + if not source.strip(): + continue + try: + code = compile(source, f"{path.name}::cell{index}", "exec") + with redirect_stdout(io.StringIO()): + exec(code, namespace, namespace) + except Exception as exc: # noqa: BLE001 + raise NotebookCheckError( + f"{path.name}: execution failed in code cell {index}: {exc}" + ) from exc + + +def main() -> int: + """Validate and execute every required notebook.""" + + notebooks = iter_notebooks() + for notebook in notebooks: + execute_notebook(notebook) + print(f"Validated {len(notebooks)} notebook(s).") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/check_registry.py b/tools/check_registry.py new file mode 100644 index 0000000..3af6025 --- /dev/null +++ b/tools/check_registry.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +"""Validate packaged registry metadata against bundled CSV tables.""" + +from __future__ import annotations + +from collections import defaultdict +from dataclasses import asdict +from importlib import import_module +from pathlib import Path +import sys +from typing import Iterable + +REPO_ROOT = Path(__file__).resolve().parents[1] +SRC = REPO_ROOT / "src" +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) + +_ALLOWED_USAGE_ROLES = {"target", "support"} + + +def _load_atomref_module(): + return import_module("atomref") + + +def _get_builtin_set(ref): + registry = import_module("atomref.registry") + return registry.get_builtin_set(ref) + + +def _canonical_token(value: str) -> str: + registry = import_module("atomref.registry") + return registry._canonicalize_alias_token(value) + + +def _iter_dataset_refs() -> Iterable[object]: + ar = _load_atomref_module() + for quantity in ar.list_quantities(): + for info in ar.list_dataset_infos(quantity): + yield info.ref + + +def _validate_alias_collisions(errors: list[str]) -> None: + ar = _load_atomref_module() + for quantity in ar.list_quantities(): + seen: dict[str, str] = {} + for info in ar.list_dataset_infos(quantity): + set_id = info.ref.set_id + for token in (set_id, *info.aliases): + key = _canonical_token(token) + previous = seen.get(key) + if previous is not None and previous != set_id: + msg = ( + f"alias collision in {quantity!r}: {token!r} resolves to both " + f"{previous!r} and {set_id!r}" + ) + errors.append(msg) + else: + seen[key] = set_id + + +def _validate_dataset_metadata(errors: list[str]) -> None: + ar = _load_atomref_module() + quantities = set(ar.list_quantities()) + by_role: dict[str, list[str]] = defaultdict(list) + + for ref in _iter_dataset_refs(): + quantity_info = ar.get_quantity_info(ref.quantity) + info = ar.get_dataset_info(ref) + dataset = _get_builtin_set(ref) + + if info.ref != ref: + errors.append(f"dataset ref mismatch: requested {ref!r}, got {info.ref!r}") + + if info.domain != quantity_info.domain: + msg = ( + f"domain mismatch for {ref!r}: quantity={quantity_info.domain!r}, " + f"dataset={info.domain!r}" + ) + errors.append(msg) + + if info.units != quantity_info.units: + msg = ( + f"units mismatch for {ref!r}: quantity={quantity_info.units!r}, " + f"dataset={info.units!r}" + ) + errors.append(msg) + + if info.usage_role not in _ALLOWED_USAGE_ROLES: + errors.append(f"invalid usage_role for {ref!r}: {info.usage_role!r}") + else: + by_role[info.usage_role].append(ref.quantity) + + if not info.references: + errors.append(f"missing references for {ref!r}") + + if info.storage is None: + errors.append(f"missing storage metadata for {ref!r}") + else: + filename = info.storage.get("filename") + column = info.storage.get("column") + fmt = info.storage.get("format") + if not isinstance(filename, str) or not filename: + errors.append(f"invalid storage filename for {ref!r}: {filename!r}") + if not isinstance(column, str) or not column: + errors.append(f"invalid storage column for {ref!r}: {column!r}") + if fmt != "dense_by_z_csv": + errors.append(f"unsupported storage format for {ref!r}: {fmt!r}") + + coverage = info.coverage + if coverage is None: + errors.append(f"missing coverage metadata for {ref!r}") + max_z = len(dataset.values_by_z) - 1 + else: + max_z = ( + coverage.z_max + if coverage.z_max is not None + else len(dataset.values_by_z) - 1 + ) + + covered_z = tuple( + z + for z, value in enumerate(dataset.values_by_z) + if z > 0 and value is not None and z <= max_z + ) + covered_set = set(covered_z) + missing_z = tuple(z for z in range(1, max_z + 1) if z not in covered_set) + has_placeholders = info.placeholder_value is not None and any( + value is not None and abs(value - info.placeholder_value) < 1e-12 + for value in dataset.values_by_z[1 : max_z + 1] + ) + + if coverage is not None: + expected = { + "n_values": len(covered_z), + "z_min": min(covered_z) if covered_z else None, + "z_max": max(covered_z) if covered_z else None, + "has_placeholders": has_placeholders, + } + actual = asdict(coverage) + for key, value in expected.items(): + if actual[key] != value: + msg = ( + f"coverage mismatch for {ref!r}: {key} is {actual[key]!r}, " + f"expected {value!r}" + ) + errors.append(msg) + if actual["covered_z"] and tuple(actual["covered_z"]) != covered_z: + msg = ( + f"coverage mismatch for {ref!r}: covered_z is " + f"{actual['covered_z']!r}, expected {covered_z!r}" + ) + errors.append(msg) + if actual["missing_z"] and tuple(actual["missing_z"]) != missing_z: + msg = ( + f"coverage mismatch for {ref!r}: missing_z is " + f"{actual['missing_z']!r}, expected {missing_z!r}" + ) + errors.append(msg) + + if ref.quantity not in quantities: + errors.append(f"dataset refers to unknown quantity: {ref!r}") + + for quantity in quantities: + if quantity not in by_role.get("target", []) and quantity != "atomic_radius": + errors.append(f"quantity {quantity!r} has no target datasets") + + +def main() -> int: + errors: list[str] = [] + _validate_alias_collisions(errors) + _validate_dataset_metadata(errors) + + if errors: + for error in errors: + print(f"ERROR: {error}") + return 1 + + print("Registry validation passed.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/export_notebooks.py b/tools/export_notebooks.py new file mode 100644 index 0000000..aa6761d --- /dev/null +++ b/tools/export_notebooks.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +"""Export bundled notebooks to Markdown pages for the docs.""" + +from __future__ import annotations + +from contextlib import redirect_stdout +import argparse +import io +import json +from pathlib import Path +import sys + + +REPO_ROOT = Path(__file__).resolve().parents[1] +SRC = REPO_ROOT / "src" +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) + +NOTEBOOKS = REPO_ROOT / "notebooks" +DEFAULT_OUTPUT_DIR = REPO_ROOT / "docs" / "notebooks" +NOTEBOOK_OUTPUTS = { + "01-quickstart.ipynb": "01-quickstart.md", + "02-policies-and-assessment.ipynb": "02-policies-and-assessment.md", + "03-custom-sets-and-discovery.ipynb": "03-custom-sets-and-discovery.md", +} +HEADER = ( + "\n" + "\n\n" +) + + +class NotebookExportError(RuntimeError): + """Raised when notebook export fails.""" + + +def _load_notebook(path: Path) -> dict[str, object]: + """Load one notebook JSON document.""" + + data = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise NotebookExportError(f"{path.name}: expected top-level JSON object") + return data + + +def _cell_source(cell: dict[str, object], *, path: Path, index: int) -> str: + """Return normalized source text for one notebook cell.""" + + source = cell.get("source", []) + if isinstance(source, str): + return source + if isinstance(source, list) and all(isinstance(line, str) for line in source): + return "".join(source) + raise NotebookExportError(f"{path.name}: invalid source in cell {index}") + + +def _export_markdown(path: Path) -> str: + """Render one notebook as Markdown, executing code cells for output.""" + + data = _load_notebook(path) + cells = data.get("cells") + if not isinstance(cells, list): + raise NotebookExportError(f"{path.name}: missing notebook cell list") + + namespace = {"__name__": "__main__"} + parts: list[str] = [HEADER] + parts.append( + f"[Open the original notebook on GitHub]" + f"(https://github.com/DeloneCommons/atomref/blob/main/notebooks/{path.name})\n" + ) + + for index, cell in enumerate(cells, start=1): + if not isinstance(cell, dict): + raise NotebookExportError(f"{path.name}: cell {index} is not an object") + source = _cell_source(cell, path=path, index=index) + cell_type = cell.get("cell_type") + if cell_type == "markdown": + text = source.strip() + if text: + parts.append(f"{text}\n") + continue + if cell_type != "code": + continue + code_text = source.rstrip() + parts.append("```python\n") + parts.append(f"{code_text}\n") + parts.append("```\n") + if not code_text.strip(): + continue + + stdout = io.StringIO() + try: + code = compile(code_text, f"{path.name}::cell{index}", "exec") + with redirect_stdout(stdout): + exec(code, namespace, namespace) + except Exception as exc: # noqa: BLE001 + raise NotebookExportError( + f"{path.name}: execution failed in code cell {index}: {exc}" + ) from exc + + output = stdout.getvalue().rstrip() + if output: + parts.append("**Output**\n\n") + parts.append("```text\n") + parts.append(f"{output}\n") + parts.append("```\n") + + return "\n".join(part.rstrip() for part in parts if part).rstrip() + "\n" + + +def export_notebooks(output_dir: Path, *, check: bool = False) -> int: + """Export bundled notebooks or verify that exported pages are in sync.""" + + output_dir.mkdir(parents=True, exist_ok=True) + for notebook_name, output_name in NOTEBOOK_OUTPUTS.items(): + notebook_path = NOTEBOOKS / notebook_name + rendered = _export_markdown(notebook_path) + output_path = output_dir / output_name + if check: + current = output_path.read_text(encoding="utf-8").replace("\r\n", "\n") + if current != rendered: + print( + f"{output_path} is out of sync with {notebook_path.name}", + file=sys.stderr, + ) + return 1 + else: + output_path.write_text(rendered, encoding="utf-8", newline="\n") + return 0 + + +def main() -> int: + """Export notebook Markdown pages or check that they are current.""" + + parser = argparse.ArgumentParser() + parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR) + parser.add_argument( + "--check", + action="store_true", + help="exit with status 1 when exported pages are out of sync", + ) + args = parser.parse_args() + return export_notebooks(args.output_dir, check=args.check) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/gen_readme.py b/tools/gen_readme.py new file mode 100644 index 0000000..71b954d --- /dev/null +++ b/tools/gen_readme.py @@ -0,0 +1,61 @@ +"""Generate ``README.md`` from the documentation home page.""" + +from __future__ import annotations + +import argparse +from pathlib import Path +import sys + + +REPO_ROOT = Path(__file__).resolve().parents[1] +SOURCE = REPO_ROOT / "docs" / "index.md" +README = REPO_ROOT / "README.md" +FOOTER = """ + +--- + +This README is generated from `docs/index.md`. + +To regenerate it: + +```bash +python tools/gen_readme.py +``` + +Edit the documentation sources instead of editing `README.md` directly. +""" + + +def render_readme() -> str: + """Return the generated README text.""" + + body = SOURCE.read_text(encoding="utf-8").rstrip() + return f"{body}{FOOTER}" + + +def main() -> int: + """Generate or verify the repository README file.""" + + parser = argparse.ArgumentParser() + parser.add_argument("--output", type=Path, default=README) + parser.add_argument( + "--check", + action="store_true", + help="exit with status 1 when the target file is out of sync", + ) + args = parser.parse_args() + + rendered = render_readme() + if args.check: + current = args.output.read_text(encoding="utf-8") + if current != rendered: + print(f"{args.output} is out of sync with docs/index.md", file=sys.stderr) + return 1 + return 0 + + args.output.write_text(rendered, encoding="utf-8") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/release_check.py b/tools/release_check.py new file mode 100644 index 0000000..a357a18 --- /dev/null +++ b/tools/release_check.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +"""Run the full release-preparation checks for the repository. + +This helper is intended for local release preparation. It runs the same checks +that are exercised separately in CI, then builds source and wheel artifacts, +validates them, and smoke-tests the built wheel in an isolated virtual +environment. +""" + +from __future__ import annotations + +import argparse +from pathlib import Path +import shutil +import subprocess +import sys +import tempfile +import venv + + +REPO_ROOT = Path(__file__).resolve().parents[1] +DIST_DIR = REPO_ROOT / "dist" +BUILD_DIR = REPO_ROOT / "build" + + +def _run(*args: str, env: dict[str, str] | None = None) -> None: + """Run one subprocess command in the repository root.""" + + print("+", " ".join(args)) + subprocess.run(args, cwd=REPO_ROOT, check=True, env=env) + + +def _fresh_build_dirs() -> None: + """Remove build artifacts from previous runs.""" + + shutil.rmtree(DIST_DIR, ignore_errors=True) + shutil.rmtree(BUILD_DIR, ignore_errors=True) + + +def _smoke_test_wheel() -> None: + """Install the built wheel into a temporary virtualenv and import it.""" + + wheels = sorted(DIST_DIR.glob("*.whl")) + if not wheels: + raise RuntimeError("no wheel found in dist/") + wheel = wheels[-1] + + with tempfile.TemporaryDirectory(prefix="atomref-release-check-") as tmp: + env_dir = Path(tmp) / "venv" + builder = venv.EnvBuilder(with_pip=True) + builder.create(env_dir) + bindir = "Scripts" if sys.platform.startswith("win") else "bin" + python = env_dir / bindir / "python" + _run(str(python), "-m", "pip", "install", "--no-deps", str(wheel)) + _run( + str(python), + "-c", + ( + "import atomref as ar; " + "assert ar.get_covalent_radius('C') == 0.76; " + "assert ar.get_vdw_radius('C') == 1.77; " + "assert 'atomic_radius' in ar.list_quantities(); " + "assert 'rahm2016' in ar.list_dataset_ids(" + "'atomic_radius', usage_role='support')" + ), + ) + + +def main() -> int: + """Run lint, tests, docs, build, metadata, and wheel smoke checks.""" + + parser = argparse.ArgumentParser( + description="Run the full release-preparation checks for the repository.", + ) + parser.add_argument( + "--skip-docs", + action="store_true", + help="skip the strict MkDocs build step", + ) + parser.add_argument( + "--skip-smoke-test", + action="store_true", + help="skip the temporary-virtualenv wheel import smoke test", + ) + args = parser.parse_args() + + _run("flake8", "src", "tests", "tools") + _run(sys.executable, "tools/check_registry.py") + _run(sys.executable, "tools/check_notebooks.py") + _run(sys.executable, "tools/export_notebooks.py", "--check") + _run(sys.executable, "tools/gen_readme.py", "--check") + _run(sys.executable, "-m", "pytest", "-q") + if not args.skip_docs: + _run("mkdocs", "build", "--strict") + + _fresh_build_dirs() + _run(sys.executable, "-m", "build") + _run(sys.executable, "-m", "twine", "check", "dist/*") + _run(sys.executable, "tools/check_dist.py", "dist") + if not args.skip_smoke_test: + _smoke_test_wheel() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())