From c0e493a6cebc0da5a1d50e224e114ed74c9284a5 Mon Sep 17 00:00:00 2001 From: Ivan Chernyshov Date: Sat, 14 Mar 2026 14:58:26 +0300 Subject: [PATCH 01/15] Adds skeleton extracted from molcryst v0.2.2 (dev branch) --- .flake8 | 3 + .gitattributes | 2 + .github/workflows/ci.yml | 23 + .github/workflows/docs.yml | 21 + CHANGELOG.md | 9 + COPYING | 674 ++++++++++++++++++++++++++ DEV_PLAN.md | 28 ++ LICENSE | 165 +++++++ NOTICE.md | 12 + README.md | 66 +++ docs/api/atomref.md | 3 + docs/api/index.md | 4 + docs/datasets/atomic_radius.md | 8 + docs/datasets/covalent_radius.md | 6 + docs/datasets/index.md | 10 + docs/datasets/van_der_waals_radius.md | 11 + docs/dev/architecture.md | 7 + docs/dev/data_curation.md | 7 + docs/dev/dev_plan.md | 28 ++ docs/guide/custom_sets.md | 18 + docs/guide/install.md | 8 + docs/guide/non_goals.md | 11 + docs/guide/policies.md | 20 + docs/guide/quickstart.md | 16 + docs/index.md | 66 +++ mkdocs.yml | 37 ++ pyproject.toml | 94 ++++ src/atomref/__about__.py | 1 + src/atomref/__init__.py | 60 +++ src/atomref/data/__init__.py | 1 + src/atomref/data/covalent.csv | 119 +++++ src/atomref/data/periodic_table.csv | 119 +++++ src/atomref/data/registry.json | 434 +++++++++++++++++ src/atomref/data/van_der_waals.csv | 119 +++++ src/atomref/elements.py | 99 ++++ src/atomref/errors.py | 14 + src/atomref/policy.py | 261 ++++++++++ src/atomref/py.typed | 0 src/atomref/radii.py | 233 +++++++++ src/atomref/registry.py | 343 +++++++++++++ src/atomref/transfer.py | 31 ++ tests/conftest.py | 9 + tests/elements/test_elements.py | 19 + tests/meta/test_imports.py | 18 + tests/meta/test_readme_sync.py | 20 + tests/radii/test_assessment.py | 37 ++ tests/radii/test_selection.py | 53 ++ tests/registry/test_registry.py | 32 ++ tests/test_smoke.py | 13 + tools/gen_readme.py | 20 + 50 files changed, 3412 insertions(+) create mode 100644 .flake8 create mode 100644 .gitattributes create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/docs.yml create mode 100644 CHANGELOG.md create mode 100644 COPYING create mode 100644 DEV_PLAN.md create mode 100644 LICENSE create mode 100644 NOTICE.md create mode 100644 README.md create mode 100644 docs/api/atomref.md create mode 100644 docs/api/index.md create mode 100644 docs/datasets/atomic_radius.md create mode 100644 docs/datasets/covalent_radius.md create mode 100644 docs/datasets/index.md create mode 100644 docs/datasets/van_der_waals_radius.md create mode 100644 docs/dev/architecture.md create mode 100644 docs/dev/data_curation.md create mode 100644 docs/dev/dev_plan.md create mode 100644 docs/guide/custom_sets.md create mode 100644 docs/guide/install.md create mode 100644 docs/guide/non_goals.md create mode 100644 docs/guide/policies.md create mode 100644 docs/guide/quickstart.md create mode 100644 docs/index.md create mode 100644 mkdocs.yml create mode 100644 pyproject.toml create mode 100644 src/atomref/__about__.py create mode 100644 src/atomref/__init__.py create mode 100644 src/atomref/data/__init__.py create mode 100644 src/atomref/data/covalent.csv create mode 100644 src/atomref/data/periodic_table.csv create mode 100644 src/atomref/data/registry.json create mode 100644 src/atomref/data/van_der_waals.csv create mode 100644 src/atomref/elements.py create mode 100644 src/atomref/errors.py create mode 100644 src/atomref/policy.py create mode 100644 src/atomref/py.typed create mode 100644 src/atomref/radii.py create mode 100644 src/atomref/registry.py create mode 100644 src/atomref/transfer.py create mode 100644 tests/conftest.py create mode 100644 tests/elements/test_elements.py create mode 100644 tests/meta/test_imports.py create mode 100644 tests/meta/test_readme_sync.py create mode 100644 tests/radii/test_assessment.py create mode 100644 tests/radii/test_selection.py create mode 100644 tests/registry/test_registry.py create mode 100644 tests/test_smoke.py create mode 100644 tools/gen_readme.py diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..8dd399a --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 88 +extend-ignore = E203 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..3225814 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Enforce Linux-style line endings for all text files +* text=auto eol=lf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..dbc7a70 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,23 @@ +name: CI + +on: + push: + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install + run: | + python -m pip install --upgrade pip + python -m pip install .[test] + - name: Test + run: pytest diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..590aad5 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,21 @@ +name: Docs + +on: + push: + branches: [main] + workflow_dispatch: + +jobs: + build-docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install docs extras + run: | + python -m pip install --upgrade pip + python -m pip install .[docs] + - name: Build docs + run: mkdocs build --strict diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..faca26a --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,9 @@ +# Changelog + +## 0.1.0a0 + +- Initial scaffold extracted from the `molcryst` chemistry data layer. +- Added packaged element metadata and radii tables. +- Added registry design separating operational quantity from scientific + classification. +- Added radii policies with substitution and linear transfer models. diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..f288702 --- /dev/null +++ b/COPYING @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/DEV_PLAN.md b/DEV_PLAN.md new file mode 100644 index 0000000..7252862 --- /dev/null +++ b/DEV_PLAN.md @@ -0,0 +1,28 @@ +# Development plan + +## v0.1 + +- element metadata +- covalent and van der Waals radii sets +- explicit provenance +- radii policies +- substitution and linear transfer +- custom element-indexed scalar sets + +## v0.2 + +- X-H bond-length datasets +- experimental plus computational support sets +- restoration of incomplete experimental data from broader-support predictors + +## v0.3 + +- radial atomic reference functions +- simple proto-density support based on spherically averaged atomic data + +## Possible future directions + +- more radii sets +- uncertainty and confidence flags +- ion-specific or atom-type-specific domains +- density-derived radii and related reference transforms diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0a04128 --- /dev/null +++ b/LICENSE @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/NOTICE.md b/NOTICE.md new file mode 100644 index 0000000..01f1cf1 --- /dev/null +++ b/NOTICE.md @@ -0,0 +1,12 @@ +# atomref + +atomref is a Python library for curated atomic reference data and transfer +policies for geometry and structure-analysis algorithms. + +Copyright (c) 2026 Ivan Chernyshov +License: LGPL-3.0-or-later (see LICENSE and COPYING) + +## Third-party material + +The initial scaffold reuses and adapts data tables and design ideas from the +Delone Commons `molcryst` repository, also authored by Ivan Chernyshov. diff --git a/README.md b/README.md new file mode 100644 index 0000000..d5b9154 --- /dev/null +++ b/README.md @@ -0,0 +1,66 @@ +# atomref + +`atomref` is a small pure-Python package for curated atomic reference data and +policy-based lookup in geometry and structure-analysis code. + +It is **not** a periodic-table encyclopedia. The package is meant to sit under +higher-level scientific software and provide: + +- stable element metadata, +- named radii sets, +- explicit dataset provenance, +- deterministic lookup policies, +- transfer from broader-support datasets into narrower target sets. + +For v0.1 the public scope is intentionally radii-first. + +## Why this exists + +Many geometry algorithms need a complete reference table, but the scientifically +preferred dataset is often incomplete. `atomref` makes that situation explicit: +choose a target dataset, add one or more transfer steps, and keep provenance on +what was returned. + +The default examples mirror the current `molcryst` behavior: + +- covalent radii: use `cordero2008`, substitute from `csd_legacy_cov` +- van der Waals radii: use `alvarez2013`, linearly transfer from + `atomic_radius:rahm2016` + +## Quick example + +```python +import atomref as ar + +r_c = ar.get_covalent_radius("C") +r_vdw = ar.get_vdw_radius("O") + +lookup = ar.lookup_vdw_radius("Pm") +print(lookup.value, lookup.source, lookup.resolved_from) +``` + +## Public API split: `get_*` vs `lookup_*` + +- `get_*` returns only the selected numeric value, or `None`. +- `lookup_*` returns the provenance-carrying `LookupResult` object. + +This follows the current `molcryst` pattern. + +## Current built-in quantities + +- `covalent_radius` +- `van_der_waals_radius` +- `atomic_radius` (support quantity; currently used for transfer from + `rahm2016`) + +## Relationship to the Delone Commons ecosystem + +`atomref` is intended to be reusable outside the surrounding ecosystem, but it +fits naturally beneath: + +- `molcryst` +- `pyvoro2` +- `pbcgraph` + +Those packages should consume atomic reference data from `atomref` rather than +re-curating such datasets independently. diff --git a/docs/api/atomref.md b/docs/api/atomref.md new file mode 100644 index 0000000..dcbc5e0 --- /dev/null +++ b/docs/api/atomref.md @@ -0,0 +1,3 @@ +# atomref + +::: atomref diff --git a/docs/api/index.md b/docs/api/index.md new file mode 100644 index 0000000..da15dbf --- /dev/null +++ b/docs/api/index.md @@ -0,0 +1,4 @@ +# API + +The top-level package exports the main radii helpers together with the registry, +policy, and transfer data structures. diff --git a/docs/datasets/atomic_radius.md b/docs/datasets/atomic_radius.md new file mode 100644 index 0000000..cbbe61b --- /dev/null +++ b/docs/datasets/atomic_radius.md @@ -0,0 +1,8 @@ +# Atomic radius + +This quantity currently exists to hold transferable support datasets that are +not best described as direct condensed-phase vdW radii. + +Built-in v0.1 support set: + +- `rahm2016` diff --git a/docs/datasets/covalent_radius.md b/docs/datasets/covalent_radius.md new file mode 100644 index 0000000..f298635 --- /dev/null +++ b/docs/datasets/covalent_radius.md @@ -0,0 +1,6 @@ +# Covalent radius + +Built-in v0.1 sets: + +- `cordero2008` +- `csd_legacy_cov` diff --git a/docs/datasets/index.md b/docs/datasets/index.md new file mode 100644 index 0000000..a58d78b --- /dev/null +++ b/docs/datasets/index.md @@ -0,0 +1,10 @@ +# Datasets + +The package distinguishes between: + +- **quantity** — the operational property being requested, +- **semantic class** — what the dataset scientifically represents, +- **origin / phase context** — how and where it was derived. + +This is what keeps support-only datasets such as `rahm2016` usable without +misclassifying them as direct condensed-phase vdW radii. diff --git a/docs/datasets/van_der_waals_radius.md b/docs/datasets/van_der_waals_radius.md new file mode 100644 index 0000000..d757bab --- /dev/null +++ b/docs/datasets/van_der_waals_radius.md @@ -0,0 +1,11 @@ +# van der Waals radius + +Built-in v0.1 target sets: + +- `bondi1964` +- `rowland_taylor1996` +- `alvarez2013` +- `chernyshov2020` +- `csd_legacy_vdw` + +Support-only sets may live under other quantities. diff --git a/docs/dev/architecture.md b/docs/dev/architecture.md new file mode 100644 index 0000000..7dd08b4 --- /dev/null +++ b/docs/dev/architecture.md @@ -0,0 +1,7 @@ +# Architecture + +Publicly, v0.1 is radii-first. + +Internally, the package is built around element-indexed scalar datasets plus a +small transfer layer. That keeps the public API simple while leaving a clean +path to later quantities such as X-H bond lengths. diff --git a/docs/dev/data_curation.md b/docs/dev/data_curation.md new file mode 100644 index 0000000..02f406b --- /dev/null +++ b/docs/dev/data_curation.md @@ -0,0 +1,7 @@ +# Data curation + +Packaged tables are stored as CSV files indexed by atomic number. Dataset +metadata and provenance live in `src/atomref/data/registry.json`. + +Placeholder values are modeled as dataset metadata, not as hard-coded Python +constants. diff --git a/docs/dev/dev_plan.md b/docs/dev/dev_plan.md new file mode 100644 index 0000000..7252862 --- /dev/null +++ b/docs/dev/dev_plan.md @@ -0,0 +1,28 @@ +# Development plan + +## v0.1 + +- element metadata +- covalent and van der Waals radii sets +- explicit provenance +- radii policies +- substitution and linear transfer +- custom element-indexed scalar sets + +## v0.2 + +- X-H bond-length datasets +- experimental plus computational support sets +- restoration of incomplete experimental data from broader-support predictors + +## v0.3 + +- radial atomic reference functions +- simple proto-density support based on spherically averaged atomic data + +## Possible future directions + +- more radii sets +- uncertainty and confidence flags +- ion-specific or atom-type-specific domains +- density-derived radii and related reference transforms diff --git a/docs/guide/custom_sets.md b/docs/guide/custom_sets.md new file mode 100644 index 0000000..bfc55cb --- /dev/null +++ b/docs/guide/custom_sets.md @@ -0,0 +1,18 @@ +# Custom sets + +Custom element-indexed scalar datasets can be built with +`ElementScalarSet.from_mapping(...)` and then used directly in a `RadiiPolicy` +or a transfer model. + +```python +from atomref import DatasetRef, ElementScalarSet, RadiiPolicy + +custom = ElementScalarSet.from_mapping( + ref=DatasetRef("covalent_radius", "my_cov"), + values={"C": 0.75, "H": 0.31}, + name="My custom covalent radii", + units="angstrom", +) + +policy = RadiiPolicy(kind="covalent", base_set=custom) +``` diff --git a/docs/guide/install.md b/docs/guide/install.md new file mode 100644 index 0000000..2e2ae65 --- /dev/null +++ b/docs/guide/install.md @@ -0,0 +1,8 @@ +# Install + +```bash +pip install atomref +``` + +The runtime package is pure Python and has no required runtime dependencies +outside the standard library. diff --git a/docs/guide/non_goals.md b/docs/guide/non_goals.md new file mode 100644 index 0000000..57bca94 --- /dev/null +++ b/docs/guide/non_goals.md @@ -0,0 +1,11 @@ +# Non-goals + +`atomref` does not aim to handle: + +- file parsing, +- crystallographic symmetry, +- structure inference, +- Voronoi or power tessellation, +- chemistry-specific plane-position logic. + +Those concerns belong in higher-level packages. diff --git a/docs/guide/policies.md b/docs/guide/policies.md new file mode 100644 index 0000000..a7e9130 --- /dev/null +++ b/docs/guide/policies.md @@ -0,0 +1,20 @@ +# Policies + +A policy is the ordered rule set for selecting a value. + +Resolution order in v0.1: + +1. override +2. base dataset +3. transfers in order +4. fallback +5. missing + +Built-in transfer models: + +- `SubstitutionTransfer` +- `LinearTransfer` + +`LinearTransfer` is intentionally limited to one predictor in v0.1, but the API +already accepts a predictor tuple so later multi-predictor linear models do not +require a redesign. diff --git a/docs/guide/quickstart.md b/docs/guide/quickstart.md new file mode 100644 index 0000000..62de165 --- /dev/null +++ b/docs/guide/quickstart.md @@ -0,0 +1,16 @@ +# Quickstart + +```python +import atomref as ar + +print(ar.get_covalent_radius("C")) +print(ar.get_vdw_radius("O")) + +m = ar.lookup_vdw_radius("Pm") +print(m.value) +print(m.source) +print(m.resolved_from) +``` + +Use `get_*` when you only need the number, and `lookup_*` when you need +provenance. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..d5b9154 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,66 @@ +# atomref + +`atomref` is a small pure-Python package for curated atomic reference data and +policy-based lookup in geometry and structure-analysis code. + +It is **not** a periodic-table encyclopedia. The package is meant to sit under +higher-level scientific software and provide: + +- stable element metadata, +- named radii sets, +- explicit dataset provenance, +- deterministic lookup policies, +- transfer from broader-support datasets into narrower target sets. + +For v0.1 the public scope is intentionally radii-first. + +## Why this exists + +Many geometry algorithms need a complete reference table, but the scientifically +preferred dataset is often incomplete. `atomref` makes that situation explicit: +choose a target dataset, add one or more transfer steps, and keep provenance on +what was returned. + +The default examples mirror the current `molcryst` behavior: + +- covalent radii: use `cordero2008`, substitute from `csd_legacy_cov` +- van der Waals radii: use `alvarez2013`, linearly transfer from + `atomic_radius:rahm2016` + +## Quick example + +```python +import atomref as ar + +r_c = ar.get_covalent_radius("C") +r_vdw = ar.get_vdw_radius("O") + +lookup = ar.lookup_vdw_radius("Pm") +print(lookup.value, lookup.source, lookup.resolved_from) +``` + +## Public API split: `get_*` vs `lookup_*` + +- `get_*` returns only the selected numeric value, or `None`. +- `lookup_*` returns the provenance-carrying `LookupResult` object. + +This follows the current `molcryst` pattern. + +## Current built-in quantities + +- `covalent_radius` +- `van_der_waals_radius` +- `atomic_radius` (support quantity; currently used for transfer from + `rahm2016`) + +## Relationship to the Delone Commons ecosystem + +`atomref` is intended to be reusable outside the surrounding ecosystem, but it +fits naturally beneath: + +- `molcryst` +- `pyvoro2` +- `pbcgraph` + +Those packages should consume atomic reference data from `atomref` rather than +re-curating such datasets independently. diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..8b5060c --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,37 @@ +site_name: atomref +site_url: https://delonecommons.github.io/atomref/ +repo_url: https://github.com/DeloneCommons/atomref +repo_name: DeloneCommons/atomref + +theme: + name: material + +plugins: + - search + - mkdocstrings: + handlers: + python: + options: + show_root_heading: true + show_source: false + +nav: + - Home: index.md + - Guide: + - Install: guide/install.md + - Quickstart: guide/quickstart.md + - Policies: guide/policies.md + - Custom sets: guide/custom_sets.md + - Non-goals: guide/non_goals.md + - Datasets: + - Overview: datasets/index.md + - Covalent radius: datasets/covalent_radius.md + - van der Waals radius: datasets/van_der_waals_radius.md + - Atomic radius: datasets/atomic_radius.md + - Development: + - Architecture: dev/architecture.md + - Data curation: dev/data_curation.md + - Development plan: dev/dev_plan.md + - API: + - Overview: api/index.md + - atomref: api/atomref.md diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ea2b569 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,94 @@ +[build-system] +requires = ["hatchling>=1.24"] +build-backend = "hatchling.build" + +[project] +name = "atomref" +dynamic = ["version"] +description = "Curated atomic reference data and transfer policies for geometry and structure-analysis algorithms." +readme = "README.md" +requires-python = ">=3.10" +license = { file = "LICENSE" } +authors = [ + { name = "Ivan Yu. Chernyshov", email = "ivan.chernyshoff@gmail.com" } +] +keywords = ["chemistry", "materials", "crystallography", "reference data", "atomic radii"] +classifiers = [ + "Development Status :: 2 - Pre-Alpha", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Chemistry", + "Topic :: Software Development :: Libraries", + "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Operating System :: OS Independent", +] +dependencies = [] + +[project.urls] +Homepage = "https://delonecommons.github.io/atomref/" +Documentation = "https://delonecommons.github.io/atomref/" +Repository = "https://github.com/DeloneCommons/atomref" +Issues = "https://github.com/DeloneCommons/atomref/issues" + +[project.optional-dependencies] +test = [ + "pytest>=7", + "tomli>=2; python_version < '3.11'", +] +docs = [ + "mkdocs>=1.6,<2", + "mkdocs-material>=9.5", + "mkdocstrings[python]>=0.25", + "mkdocs-include-markdown-plugin>=6.2", + "pymdown-extensions>=10.0", + "tomli>=2; python_version < '3.11'", +] +dev = [ + "build>=1.2", + "twine>=5", + "flake8>=7", +] + +[tool.hatch.version] +path = "src/atomref/__about__.py" + +[tool.hatch.build.targets.wheel] +packages = ["src/atomref"] +include = [ + "src/atomref/data/*.csv", + "src/atomref/data/*.json", +] + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", + "/docs", + "/tools", + "/mkdocs.yml", + "/README.md", + "/CHANGELOG.md", + "/DEV_PLAN.md", + "/NOTICE.md", + "/LICENSE", + "/COPYING", +] + +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "-ra --ignore=build --ignore=dist" +norecursedirs = [ + ".git", + ".pytest_cache", + "__pycache__", + ".venv", + ".tox", + "dist", + ".eggs", + "*.egg-info", +] diff --git a/src/atomref/__about__.py b/src/atomref/__about__.py new file mode 100644 index 0000000..44cdb9a --- /dev/null +++ b/src/atomref/__about__.py @@ -0,0 +1 @@ +__version__ = '0.1.0a0' diff --git a/src/atomref/__init__.py b/src/atomref/__init__.py new file mode 100644 index 0000000..fd07068 --- /dev/null +++ b/src/atomref/__init__.py @@ -0,0 +1,60 @@ +from .__about__ import __version__ +from .elements import Element, canonicalize_element_symbol, get_element, iter_elements, is_valid_element_symbol +from .policy import LookupResult, ValuePolicy +from .radii import ( + DEFAULT_COVALENT_POLICY, + DEFAULT_VDW_POLICY, + RadiiElementAssessment, + RadiiPolicy, + RadiiPolicyAssessment, + assess_radii_policy, + get_covalent_radius, + get_radii_set_info, + get_vdw_radius, + list_radii_sets, + lookup_covalent_radius, + lookup_vdw_radius, +) +from .registry import ( + CoverageInfo, + DatasetInfo, + DatasetRef, + ElementScalarSet, + Reference, + get_dataset_info, + list_dataset_ids, +) +from .transfer import LinearFit, LinearTransfer, SubstitutionTransfer + +__all__ = [ + '__version__', + 'Element', + 'canonicalize_element_symbol', + 'get_element', + 'iter_elements', + 'is_valid_element_symbol', + 'CoverageInfo', + 'DatasetInfo', + 'DatasetRef', + 'ElementScalarSet', + 'Reference', + 'get_dataset_info', + 'list_dataset_ids', + 'LinearFit', + 'LinearTransfer', + 'SubstitutionTransfer', + 'LookupResult', + 'ValuePolicy', + 'RadiiPolicy', + 'RadiiElementAssessment', + 'RadiiPolicyAssessment', + 'DEFAULT_COVALENT_POLICY', + 'DEFAULT_VDW_POLICY', + 'list_radii_sets', + 'get_radii_set_info', + 'lookup_covalent_radius', + 'get_covalent_radius', + 'lookup_vdw_radius', + 'get_vdw_radius', + 'assess_radii_policy', +] diff --git a/src/atomref/data/__init__.py b/src/atomref/data/__init__.py new file mode 100644 index 0000000..835d4e0 --- /dev/null +++ b/src/atomref/data/__init__.py @@ -0,0 +1 @@ +"""Packaged data files for atomref.""" diff --git a/src/atomref/data/covalent.csv b/src/atomref/data/covalent.csv new file mode 100644 index 0000000..053a71a --- /dev/null +++ b/src/atomref/data/covalent.csv @@ -0,0 +1,119 @@ +z,cordero2008,csd_legacy_cov +1,0.31,0.23 +2,0.28,1.5 +3,1.28,1.28 +4,0.96,0.96 +5,0.84,0.83 +6,0.76,0.68 +7,0.71,0.68 +8,0.66,0.68 +9,0.57,0.64 +10,0.58,1.5 +11,1.66,1.66 +12,1.41,1.41 +13,1.21,1.21 +14,1.11,1.2 +15,1.07,1.05 +16,1.05,1.02 +17,1.02,0.99 +18,1.06,1.51 +19,2.03,2.03 +20,1.76,1.76 +21,1.7,1.7 +22,1.6,1.6 +23,1.53,1.53 +24,1.39,1.39 +25,1.61,1.61 +26,1.52,1.52 +27,1.5,1.26 +28,1.24,1.24 +29,1.32,1.32 +30,1.22,1.22 +31,1.22,1.22 +32,1.2,1.17 +33,1.19,1.21 +34,1.2,1.22 +35,1.2,1.21 +36,1.16,1.5 +37,2.2,2.2 +38,1.95,1.95 +39,1.9,1.9 +40,1.75,1.75 +41,1.64,1.64 +42,1.54,1.54 +43,1.47,1.47 +44,1.46,1.46 +45,1.42,1.42 +46,1.39,1.39 +47,1.45,1.45 +48,1.44,1.54 +49,1.42,1.42 +50,1.39,1.39 +51,1.39,1.39 +52,1.38,1.47 +53,1.39,1.4 +54,1.4,1.5 +55,2.44,2.44 +56,2.15,2.15 +57,2.07,2.07 +58,2.04,2.04 +59,2.03,2.03 +60,2.01,2.01 +61,1.99,1.99 +62,1.98,1.98 +63,1.98,1.98 +64,1.96,1.96 +65,1.94,1.94 +66,1.92,1.92 +67,1.92,1.92 +68,1.89,1.89 +69,1.9,1.9 +70,1.87,1.87 +71,1.87,1.87 +72,1.75,1.75 +73,1.7,1.7 +74,1.62,1.62 +75,1.51,1.51 +76,1.44,1.44 +77,1.41,1.41 +78,1.36,1.36 +79,1.36,1.36 +80,1.32,1.32 +81,1.45,1.45 +82,1.46,1.46 +83,1.48,1.48 +84,1.4,1.4 +85,1.5,1.21 +86,1.5,1.5 +87,2.6,2.6 +88,2.21,2.21 +89,2.15,2.15 +90,2.06,2.06 +91,2,2 +92,1.96,1.96 +93,1.9,1.9 +94,1.87,1.87 +95,1.8,1.8 +96,1.69,1.69 +97,,1.54 +98,,1.83 +99,,1.5 +100,,1.5 +101,,1.5 +102,,1.5 +103,,1.5 +104,,1.5 +105,,1.5 +106,,1.5 +107,,1.5 +108,,1.5 +109,,1.5 +110,,1.5 +111,, +112,, +113,, +114,, +115,, +116,, +117,, +118,, diff --git a/src/atomref/data/periodic_table.csv b/src/atomref/data/periodic_table.csv new file mode 100644 index 0000000..744b4aa --- /dev/null +++ b/src/atomref/data/periodic_table.csv @@ -0,0 +1,119 @@ +z,symbol,name +1,H,Hydrogen +2,He,Helium +3,Li,Lithium +4,Be,Beryllium +5,B,Boron +6,C,Carbon +7,N,Nitrogen +8,O,Oxygen +9,F,Fluorine +10,Ne,Neon +11,Na,Sodium +12,Mg,Magnesium +13,Al,Aluminium +14,Si,Silicon +15,P,Phosphorus +16,S,Sulphur +17,Cl,Chlorine +18,Ar,Argon +19,K,Potassium +20,Ca,Calcium +21,Sc,Scandium +22,Ti,Titanium +23,V,Vanadium +24,Cr,Chromium +25,Mn,Manganese +26,Fe,Iron +27,Co,Cobalt +28,Ni,Nickel +29,Cu,Copper +30,Zn,Zinc +31,Ga,Gallium +32,Ge,Germanium +33,As,Arsenic +34,Se,Selenium +35,Br,Bromine +36,Kr,Krypton +37,Rb,Rubidium +38,Sr,Strontium +39,Y,Yttrium +40,Zr,Zirconium +41,Nb,Niobium +42,Mo,Molybdenum +43,Tc,Technetium +44,Ru,Ruthenium +45,Rh,Rhodium +46,Pd,Palladium +47,Ag,Silver +48,Cd,Cadmium +49,In,Indium +50,Sn,Tin +51,Sb,Antimony +52,Te,Tellurium +53,I,Iodine +54,Xe,Xenon +55,Cs,Caesium +56,Ba,Barium +57,La,Lanthanum +58,Ce,Cerium +59,Pr,Praseodymium +60,Nd,Neodymium +61,Pm,Promethium +62,Sm,Samarium +63,Eu,Europium +64,Gd,Gadolinium +65,Tb,Terbium +66,Dy,Dysprosium +67,Ho,Holmium +68,Er,Erbium +69,Tm,Thulium +70,Yb,Ytterbium +71,Lu,Lutetium +72,Hf,Hafnium +73,Ta,Tantalum +74,W,Tungsten +75,Re,Rhenium +76,Os,Osmium +77,Ir,Iridium +78,Pt,Platinum +79,Au,Gold +80,Hg,Mercury +81,Tl,Thallium +82,Pb,Lead +83,Bi,Bismuth +84,Po,Polonium +85,At,Astatine +86,Rn,Radon +87,Fr,Francium +88,Ra,Radium +89,Ac,Actinium +90,Th,Thorium +91,Pa,Protactinium +92,U,Uranium +93,Np,Neptunium +94,Pu,Plutonium +95,Am,Americium +96,Cm,Curium +97,Bk,Berkelium +98,Cf,Californium +99,Es,Einsteinium +100,Fm,Fermium +101,Md,Mendelevium +102,No,Nobelium +103,Lr,Lawrencium +104,Rf,Rutherfordium +105,Db,Dubnium +106,Sg,Seaborgium +107,Bh,Bohrium +108,Hs,Hassium +109,Mt,Meitnerium +110,Ds,Darmstadtium +111,Rg,Roentgenium +112,Cn,Copernicium +113,Nh,Nihonium +114,Fl,Flerovium +115,Mc,Moscovium +116,Lv,Livermorium +117,Ts,Tennessine +118,Og,Oganesson diff --git a/src/atomref/data/registry.json b/src/atomref/data/registry.json new file mode 100644 index 0000000..2577ab7 --- /dev/null +++ b/src/atomref/data/registry.json @@ -0,0 +1,434 @@ +{ + "schema_version": "0.1", + "created_from": { + "source_project": "molcryst", + "source_schema_version": "0.2", + "notes": [ + "Transformed for the initial atomref v0.1 scaffold.", + "Rahm 2016 is reclassified from van_der_waals to atomic_radius." + ] + }, + "quantities": { + "covalent_radius": { + "domain": "element", + "units": "angstrom", + "description": "Element-indexed covalent radii intended for geometry and bonding heuristics." + }, + "van_der_waals_radius": { + "domain": "element", + "units": "angstrom", + "description": "Element-indexed condensed-phase or contact-derived van der Waals radii." + }, + "atomic_radius": { + "domain": "element", + "units": "angstrom", + "description": "Element-indexed isolated-atom or theory-defined atomic radii used as transferable support data." + } + }, + "datasets": { + "covalent_radius": { + "cordero2008": { + "name": "Cordero et al. covalent radii", + "description": "Covalent radii from Cordero et al. (2008) (last author: Alvarez).", + "semantic_class": "covalent_structural", + "origin_class": "compiled_experimental", + "phase_context": "condensed_phase", + "method_summary": "Derived from crystallographic bond distances (primarily single bonds) across the periodic table.", + "storage": { + "format": "dense_by_z_csv", + "filename": "covalent.csv", + "column": "cordero2008" + }, + "coverage": { + "n_values": 96, + "z_min": 1, + "z_max": 96, + "has_placeholders": false + }, + "placeholder_value": null, + "extraction_source": "Table 2 in B. Cordero et al. (2008), column 'r'", + "aliases": [ + "Cordero covalent radii", + "Cordero–Alvarez covalent radii", + "Alvarez covalent radii (2008)" + ], + "references": [ + { + "authors": "B. Cordero et al.", + "doi": "10.1039/B801115J", + "title": "Covalent radii revisited", + "venue": "Dalton Trans. (2008) 2832-2838" + } + ], + "notes": [ + "The source paper provides multiple radii per element for different atom types/environments; this package currently includes C(sp3) value for C and high-spin values for Mn/Fe/Co." + ] + }, + "csd_legacy_cov": { + "name": "CSD legacy covalent radii (bond perception)", + "description": "Legacy covalent radii used in CSD software for bond assignment (Rcov).", + "semantic_class": "covalent_legacy", + "origin_class": "curated_heuristic", + "phase_context": "mixed_or_legacy", + "method_summary": null, + "storage": { + "format": "dense_by_z_csv", + "filename": "covalent.csv", + "column": "csd_legacy_cov" + }, + "coverage": { + "n_values": 110, + "z_min": 1, + "z_max": 110, + "has_placeholders": true + }, + "placeholder_value": 1.5, + "extraction_source": "CCDC Elemental_Radii.xlsx (CSD radii table), column 'Covalent Radius'.", + "aliases": [], + "references": [ + { + "publisher": "Cambridge Crystallographic Data Centre (CCDC)", + "title": "Elemental Data and Radii (Excel)", + "url": "https://www.ccdc.cam.ac.uk/media/Documentation/F8D8439E-30C5-4FA8-B781-D9E65AAB0BF3/Elemental_Radii.xlsx" + }, + { + "authors": "B. Cordero et al.", + "doi": "10.1039/B801115J", + "title": "Covalent radii revisited", + "venue": "Dalton Trans. (2008) 2832-2838" + } + ], + "notes": [ + "CSD bond assignment heuristic: a bond A-B may be inferred if distance d satisfies Rcov(A)+Rcov(B)-t <= d <= Rcov(A)+Rcov(B)+t, with typical t=0.4 Å. (See the CCDC spreadsheet notes.)", + "For Z>=111, csd_legacy values are omitted because the legacy CSD table does not provide radii beyond Darmstadtium (Z=110).", + "Elements not yet encountered in the CSD have Rcov = 1.50 Å." + ] + } + }, + "van_der_waals_radius": { + "bondi1964": { + "name": "Bondi van der Waals radii", + "description": "Classic van der Waals radii compiled by Bondi (1964), available for 38 elements.", + "semantic_class": "vdw_compiled", + "origin_class": "compiled_experimental", + "phase_context": "mixed_or_legacy", + "method_summary": "Bondi compiled van der Waals radii from a combination of experimental sources (e.g., crystal structures, liquid-state properties, gas kinetic data) to reproduce molecular/atomic volumes and sizes. This set is widely used as a historical reference and in many computational chemistry defaults.", + "storage": { + "format": "dense_by_z_csv", + "filename": "van_der_waals.csv", + "column": "bondi1964" + }, + "coverage": { + "n_values": 38, + "z_min": 1, + "z_max": 92, + "has_placeholders": false, + "covered_z": [ + 1, + 2, + 3, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 14, + 15, + 16, + 17, + 18, + 19, + 28, + 29, + 30, + 31, + 33, + 34, + 35, + 36, + 46, + 47, + 48, + 49, + 50, + 52, + 53, + 54, + 78, + 79, + 80, + 81, + 82, + 92 + ] + }, + "placeholder_value": null, + "extraction_source": "Bondi column in Table 1 of Alvarez (2013) (used as a convenient transcription of Bondi's tabulation).", + "aliases": [ + "Bondi radii", + "Bondi vdW radii" + ], + "references": [ + { + "authors": "A. Bondi", + "title": "van der Waals Volumes and Radii", + "venue": "J. Phys. Chem. 68 (1964) 441-451", + "doi": "10.1021/j100785a001" + }, + { + "authors": "S. Alvarez", + "title": "A cartography of the van der Waals territories", + "venue": "Dalton Trans. 42 (2013) 8617-8636", + "doi": "10.1039/C3DT50599E", + "note": "Table 1 reproduces Bondi radii for 38 elements." + } + ], + "notes": [ + "Coverage is limited (38 elements, including only a few transition metals and uranium).", + "Because Bondi radii were not derived exclusively from crystal nonbonded contact statistics, they can differ slightly from later 'structural' vdW radii." + ] + }, + "rowland_taylor1996": { + "name": "Rowland & Taylor nonbonded contact radii", + "description": "Nonbonded contact radii derived from organic crystal structures (Rowland & Taylor, 1996).", + "semantic_class": "vdw_structural", + "origin_class": "structural", + "phase_context": "condensed_phase", + "method_summary": "Rowland & Taylor analyzed distributions of intermolecular nonbonded contact distances in organic crystal structures from the Cambridge Structural Database (CSD). They fitted/estimated characteristic contact distances and solved for per-element radii by least-squares analysis over many element-pair distance distributions.", + "storage": { + "format": "dense_by_z_csv", + "filename": "van_der_waals.csv", + "column": "rowland_taylor1996" + }, + "coverage": { + "n_values": 9, + "z_min": 1, + "z_max": 53, + "has_placeholders": false, + "covered_z": [ + 1, + 6, + 7, + 8, + 9, + 16, + 17, + 35, + 53 + ] + }, + "placeholder_value": null, + "extraction_source": "Table 3 in Rowland & Taylor (1996), column 'r_c' (least-squares radii, not the normalized R_d column).", + "aliases": [ + "Rowland–Taylor radii", + "Rowland & Taylor vdW radii" + ], + "references": [ + { + "authors": "R. S. Rowland; R. Taylor", + "title": "Intermolecular Nonbonded Contact Distances in Organic Crystal Structures: Comparison with Distances Expected from van der Waals Radii", + "venue": "J. Phys. Chem. 100 (1996) 7384-7391", + "doi": "10.1021/jp953141+" + } + ], + "notes": [ + "Coverage is intentionally limited to common organic-crystal nonmetals (H, C, N, O, F, S, Cl, Br, I).", + "Rowland & Taylor also report a normalized set (R_d) constrained to match the total of Bondi radii; this package uses the raw least-squares r_c values." + ] + }, + "alvarez2013": { + "name": "Alvarez van der Waals radii", + "description": "van der Waals radii from Alvarez (2013).", + "semantic_class": "vdw_structural", + "origin_class": "structural", + "phase_context": "condensed_phase", + "method_summary": null, + "storage": { + "format": "dense_by_z_csv", + "filename": "van_der_waals.csv", + "column": "alvarez2013" + }, + "coverage": { + "n_values": 93, + "z_min": 1, + "z_max": 99, + "has_placeholders": false, + "missing_z": [ + 61, + 84, + 85, + 86, + 87, + 88 + ] + }, + "placeholder_value": null, + "extraction_source": "Table 1 in Alvarez (2013), column 'r_vdW'.", + "aliases": [ + "Alvarez vdW radii", + "Alvarez (2013) r_vdW", + "Dalton Trans. vdW cartography radii" + ], + "references": [ + { + "authors": "S. Alvarez", + "doi": "10.1039/C3DT50599E", + "title": "A cartography of the van der Waals territories", + "venue": "Dalton Trans. 42 (2013) 8617-8636" + } + ], + "notes": [ + "Obtained by statistical analysis of millions of interatomic distances in the Cambridge Structural Database (CSD), locating the vdW peak after the vdW gap." + ] + }, + "chernyshov2020": { + "name": "Chernyshov LoS van der Waals radii", + "description": "van der Waals radii from Chernyshov et al. (ChemPhysChem 2020) using line-of-sight (LoS) classification of direct contacts.", + "semantic_class": "vdw_structural_typed_reduced", + "origin_class": "structural", + "phase_context": "condensed_phase", + "method_summary": "Chernyshov et al. introduce a line-of-sight (LoS) criterion to identify 'direct' interatomic contacts in complex molecular crystals. vdW radii are then inferred from statistically analyzed contact-distance distributions for specific atom types, yielding radii (including R_half and R_max variants) intended to better reflect steric/anistropic effects than simple distance-based heuristics.", + "storage": { + "format": "dense_by_z_csv", + "filename": "van_der_waals.csv", + "column": "chernyshov2020" + }, + "coverage": { + "n_values": 10, + "z_min": 1, + "z_max": 53, + "has_placeholders": false, + "covered_z": [ + 1, + 6, + 7, + 8, + 9, + 16, + 17, + 34, + 35, + 53 + ] + }, + "placeholder_value": null, + "extraction_source": "Table 1 in Chernyshov et al. (2020): R_max values for the 'default' atom types typical for organic compounds.", + "aliases": [ + "LoS vdW radii", + "Chernyshov vdW radii" + ], + "references": [ + { + "authors": "I. Yu. Chernyshov; I. V. Ananyev; E. A. Pidko", + "title": "Revisiting van der Waals Radii: From Comprehensive Structural Analysis to Knowledge-Based Classification of Interatomic Contacts", + "venue": "ChemPhysChem 21 (2020) 1–8", + "doi": "10.1002/cphc.201901083" + } + ], + "notes": [ + "The source paper provides multiple radii per element for different atom types/environments; this package currently includes only the main/default R_max values used in Table 1.", + "Primarily targeted at elements common in organic crystals (H, C, N, O, F, S, Cl, Se, Br, I)." + ] + }, + "csd_legacy_vdw": { + "name": "CSD legacy van der Waals radii (pre-2024.3)", + "description": "Legacy van der Waals radii historically used in CSD tools (pre-2024.3).", + "semantic_class": "vdw_legacy", + "origin_class": "curated_heuristic", + "phase_context": "mixed_or_legacy", + "method_summary": null, + "storage": { + "format": "dense_by_z_csv", + "filename": "van_der_waals.csv", + "column": "csd_legacy_vdw" + }, + "coverage": { + "n_values": 110, + "z_min": 1, + "z_max": 110, + "has_placeholders": true + }, + "placeholder_value": 2.0, + "extraction_source": "CCDC Elemental_Radii.xlsx (CSD radii table), column 'vdW Radius' (Bondi/Rowland-Taylor based with defaults).", + "aliases": [], + "references": [ + { + "authors": "A. Bondi", + "doi": "10.1021/j100785a001", + "title": "van der Waals Volumes and Radii", + "venue": "J. Phys. Chem. 68 (1964) 441-451" + }, + { + "authors": "R. S. Rowland; R. Taylor", + "doi": "10.1021/jp953141+", + "title": "Intermolecular Nonbonded Contact Distances in Organic Crystal Structures: Comparison with Distances Expected from van der Waals Radii", + "venue": "J. Phys. Chem. 100 (1996) 7384-7391" + }, + { + "publisher": "CCDC", + "title": "Elemental Data and Radii (Excel)", + "url": "https://www.ccdc.cam.ac.uk/media/Documentation/F8D8439E-30C5-4FA8-B781-D9E65AAB0BF3/Elemental_Radii.xlsx" + }, + { + "publisher": "CCDC blog", + "title": "Updates to van der Waals radii used in the CSD and Mercury", + "url": "https://www.ccdc.cam.ac.uk/discover/blog/updates-to-van-der-waals-radii-csd-mercury/" + } + ], + "notes": [ + "For Z>=111, csd_legacy values are omitted because the legacy CSD table does not provide radii beyond Darmstadtium (Z=110).", + "Radii that are not available in either Bondi or Rowland & Taylor versions were assigned RvdW of 2.00 Å.", + "The CSD 2024.3 release updated the vdW radii used in CSD and Mercury to Alvarez-derived values (see CCDC blog post)." + ] + } + }, + "atomic_radius": { + "rahm2016": { + "name": "Rahm isodensity atomic radii (ρ=0.001 e/bohr³)", + "description": "Computed atomic radii for neutral atoms (elements 1–96) defined by the ρ=0.001 e/bohr³ electron-density isosurface (Rahm et al., 2016).", + "semantic_class": "atomic_isodensity", + "origin_class": "computational", + "phase_context": "isolated_atom", + "method_summary": "Rahm et al. computed relativistic all-electron DFT electron densities (close to the basis-set limit) for isolated atoms and ions. Radii are defined by an electron-density threshold, producing a consistent, theory-based size measure that correlates well with structural van der Waals radii derived from crystal structures.", + "storage": { + "format": "dense_by_z_csv", + "filename": "van_der_waals.csv", + "column": "rahm2016" + }, + "coverage": { + "n_values": 96, + "z_min": 1, + "z_max": 96, + "has_placeholders": false + }, + "placeholder_value": null, + "extraction_source": "Supporting Information for Rahm et al. (2016), Table S1: neutral-atom radii for elements 1–96.", + "aliases": [ + "Rahm radii", + "Rahm–Hoffmann–Ashcroft atomic radii", + "0.001 e/bohr^3 radii" + ], + "references": [ + { + "authors": "M. Rahm; R. Hoffmann; N. W. Ashcroft", + "title": "Atomic and Ionic Radii of Elements 1–96", + "venue": "Chem. Eur. J. 22 (2016) 14625–14632", + "doi": "10.1002/chem.201602949" + }, + { + "title": "Chem. Eur. J. 2016, 22, 14625–14632 (Rahm et al.) – Misc. Information", + "url": "http://dx.doi.org/10.1002/chem.201602949", + "publisher": "Supporting Information", + "note": "Table S1 contains the neutral-atom radii used here." + } + ], + "notes": [ + "The original work also reports cationic radii (+1) for the first 96 elements and selected anionic radii (−1) for some elements; these are not yet included in the current CSV.", + "Despite the fact that in this project this radii are classified as vdW radii for the purpose of simplicity, they should be treated as a correlational/transferable baseline rather than a direct condensed-phase vdW radius since they describe isolated atoms in vacuum." + ] + } + } + } +} diff --git a/src/atomref/data/van_der_waals.csv b/src/atomref/data/van_der_waals.csv new file mode 100644 index 0000000..86e7be3 --- /dev/null +++ b/src/atomref/data/van_der_waals.csv @@ -0,0 +1,119 @@ +z,bondi1964,rowland_taylor1996,alvarez2013,chernyshov2020,csd_legacy_vdw,rahm2016 +1,1.2,1.1,1.2,1.21,1.09,1.54 +2,1.4,,1.43,,1.4,1.34 +3,1.81,,2.12,,1.82,2.2 +4,,,1.98,,2,2.19 +5,,,1.91,,2,2.05 +6,1.7,1.77,1.77,1.91,1.7,1.9 +7,1.55,1.64,1.66,1.76,1.55,1.79 +8,1.52,1.58,1.5,1.74,1.52,1.71 +9,1.47,1.46,1.46,1.55,1.47,1.63 +10,1.54,,1.58,,1.54,1.56 +11,2.27,,2.5,,2.27,2.25 +12,1.73,,2.51,,1.73,2.4 +13,,,2.25,,2,2.39 +14,2.22,,2.19,,2.1,2.32 +15,1.8,,1.9,,1.8,2.23 +16,1.8,1.81,1.89,1.95,1.8,2.14 +17,1.75,1.76,1.82,1.91,1.75,2.06 +18,1.76,,1.83,,1.88,1.97 +19,2.75,,2.73,,2.75,2.34 +20,,,2.62,,2,2.7 +21,,,2.58,,2,2.63 +22,,,2.46,,2,2.57 +23,,,2.42,,2,2.52 +24,,,2.45,,2,2.33 +25,,,2.45,,2,2.42 +26,,,2.44,,2,2.26 +27,,,2.4,,2,2.22 +28,1.63,,2.4,,1.63,2.19 +29,1.4,,2.38,,1.4,2.17 +30,1.39,,2.39,,1.39,2.22 +31,1.87,,2.32,,1.87,2.33 +32,,,2.29,,2,2.34 +33,1.85,,1.88,,1.85,2.31 +34,1.9,,1.82,2.04,1.9,2.24 +35,1.83,1.87,1.86,2,1.85,2.19 +36,2.02,,2.25,,2.02,2.12 +37,,,3.21,,2,2.4 +38,,,2.84,,2,2.79 +39,,,2.75,,2,2.74 +40,,,2.52,,2,2.68 +41,,,2.56,,2,2.51 +42,,,2.45,,2,2.44 +43,,,2.44,,2,2.41 +44,,,2.46,,2,2.37 +45,,,2.44,,2,2.33 +46,1.63,,2.15,,1.63,2.15 +47,1.72,,2.53,,1.72,2.25 +48,1.62,,2.49,,1.58,2.38 +49,1.93,,2.43,,1.93,2.46 +50,2.17,,2.42,,2.17,2.48 +51,,,2.47,,2,2.46 +52,2,,1.99,,2.06,2.42 +53,1.98,2.03,2.04,2.17,1.98,2.38 +54,2.16,,2.06,,2.16,2.32 +55,,,3.48,,2,2.49 +56,,,3.03,,2,2.93 +57,,,2.98,,2,2.84 +58,,,2.88,,2,2.82 +59,,,2.92,,2,2.86 +60,,,2.95,,2,2.84 +61,,,,,2,2.83 +62,,,2.9,,2,2.8 +63,,,2.87,,2,2.8 +64,,,2.83,,2,2.77 +65,,,2.79,,2,2.76 +66,,,2.87,,2,2.75 +67,,,2.81,,2,2.73 +68,,,2.83,,2,2.72 +69,,,2.79,,2,2.71 +70,,,2.8,,2,2.77 +71,,,2.74,,2,2.7 +72,,,2.63,,2,2.64 +73,,,2.53,,2,2.58 +74,,,2.57,,2,2.53 +75,,,2.49,,2,2.49 +76,,,2.48,,2,2.44 +77,,,2.41,,2,2.33 +78,1.72,,2.29,,1.72,2.3 +79,1.66,,2.32,,1.66,2.26 +80,1.7,,2.45,,1.55,2.29 +81,1.96,,2.47,,1.96,2.42 +82,2.02,,2.6,,2.02,2.49 +83,,,2.54,,2,2.5 +84,,,,,2,2.5 +85,,,,,2,2.47 +86,,,,,2,2.43 +87,,,,,2,2.58 +88,,,,,2,2.92 +89,,,2.8,,2,2.93 +90,,,2.93,,2,2.89 +91,,,2.88,,2,2.85 +92,1.86,,2.71,,1.86,2.83 +93,,,2.82,,2,2.8 +94,,,2.81,,2,2.78 +95,,,2.83,,2,2.76 +96,,,3.05,,2,2.76 +97,,,3.4,,2, +98,,,3.05,,2, +99,,,2.7,,2, +100,,,,,2, +101,,,,,2, +102,,,,,2, +103,,,,,2, +104,,,,,2, +105,,,,,2, +106,,,,,2, +107,,,,,2, +108,,,,,2, +109,,,,,2, +110,,,,,2, +111,,,,,, +112,,,,,, +113,,,,,, +114,,,,,, +115,,,,,, +116,,,,,, +117,,,,,, +118,,,,,, diff --git a/src/atomref/elements.py b/src/atomref/elements.py new file mode 100644 index 0000000..42f0598 --- /dev/null +++ b/src/atomref/elements.py @@ -0,0 +1,99 @@ +"""Periodic table access for stable element identity.""" + +from __future__ import annotations + +import csv +import re +from dataclasses import dataclass +from functools import lru_cache +from importlib import resources + + +_MISSING_TOKENS = {'', '?', '.'} +_LEADING_ALPHA_RE = re.compile(r'([A-Za-z]{1,3})') + + +@dataclass(frozen=True, slots=True) +class Element: + """Chemical element identity.""" + + z: int + symbol: str + name: str + + +def _normalize_element_token(token: str | None) -> str | None: + if token is None: + return None + + raw = token.strip() + if raw in _MISSING_TOKENS: + return None + + if (raw.startswith("'") and raw.endswith("'")) or ( + raw.startswith('"') and raw.endswith('"') + ): + raw = raw[1:-1].strip() + if raw in _MISSING_TOKENS: + return None + + if not raw: + return None + return raw + + +def canonicalize_element_symbol(token: str | None) -> str | None: + """Canonicalize a free-form element token.""" + + raw = _normalize_element_token(token) + if raw is None: + return None + + match = _LEADING_ALPHA_RE.match(raw) + if match is None: + return None + + letters = match.group(1) + return letters[0].upper() + letters[1:].lower() + + +@lru_cache(maxsize=1) +def _load_elements_by_symbol() -> dict[str, Element]: + table_path = resources.files('atomref.data').joinpath('periodic_table.csv') + with table_path.open('r', encoding='utf-8', newline='') as handle: + reader = csv.DictReader(handle) + out: dict[str, Element] = {} + for row in reader: + z = int(row['z']) + symbol = row['symbol'] + name = row['name'] + out[symbol] = Element(z=z, symbol=symbol, name=name) + return out + + +@lru_cache(maxsize=1) +def _elements_in_z_order() -> tuple[Element, ...]: + return tuple(sorted(_load_elements_by_symbol().values(), key=lambda e: e.z)) + + +def is_valid_element_symbol(symbol: str | None) -> bool: + """Return ``True`` if ``symbol`` is a known element symbol.""" + + if symbol is None: + return False + return symbol in _load_elements_by_symbol() + + +def get_element(symbol: str | None) -> Element | None: + """Look up element identity by symbol or free-form token.""" + + sym = canonicalize_element_symbol(symbol) + if sym is None: + return None + return _load_elements_by_symbol().get(sym) + + +def iter_elements() -> tuple[Element, ...]: + """Return all packaged elements in increasing atomic-number order.""" + + return _elements_in_z_order() diff --git a/src/atomref/errors.py b/src/atomref/errors.py new file mode 100644 index 0000000..1922cf5 --- /dev/null +++ b/src/atomref/errors.py @@ -0,0 +1,14 @@ +class AtomrefError(Exception): + """Base package error.""" + + +class DatasetError(AtomrefError): + """Packaged dataset or registry error.""" + + +class MissingValueError(AtomrefError): + """Raised when a required reference value is unavailable.""" + + +class PolicyError(AtomrefError): + """Raised for invalid policy configuration.""" diff --git a/src/atomref/policy.py b/src/atomref/policy.py new file mode 100644 index 0000000..b7df87b --- /dev/null +++ b/src/atomref/policy.py @@ -0,0 +1,261 @@ +"""Generic value-policy resolution for element-indexed scalar datasets.""" + +from __future__ import annotations + +from collections.abc import Mapping +from dataclasses import dataclass, field +from functools import lru_cache +import math +from typing import Generic, Literal, TypeVar + +from .elements import canonicalize_element_symbol, get_element, is_valid_element_symbol +from .errors import PolicyError +from .registry import ( + DatasetLike, + DatasetRef, + ElementScalarSet, + _is_placeholder_value, + get_builtin_set, + resolve_dataset_like, +) +from .transfer import LinearFit, LinearTransfer, SubstitutionTransfer, TransferModel + + +K = TypeVar('K') + +LookupSource = Literal[ + 'override', + 'base', + 'transfer_substitution', + 'transfer_linear', + 'fallback', + 'missing', +] + + +@dataclass(frozen=True, slots=True) +class LookupResult: + value: float | None + source: LookupSource + target: DatasetRef + resolved_from: tuple[DatasetRef, ...] = () + is_placeholder: bool = False + fit: LinearFit | None = None + notes: tuple[str, ...] = () + + def __float__(self) -> float: + if self.value is None: + raise TypeError('reference value is missing') + return float(self.value) + + +@dataclass(frozen=True, slots=True) +class ValuePolicy(Generic[K]): + base: DatasetLike + transfers: tuple[TransferModel, ...] = () + overrides: Mapping[K, float] = field(default_factory=dict) + fallback: float | None = None + + +def _normalize_element_symbol(symbol: str | None) -> str | None: + cand = canonicalize_element_symbol(symbol) + if cand in {'D', 'T'}: + cand = 'H' + if cand is None: + return None + if not is_valid_element_symbol(cand): + return None + return cand + + +def _resolve_target_ref(policy: ValuePolicy[object]) -> DatasetRef: + return resolve_dataset_like(policy.base).ref + + +def _fit_linear_transfer(base_set: ElementScalarSet, predictor_set: ElementScalarSet, *, min_points: int, exclude_placeholders: bool) -> LinearFit: + xs: list[float] = [] + ys: list[float] = [] + + n_z = min(len(base_set.values_by_z), len(predictor_set.values_by_z)) + for z in range(1, n_z): + y = base_set.values_by_z[z] + x = predictor_set.values_by_z[z] + if y is None or x is None: + continue + y_f = float(y) + x_f = float(x) + if exclude_placeholders and ( + _is_placeholder_value(base_set.info, y_f) + or _is_placeholder_value(predictor_set.info, x_f) + ): + continue + xs.append(x_f) + ys.append(y_f) + + n = len(xs) + if n < min_points: + raise PolicyError('not enough overlapping elements to fit linear transfer') + + x_mean = sum(xs) / n + y_mean = sum(ys) / n + sxx = sum((x - x_mean) ** 2 for x in xs) + if sxx == 0: + raise PolicyError('cannot fit linear transfer: zero predictor variance') + + sxy = sum((x - x_mean) * (y - y_mean) for x, y in zip(xs, ys)) + slope = sxy / sxx + intercept = y_mean - slope * x_mean + + y_hat = [slope * x + intercept for x in xs] + sse = sum((y - yh) ** 2 for y, yh in zip(ys, y_hat)) + sst = sum((y - y_mean) ** 2 for y in ys) + r2 = 1.0 - sse / sst if sst != 0 else 1.0 + rmse = math.sqrt(sse / n) + + return LinearFit( + coefficients=(slope,), + intercept=intercept, + n_points=n, + r2=r2, + rmse=rmse, + ) + + +@lru_cache(maxsize=None) +def _fit_linear_transfer_cached(base_ref: DatasetRef, predictor_ref: DatasetRef, min_points: int, exclude_placeholders: bool) -> LinearFit: + return _fit_linear_transfer( + get_builtin_set(base_ref), + get_builtin_set(predictor_ref), + min_points=min_points, + exclude_placeholders=exclude_placeholders, + ) + + +def _fit_transfer_model(base: DatasetLike, transfer: TransferModel) -> LinearFit | None: + if not isinstance(transfer, LinearTransfer): + return None + if len(transfer.predictors) != 1: + raise PolicyError('v0.1 LinearTransfer supports exactly one predictor dataset') + + predictor = transfer.predictors[0] + if isinstance(base, DatasetRef) and isinstance(predictor, DatasetRef): + return _fit_linear_transfer_cached( + base, predictor, transfer.min_points, transfer.exclude_placeholders + ) + return _fit_linear_transfer( + resolve_dataset_like(base), + resolve_dataset_like(predictor), + min_points=transfer.min_points, + exclude_placeholders=transfer.exclude_placeholders, + ) + + +def _apply_substitution_transfer(symbol: str, *, target: DatasetRef, transfer: SubstitutionTransfer) -> tuple[LookupResult | None, str | None]: + source_set = resolve_dataset_like(transfer.source) + value = source_set.get(symbol) + if value is None: + return None, f'no substitution value in {source_set.ref.set_id}' + value_f = float(value) + return ( + LookupResult( + value=value_f, + source='transfer_substitution', + target=target, + resolved_from=(source_set.ref,), + is_placeholder=_is_placeholder_value(source_set.info, value_f), + notes=('missing in base set; substituted from transfer source',), + ), + None, + ) + + +def _apply_linear_transfer(symbol: str, *, base: DatasetLike, target: DatasetRef, transfer: LinearTransfer) -> tuple[LookupResult | None, str | None]: + if len(transfer.predictors) != 1: + raise PolicyError('v0.1 LinearTransfer supports exactly one predictor dataset') + + predictor_set = resolve_dataset_like(transfer.predictors[0]) + predictor_value = predictor_set.get(symbol) + if predictor_value is None: + return None, f'no predictor value in {predictor_set.ref.set_id}' + predictor_f = float(predictor_value) + + if transfer.exclude_placeholders and _is_placeholder_value(predictor_set.info, predictor_f): + return None, f'predictor value in {predictor_set.ref.set_id} is a placeholder' + + fit = _fit_transfer_model(base, transfer) + if fit is None: + return None, 'no fit available for linear transfer' + predicted = fit.coefficients[0] * predictor_f + fit.intercept + return ( + LookupResult( + value=float(predicted), + source='transfer_linear', + target=target, + resolved_from=(predictor_set.ref,), + is_placeholder=False, + fit=fit, + notes=('missing in base set; inferred via linear transfer',), + ), + None, + ) + + +def _resolve_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupResult: + target = _resolve_target_ref(policy) + base_set = resolve_dataset_like(policy.base) + if base_set.info.domain != 'element': + raise PolicyError('v0.1 resolver supports only element-domain datasets') + + sym = _normalize_element_symbol(symbol) + if sym is None: + note = 'unknown element' if symbol is not None else 'missing element symbol' + return LookupResult(value=None, source='missing', target=target, notes=(note,)) + + if sym in policy.overrides: + return LookupResult( + value=float(policy.overrides[sym]), + source='override', + target=target, + notes=('value supplied by policy override',), + ) + + base_value = base_set.get(sym) + if base_value is not None: + base_f = float(base_value) + return LookupResult( + value=base_f, + source='base', + target=target, + resolved_from=(base_set.ref,), + is_placeholder=_is_placeholder_value(base_set.info, base_f), + notes=(), + ) + + transfer_notes: list[str] = ['missing in base set'] + for transfer in policy.transfers: + if isinstance(transfer, SubstitutionTransfer): + result, note = _apply_substitution_transfer(sym, target=target, transfer=transfer) + elif isinstance(transfer, LinearTransfer): + result, note = _apply_linear_transfer(sym, base=policy.base, target=target, transfer=transfer) + else: # pragma: no cover - closed union today + raise PolicyError(f'unsupported transfer model: {type(transfer)!r}') + + if result is not None: + return result + if note: + transfer_notes.append(note) + + if policy.fallback is not None: + return LookupResult( + value=float(policy.fallback), + source='fallback', + target=target, + notes=tuple(transfer_notes + ['using fallback value']), + ) + + return LookupResult( + value=None, + source='missing', + target=target, + notes=tuple(transfer_notes), + ) diff --git a/src/atomref/py.typed b/src/atomref/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/src/atomref/radii.py b/src/atomref/radii.py new file mode 100644 index 0000000..61cebda --- /dev/null +++ b/src/atomref/radii.py @@ -0,0 +1,233 @@ +"""Radii-specific public API built on the generic policy core.""" + +from __future__ import annotations + +from collections.abc import Iterable, Mapping +from dataclasses import dataclass, field +from typing import Literal + +from .elements import canonicalize_element_symbol, get_element, is_valid_element_symbol +from .errors import PolicyError +from .policy import LookupResult, ValuePolicy, _fit_transfer_model, _resolve_value +from .registry import DatasetInfo, DatasetRef, ElementScalarSet, get_dataset_info, list_dataset_ids +from .transfer import LinearFit, LinearTransfer, SubstitutionTransfer, TransferModel + + +RadiiKind = Literal['covalent', 'van_der_waals'] +RadiiSet = ElementScalarSet + + +_KIND_TO_QUANTITY = { + 'covalent': 'covalent_radius', + 'van_der_waals': 'van_der_waals_radius', +} + + +@dataclass(frozen=True, slots=True) +class RadiiPolicy: + kind: RadiiKind + base_set: str | RadiiSet + transfers: tuple[TransferModel, ...] = () + overrides: Mapping[str, float] = field(default_factory=dict) + fallback: float | None = None + + def as_value_policy(self) -> ValuePolicy[str]: + quantity = _quantity_for_kind(self.kind) + if isinstance(self.base_set, ElementScalarSet): + if self.base_set.ref.quantity != quantity: + raise PolicyError( + f'base_set quantity {self.base_set.ref.quantity!r} is incompatible with radii kind {self.kind!r}' + ) + base = self.base_set + else: + base = DatasetRef(quantity, self.base_set) + + normalized_overrides: dict[str, float] = {} + for key, value in self.overrides.items(): + sym = _normalize_radii_symbol(key) + if sym is None or not is_valid_element_symbol(sym): + raise PolicyError(f'invalid override element symbol: {key!r}') + normalized_overrides[sym] = float(value) + + return ValuePolicy( + base=base, + transfers=self.transfers, + overrides=normalized_overrides, + fallback=self.fallback, + ) + + +@dataclass(frozen=True, slots=True) +class RadiiElementAssessment: + symbol: str + lookup: LookupResult + + +@dataclass(frozen=True, slots=True) +class RadiiPolicyAssessment: + kind: RadiiKind + policy: RadiiPolicy + elements: tuple[str, ...] + + n_elements: int + n_override: int + n_base: int + n_transfer_substitution: int + n_transfer_linear: int + n_fallback: int + n_missing: int + n_placeholders: int + + missing_symbols: tuple[str, ...] + placeholder_symbols: tuple[str, ...] + + fits: tuple[LinearFit, ...] = () + warnings: tuple[str, ...] = () + per_element: tuple[RadiiElementAssessment, ...] = () + + +def _quantity_for_kind(kind: RadiiKind) -> str: + try: + return _KIND_TO_QUANTITY[kind] + except KeyError as exc: + raise PolicyError(f'unknown radii kind: {kind!r}') from exc + + +def _normalize_radii_symbol(symbol: str | None) -> str | None: + cand = canonicalize_element_symbol(symbol) + if cand in {'D', 'T'}: + cand = 'H' + return cand + + +def _normalize_assessment_elements(elements: Iterable[str]) -> tuple[str, ...]: + symbols: set[str] = set() + for token in elements: + sym = _normalize_radii_symbol(token) + if sym is None: + raise ValueError('missing element symbol') + if not is_valid_element_symbol(sym): + raise ValueError(f'invalid element symbol: {sym!r}') + symbols.add(sym) + return tuple(sorted(symbols, key=lambda s: get_element(s).z if get_element(s) else 0)) + + +def list_radii_sets(kind: RadiiKind) -> tuple[str, ...]: + return list_dataset_ids(_quantity_for_kind(kind)) + + +def get_radii_set_info(kind: RadiiKind, set_id: str) -> DatasetInfo: + return get_dataset_info(DatasetRef(_quantity_for_kind(kind), set_id)) + + +def _validate_policy_kind(policy: RadiiPolicy, *, expected: RadiiKind) -> None: + if policy.kind != expected: + raise PolicyError(f'expected a {expected!r} radii policy, got {policy.kind!r}') + + +def _lookup_radius(symbol: str | None, *, policy: RadiiPolicy) -> LookupResult: + return _resolve_value(symbol, policy=policy.as_value_policy()) + + +def lookup_covalent_radius(symbol: str | None, *, policy: RadiiPolicy | None = None) -> LookupResult: + active = DEFAULT_COVALENT_POLICY if policy is None else policy + _validate_policy_kind(active, expected='covalent') + return _lookup_radius(symbol, policy=active) + + +def get_covalent_radius(symbol: str | None, *, policy: RadiiPolicy | None = None) -> float | None: + return lookup_covalent_radius(symbol, policy=policy).value + + +def lookup_vdw_radius(symbol: str | None, *, policy: RadiiPolicy | None = None) -> LookupResult: + active = DEFAULT_VDW_POLICY if policy is None else policy + _validate_policy_kind(active, expected='van_der_waals') + return _lookup_radius(symbol, policy=active) + + +def get_vdw_radius(symbol: str | None, *, policy: RadiiPolicy | None = None) -> float | None: + return lookup_vdw_radius(symbol, policy=policy).value + + +def assess_radii_policy(elements: Iterable[str], *, policy: RadiiPolicy, detail: bool = False) -> RadiiPolicyAssessment: + elems = _normalize_assessment_elements(elements) + value_policy = policy.as_value_policy() + + n_override = 0 + n_base = 0 + n_transfer_substitution = 0 + n_transfer_linear = 0 + n_fallback = 0 + n_missing = 0 + n_placeholders = 0 + + missing_symbols: list[str] = [] + placeholder_symbols: list[str] = [] + per_element: list[RadiiElementAssessment] = [] + + for symbol in elems: + lookup = _resolve_value(symbol, policy=value_policy) + if lookup.source == 'override': + n_override += 1 + elif lookup.source == 'base': + n_base += 1 + elif lookup.source == 'transfer_substitution': + n_transfer_substitution += 1 + elif lookup.source == 'transfer_linear': + n_transfer_linear += 1 + elif lookup.source == 'fallback': + n_fallback += 1 + elif lookup.source == 'missing': + n_missing += 1 + missing_symbols.append(symbol) + + if lookup.is_placeholder: + n_placeholders += 1 + placeholder_symbols.append(symbol) + + if detail: + per_element.append(RadiiElementAssessment(symbol=symbol, lookup=lookup)) + + fits: list[LinearFit] = [] + warnings: list[str] = [] + for transfer in value_policy.transfers: + if isinstance(transfer, LinearTransfer): + try: + fit = _fit_transfer_model(value_policy.base, transfer) + except Exception as exc: # noqa: BLE001 + warnings.append(str(exc)) + else: + if fit is not None: + fits.append(fit) + + return RadiiPolicyAssessment( + kind=policy.kind, + policy=policy, + elements=elems, + n_elements=len(elems), + n_override=n_override, + n_base=n_base, + n_transfer_substitution=n_transfer_substitution, + n_transfer_linear=n_transfer_linear, + n_fallback=n_fallback, + n_missing=n_missing, + n_placeholders=n_placeholders, + missing_symbols=tuple(missing_symbols), + placeholder_symbols=tuple(placeholder_symbols), + fits=tuple(fits), + warnings=tuple(warnings), + per_element=tuple(per_element), + ) + + +DEFAULT_COVALENT_POLICY = RadiiPolicy( + kind='covalent', + base_set='cordero2008', + transfers=(SubstitutionTransfer(source=DatasetRef('covalent_radius', 'csd_legacy_cov')),), +) + +DEFAULT_VDW_POLICY = RadiiPolicy( + kind='van_der_waals', + base_set='alvarez2013', + transfers=(LinearTransfer(predictors=(DatasetRef('atomic_radius', 'rahm2016'),)),), +) diff --git a/src/atomref/registry.py b/src/atomref/registry.py new file mode 100644 index 0000000..196dbc3 --- /dev/null +++ b/src/atomref/registry.py @@ -0,0 +1,343 @@ +"""Dataset registry and packaged element-scalar set loading.""" + +from __future__ import annotations + +from collections.abc import Iterable, Mapping +from dataclasses import dataclass +import csv +import json +from functools import lru_cache +from importlib import resources + +from .elements import canonicalize_element_symbol, get_element, iter_elements +from .errors import DatasetError + + +QuantityId = str +DomainId = str + + +@dataclass(frozen=True, slots=True) +class DatasetRef: + quantity: QuantityId + set_id: str + + +@dataclass(frozen=True, slots=True) +class Reference: + authors: str | None = None + year: int | None = None + title: str | None = None + venue: str | None = None + doi: str | None = None + url: str | None = None + publisher: str | None = None + note: str | None = None + + +@dataclass(frozen=True, slots=True) +class CoverageInfo: + n_values: int + z_min: int | None = None + z_max: int | None = None + has_placeholders: bool = False + covered_z: tuple[int, ...] = () + missing_z: tuple[int, ...] = () + + +@dataclass(frozen=True, slots=True) +class DatasetInfo: + ref: DatasetRef + domain: DomainId + units: str | None + name: str + description: str | None = None + semantic_class: str | None = None + origin_class: str | None = None + phase_context: str | None = None + method_summary: str | None = None + placeholder_value: float | None = None + extraction_source: str | None = None + aliases: tuple[str, ...] = () + references: tuple[Reference, ...] = () + notes: tuple[str, ...] = () + storage: Mapping[str, object] | None = None + coverage: CoverageInfo | None = None + + +@dataclass(frozen=True, slots=True) +class ElementScalarSet: + ref: DatasetRef + info: DatasetInfo + values_by_z: tuple[float | None, ...] + + @classmethod + def from_mapping( + cls, + *, + ref: DatasetRef, + values: Mapping[str, float | None], + name: str, + units: str | None, + description: str | None = None, + semantic_class: str = 'user', + origin_class: str = 'user', + phase_context: str | None = None, + references: Iterable[Reference] = (), + notes: Iterable[str] = (), + placeholder_value: float | None = None, + ) -> 'ElementScalarSet': + n_z = max(e.z for e in iter_elements()) + values_by_z: list[float | None] = [None] * (n_z + 1) + + for key, value in values.items(): + sym = _normalize_element_domain_symbol(key) + elem = get_element(sym) + if elem is None: + raise DatasetError(f'invalid element symbol in custom set: {key!r}') + values_by_z[elem.z] = None if value is None else float(value) + + covered_z = tuple(z for z, value in enumerate(values_by_z) if z > 0 and value is not None) + has_placeholders = False + if placeholder_value is not None: + has_placeholders = any( + value is not None and abs(value - placeholder_value) < 1e-12 + for value in values_by_z[1:] + ) + + info = DatasetInfo( + ref=ref, + domain='element', + units=units, + name=name, + description=description, + semantic_class=semantic_class, + origin_class=origin_class, + phase_context=phase_context, + placeholder_value=placeholder_value, + aliases=(), + references=tuple(references), + notes=tuple(notes), + storage=None, + coverage=CoverageInfo( + n_values=len(covered_z), + z_min=min(covered_z) if covered_z else None, + z_max=max(covered_z) if covered_z else None, + has_placeholders=has_placeholders, + covered_z=covered_z, + missing_z=tuple(z for z in range(1, n_z + 1) if values_by_z[z] is None), + ), + ) + return cls(ref=ref, info=info, values_by_z=tuple(values_by_z)) + + def get(self, symbol: str | None) -> float | None: + sym = _normalize_element_domain_symbol(symbol) + elem = get_element(sym) + if elem is None: + return None + return self.values_by_z[elem.z] + + +DatasetLike = DatasetRef | ElementScalarSet + + +def _normalize_element_domain_symbol(symbol: str | None) -> str | None: + cand = canonicalize_element_symbol(symbol) + if cand in {'D', 'T'}: + return 'H' + return cand + + +@lru_cache(maxsize=1) +def _load_registry_json() -> dict[str, object]: + path = resources.files('atomref.data').joinpath('registry.json') + with path.open('r', encoding='utf-8') as handle: + data = json.load(handle) + if not isinstance(data, dict): + raise DatasetError('invalid registry.json: expected JSON object') + return data + + +def _get_quantities_mapping() -> Mapping[str, object]: + quantities = _load_registry_json().get('quantities') + if not isinstance(quantities, dict): + raise DatasetError('invalid registry.json: missing quantities mapping') + return quantities + + +def _get_datasets_mapping() -> Mapping[str, object]: + datasets = _load_registry_json().get('datasets') + if not isinstance(datasets, dict): + raise DatasetError('invalid registry.json: missing datasets mapping') + return datasets + + +def _datasets_for_quantity(quantity: QuantityId) -> Mapping[str, object]: + datasets = _get_datasets_mapping().get(quantity) + if not isinstance(datasets, dict): + raise DatasetError(f'unknown quantity: {quantity!r}') + return datasets + + +def _canonicalize_alias_token(value: str) -> str: + return ' '.join(value.strip().lower().split()) + + +def _resolve_set_id(quantity: QuantityId, set_id: str) -> str: + by_quantity = _datasets_for_quantity(quantity) + if set_id in by_quantity: + return set_id + + wanted = _canonicalize_alias_token(set_id) + for actual_id, raw_entry in by_quantity.items(): + if _canonicalize_alias_token(actual_id) == wanted: + return actual_id + if isinstance(raw_entry, dict): + aliases = raw_entry.get('aliases', ()) + if isinstance(aliases, list): + for alias in aliases: + if isinstance(alias, str) and _canonicalize_alias_token(alias) == wanted: + return actual_id + raise DatasetError(f'unknown dataset id for {quantity!r}: {set_id!r}') + + +def list_dataset_ids(quantity: QuantityId) -> tuple[str, ...]: + return tuple(_datasets_for_quantity(quantity).keys()) + + +def _coerce_reference(obj: object) -> Reference: + if not isinstance(obj, dict): + raise DatasetError('invalid reference entry in registry.json') + return Reference( + authors=obj.get('authors') if isinstance(obj.get('authors'), str) else None, + year=obj.get('year') if isinstance(obj.get('year'), int) else None, + title=obj.get('title') if isinstance(obj.get('title'), str) else None, + venue=obj.get('venue') if isinstance(obj.get('venue'), str) else None, + doi=obj.get('doi') if isinstance(obj.get('doi'), str) else None, + url=obj.get('url') if isinstance(obj.get('url'), str) else None, + publisher=obj.get('publisher') if isinstance(obj.get('publisher'), str) else None, + note=obj.get('note') if isinstance(obj.get('note'), str) else None, + ) + + +def _coerce_coverage(obj: object) -> CoverageInfo | None: + if not isinstance(obj, dict): + return None + covered = obj.get('covered_z') + missing = obj.get('missing_z') + covered_z = tuple(int(z) for z in covered) if isinstance(covered, list) else () + missing_z = tuple(int(z) for z in missing) if isinstance(missing, list) else () + return CoverageInfo( + n_values=int(obj['n_values']), + z_min=int(obj['z_min']) if isinstance(obj.get('z_min'), int) else None, + z_max=int(obj['z_max']) if isinstance(obj.get('z_max'), int) else None, + has_placeholders=bool(obj.get('has_placeholders', False)), + covered_z=covered_z, + missing_z=missing_z, + ) + + +def get_dataset_info(ref: DatasetRef) -> DatasetInfo: + actual_set_id = _resolve_set_id(ref.quantity, ref.set_id) + actual_ref = DatasetRef(quantity=ref.quantity, set_id=actual_set_id) + + quantities = _get_quantities_mapping() + quantity_info = quantities.get(actual_ref.quantity) + if not isinstance(quantity_info, dict): + raise DatasetError(f'unknown quantity: {actual_ref.quantity!r}') + + units = quantity_info.get('units') if isinstance(quantity_info.get('units'), str) else None + domain = quantity_info.get('domain') if isinstance(quantity_info.get('domain'), str) else None + if domain is None: + raise DatasetError(f'missing domain for quantity: {actual_ref.quantity!r}') + + raw_entry = _datasets_for_quantity(actual_ref.quantity).get(actual_ref.set_id) + if not isinstance(raw_entry, dict): + raise DatasetError(f'unknown dataset: {actual_ref}') + + refs_raw = raw_entry.get('references', []) + references = tuple(_coerce_reference(item) for item in refs_raw) if isinstance(refs_raw, list) else () + aliases_raw = raw_entry.get('aliases', []) + aliases = tuple(item for item in aliases_raw if isinstance(item, str)) if isinstance(aliases_raw, list) else () + notes_raw = raw_entry.get('notes', []) + notes = tuple(item for item in notes_raw if isinstance(item, str)) if isinstance(notes_raw, list) else () + storage = raw_entry.get('storage') if isinstance(raw_entry.get('storage'), dict) else None + + return DatasetInfo( + ref=actual_ref, + domain=domain, + units=units, + name=raw_entry.get('name') if isinstance(raw_entry.get('name'), str) else actual_ref.set_id, + description=raw_entry.get('description') if isinstance(raw_entry.get('description'), str) else None, + semantic_class=raw_entry.get('semantic_class') if isinstance(raw_entry.get('semantic_class'), str) else None, + origin_class=raw_entry.get('origin_class') if isinstance(raw_entry.get('origin_class'), str) else None, + phase_context=raw_entry.get('phase_context') if isinstance(raw_entry.get('phase_context'), str) else None, + method_summary=raw_entry.get('method_summary') if isinstance(raw_entry.get('method_summary'), str) else None, + placeholder_value=( + float(raw_entry['placeholder_value']) + if raw_entry.get('placeholder_value') is not None + else None + ), + extraction_source=raw_entry.get('extraction_source') if isinstance(raw_entry.get('extraction_source'), str) else None, + aliases=aliases, + references=references, + notes=notes, + storage=storage, + coverage=_coerce_coverage(raw_entry.get('coverage')), + ) + + +@lru_cache(maxsize=None) +def _load_csv_columns(filename: str) -> dict[str, tuple[float | None, ...]]: + path = resources.files('atomref.data').joinpath(filename) + with path.open('r', encoding='utf-8', newline='') as handle: + reader = csv.DictReader(handle) + if reader.fieldnames is None or 'z' not in reader.fieldnames: + raise DatasetError(f'invalid CSV file: {filename!r}') + columns = [name for name in reader.fieldnames if name != 'z'] + values: dict[str, list[float | None]] = {name: [None] * 119 for name in columns} + for row in reader: + z_text = row.get('z') + if z_text is None: + continue + z = int(z_text) + for name in columns: + raw = row.get(name) + if raw is None: + values[name][z] = None + continue + raw = raw.strip() + values[name][z] = float(raw) if raw else None + return {name: tuple(vals) for name, vals in values.items()} + + +@lru_cache(maxsize=None) +def get_builtin_set(ref: DatasetRef) -> ElementScalarSet: + info = get_dataset_info(ref) + if info.domain != 'element': + raise DatasetError(f'only element-domain datasets are supported in v0.1: {info.ref!r}') + if not isinstance(info.storage, Mapping): + raise DatasetError(f'missing storage metadata for dataset: {info.ref!r}') + + filename = info.storage.get('filename') + column = info.storage.get('column') + if not isinstance(filename, str) or not isinstance(column, str): + raise DatasetError(f'invalid storage metadata for dataset: {info.ref!r}') + + table = _load_csv_columns(filename) + if column not in table: + raise DatasetError(f'column {column!r} not found in {filename!r}') + + return ElementScalarSet(ref=info.ref, info=info, values_by_z=table[column]) + + +def resolve_dataset_like(dataset: DatasetLike) -> ElementScalarSet: + if isinstance(dataset, ElementScalarSet): + return dataset + return get_builtin_set(dataset) + + +def _is_placeholder_value(info: DatasetInfo, value: float) -> bool: + if info.placeholder_value is None: + return False + return abs(value - info.placeholder_value) < 1e-12 diff --git a/src/atomref/transfer.py b/src/atomref/transfer.py new file mode 100644 index 0000000..d7f5d5e --- /dev/null +++ b/src/atomref/transfer.py @@ -0,0 +1,31 @@ +"""Transfer model configuration types.""" + +from __future__ import annotations + +from dataclasses import dataclass + +from .registry import DatasetLike + + +@dataclass(frozen=True, slots=True) +class LinearFit: + coefficients: tuple[float, ...] + intercept: float + n_points: int + r2: float + rmse: float + + +@dataclass(frozen=True, slots=True) +class SubstitutionTransfer: + source: DatasetLike + + +@dataclass(frozen=True, slots=True) +class LinearTransfer: + predictors: tuple[DatasetLike, ...] + min_points: int = 2 + exclude_placeholders: bool = True + + +TransferModel = SubstitutionTransfer | LinearTransfer diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..08328a4 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +SRC = REPO_ROOT / 'src' +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) diff --git a/tests/elements/test_elements.py b/tests/elements/test_elements.py new file mode 100644 index 0000000..161b420 --- /dev/null +++ b/tests/elements/test_elements.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +import atomref as ar + + +def test_element_lookup_and_validation() -> None: + assert ar.is_valid_element_symbol('C') + assert ar.is_valid_element_symbol('cl') is False + assert ar.get_element('cl') is not None + assert ar.get_element('C').z == 6 + assert ar.get_element('Xx') is None + + +def test_iter_elements_is_sorted_and_complete() -> None: + elems = ar.iter_elements() + assert elems[0].symbol == 'H' + assert elems[-1].symbol == 'Og' + assert elems[0].z == 1 + assert elems[-1].z == 118 diff --git a/tests/meta/test_imports.py b/tests/meta/test_imports.py new file mode 100644 index 0000000..374996a --- /dev/null +++ b/tests/meta/test_imports.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +import importlib + + +MODULES = [ + 'atomref', + 'atomref.elements', + 'atomref.registry', + 'atomref.transfer', + 'atomref.policy', + 'atomref.radii', +] + + +def test_imports() -> None: + for name in MODULES: + importlib.import_module(name) diff --git a/tests/meta/test_readme_sync.py b/tests/meta/test_readme_sync.py new file mode 100644 index 0000000..fe56ac2 --- /dev/null +++ b/tests/meta/test_readme_sync.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from pathlib import Path +import subprocess +import sys + + +REPO_ROOT = Path(__file__).resolve().parents[2] +README = REPO_ROOT / 'README.md' +SCRIPT = REPO_ROOT / 'tools' / 'gen_readme.py' + + +def test_readme_is_in_sync(tmp_path: Path) -> None: + generated = tmp_path / 'README.generated.md' + subprocess.run( + [sys.executable, str(SCRIPT), '--output', str(generated)], + cwd=REPO_ROOT, + check=True, + ) + assert generated.read_text(encoding='utf-8') == README.read_text(encoding='utf-8') diff --git a/tests/radii/test_assessment.py b/tests/radii/test_assessment.py new file mode 100644 index 0000000..664d867 --- /dev/null +++ b/tests/radii/test_assessment.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +import atomref as ar + + +def test_assess_vdw_default_linear_counts() -> None: + rep = ar.assess_radii_policy(['Pm', 'O'], policy=ar.DEFAULT_VDW_POLICY) + assert rep.kind == 'van_der_waals' + assert rep.n_elements == 2 + assert rep.n_base == 1 + assert rep.n_transfer_linear == 1 + assert rep.n_missing == 0 + assert rep.fits + assert rep.fits[0].n_points == 90 + + +def test_assess_vdw_detail_reports_sources() -> None: + rep = ar.assess_radii_policy(['Pm', 'O'], policy=ar.DEFAULT_VDW_POLICY, detail=True) + by_sym = {d.symbol: d for d in rep.per_element} + assert by_sym['O'].lookup.source == 'base' + assert by_sym['Pm'].lookup.source == 'transfer_linear' + + +def test_assess_covalent_sub_placeholder_count() -> None: + rep = ar.assess_radii_policy(['Es'], policy=ar.DEFAULT_COVALENT_POLICY) + assert rep.kind == 'covalent' + assert rep.n_elements == 1 + assert rep.n_transfer_substitution == 1 + assert rep.n_placeholders == 1 + assert rep.placeholder_symbols == ('Es',) + assert rep.n_missing == 0 + + +def test_assess_covalent_missing_in_both_sets() -> None: + rep = ar.assess_radii_policy(['Rg'], policy=ar.DEFAULT_COVALENT_POLICY) + assert rep.n_missing == 1 + assert rep.missing_symbols == ('Rg',) diff --git a/tests/radii/test_selection.py b/tests/radii/test_selection.py new file mode 100644 index 0000000..c432d8e --- /dev/null +++ b/tests/radii/test_selection.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +import pytest + +import atomref as ar + + +def test_get_covalent_radius_default_prefers_cordero() -> None: + assert ar.get_covalent_radius('C') == pytest.approx(0.76) + + +def test_get_covalent_radius_maps_deuterium_to_hydrogen() -> None: + assert ar.get_covalent_radius('D') == pytest.approx(0.31) + + +def test_get_vdw_radius_default_prefers_alvarez() -> None: + assert ar.get_vdw_radius('C') == pytest.approx(1.77) + + +def test_completion_is_used_for_missing_base_values() -> None: + m = ar.lookup_covalent_radius('Bk') + assert m.value is not None + assert m.source == 'transfer_substitution' + + m2 = ar.lookup_vdw_radius('Pm') + assert m2.value is not None + assert m2.source == 'transfer_linear' + assert m2.value == pytest.approx(2.897226539514835) + + +def test_linear_transfer_rejects_placeholder_values() -> None: + scheme = ar.RadiiPolicy( + kind='van_der_waals', + base_set='bondi1964', + transfers=( + ar.LinearTransfer( + predictors=(ar.DatasetRef('van_der_waals_radius', 'csd_legacy_vdw'),) + ), + ), + ) + m = ar.lookup_vdw_radius('Be', policy=scheme) + assert m.value is None + assert m.source == 'missing' + assert any('placeholder' in s for s in m.notes) + + +def test_lookup_float_conversion() -> None: + m = ar.lookup_covalent_radius('C') + assert float(m) == pytest.approx(0.76) + + m_missing = ar.lookup_covalent_radius('Xx') + with pytest.raises(TypeError): + float(m_missing) diff --git a/tests/registry/test_registry.py b/tests/registry/test_registry.py new file mode 100644 index 0000000..e8811d1 --- /dev/null +++ b/tests/registry/test_registry.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from importlib import resources + +import atomref as ar +from atomref.registry import get_builtin_set + + +def test_packaged_data_files_exist() -> None: + pkg = 'atomref.data' + assert resources.files(pkg).joinpath('periodic_table.csv').is_file() + assert resources.files(pkg).joinpath('covalent.csv').is_file() + assert resources.files(pkg).joinpath('van_der_waals.csv').is_file() + assert resources.files(pkg).joinpath('registry.json').is_file() + + +def test_registry_lists_vdw_sets_but_not_atomic_support_sets() -> None: + vdw_sets = ar.list_radii_sets('van_der_waals') + assert 'alvarez2013' in vdw_sets + assert 'rahm2016' not in vdw_sets + + +def test_rahm_is_registered_as_atomic_radius() -> None: + info = ar.get_dataset_info(ar.DatasetRef('atomic_radius', 'rahm2016')) + assert info.ref.quantity == 'atomic_radius' + assert info.semantic_class == 'atomic_isodensity' + assert info.phase_context == 'isolated_atom' + + +def test_builtin_set_loading_works() -> None: + ds = get_builtin_set(ar.DatasetRef('covalent_radius', 'cordero2008')) + assert ds.get('C') == 0.76 diff --git a/tests/test_smoke.py b/tests/test_smoke.py new file mode 100644 index 0000000..6a96b08 --- /dev/null +++ b/tests/test_smoke.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +import atomref as ar + + +def test_version_is_present() -> None: + assert isinstance(ar.__version__, str) + assert ar.__version__ + + +def test_basic_smoke_import_and_lookup() -> None: + assert ar.get_covalent_radius('C') == 0.76 + assert ar.get_vdw_radius('C') == 1.77 diff --git a/tools/gen_readme.py b/tools/gen_readme.py new file mode 100644 index 0000000..cad0335 --- /dev/null +++ b/tools/gen_readme.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +import argparse +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[1] +SOURCE = REPO_ROOT / 'docs' / 'index.md' +README = REPO_ROOT / 'README.md' + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument('--output', type=Path, default=README) + args = parser.parse_args() + args.output.write_text(SOURCE.read_text(encoding='utf-8'), encoding='utf-8') + + +if __name__ == '__main__': + main() From c7ee02ceba9c72a241dbfd6f7c06deb4a3a21219 Mon Sep 17 00:00:00 2001 From: Ivan Chernyshov Date: Sat, 14 Mar 2026 15:33:20 +0300 Subject: [PATCH 02/15] Adds quantity introspection --- README.md | 9 +++++++ docs/datasets/atomic_radius.md | 2 ++ docs/datasets/index.md | 2 ++ docs/guide/quickstart.md | 10 +++++++ docs/index.md | 9 +++++++ src/atomref/__init__.py | 6 +++++ src/atomref/data/registry.json | 2 +- src/atomref/registry.py | 24 +++++++++++++++++ tests/radii/test_selection.py | 47 +++++++++++++++++++++++++++++++++ tests/registry/test_registry.py | 19 +++++++++++++ 10 files changed, 129 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d5b9154..efdbd73 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,15 @@ This follows the current `molcryst` pattern. - `atomic_radius` (support quantity; currently used for transfer from `rahm2016`) +You can inspect the packaged quantity layer directly: + +```python +import atomref as ar + +print(ar.list_quantities()) +print(ar.get_quantity_info("atomic_radius")) +``` + ## Relationship to the Delone Commons ecosystem `atomref` is intended to be reusable outside the surrounding ecosystem, but it diff --git a/docs/datasets/atomic_radius.md b/docs/datasets/atomic_radius.md index cbbe61b..00a43cd 100644 --- a/docs/datasets/atomic_radius.md +++ b/docs/datasets/atomic_radius.md @@ -6,3 +6,5 @@ not best described as direct condensed-phase vdW radii. Built-in v0.1 support set: - `rahm2016` + +`rahm2016` is intentionally classified here as atomic support data rather than as a direct vdW target set. diff --git a/docs/datasets/index.md b/docs/datasets/index.md index a58d78b..e9e2565 100644 --- a/docs/datasets/index.md +++ b/docs/datasets/index.md @@ -8,3 +8,5 @@ The package distinguishes between: This is what keeps support-only datasets such as `rahm2016` usable without misclassifying them as direct condensed-phase vdW radii. + +For programmatic inspection, use `atomref.list_quantities()` and `atomref.get_quantity_info(...)`. diff --git a/docs/guide/quickstart.md b/docs/guide/quickstart.md index 62de165..5cb1637 100644 --- a/docs/guide/quickstart.md +++ b/docs/guide/quickstart.md @@ -14,3 +14,13 @@ print(m.resolved_from) Use `get_*` when you only need the number, and `lookup_*` when you need provenance. + +You can also inspect the packaged quantity layer directly: + +```python +import atomref as ar + +print(ar.list_quantities()) +print(ar.get_quantity_info("atomic_radius")) +``` + diff --git a/docs/index.md b/docs/index.md index d5b9154..efdbd73 100644 --- a/docs/index.md +++ b/docs/index.md @@ -53,6 +53,15 @@ This follows the current `molcryst` pattern. - `atomic_radius` (support quantity; currently used for transfer from `rahm2016`) +You can inspect the packaged quantity layer directly: + +```python +import atomref as ar + +print(ar.list_quantities()) +print(ar.get_quantity_info("atomic_radius")) +``` + ## Relationship to the Delone Commons ecosystem `atomref` is intended to be reusable outside the surrounding ecosystem, but it diff --git a/src/atomref/__init__.py b/src/atomref/__init__.py index fd07068..a639619 100644 --- a/src/atomref/__init__.py +++ b/src/atomref/__init__.py @@ -20,9 +20,12 @@ DatasetInfo, DatasetRef, ElementScalarSet, + QuantityInfo, Reference, get_dataset_info, + get_quantity_info, list_dataset_ids, + list_quantities, ) from .transfer import LinearFit, LinearTransfer, SubstitutionTransfer @@ -37,9 +40,12 @@ 'DatasetInfo', 'DatasetRef', 'ElementScalarSet', + 'QuantityInfo', 'Reference', 'get_dataset_info', + 'get_quantity_info', 'list_dataset_ids', + 'list_quantities', 'LinearFit', 'LinearTransfer', 'SubstitutionTransfer', diff --git a/src/atomref/data/registry.json b/src/atomref/data/registry.json index 2577ab7..3d9bf6e 100644 --- a/src/atomref/data/registry.json +++ b/src/atomref/data/registry.json @@ -426,7 +426,7 @@ ], "notes": [ "The original work also reports cationic radii (+1) for the first 96 elements and selected anionic radii (−1) for some elements; these are not yet included in the current CSV.", - "Despite the fact that in this project this radii are classified as vdW radii for the purpose of simplicity, they should be treated as a correlational/transferable baseline rather than a direct condensed-phase vdW radius since they describe isolated atoms in vacuum." + "In atomref this dataset is classified as atomic support data, not as a direct condensed-phase van der Waals-radius set, because it describes isolated atoms in vacuum and is used here primarily as a transferable baseline." ] } } diff --git a/src/atomref/registry.py b/src/atomref/registry.py index 196dbc3..d61dd29 100644 --- a/src/atomref/registry.py +++ b/src/atomref/registry.py @@ -45,6 +45,14 @@ class CoverageInfo: missing_z: tuple[int, ...] = () +@dataclass(frozen=True, slots=True) +class QuantityInfo: + quantity: QuantityId + domain: DomainId + units: str | None = None + description: str | None = None + + @dataclass(frozen=True, slots=True) class DatasetInfo: ref: DatasetRef @@ -179,6 +187,22 @@ def _datasets_for_quantity(quantity: QuantityId) -> Mapping[str, object]: return datasets +def list_quantities() -> tuple[str, ...]: + return tuple(_get_quantities_mapping().keys()) + + +def get_quantity_info(quantity: QuantityId) -> QuantityInfo: + raw = _get_quantities_mapping().get(quantity) + if not isinstance(raw, dict): + raise DatasetError(f'unknown quantity: {quantity!r}') + domain = raw.get('domain') if isinstance(raw.get('domain'), str) else None + if domain is None: + raise DatasetError(f'missing domain for quantity: {quantity!r}') + units = raw.get('units') if isinstance(raw.get('units'), str) else None + description = raw.get('description') if isinstance(raw.get('description'), str) else None + return QuantityInfo(quantity=quantity, domain=domain, units=units, description=description) + + def _canonicalize_alias_token(value: str) -> str: return ' '.join(value.strip().lower().split()) diff --git a/tests/radii/test_selection.py b/tests/radii/test_selection.py index c432d8e..9eb16de 100644 --- a/tests/radii/test_selection.py +++ b/tests/radii/test_selection.py @@ -3,6 +3,7 @@ import pytest import atomref as ar +from atomref.errors import PolicyError def test_get_covalent_radius_default_prefers_cordero() -> None: @@ -51,3 +52,49 @@ def test_lookup_float_conversion() -> None: m_missing = ar.lookup_covalent_radius('Xx') with pytest.raises(TypeError): float(m_missing) + + +def test_override_precedes_base_value() -> None: + policy = ar.RadiiPolicy( + kind='covalent', + base_set='cordero2008', + overrides={'C': 9.99}, + ) + lookup = ar.lookup_covalent_radius('C', policy=policy) + assert lookup.source == 'override' + assert lookup.value == pytest.approx(9.99) + + +def test_fallback_is_used_only_after_transfers_fail() -> None: + policy = ar.RadiiPolicy( + kind='van_der_waals', + base_set='bondi1964', + transfers=( + ar.LinearTransfer( + predictors=(ar.DatasetRef('van_der_waals_radius', 'csd_legacy_vdw'),) + ), + ), + fallback=2.5, + ) + lookup = ar.lookup_vdw_radius('Be', policy=policy) + assert lookup.source == 'fallback' + assert lookup.value == pytest.approx(2.5) + assert any('placeholder' in note for note in lookup.notes) + + +def test_linear_transfer_rejects_multiple_predictors_in_v0_1() -> None: + policy = ar.RadiiPolicy( + kind='van_der_waals', + base_set='alvarez2013', + transfers=( + ar.LinearTransfer( + predictors=( + ar.DatasetRef('atomic_radius', 'rahm2016'), + ar.DatasetRef('covalent_radius', 'cordero2008'), + ) + ), + ), + ) + with pytest.raises(PolicyError): + ar.lookup_vdw_radius('Pm', policy=policy) + diff --git a/tests/registry/test_registry.py b/tests/registry/test_registry.py index e8811d1..b18327e 100644 --- a/tests/registry/test_registry.py +++ b/tests/registry/test_registry.py @@ -30,3 +30,22 @@ def test_rahm_is_registered_as_atomic_radius() -> None: def test_builtin_set_loading_works() -> None: ds = get_builtin_set(ar.DatasetRef('covalent_radius', 'cordero2008')) assert ds.get('C') == 0.76 + + +def test_list_quantities_and_quantity_info() -> None: + quantities = ar.list_quantities() + assert quantities == ('covalent_radius', 'van_der_waals_radius', 'atomic_radius') + + info = ar.get_quantity_info('atomic_radius') + assert info.quantity == 'atomic_radius' + assert info.domain == 'element' + assert info.units == 'angstrom' + assert 'support' in (info.description or '') + + +def test_rahm_note_no_longer_claims_it_is_classified_as_vdw() -> None: + info = ar.get_dataset_info(ar.DatasetRef('atomic_radius', 'rahm2016')) + joined = ' '.join(info.notes).lower() + assert 'classified as vdw' not in joined + assert 'atomic support data' in joined + From 5c9ff264c5ea46cb35693462c3c0431393c54dec Mon Sep 17 00:00:00 2001 From: Ivan Chernyshov Date: Sat, 14 Mar 2026 17:47:02 +0300 Subject: [PATCH 03/15] Adds usage roles --- README.md | 2 ++ docs/datasets/index.md | 5 +++++ docs/guide/policies.md | 8 ++++++++ docs/index.md | 2 ++ src/atomref/data/registry.json | 24 ++++++++++++++++-------- src/atomref/radii.py | 4 ++-- src/atomref/registry.py | 19 +++++++++++++++++-- tests/registry/test_registry.py | 19 +++++++++++++++++++ 8 files changed, 71 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index efdbd73..fc2a6fd 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,8 @@ import atomref as ar print(ar.list_quantities()) print(ar.get_quantity_info("atomic_radius")) +print(ar.list_dataset_ids("van_der_waals_radius", usage_role="target")) +print(ar.list_dataset_ids("atomic_radius", usage_role="support")) ``` ## Relationship to the Delone Commons ecosystem diff --git a/docs/datasets/index.md b/docs/datasets/index.md index e9e2565..1095225 100644 --- a/docs/datasets/index.md +++ b/docs/datasets/index.md @@ -10,3 +10,8 @@ This is what keeps support-only datasets such as `rahm2016` usable without misclassifying them as direct condensed-phase vdW radii. For programmatic inspection, use `atomref.list_quantities()` and `atomref.get_quantity_info(...)`. + +Dataset metadata also carries a package-level `usage_role`, which currently +distinguishes direct target sets from support-only sets used for substitution or +linear transfer. Use `atomref.list_dataset_ids(..., usage_role=...)` to inspect +that layer programmatically. diff --git a/docs/guide/policies.md b/docs/guide/policies.md index a7e9130..a5a5b1b 100644 --- a/docs/guide/policies.md +++ b/docs/guide/policies.md @@ -18,3 +18,11 @@ Built-in transfer models: `LinearTransfer` is intentionally limited to one predictor in v0.1, but the API already accepts a predictor tuple so later multi-predictor linear models do not require a redesign. + +## Target vs support sets + +`atomref` keeps the lookup behavior separate from the scientific classification +of a dataset. In addition, each built-in dataset now carries a package-level +`usage_role` such as `target` or `support`. This is how `rahm2016` can remain +available for linear transfer into `alvarez2013`-style vdW values without being +misrepresented as a direct condensed-phase vdW target set. diff --git a/docs/index.md b/docs/index.md index efdbd73..fc2a6fd 100644 --- a/docs/index.md +++ b/docs/index.md @@ -60,6 +60,8 @@ import atomref as ar print(ar.list_quantities()) print(ar.get_quantity_info("atomic_radius")) +print(ar.list_dataset_ids("van_der_waals_radius", usage_role="target")) +print(ar.list_dataset_ids("atomic_radius", usage_role="support")) ``` ## Relationship to the Delone Commons ecosystem diff --git a/src/atomref/data/registry.json b/src/atomref/data/registry.json index 3d9bf6e..a722e1c 100644 --- a/src/atomref/data/registry.json +++ b/src/atomref/data/registry.json @@ -62,7 +62,8 @@ ], "notes": [ "The source paper provides multiple radii per element for different atom types/environments; this package currently includes C(sp3) value for C and high-spin values for Mn/Fe/Co." - ] + ], + "usage_role": "target" }, "csd_legacy_cov": { "name": "CSD legacy covalent radii (bond perception)", @@ -102,7 +103,8 @@ "CSD bond assignment heuristic: a bond A-B may be inferred if distance d satisfies Rcov(A)+Rcov(B)-t <= d <= Rcov(A)+Rcov(B)+t, with typical t=0.4 Å. (See the CCDC spreadsheet notes.)", "For Z>=111, csd_legacy values are omitted because the legacy CSD table does not provide radii beyond Darmstadtium (Z=110).", "Elements not yet encountered in the CSD have Rcov = 1.50 Å." - ] + ], + "usage_role": "support" } }, "van_der_waals_radius": { @@ -188,7 +190,8 @@ "notes": [ "Coverage is limited (38 elements, including only a few transition metals and uranium).", "Because Bondi radii were not derived exclusively from crystal nonbonded contact statistics, they can differ slightly from later 'structural' vdW radii." - ] + ], + "usage_role": "target" }, "rowland_taylor1996": { "name": "Rowland & Taylor nonbonded contact radii", @@ -236,7 +239,8 @@ "notes": [ "Coverage is intentionally limited to common organic-crystal nonmetals (H, C, N, O, F, S, Cl, Br, I).", "Rowland & Taylor also report a normalized set (R_d) constrained to match the total of Bondi radii; this package uses the raw least-squares r_c values." - ] + ], + "usage_role": "target" }, "alvarez2013": { "name": "Alvarez van der Waals radii", @@ -281,7 +285,8 @@ ], "notes": [ "Obtained by statistical analysis of millions of interatomic distances in the Cambridge Structural Database (CSD), locating the vdW peak after the vdW gap." - ] + ], + "usage_role": "target" }, "chernyshov2020": { "name": "Chernyshov LoS van der Waals radii", @@ -330,7 +335,8 @@ "notes": [ "The source paper provides multiple radii per element for different atom types/environments; this package currently includes only the main/default R_max values used in Table 1.", "Primarily targeted at elements common in organic crystals (H, C, N, O, F, S, Cl, Se, Br, I)." - ] + ], + "usage_role": "target" }, "csd_legacy_vdw": { "name": "CSD legacy van der Waals radii (pre-2024.3)", @@ -381,7 +387,8 @@ "For Z>=111, csd_legacy values are omitted because the legacy CSD table does not provide radii beyond Darmstadtium (Z=110).", "Radii that are not available in either Bondi or Rowland & Taylor versions were assigned RvdW of 2.00 Å.", "The CSD 2024.3 release updated the vdW radii used in CSD and Mercury to Alvarez-derived values (see CCDC blog post)." - ] + ], + "usage_role": "support" } }, "atomic_radius": { @@ -427,7 +434,8 @@ "notes": [ "The original work also reports cationic radii (+1) for the first 96 elements and selected anionic radii (−1) for some elements; these are not yet included in the current CSV.", "In atomref this dataset is classified as atomic support data, not as a direct condensed-phase van der Waals-radius set, because it describes isolated atoms in vacuum and is used here primarily as a transferable baseline." - ] + ], + "usage_role": "support" } } } diff --git a/src/atomref/radii.py b/src/atomref/radii.py index 61cebda..7ce73ed 100644 --- a/src/atomref/radii.py +++ b/src/atomref/radii.py @@ -112,8 +112,8 @@ def _normalize_assessment_elements(elements: Iterable[str]) -> tuple[str, ...]: return tuple(sorted(symbols, key=lambda s: get_element(s).z if get_element(s) else 0)) -def list_radii_sets(kind: RadiiKind) -> tuple[str, ...]: - return list_dataset_ids(_quantity_for_kind(kind)) +def list_radii_sets(kind: RadiiKind, *, usage_role: str | None = None) -> tuple[str, ...]: + return list_dataset_ids(_quantity_for_kind(kind), usage_role=usage_role) def get_radii_set_info(kind: RadiiKind, set_id: str) -> DatasetInfo: diff --git a/src/atomref/registry.py b/src/atomref/registry.py index d61dd29..458d202 100644 --- a/src/atomref/registry.py +++ b/src/atomref/registry.py @@ -60,6 +60,7 @@ class DatasetInfo: units: str | None name: str description: str | None = None + usage_role: str | None = None semantic_class: str | None = None origin_class: str | None = None phase_context: str | None = None @@ -88,6 +89,7 @@ def from_mapping( name: str, units: str | None, description: str | None = None, + usage_role: str = 'user', semantic_class: str = 'user', origin_class: str = 'user', phase_context: str | None = None, @@ -119,6 +121,7 @@ def from_mapping( units=units, name=name, description=description, + usage_role=usage_role, semantic_class=semantic_class, origin_class=origin_class, phase_context=phase_context, @@ -225,8 +228,19 @@ def _resolve_set_id(quantity: QuantityId, set_id: str) -> str: raise DatasetError(f'unknown dataset id for {quantity!r}: {set_id!r}') -def list_dataset_ids(quantity: QuantityId) -> tuple[str, ...]: - return tuple(_datasets_for_quantity(quantity).keys()) +def list_dataset_ids(quantity: QuantityId, *, usage_role: str | None = None) -> tuple[str, ...]: + dataset_ids = tuple(_datasets_for_quantity(quantity).keys()) + if usage_role is None: + return dataset_ids + + filtered: list[str] = [] + wanted = usage_role.strip().lower() + for set_id in dataset_ids: + info = get_dataset_info(DatasetRef(quantity, set_id)) + role = (info.usage_role or '').strip().lower() + if role == wanted: + filtered.append(set_id) + return tuple(filtered) def _coerce_reference(obj: object) -> Reference: @@ -293,6 +307,7 @@ def get_dataset_info(ref: DatasetRef) -> DatasetInfo: units=units, name=raw_entry.get('name') if isinstance(raw_entry.get('name'), str) else actual_ref.set_id, description=raw_entry.get('description') if isinstance(raw_entry.get('description'), str) else None, + usage_role=raw_entry.get('usage_role') if isinstance(raw_entry.get('usage_role'), str) else None, semantic_class=raw_entry.get('semantic_class') if isinstance(raw_entry.get('semantic_class'), str) else None, origin_class=raw_entry.get('origin_class') if isinstance(raw_entry.get('origin_class'), str) else None, phase_context=raw_entry.get('phase_context') if isinstance(raw_entry.get('phase_context'), str) else None, diff --git a/tests/registry/test_registry.py b/tests/registry/test_registry.py index b18327e..23b401e 100644 --- a/tests/registry/test_registry.py +++ b/tests/registry/test_registry.py @@ -49,3 +49,22 @@ def test_rahm_note_no_longer_claims_it_is_classified_as_vdw() -> None: assert 'classified as vdw' not in joined assert 'atomic support data' in joined + +def test_usage_role_is_exposed_on_dataset_info() -> None: + info = ar.get_dataset_info(ar.DatasetRef('atomic_radius', 'rahm2016')) + assert info.usage_role == 'support' + + +def test_list_dataset_ids_can_filter_by_usage_role() -> None: + assert ar.list_dataset_ids('atomic_radius', usage_role='support') == ('rahm2016',) + assert ar.list_dataset_ids('van_der_waals_radius', usage_role='target') == ( + 'bondi1964', + 'rowland_taylor1996', + 'alvarez2013', + 'chernyshov2020', + ) + + +def test_list_radii_sets_can_filter_by_usage_role() -> None: + assert ar.list_radii_sets('covalent', usage_role='support') == ('csd_legacy_cov',) + assert 'alvarez2013' in ar.list_radii_sets('van_der_waals', usage_role='target') From 8a74c00ea5a28d487e803dd00a19e66a38980034 Mon Sep 17 00:00:00 2001 From: Ivan Chernyshov Date: Sat, 14 Mar 2026 18:37:06 +0300 Subject: [PATCH 04/15] Fixes formatting --- .github/workflows/ci.yml | 14 +++ src/atomref/__init__.py | 74 ++++++----- src/atomref/policy.py | 106 +++++++++------- src/atomref/radii.py | 93 ++++++++------ src/atomref/registry.py | 230 ++++++++++++++++++++++------------ tests/radii/test_selection.py | 63 +++++----- 6 files changed, 355 insertions(+), 225 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dbc7a70..38602c6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,6 +5,20 @@ on: pull_request: jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install lint dependencies + run: | + python -m pip install --upgrade pip + python -m pip install .[dev] + - name: Lint + run: flake8 src tests + test: runs-on: ubuntu-latest strategy: diff --git a/src/atomref/__init__.py b/src/atomref/__init__.py index a639619..1b0ef6b 100644 --- a/src/atomref/__init__.py +++ b/src/atomref/__init__.py @@ -1,5 +1,11 @@ from .__about__ import __version__ -from .elements import Element, canonicalize_element_symbol, get_element, iter_elements, is_valid_element_symbol +from .elements import ( + Element, + canonicalize_element_symbol, + get_element, + iter_elements, + is_valid_element_symbol, +) from .policy import LookupResult, ValuePolicy from .radii import ( DEFAULT_COVALENT_POLICY, @@ -30,37 +36,37 @@ from .transfer import LinearFit, LinearTransfer, SubstitutionTransfer __all__ = [ - '__version__', - 'Element', - 'canonicalize_element_symbol', - 'get_element', - 'iter_elements', - 'is_valid_element_symbol', - 'CoverageInfo', - 'DatasetInfo', - 'DatasetRef', - 'ElementScalarSet', - 'QuantityInfo', - 'Reference', - 'get_dataset_info', - 'get_quantity_info', - 'list_dataset_ids', - 'list_quantities', - 'LinearFit', - 'LinearTransfer', - 'SubstitutionTransfer', - 'LookupResult', - 'ValuePolicy', - 'RadiiPolicy', - 'RadiiElementAssessment', - 'RadiiPolicyAssessment', - 'DEFAULT_COVALENT_POLICY', - 'DEFAULT_VDW_POLICY', - 'list_radii_sets', - 'get_radii_set_info', - 'lookup_covalent_radius', - 'get_covalent_radius', - 'lookup_vdw_radius', - 'get_vdw_radius', - 'assess_radii_policy', + "__version__", + "Element", + "canonicalize_element_symbol", + "get_element", + "iter_elements", + "is_valid_element_symbol", + "CoverageInfo", + "DatasetInfo", + "DatasetRef", + "ElementScalarSet", + "QuantityInfo", + "Reference", + "get_dataset_info", + "get_quantity_info", + "list_dataset_ids", + "list_quantities", + "LinearFit", + "LinearTransfer", + "SubstitutionTransfer", + "LookupResult", + "ValuePolicy", + "RadiiPolicy", + "RadiiElementAssessment", + "RadiiPolicyAssessment", + "DEFAULT_COVALENT_POLICY", + "DEFAULT_VDW_POLICY", + "list_radii_sets", + "get_radii_set_info", + "lookup_covalent_radius", + "get_covalent_radius", + "lookup_vdw_radius", + "get_vdw_radius", + "assess_radii_policy", ] diff --git a/src/atomref/policy.py b/src/atomref/policy.py index b7df87b..5b242e2 100644 --- a/src/atomref/policy.py +++ b/src/atomref/policy.py @@ -8,7 +8,7 @@ import math from typing import Generic, Literal, TypeVar -from .elements import canonicalize_element_symbol, get_element, is_valid_element_symbol +from .elements import canonicalize_element_symbol, is_valid_element_symbol from .errors import PolicyError from .registry import ( DatasetLike, @@ -20,16 +20,15 @@ ) from .transfer import LinearFit, LinearTransfer, SubstitutionTransfer, TransferModel - -K = TypeVar('K') +K = TypeVar("K") LookupSource = Literal[ - 'override', - 'base', - 'transfer_substitution', - 'transfer_linear', - 'fallback', - 'missing', + "override", + "base", + "transfer_substitution", + "transfer_linear", + "fallback", + "missing", ] @@ -45,7 +44,7 @@ class LookupResult: def __float__(self) -> float: if self.value is None: - raise TypeError('reference value is missing') + raise TypeError("reference value is missing") return float(self.value) @@ -59,8 +58,8 @@ class ValuePolicy(Generic[K]): def _normalize_element_symbol(symbol: str | None) -> str | None: cand = canonicalize_element_symbol(symbol) - if cand in {'D', 'T'}: - cand = 'H' + if cand in {"D", "T"}: + cand = "H" if cand is None: return None if not is_valid_element_symbol(cand): @@ -72,7 +71,13 @@ def _resolve_target_ref(policy: ValuePolicy[object]) -> DatasetRef: return resolve_dataset_like(policy.base).ref -def _fit_linear_transfer(base_set: ElementScalarSet, predictor_set: ElementScalarSet, *, min_points: int, exclude_placeholders: bool) -> LinearFit: +def _fit_linear_transfer( + base_set: ElementScalarSet, + predictor_set: ElementScalarSet, + *, + min_points: int, + exclude_placeholders: bool, +) -> LinearFit: xs: list[float] = [] ys: list[float] = [] @@ -94,13 +99,13 @@ def _fit_linear_transfer(base_set: ElementScalarSet, predictor_set: ElementScala n = len(xs) if n < min_points: - raise PolicyError('not enough overlapping elements to fit linear transfer') + raise PolicyError("not enough overlapping elements to fit linear transfer") x_mean = sum(xs) / n y_mean = sum(ys) / n sxx = sum((x - x_mean) ** 2 for x in xs) if sxx == 0: - raise PolicyError('cannot fit linear transfer: zero predictor variance') + raise PolicyError("cannot fit linear transfer: zero predictor variance") sxy = sum((x - x_mean) * (y - y_mean) for x, y in zip(xs, ys)) slope = sxy / sxx @@ -122,7 +127,12 @@ def _fit_linear_transfer(base_set: ElementScalarSet, predictor_set: ElementScala @lru_cache(maxsize=None) -def _fit_linear_transfer_cached(base_ref: DatasetRef, predictor_ref: DatasetRef, min_points: int, exclude_placeholders: bool) -> LinearFit: +def _fit_linear_transfer_cached( + base_ref: DatasetRef, + predictor_ref: DatasetRef, + min_points: int, + exclude_placeholders: bool, +) -> LinearFit: return _fit_linear_transfer( get_builtin_set(base_ref), get_builtin_set(predictor_ref), @@ -135,7 +145,7 @@ def _fit_transfer_model(base: DatasetLike, transfer: TransferModel) -> LinearFit if not isinstance(transfer, LinearTransfer): return None if len(transfer.predictors) != 1: - raise PolicyError('v0.1 LinearTransfer supports exactly one predictor dataset') + raise PolicyError("v0.1 LinearTransfer supports exactly one predictor dataset") predictor = transfer.predictors[0] if isinstance(base, DatasetRef) and isinstance(predictor, DatasetRef): @@ -150,51 +160,57 @@ def _fit_transfer_model(base: DatasetLike, transfer: TransferModel) -> LinearFit ) -def _apply_substitution_transfer(symbol: str, *, target: DatasetRef, transfer: SubstitutionTransfer) -> tuple[LookupResult | None, str | None]: +def _apply_substitution_transfer( + symbol: str, *, target: DatasetRef, transfer: SubstitutionTransfer +) -> tuple[LookupResult | None, str | None]: source_set = resolve_dataset_like(transfer.source) value = source_set.get(symbol) if value is None: - return None, f'no substitution value in {source_set.ref.set_id}' + return None, f"no substitution value in {source_set.ref.set_id}" value_f = float(value) return ( LookupResult( value=value_f, - source='transfer_substitution', + source="transfer_substitution", target=target, resolved_from=(source_set.ref,), is_placeholder=_is_placeholder_value(source_set.info, value_f), - notes=('missing in base set; substituted from transfer source',), + notes=("missing in base set; substituted from transfer source",), ), None, ) -def _apply_linear_transfer(symbol: str, *, base: DatasetLike, target: DatasetRef, transfer: LinearTransfer) -> tuple[LookupResult | None, str | None]: +def _apply_linear_transfer( + symbol: str, *, base: DatasetLike, target: DatasetRef, transfer: LinearTransfer +) -> tuple[LookupResult | None, str | None]: if len(transfer.predictors) != 1: - raise PolicyError('v0.1 LinearTransfer supports exactly one predictor dataset') + raise PolicyError("v0.1 LinearTransfer supports exactly one predictor dataset") predictor_set = resolve_dataset_like(transfer.predictors[0]) predictor_value = predictor_set.get(symbol) if predictor_value is None: - return None, f'no predictor value in {predictor_set.ref.set_id}' + return None, f"no predictor value in {predictor_set.ref.set_id}" predictor_f = float(predictor_value) - if transfer.exclude_placeholders and _is_placeholder_value(predictor_set.info, predictor_f): - return None, f'predictor value in {predictor_set.ref.set_id} is a placeholder' + if transfer.exclude_placeholders and _is_placeholder_value( + predictor_set.info, predictor_f + ): + return None, f"predictor value in {predictor_set.ref.set_id} is a placeholder" fit = _fit_transfer_model(base, transfer) if fit is None: - return None, 'no fit available for linear transfer' + return None, "no fit available for linear transfer" predicted = fit.coefficients[0] * predictor_f + fit.intercept return ( LookupResult( value=float(predicted), - source='transfer_linear', + source="transfer_linear", target=target, resolved_from=(predictor_set.ref,), is_placeholder=False, fit=fit, - notes=('missing in base set; inferred via linear transfer',), + notes=("missing in base set; inferred via linear transfer",), ), None, ) @@ -203,20 +219,20 @@ def _apply_linear_transfer(symbol: str, *, base: DatasetLike, target: DatasetRef def _resolve_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupResult: target = _resolve_target_ref(policy) base_set = resolve_dataset_like(policy.base) - if base_set.info.domain != 'element': - raise PolicyError('v0.1 resolver supports only element-domain datasets') + if base_set.info.domain != "element": + raise PolicyError("v0.1 resolver supports only element-domain datasets") sym = _normalize_element_symbol(symbol) if sym is None: - note = 'unknown element' if symbol is not None else 'missing element symbol' - return LookupResult(value=None, source='missing', target=target, notes=(note,)) + note = "unknown element" if symbol is not None else "missing element symbol" + return LookupResult(value=None, source="missing", target=target, notes=(note,)) if sym in policy.overrides: return LookupResult( value=float(policy.overrides[sym]), - source='override', + source="override", target=target, - notes=('value supplied by policy override',), + notes=("value supplied by policy override",), ) base_value = base_set.get(sym) @@ -224,21 +240,25 @@ def _resolve_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupRes base_f = float(base_value) return LookupResult( value=base_f, - source='base', + source="base", target=target, resolved_from=(base_set.ref,), is_placeholder=_is_placeholder_value(base_set.info, base_f), notes=(), ) - transfer_notes: list[str] = ['missing in base set'] + transfer_notes: list[str] = ["missing in base set"] for transfer in policy.transfers: if isinstance(transfer, SubstitutionTransfer): - result, note = _apply_substitution_transfer(sym, target=target, transfer=transfer) + result, note = _apply_substitution_transfer( + sym, target=target, transfer=transfer + ) elif isinstance(transfer, LinearTransfer): - result, note = _apply_linear_transfer(sym, base=policy.base, target=target, transfer=transfer) + result, note = _apply_linear_transfer( + sym, base=policy.base, target=target, transfer=transfer + ) else: # pragma: no cover - closed union today - raise PolicyError(f'unsupported transfer model: {type(transfer)!r}') + raise PolicyError(f"unsupported transfer model: {type(transfer)!r}") if result is not None: return result @@ -248,14 +268,14 @@ def _resolve_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupRes if policy.fallback is not None: return LookupResult( value=float(policy.fallback), - source='fallback', + source="fallback", target=target, - notes=tuple(transfer_notes + ['using fallback value']), + notes=tuple(transfer_notes + ["using fallback value"]), ) return LookupResult( value=None, - source='missing', + source="missing", target=target, notes=tuple(transfer_notes), ) diff --git a/src/atomref/radii.py b/src/atomref/radii.py index 7ce73ed..1095667 100644 --- a/src/atomref/radii.py +++ b/src/atomref/radii.py @@ -9,17 +9,22 @@ from .elements import canonicalize_element_symbol, get_element, is_valid_element_symbol from .errors import PolicyError from .policy import LookupResult, ValuePolicy, _fit_transfer_model, _resolve_value -from .registry import DatasetInfo, DatasetRef, ElementScalarSet, get_dataset_info, list_dataset_ids +from .registry import ( + DatasetInfo, + DatasetRef, + ElementScalarSet, + get_dataset_info, + list_dataset_ids, +) from .transfer import LinearFit, LinearTransfer, SubstitutionTransfer, TransferModel - -RadiiKind = Literal['covalent', 'van_der_waals'] +RadiiKind = Literal["covalent", "van_der_waals"] RadiiSet = ElementScalarSet _KIND_TO_QUANTITY = { - 'covalent': 'covalent_radius', - 'van_der_waals': 'van_der_waals_radius', + "covalent": "covalent_radius", + "van_der_waals": "van_der_waals_radius", } @@ -35,9 +40,11 @@ def as_value_policy(self) -> ValuePolicy[str]: quantity = _quantity_for_kind(self.kind) if isinstance(self.base_set, ElementScalarSet): if self.base_set.ref.quantity != quantity: - raise PolicyError( - f'base_set quantity {self.base_set.ref.quantity!r} is incompatible with radii kind {self.kind!r}' + msg = ( + f"base_set quantity {self.base_set.ref.quantity!r} " + f"is incompatible with radii kind {self.kind!r}" ) + raise PolicyError(msg) base = self.base_set else: base = DatasetRef(quantity, self.base_set) @@ -46,7 +53,7 @@ def as_value_policy(self) -> ValuePolicy[str]: for key, value in self.overrides.items(): sym = _normalize_radii_symbol(key) if sym is None or not is_valid_element_symbol(sym): - raise PolicyError(f'invalid override element symbol: {key!r}') + raise PolicyError(f"invalid override element symbol: {key!r}") normalized_overrides[sym] = float(value) return ValuePolicy( @@ -90,13 +97,13 @@ def _quantity_for_kind(kind: RadiiKind) -> str: try: return _KIND_TO_QUANTITY[kind] except KeyError as exc: - raise PolicyError(f'unknown radii kind: {kind!r}') from exc + raise PolicyError(f"unknown radii kind: {kind!r}") from exc def _normalize_radii_symbol(symbol: str | None) -> str | None: cand = canonicalize_element_symbol(symbol) - if cand in {'D', 'T'}: - cand = 'H' + if cand in {"D", "T"}: + cand = "H" return cand @@ -105,14 +112,18 @@ def _normalize_assessment_elements(elements: Iterable[str]) -> tuple[str, ...]: for token in elements: sym = _normalize_radii_symbol(token) if sym is None: - raise ValueError('missing element symbol') + raise ValueError("missing element symbol") if not is_valid_element_symbol(sym): - raise ValueError(f'invalid element symbol: {sym!r}') + raise ValueError(f"invalid element symbol: {sym!r}") symbols.add(sym) - return tuple(sorted(symbols, key=lambda s: get_element(s).z if get_element(s) else 0)) + return tuple( + sorted(symbols, key=lambda s: get_element(s).z if get_element(s) else 0) + ) -def list_radii_sets(kind: RadiiKind, *, usage_role: str | None = None) -> tuple[str, ...]: +def list_radii_sets( + kind: RadiiKind, *, usage_role: str | None = None +) -> tuple[str, ...]: return list_dataset_ids(_quantity_for_kind(kind), usage_role=usage_role) @@ -122,34 +133,44 @@ def get_radii_set_info(kind: RadiiKind, set_id: str) -> DatasetInfo: def _validate_policy_kind(policy: RadiiPolicy, *, expected: RadiiKind) -> None: if policy.kind != expected: - raise PolicyError(f'expected a {expected!r} radii policy, got {policy.kind!r}') + raise PolicyError(f"expected a {expected!r} radii policy, got {policy.kind!r}") def _lookup_radius(symbol: str | None, *, policy: RadiiPolicy) -> LookupResult: return _resolve_value(symbol, policy=policy.as_value_policy()) -def lookup_covalent_radius(symbol: str | None, *, policy: RadiiPolicy | None = None) -> LookupResult: +def lookup_covalent_radius( + symbol: str | None, *, policy: RadiiPolicy | None = None +) -> LookupResult: active = DEFAULT_COVALENT_POLICY if policy is None else policy - _validate_policy_kind(active, expected='covalent') + _validate_policy_kind(active, expected="covalent") return _lookup_radius(symbol, policy=active) -def get_covalent_radius(symbol: str | None, *, policy: RadiiPolicy | None = None) -> float | None: +def get_covalent_radius( + symbol: str | None, *, policy: RadiiPolicy | None = None +) -> float | None: return lookup_covalent_radius(symbol, policy=policy).value -def lookup_vdw_radius(symbol: str | None, *, policy: RadiiPolicy | None = None) -> LookupResult: +def lookup_vdw_radius( + symbol: str | None, *, policy: RadiiPolicy | None = None +) -> LookupResult: active = DEFAULT_VDW_POLICY if policy is None else policy - _validate_policy_kind(active, expected='van_der_waals') + _validate_policy_kind(active, expected="van_der_waals") return _lookup_radius(symbol, policy=active) -def get_vdw_radius(symbol: str | None, *, policy: RadiiPolicy | None = None) -> float | None: +def get_vdw_radius( + symbol: str | None, *, policy: RadiiPolicy | None = None +) -> float | None: return lookup_vdw_radius(symbol, policy=policy).value -def assess_radii_policy(elements: Iterable[str], *, policy: RadiiPolicy, detail: bool = False) -> RadiiPolicyAssessment: +def assess_radii_policy( + elements: Iterable[str], *, policy: RadiiPolicy, detail: bool = False +) -> RadiiPolicyAssessment: elems = _normalize_assessment_elements(elements) value_policy = policy.as_value_policy() @@ -167,17 +188,17 @@ def assess_radii_policy(elements: Iterable[str], *, policy: RadiiPolicy, detail: for symbol in elems: lookup = _resolve_value(symbol, policy=value_policy) - if lookup.source == 'override': + if lookup.source == "override": n_override += 1 - elif lookup.source == 'base': + elif lookup.source == "base": n_base += 1 - elif lookup.source == 'transfer_substitution': + elif lookup.source == "transfer_substitution": n_transfer_substitution += 1 - elif lookup.source == 'transfer_linear': + elif lookup.source == "transfer_linear": n_transfer_linear += 1 - elif lookup.source == 'fallback': + elif lookup.source == "fallback": n_fallback += 1 - elif lookup.source == 'missing': + elif lookup.source == "missing": n_missing += 1 missing_symbols.append(symbol) @@ -221,13 +242,15 @@ def assess_radii_policy(elements: Iterable[str], *, policy: RadiiPolicy, detail: DEFAULT_COVALENT_POLICY = RadiiPolicy( - kind='covalent', - base_set='cordero2008', - transfers=(SubstitutionTransfer(source=DatasetRef('covalent_radius', 'csd_legacy_cov')),), + kind="covalent", + base_set="cordero2008", + transfers=( + SubstitutionTransfer(source=DatasetRef("covalent_radius", "csd_legacy_cov")), + ), ) DEFAULT_VDW_POLICY = RadiiPolicy( - kind='van_der_waals', - base_set='alvarez2013', - transfers=(LinearTransfer(predictors=(DatasetRef('atomic_radius', 'rahm2016'),)),), + kind="van_der_waals", + base_set="alvarez2013", + transfers=(LinearTransfer(predictors=(DatasetRef("atomic_radius", "rahm2016"),)),), ) diff --git a/src/atomref/registry.py b/src/atomref/registry.py index 458d202..f84b14b 100644 --- a/src/atomref/registry.py +++ b/src/atomref/registry.py @@ -12,7 +12,6 @@ from .elements import canonicalize_element_symbol, get_element, iter_elements from .errors import DatasetError - QuantityId = str DomainId = str @@ -89,14 +88,14 @@ def from_mapping( name: str, units: str | None, description: str | None = None, - usage_role: str = 'user', - semantic_class: str = 'user', - origin_class: str = 'user', + usage_role: str = "user", + semantic_class: str = "user", + origin_class: str = "user", phase_context: str | None = None, references: Iterable[Reference] = (), notes: Iterable[str] = (), placeholder_value: float | None = None, - ) -> 'ElementScalarSet': + ) -> "ElementScalarSet": n_z = max(e.z for e in iter_elements()) values_by_z: list[float | None] = [None] * (n_z + 1) @@ -104,10 +103,12 @@ def from_mapping( sym = _normalize_element_domain_symbol(key) elem = get_element(sym) if elem is None: - raise DatasetError(f'invalid element symbol in custom set: {key!r}') + raise DatasetError(f"invalid element symbol in custom set: {key!r}") values_by_z[elem.z] = None if value is None else float(value) - covered_z = tuple(z for z, value in enumerate(values_by_z) if z > 0 and value is not None) + covered_z = tuple( + z for z, value in enumerate(values_by_z) if z > 0 and value is not None + ) has_placeholders = False if placeholder_value is not None: has_placeholders = any( @@ -117,7 +118,7 @@ def from_mapping( info = DatasetInfo( ref=ref, - domain='element', + domain="element", units=units, name=name, description=description, @@ -154,39 +155,39 @@ def get(self, symbol: str | None) -> float | None: def _normalize_element_domain_symbol(symbol: str | None) -> str | None: cand = canonicalize_element_symbol(symbol) - if cand in {'D', 'T'}: - return 'H' + if cand in {"D", "T"}: + return "H" return cand @lru_cache(maxsize=1) def _load_registry_json() -> dict[str, object]: - path = resources.files('atomref.data').joinpath('registry.json') - with path.open('r', encoding='utf-8') as handle: + path = resources.files("atomref.data").joinpath("registry.json") + with path.open("r", encoding="utf-8") as handle: data = json.load(handle) if not isinstance(data, dict): - raise DatasetError('invalid registry.json: expected JSON object') + raise DatasetError("invalid registry.json: expected JSON object") return data def _get_quantities_mapping() -> Mapping[str, object]: - quantities = _load_registry_json().get('quantities') + quantities = _load_registry_json().get("quantities") if not isinstance(quantities, dict): - raise DatasetError('invalid registry.json: missing quantities mapping') + raise DatasetError("invalid registry.json: missing quantities mapping") return quantities def _get_datasets_mapping() -> Mapping[str, object]: - datasets = _load_registry_json().get('datasets') + datasets = _load_registry_json().get("datasets") if not isinstance(datasets, dict): - raise DatasetError('invalid registry.json: missing datasets mapping') + raise DatasetError("invalid registry.json: missing datasets mapping") return datasets def _datasets_for_quantity(quantity: QuantityId) -> Mapping[str, object]: datasets = _get_datasets_mapping().get(quantity) if not isinstance(datasets, dict): - raise DatasetError(f'unknown quantity: {quantity!r}') + raise DatasetError(f"unknown quantity: {quantity!r}") return datasets @@ -197,17 +198,21 @@ def list_quantities() -> tuple[str, ...]: def get_quantity_info(quantity: QuantityId) -> QuantityInfo: raw = _get_quantities_mapping().get(quantity) if not isinstance(raw, dict): - raise DatasetError(f'unknown quantity: {quantity!r}') - domain = raw.get('domain') if isinstance(raw.get('domain'), str) else None + raise DatasetError(f"unknown quantity: {quantity!r}") + domain = raw.get("domain") if isinstance(raw.get("domain"), str) else None if domain is None: - raise DatasetError(f'missing domain for quantity: {quantity!r}') - units = raw.get('units') if isinstance(raw.get('units'), str) else None - description = raw.get('description') if isinstance(raw.get('description'), str) else None - return QuantityInfo(quantity=quantity, domain=domain, units=units, description=description) + raise DatasetError(f"missing domain for quantity: {quantity!r}") + units = raw.get("units") if isinstance(raw.get("units"), str) else None + description = ( + raw.get("description") if isinstance(raw.get("description"), str) else None + ) + return QuantityInfo( + quantity=quantity, domain=domain, units=units, description=description + ) def _canonicalize_alias_token(value: str) -> str: - return ' '.join(value.strip().lower().split()) + return " ".join(value.strip().lower().split()) def _resolve_set_id(quantity: QuantityId, set_id: str) -> str: @@ -220,15 +225,20 @@ def _resolve_set_id(quantity: QuantityId, set_id: str) -> str: if _canonicalize_alias_token(actual_id) == wanted: return actual_id if isinstance(raw_entry, dict): - aliases = raw_entry.get('aliases', ()) + aliases = raw_entry.get("aliases", ()) if isinstance(aliases, list): for alias in aliases: - if isinstance(alias, str) and _canonicalize_alias_token(alias) == wanted: + if ( + isinstance(alias, str) + and _canonicalize_alias_token(alias) == wanted + ): return actual_id - raise DatasetError(f'unknown dataset id for {quantity!r}: {set_id!r}') + raise DatasetError(f"unknown dataset id for {quantity!r}: {set_id!r}") -def list_dataset_ids(quantity: QuantityId, *, usage_role: str | None = None) -> tuple[str, ...]: +def list_dataset_ids( + quantity: QuantityId, *, usage_role: str | None = None +) -> tuple[str, ...]: dataset_ids = tuple(_datasets_for_quantity(quantity).keys()) if usage_role is None: return dataset_ids @@ -237,7 +247,7 @@ def list_dataset_ids(quantity: QuantityId, *, usage_role: str | None = None) -> wanted = usage_role.strip().lower() for set_id in dataset_ids: info = get_dataset_info(DatasetRef(quantity, set_id)) - role = (info.usage_role or '').strip().lower() + role = (info.usage_role or "").strip().lower() if role == wanted: filtered.append(set_id) return tuple(filtered) @@ -245,31 +255,33 @@ def list_dataset_ids(quantity: QuantityId, *, usage_role: str | None = None) -> def _coerce_reference(obj: object) -> Reference: if not isinstance(obj, dict): - raise DatasetError('invalid reference entry in registry.json') + raise DatasetError("invalid reference entry in registry.json") return Reference( - authors=obj.get('authors') if isinstance(obj.get('authors'), str) else None, - year=obj.get('year') if isinstance(obj.get('year'), int) else None, - title=obj.get('title') if isinstance(obj.get('title'), str) else None, - venue=obj.get('venue') if isinstance(obj.get('venue'), str) else None, - doi=obj.get('doi') if isinstance(obj.get('doi'), str) else None, - url=obj.get('url') if isinstance(obj.get('url'), str) else None, - publisher=obj.get('publisher') if isinstance(obj.get('publisher'), str) else None, - note=obj.get('note') if isinstance(obj.get('note'), str) else None, + authors=obj.get("authors") if isinstance(obj.get("authors"), str) else None, + year=obj.get("year") if isinstance(obj.get("year"), int) else None, + title=obj.get("title") if isinstance(obj.get("title"), str) else None, + venue=obj.get("venue") if isinstance(obj.get("venue"), str) else None, + doi=obj.get("doi") if isinstance(obj.get("doi"), str) else None, + url=obj.get("url") if isinstance(obj.get("url"), str) else None, + publisher=( + obj.get("publisher") if isinstance(obj.get("publisher"), str) else None + ), + note=obj.get("note") if isinstance(obj.get("note"), str) else None, ) def _coerce_coverage(obj: object) -> CoverageInfo | None: if not isinstance(obj, dict): return None - covered = obj.get('covered_z') - missing = obj.get('missing_z') + covered = obj.get("covered_z") + missing = obj.get("missing_z") covered_z = tuple(int(z) for z in covered) if isinstance(covered, list) else () missing_z = tuple(int(z) for z in missing) if isinstance(missing, list) else () return CoverageInfo( - n_values=int(obj['n_values']), - z_min=int(obj['z_min']) if isinstance(obj.get('z_min'), int) else None, - z_max=int(obj['z_max']) if isinstance(obj.get('z_max'), int) else None, - has_placeholders=bool(obj.get('has_placeholders', False)), + n_values=int(obj["n_values"]), + z_min=int(obj["z_min"]) if isinstance(obj.get("z_min"), int) else None, + z_max=int(obj["z_max"]) if isinstance(obj.get("z_max"), int) else None, + has_placeholders=bool(obj.get("has_placeholders", False)), covered_z=covered_z, missing_z=missing_z, ) @@ -282,61 +294,115 @@ def get_dataset_info(ref: DatasetRef) -> DatasetInfo: quantities = _get_quantities_mapping() quantity_info = quantities.get(actual_ref.quantity) if not isinstance(quantity_info, dict): - raise DatasetError(f'unknown quantity: {actual_ref.quantity!r}') + raise DatasetError(f"unknown quantity: {actual_ref.quantity!r}") - units = quantity_info.get('units') if isinstance(quantity_info.get('units'), str) else None - domain = quantity_info.get('domain') if isinstance(quantity_info.get('domain'), str) else None + units = ( + quantity_info.get("units") + if isinstance(quantity_info.get("units"), str) + else None + ) + domain = ( + quantity_info.get("domain") + if isinstance(quantity_info.get("domain"), str) + else None + ) if domain is None: - raise DatasetError(f'missing domain for quantity: {actual_ref.quantity!r}') + raise DatasetError(f"missing domain for quantity: {actual_ref.quantity!r}") raw_entry = _datasets_for_quantity(actual_ref.quantity).get(actual_ref.set_id) if not isinstance(raw_entry, dict): - raise DatasetError(f'unknown dataset: {actual_ref}') + raise DatasetError(f"unknown dataset: {actual_ref}") - refs_raw = raw_entry.get('references', []) - references = tuple(_coerce_reference(item) for item in refs_raw) if isinstance(refs_raw, list) else () - aliases_raw = raw_entry.get('aliases', []) - aliases = tuple(item for item in aliases_raw if isinstance(item, str)) if isinstance(aliases_raw, list) else () - notes_raw = raw_entry.get('notes', []) - notes = tuple(item for item in notes_raw if isinstance(item, str)) if isinstance(notes_raw, list) else () - storage = raw_entry.get('storage') if isinstance(raw_entry.get('storage'), dict) else None + refs_raw = raw_entry.get("references", []) + references = ( + tuple(_coerce_reference(item) for item in refs_raw) + if isinstance(refs_raw, list) + else () + ) + aliases_raw = raw_entry.get("aliases", []) + aliases = ( + tuple(item for item in aliases_raw if isinstance(item, str)) + if isinstance(aliases_raw, list) + else () + ) + notes_raw = raw_entry.get("notes", []) + notes = ( + tuple(item for item in notes_raw if isinstance(item, str)) + if isinstance(notes_raw, list) + else () + ) + storage = ( + raw_entry.get("storage") if isinstance(raw_entry.get("storage"), dict) else None + ) return DatasetInfo( ref=actual_ref, domain=domain, units=units, - name=raw_entry.get('name') if isinstance(raw_entry.get('name'), str) else actual_ref.set_id, - description=raw_entry.get('description') if isinstance(raw_entry.get('description'), str) else None, - usage_role=raw_entry.get('usage_role') if isinstance(raw_entry.get('usage_role'), str) else None, - semantic_class=raw_entry.get('semantic_class') if isinstance(raw_entry.get('semantic_class'), str) else None, - origin_class=raw_entry.get('origin_class') if isinstance(raw_entry.get('origin_class'), str) else None, - phase_context=raw_entry.get('phase_context') if isinstance(raw_entry.get('phase_context'), str) else None, - method_summary=raw_entry.get('method_summary') if isinstance(raw_entry.get('method_summary'), str) else None, + name=( + raw_entry.get("name") + if isinstance(raw_entry.get("name"), str) + else actual_ref.set_id + ), + description=( + raw_entry.get("description") + if isinstance(raw_entry.get("description"), str) + else None + ), + usage_role=( + raw_entry.get("usage_role") + if isinstance(raw_entry.get("usage_role"), str) + else None + ), + semantic_class=( + raw_entry.get("semantic_class") + if isinstance(raw_entry.get("semantic_class"), str) + else None + ), + origin_class=( + raw_entry.get("origin_class") + if isinstance(raw_entry.get("origin_class"), str) + else None + ), + phase_context=( + raw_entry.get("phase_context") + if isinstance(raw_entry.get("phase_context"), str) + else None + ), + method_summary=( + raw_entry.get("method_summary") + if isinstance(raw_entry.get("method_summary"), str) + else None + ), placeholder_value=( - float(raw_entry['placeholder_value']) - if raw_entry.get('placeholder_value') is not None + float(raw_entry["placeholder_value"]) + if raw_entry.get("placeholder_value") is not None + else None + ), + extraction_source=( + raw_entry.get("extraction_source") + if isinstance(raw_entry.get("extraction_source"), str) else None ), - extraction_source=raw_entry.get('extraction_source') if isinstance(raw_entry.get('extraction_source'), str) else None, aliases=aliases, references=references, notes=notes, storage=storage, - coverage=_coerce_coverage(raw_entry.get('coverage')), + coverage=_coerce_coverage(raw_entry.get("coverage")), ) @lru_cache(maxsize=None) def _load_csv_columns(filename: str) -> dict[str, tuple[float | None, ...]]: - path = resources.files('atomref.data').joinpath(filename) - with path.open('r', encoding='utf-8', newline='') as handle: + path = resources.files("atomref.data").joinpath(filename) + with path.open("r", encoding="utf-8", newline="") as handle: reader = csv.DictReader(handle) - if reader.fieldnames is None or 'z' not in reader.fieldnames: - raise DatasetError(f'invalid CSV file: {filename!r}') - columns = [name for name in reader.fieldnames if name != 'z'] + if reader.fieldnames is None or "z" not in reader.fieldnames: + raise DatasetError(f"invalid CSV file: {filename!r}") + columns = [name for name in reader.fieldnames if name != "z"] values: dict[str, list[float | None]] = {name: [None] * 119 for name in columns} for row in reader: - z_text = row.get('z') + z_text = row.get("z") if z_text is None: continue z = int(z_text) @@ -353,19 +419,21 @@ def _load_csv_columns(filename: str) -> dict[str, tuple[float | None, ...]]: @lru_cache(maxsize=None) def get_builtin_set(ref: DatasetRef) -> ElementScalarSet: info = get_dataset_info(ref) - if info.domain != 'element': - raise DatasetError(f'only element-domain datasets are supported in v0.1: {info.ref!r}') + if info.domain != "element": + raise DatasetError( + f"only element-domain datasets are supported in v0.1: {info.ref!r}" + ) if not isinstance(info.storage, Mapping): - raise DatasetError(f'missing storage metadata for dataset: {info.ref!r}') + raise DatasetError(f"missing storage metadata for dataset: {info.ref!r}") - filename = info.storage.get('filename') - column = info.storage.get('column') + filename = info.storage.get("filename") + column = info.storage.get("column") if not isinstance(filename, str) or not isinstance(column, str): - raise DatasetError(f'invalid storage metadata for dataset: {info.ref!r}') + raise DatasetError(f"invalid storage metadata for dataset: {info.ref!r}") table = _load_csv_columns(filename) if column not in table: - raise DatasetError(f'column {column!r} not found in {filename!r}') + raise DatasetError(f"column {column!r} not found in {filename!r}") return ElementScalarSet(ref=info.ref, info=info, values_by_z=table[column]) diff --git a/tests/radii/test_selection.py b/tests/radii/test_selection.py index 9eb16de..e84a4f0 100644 --- a/tests/radii/test_selection.py +++ b/tests/radii/test_selection.py @@ -7,94 +7,93 @@ def test_get_covalent_radius_default_prefers_cordero() -> None: - assert ar.get_covalent_radius('C') == pytest.approx(0.76) + assert ar.get_covalent_radius("C") == pytest.approx(0.76) def test_get_covalent_radius_maps_deuterium_to_hydrogen() -> None: - assert ar.get_covalent_radius('D') == pytest.approx(0.31) + assert ar.get_covalent_radius("D") == pytest.approx(0.31) def test_get_vdw_radius_default_prefers_alvarez() -> None: - assert ar.get_vdw_radius('C') == pytest.approx(1.77) + assert ar.get_vdw_radius("C") == pytest.approx(1.77) def test_completion_is_used_for_missing_base_values() -> None: - m = ar.lookup_covalent_radius('Bk') + m = ar.lookup_covalent_radius("Bk") assert m.value is not None - assert m.source == 'transfer_substitution' + assert m.source == "transfer_substitution" - m2 = ar.lookup_vdw_radius('Pm') + m2 = ar.lookup_vdw_radius("Pm") assert m2.value is not None - assert m2.source == 'transfer_linear' + assert m2.source == "transfer_linear" assert m2.value == pytest.approx(2.897226539514835) def test_linear_transfer_rejects_placeholder_values() -> None: scheme = ar.RadiiPolicy( - kind='van_der_waals', - base_set='bondi1964', + kind="van_der_waals", + base_set="bondi1964", transfers=( ar.LinearTransfer( - predictors=(ar.DatasetRef('van_der_waals_radius', 'csd_legacy_vdw'),) + predictors=(ar.DatasetRef("van_der_waals_radius", "csd_legacy_vdw"),) ), ), ) - m = ar.lookup_vdw_radius('Be', policy=scheme) + m = ar.lookup_vdw_radius("Be", policy=scheme) assert m.value is None - assert m.source == 'missing' - assert any('placeholder' in s for s in m.notes) + assert m.source == "missing" + assert any("placeholder" in s for s in m.notes) def test_lookup_float_conversion() -> None: - m = ar.lookup_covalent_radius('C') + m = ar.lookup_covalent_radius("C") assert float(m) == pytest.approx(0.76) - m_missing = ar.lookup_covalent_radius('Xx') + m_missing = ar.lookup_covalent_radius("Xx") with pytest.raises(TypeError): float(m_missing) def test_override_precedes_base_value() -> None: policy = ar.RadiiPolicy( - kind='covalent', - base_set='cordero2008', - overrides={'C': 9.99}, + kind="covalent", + base_set="cordero2008", + overrides={"C": 9.99}, ) - lookup = ar.lookup_covalent_radius('C', policy=policy) - assert lookup.source == 'override' + lookup = ar.lookup_covalent_radius("C", policy=policy) + assert lookup.source == "override" assert lookup.value == pytest.approx(9.99) def test_fallback_is_used_only_after_transfers_fail() -> None: policy = ar.RadiiPolicy( - kind='van_der_waals', - base_set='bondi1964', + kind="van_der_waals", + base_set="bondi1964", transfers=( ar.LinearTransfer( - predictors=(ar.DatasetRef('van_der_waals_radius', 'csd_legacy_vdw'),) + predictors=(ar.DatasetRef("van_der_waals_radius", "csd_legacy_vdw"),) ), ), fallback=2.5, ) - lookup = ar.lookup_vdw_radius('Be', policy=policy) - assert lookup.source == 'fallback' + lookup = ar.lookup_vdw_radius("Be", policy=policy) + assert lookup.source == "fallback" assert lookup.value == pytest.approx(2.5) - assert any('placeholder' in note for note in lookup.notes) + assert any("placeholder" in note for note in lookup.notes) def test_linear_transfer_rejects_multiple_predictors_in_v0_1() -> None: policy = ar.RadiiPolicy( - kind='van_der_waals', - base_set='alvarez2013', + kind="van_der_waals", + base_set="alvarez2013", transfers=( ar.LinearTransfer( predictors=( - ar.DatasetRef('atomic_radius', 'rahm2016'), - ar.DatasetRef('covalent_radius', 'cordero2008'), + ar.DatasetRef("atomic_radius", "rahm2016"), + ar.DatasetRef("covalent_radius", "cordero2008"), ) ), ), ) with pytest.raises(PolicyError): - ar.lookup_vdw_radius('Pm', policy=policy) - + ar.lookup_vdw_radius("Pm", policy=policy) From 85cd280452b086e7cc8a563f0d27c45603e4b32f Mon Sep 17 00:00:00 2001 From: Ivan Chernyshov Date: Sat, 14 Mar 2026 19:09:48 +0300 Subject: [PATCH 05/15] Tightens CI --- .github/workflows/ci.yml | 45 ++++++++++++++++- .github/workflows/docs.yml | 2 +- tests/meta/test_package_data.py | 26 ++++++++++ tests/meta/test_public_api.py | 34 +++++++++++++ tools/check_dist.py | 88 +++++++++++++++++++++++++++++++++ 5 files changed, 193 insertions(+), 2 deletions(-) create mode 100644 tests/meta/test_package_data.py create mode 100644 tests/meta/test_public_api.py create mode 100644 tools/check_dist.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 38602c6..942ee01 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,21 @@ jobs: python -m pip install --upgrade pip python -m pip install .[dev] - name: Lint - run: flake8 src tests + run: flake8 src tests tools + + docs-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install docs extras + run: | + python -m pip install --upgrade pip + python -m pip install .[docs] + - name: Build docs + run: mkdocs build --strict test: runs-on: ubuntu-latest @@ -35,3 +49,32 @@ jobs: python -m pip install .[test] - name: Test run: pytest + + build-dist: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install build dependencies + run: | + python -m pip install --upgrade pip + python -m pip install build twine + - name: Build distributions + run: python -m build + - name: Validate metadata + run: python -m twine check dist/* + - name: Check packaged files + run: python tools/check_dist.py dist + - name: Install built wheel and smoke-test it + run: | + python -m pip install --force-reinstall --no-deps dist/*.whl + python - <<'PY' + import atomref as ar + + assert ar.get_covalent_radius('C') == 0.76 + assert ar.get_vdw_radius('C') == 1.77 + assert 'atomic_radius' in ar.list_quantities() + assert 'rahm2016' in ar.list_dataset_ids('atomic_radius', usage_role='support') + PY diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 590aad5..70396d7 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -2,7 +2,7 @@ name: Docs on: push: - branches: [main] + branches: [main, master] workflow_dispatch: jobs: diff --git a/tests/meta/test_package_data.py b/tests/meta/test_package_data.py new file mode 100644 index 0000000..e5c393c --- /dev/null +++ b/tests/meta/test_package_data.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from importlib import resources +import json + + +def test_packaged_data_files_are_available() -> None: + data_root = resources.files('atomref.data') + for name in ( + 'periodic_table.csv', + 'covalent.csv', + 'van_der_waals.csv', + 'registry.json', + ): + assert data_root.joinpath(name).is_file(), name + + +def test_packaged_registry_keeps_atomic_support_classification() -> None: + data_root = resources.files('atomref.data') + raw = json.loads(data_root.joinpath('registry.json').read_text(encoding='utf-8')) + + assert 'atomic_radius' in raw['datasets'] + rahm = raw['datasets']['atomic_radius']['rahm2016'] + assert rahm['usage_role'] == 'support' + assert rahm['semantic_class'] == 'atomic_isodensity' + assert rahm['phase_context'] == 'isolated_atom' diff --git a/tests/meta/test_public_api.py b/tests/meta/test_public_api.py new file mode 100644 index 0000000..b64f77d --- /dev/null +++ b/tests/meta/test_public_api.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +import atomref as ar + + +REQUIRED_PUBLIC_NAMES = { + 'Element', + 'DatasetRef', + 'DatasetInfo', + 'ElementScalarSet', + 'QuantityInfo', + 'LookupResult', + 'RadiiPolicy', + 'DEFAULT_COVALENT_POLICY', + 'DEFAULT_VDW_POLICY', + 'LinearTransfer', + 'SubstitutionTransfer', + 'get_covalent_radius', + 'lookup_covalent_radius', + 'get_vdw_radius', + 'lookup_vdw_radius', + 'list_quantities', + 'list_dataset_ids', + 'list_radii_sets', +} + + +def test___all___exports_existing_objects() -> None: + for name in ar.__all__: + assert hasattr(ar, name), name + + +def test_core_public_api_names_are_exported() -> None: + assert REQUIRED_PUBLIC_NAMES.issubset(set(ar.__all__)) diff --git a/tools/check_dist.py b/tools/check_dist.py new file mode 100644 index 0000000..3eb4c66 --- /dev/null +++ b/tools/check_dist.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import argparse +from pathlib import Path +import tarfile +import zipfile + + +REQUIRED_WHEEL_MEMBERS = { + 'atomref/data/periodic_table.csv', + 'atomref/data/covalent.csv', + 'atomref/data/van_der_waals.csv', + 'atomref/data/registry.json', + 'atomref/py.typed', +} + +REQUIRED_SDIST_SUFFIXES = { + 'src/atomref/data/periodic_table.csv', + 'src/atomref/data/covalent.csv', + 'src/atomref/data/van_der_waals.csv', + 'src/atomref/data/registry.json', + 'src/atomref/py.typed', + 'README.md', + 'LICENSE', + 'pyproject.toml', +} + + +class DistCheckError(RuntimeError): + """Raised when a built distribution is missing required members.""" + + +def _assert_members_present( + actual: set[str], required: set[str], *, label: str +) -> None: + missing = sorted(required - actual) + if missing: + joined = ', '.join(missing) + raise DistCheckError(f'{label} is missing required members: {joined}') + + +def _members_matching_suffixes(actual: set[str], suffixes: set[str]) -> set[str]: + matched: set[str] = set() + for suffix in suffixes: + if any(name.endswith(suffix) for name in actual): + matched.add(suffix) + return matched + + +def check_wheel(path: Path) -> None: + with zipfile.ZipFile(path) as zf: + names = set(zf.namelist()) + matched = { + member + for member in REQUIRED_WHEEL_MEMBERS + if any(name.endswith(member) for name in names) + } + _assert_members_present(matched, REQUIRED_WHEEL_MEMBERS, label=path.name) + + +def check_sdist(path: Path) -> None: + with tarfile.open(path, 'r:gz') as tf: + names = {member.name for member in tf.getmembers()} + matched = _members_matching_suffixes(names, REQUIRED_SDIST_SUFFIXES) + _assert_members_present(matched, REQUIRED_SDIST_SUFFIXES, label=path.name) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument('dist_dir', type=Path, nargs='?', default=Path('dist')) + args = parser.parse_args() + + dist_dir = args.dist_dir + wheels = sorted(dist_dir.glob('*.whl')) + sdists = sorted(dist_dir.glob('*.tar.gz')) + if not wheels: + raise DistCheckError(f'no wheel files found in {dist_dir}') + if not sdists: + raise DistCheckError(f'no source distributions found in {dist_dir}') + + for wheel in wheels: + check_wheel(wheel) + for sdist in sdists: + check_sdist(sdist) + + +if __name__ == '__main__': + main() From 5bca61d0e325f2c113ec058401993f0568b656bf Mon Sep 17 00:00:00 2001 From: Ivan Chernyshov Date: Sat, 14 Mar 2026 19:41:14 +0300 Subject: [PATCH 06/15] Tightens data validation --- .github/workflows/ci.yml | 2 + README.md | 3 + docs/dev/data_curation.md | 19 +++ docs/index.md | 3 + tests/meta/test_registry_integrity.py | 80 ++++++++++++ tools/check_registry.py | 172 ++++++++++++++++++++++++++ 6 files changed, 279 insertions(+) create mode 100644 tests/meta/test_registry_integrity.py create mode 100644 tools/check_registry.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 942ee01..3512c2a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,6 +18,8 @@ jobs: python -m pip install .[dev] - name: Lint run: flake8 src tests tools + - name: Validate packaged registry + run: python tools/check_registry.py docs-check: runs-on: ubuntu-latest diff --git a/README.md b/README.md index fc2a6fd..5514004 100644 --- a/README.md +++ b/README.md @@ -75,3 +75,6 @@ fits naturally beneath: Those packages should consume atomic reference data from `atomref` rather than re-curating such datasets independently. + +For data-curation changes, validate the packaged registry against the bundled +CSV tables with `python tools/check_registry.py`. diff --git a/docs/dev/data_curation.md b/docs/dev/data_curation.md index 02f406b..689ae24 100644 --- a/docs/dev/data_curation.md +++ b/docs/dev/data_curation.md @@ -5,3 +5,22 @@ metadata and provenance live in `src/atomref/data/registry.json`. Placeholder values are modeled as dataset metadata, not as hard-coded Python constants. + +The registry distinguishes several orthogonal concerns: + +- `quantity` — the operational lookup target, such as `covalent_radius` or + `van_der_waals_radius` +- `semantic_class` — what the dataset scientifically represents +- `usage_role` — whether the dataset is intended as a direct target set or as + support data for transfer +- `phase_context` — the physical context of the underlying values + +This matters for support-only datasets such as `atomic_radius:rahm2016`, which +is packaged as atomic support data and then used by the default van der Waals +policy through linear transfer. + +To check that metadata and packaged tables stay synchronized, run: + +```bash +python tools/check_registry.py +``` diff --git a/docs/index.md b/docs/index.md index fc2a6fd..5514004 100644 --- a/docs/index.md +++ b/docs/index.md @@ -75,3 +75,6 @@ fits naturally beneath: Those packages should consume atomic reference data from `atomref` rather than re-curating such datasets independently. + +For data-curation changes, validate the packaged registry against the bundled +CSV tables with `python tools/check_registry.py`. diff --git a/tests/meta/test_registry_integrity.py b/tests/meta/test_registry_integrity.py new file mode 100644 index 0000000..853df5c --- /dev/null +++ b/tests/meta/test_registry_integrity.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +from collections import defaultdict +from dataclasses import asdict + +import atomref as ar +from atomref.registry import get_builtin_set + +_ALLOWED_USAGE_ROLES = {"target", "support"} + + +def _canonical_token(value: str) -> str: + return " ".join(value.strip().lower().split()) + + +def test_dataset_aliases_are_unique_within_each_quantity() -> None: + for quantity in ar.list_quantities(): + seen: dict[str, str] = {} + for set_id in ar.list_dataset_ids(quantity): + info = ar.get_dataset_info(ar.DatasetRef(quantity, set_id)) + for token in (set_id, *info.aliases): + key = _canonical_token(token) + previous = seen.get(key) + assert previous in (None, set_id) + seen[key] = set_id + + +def test_every_built_in_dataset_loads_and_matches_coverage_metadata() -> None: + for quantity in ar.list_quantities(): + quantity_info = ar.get_quantity_info(quantity) + for set_id in ar.list_dataset_ids(quantity): + ref = ar.DatasetRef(quantity, set_id) + info = ar.get_dataset_info(ref) + dataset = get_builtin_set(ref) + + assert info.domain == quantity_info.domain + assert info.units == quantity_info.units + assert info.usage_role in _ALLOWED_USAGE_ROLES + assert info.references + assert info.coverage is not None + + max_z = ( + info.coverage.z_max + if info.coverage.z_max is not None + else len(dataset.values_by_z) - 1 + ) + covered_z = tuple( + z + for z, value in enumerate(dataset.values_by_z) + if z > 0 and value is not None and z <= max_z + ) + covered_set = set(covered_z) + missing_z = tuple(z for z in range(1, max_z + 1) if z not in covered_set) + has_placeholders = info.placeholder_value is not None and any( + value is not None and abs(value - info.placeholder_value) < 1e-12 + for value in dataset.values_by_z[1 : max_z + 1] + ) + + coverage = asdict(info.coverage) + assert coverage["n_values"] == len(covered_z) + assert coverage["z_min"] == (min(covered_z) if covered_z else None) + assert coverage["z_max"] == (max(covered_z) if covered_z else None) + assert coverage["has_placeholders"] is has_placeholders + if coverage["covered_z"]: + assert tuple(coverage["covered_z"]) == covered_z + if coverage["missing_z"]: + assert tuple(coverage["missing_z"]) == missing_z + + +def test_non_atomic_quantities_have_at_least_one_target_dataset() -> None: + by_role: dict[str, list[str]] = defaultdict(list) + for quantity in ar.list_quantities(): + for set_id in ar.list_dataset_ids(quantity): + role = ar.get_dataset_info(ar.DatasetRef(quantity, set_id)).usage_role + assert role is not None + by_role[role].append(quantity) + + for quantity in ar.list_quantities(): + if quantity != "atomic_radius": + assert quantity in by_role["target"] diff --git a/tools/check_registry.py b/tools/check_registry.py new file mode 100644 index 0000000..a57f49f --- /dev/null +++ b/tools/check_registry.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +"""Validate packaged registry metadata against bundled CSV tables.""" + +from __future__ import annotations + +from collections import defaultdict +from dataclasses import asdict +from pathlib import Path +import sys +from typing import Iterable + +REPO_ROOT = Path(__file__).resolve().parents[1] +SRC = REPO_ROOT / "src" +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) + +import atomref as ar +from atomref.registry import get_builtin_set + +_ALLOWED_USAGE_ROLES = {"target", "support"} + + +def _canonical_token(value: str) -> str: + return " ".join(value.strip().lower().split()) + + +def _iter_dataset_refs() -> Iterable[ar.DatasetRef]: + for quantity in ar.list_quantities(): + for set_id in ar.list_dataset_ids(quantity): + yield ar.DatasetRef(quantity, set_id) + + +def _validate_alias_collisions(errors: list[str]) -> None: + for quantity in ar.list_quantities(): + seen: dict[str, str] = {} + for set_id in ar.list_dataset_ids(quantity): + info = ar.get_dataset_info(ar.DatasetRef(quantity, set_id)) + for token in (set_id, *info.aliases): + key = _canonical_token(token) + previous = seen.get(key) + if previous is not None and previous != set_id: + msg = ( + f"alias collision in {quantity!r}: {token!r} resolves to both " + f"{previous!r} and {set_id!r}" + ) + errors.append(msg) + else: + seen[key] = set_id + + +def _validate_dataset_metadata(errors: list[str]) -> None: + quantities = set(ar.list_quantities()) + by_role: dict[str, list[str]] = defaultdict(list) + + for ref in _iter_dataset_refs(): + quantity_info = ar.get_quantity_info(ref.quantity) + info = ar.get_dataset_info(ref) + dataset = get_builtin_set(ref) + + if info.ref != ref: + errors.append(f"dataset ref mismatch: requested {ref!r}, got {info.ref!r}") + + if info.domain != quantity_info.domain: + msg = ( + f"domain mismatch for {ref!r}: quantity={quantity_info.domain!r}, " + f"dataset={info.domain!r}" + ) + errors.append(msg) + + if info.units != quantity_info.units: + msg = ( + f"units mismatch for {ref!r}: quantity={quantity_info.units!r}, " + f"dataset={info.units!r}" + ) + errors.append(msg) + + if info.usage_role not in _ALLOWED_USAGE_ROLES: + errors.append(f"invalid usage_role for {ref!r}: {info.usage_role!r}") + else: + by_role[info.usage_role].append(ref.quantity) + + if not info.references: + errors.append(f"missing references for {ref!r}") + + if info.storage is None: + errors.append(f"missing storage metadata for {ref!r}") + else: + filename = info.storage.get("filename") + column = info.storage.get("column") + fmt = info.storage.get("format") + if not isinstance(filename, str) or not filename: + errors.append(f"invalid storage filename for {ref!r}: {filename!r}") + if not isinstance(column, str) or not column: + errors.append(f"invalid storage column for {ref!r}: {column!r}") + if fmt != "dense_by_z_csv": + errors.append(f"unsupported storage format for {ref!r}: {fmt!r}") + + coverage = info.coverage + if coverage is None: + errors.append(f"missing coverage metadata for {ref!r}") + max_z = len(dataset.values_by_z) - 1 + else: + max_z = ( + coverage.z_max + if coverage.z_max is not None + else len(dataset.values_by_z) - 1 + ) + + covered_z = tuple( + z + for z, value in enumerate(dataset.values_by_z) + if z > 0 and value is not None and z <= max_z + ) + covered_set = set(covered_z) + missing_z = tuple(z for z in range(1, max_z + 1) if z not in covered_set) + has_placeholders = info.placeholder_value is not None and any( + value is not None and abs(value - info.placeholder_value) < 1e-12 + for value in dataset.values_by_z[1 : max_z + 1] + ) + + if coverage is not None: + expected = { + "n_values": len(covered_z), + "z_min": min(covered_z) if covered_z else None, + "z_max": max(covered_z) if covered_z else None, + "has_placeholders": has_placeholders, + } + actual = asdict(coverage) + for key, value in expected.items(): + if actual[key] != value: + msg = ( + f"coverage mismatch for {ref!r}: {key} is {actual[key]!r}, " + f"expected {value!r}" + ) + errors.append(msg) + if actual["covered_z"] and tuple(actual["covered_z"]) != covered_z: + msg = ( + f"coverage mismatch for {ref!r}: covered_z is " + f"{actual['covered_z']!r}, expected {covered_z!r}" + ) + errors.append(msg) + if actual["missing_z"] and tuple(actual["missing_z"]) != missing_z: + msg = ( + f"coverage mismatch for {ref!r}: missing_z is " + f"{actual['missing_z']!r}, expected {missing_z!r}" + ) + errors.append(msg) + + if ref.quantity not in quantities: + errors.append(f"dataset refers to unknown quantity: {ref!r}") + + for quantity in quantities: + if quantity not in by_role.get("target", []) and quantity != "atomic_radius": + errors.append(f"quantity {quantity!r} has no target datasets") + + +def main() -> int: + errors: list[str] = [] + _validate_alias_collisions(errors) + _validate_dataset_metadata(errors) + + if errors: + for error in errors: + print(f"ERROR: {error}") + return 1 + + print("Registry validation passed.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From b84372116c240ee5788b4875c2296be398c12ac9 Mon Sep 17 00:00:00 2001 From: Ivan Chernyshov Date: Sat, 14 Mar 2026 20:27:27 +0300 Subject: [PATCH 07/15] Adds data listing functionality --- README.md | 4 ++-- docs/datasets/index.md | 2 +- docs/guide/quickstart.md | 2 ++ docs/index.md | 4 ++-- src/atomref/__init__.py | 4 ++++ src/atomref/radii.py | 7 +++++++ src/atomref/registry.py | 9 +++++++++ tests/meta/test_public_api.py | 2 ++ tests/registry/test_registry.py | 12 ++++++++++++ tools/check_registry.py | 28 +++++++++++++++++++--------- 10 files changed, 60 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 5514004..d9f0d6c 100644 --- a/README.md +++ b/README.md @@ -60,8 +60,8 @@ import atomref as ar print(ar.list_quantities()) print(ar.get_quantity_info("atomic_radius")) -print(ar.list_dataset_ids("van_der_waals_radius", usage_role="target")) -print(ar.list_dataset_ids("atomic_radius", usage_role="support")) +print(ar.list_dataset_infos("van_der_waals_radius", usage_role="target")) +print(ar.list_dataset_infos("atomic_radius", usage_role="support")) ``` ## Relationship to the Delone Commons ecosystem diff --git a/docs/datasets/index.md b/docs/datasets/index.md index 1095225..cbd132e 100644 --- a/docs/datasets/index.md +++ b/docs/datasets/index.md @@ -9,7 +9,7 @@ The package distinguishes between: This is what keeps support-only datasets such as `rahm2016` usable without misclassifying them as direct condensed-phase vdW radii. -For programmatic inspection, use `atomref.list_quantities()` and `atomref.get_quantity_info(...)`. +For programmatic inspection, use `atomref.list_quantities()`, `atomref.get_quantity_info(...)`, and `atomref.list_dataset_infos(...)`. Dataset metadata also carries a package-level `usage_role`, which currently distinguishes direct target sets from support-only sets used for substitution or diff --git a/docs/guide/quickstart.md b/docs/guide/quickstart.md index 5cb1637..eb55fd3 100644 --- a/docs/guide/quickstart.md +++ b/docs/guide/quickstart.md @@ -22,5 +22,7 @@ import atomref as ar print(ar.list_quantities()) print(ar.get_quantity_info("atomic_radius")) +print(ar.list_dataset_infos("covalent_radius")) +print(ar.list_radii_set_infos("van_der_waals", usage_role="target")) ``` diff --git a/docs/index.md b/docs/index.md index 5514004..d9f0d6c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -60,8 +60,8 @@ import atomref as ar print(ar.list_quantities()) print(ar.get_quantity_info("atomic_radius")) -print(ar.list_dataset_ids("van_der_waals_radius", usage_role="target")) -print(ar.list_dataset_ids("atomic_radius", usage_role="support")) +print(ar.list_dataset_infos("van_der_waals_radius", usage_role="target")) +print(ar.list_dataset_infos("atomic_radius", usage_role="support")) ``` ## Relationship to the Delone Commons ecosystem diff --git a/src/atomref/__init__.py b/src/atomref/__init__.py index 1b0ef6b..d08e1ed 100644 --- a/src/atomref/__init__.py +++ b/src/atomref/__init__.py @@ -17,6 +17,7 @@ get_covalent_radius, get_radii_set_info, get_vdw_radius, + list_radii_set_infos, list_radii_sets, lookup_covalent_radius, lookup_vdw_radius, @@ -31,6 +32,7 @@ get_dataset_info, get_quantity_info, list_dataset_ids, + list_dataset_infos, list_quantities, ) from .transfer import LinearFit, LinearTransfer, SubstitutionTransfer @@ -51,6 +53,7 @@ "get_dataset_info", "get_quantity_info", "list_dataset_ids", + "list_dataset_infos", "list_quantities", "LinearFit", "LinearTransfer", @@ -63,6 +66,7 @@ "DEFAULT_COVALENT_POLICY", "DEFAULT_VDW_POLICY", "list_radii_sets", + "list_radii_set_infos", "get_radii_set_info", "lookup_covalent_radius", "get_covalent_radius", diff --git a/src/atomref/radii.py b/src/atomref/radii.py index 1095667..a5ede9e 100644 --- a/src/atomref/radii.py +++ b/src/atomref/radii.py @@ -15,6 +15,7 @@ ElementScalarSet, get_dataset_info, list_dataset_ids, + list_dataset_infos, ) from .transfer import LinearFit, LinearTransfer, SubstitutionTransfer, TransferModel @@ -127,6 +128,12 @@ def list_radii_sets( return list_dataset_ids(_quantity_for_kind(kind), usage_role=usage_role) +def list_radii_set_infos( + kind: RadiiKind, *, usage_role: str | None = None +) -> tuple[DatasetInfo, ...]: + return list_dataset_infos(_quantity_for_kind(kind), usage_role=usage_role) + + def get_radii_set_info(kind: RadiiKind, set_id: str) -> DatasetInfo: return get_dataset_info(DatasetRef(_quantity_for_kind(kind), set_id)) diff --git a/src/atomref/registry.py b/src/atomref/registry.py index f84b14b..c465786 100644 --- a/src/atomref/registry.py +++ b/src/atomref/registry.py @@ -253,6 +253,15 @@ def list_dataset_ids( return tuple(filtered) +def list_dataset_infos( + quantity: QuantityId, *, usage_role: str | None = None +) -> tuple[DatasetInfo, ...]: + return tuple( + get_dataset_info(DatasetRef(quantity, set_id)) + for set_id in list_dataset_ids(quantity, usage_role=usage_role) + ) + + def _coerce_reference(obj: object) -> Reference: if not isinstance(obj, dict): raise DatasetError("invalid reference entry in registry.json") diff --git a/tests/meta/test_public_api.py b/tests/meta/test_public_api.py index b64f77d..a6cb329 100644 --- a/tests/meta/test_public_api.py +++ b/tests/meta/test_public_api.py @@ -21,7 +21,9 @@ 'lookup_vdw_radius', 'list_quantities', 'list_dataset_ids', + 'list_dataset_infos', 'list_radii_sets', + 'list_radii_set_infos', } diff --git a/tests/registry/test_registry.py b/tests/registry/test_registry.py index 23b401e..3cfaec5 100644 --- a/tests/registry/test_registry.py +++ b/tests/registry/test_registry.py @@ -68,3 +68,15 @@ def test_list_dataset_ids_can_filter_by_usage_role() -> None: def test_list_radii_sets_can_filter_by_usage_role() -> None: assert ar.list_radii_sets('covalent', usage_role='support') == ('csd_legacy_cov',) assert 'alvarez2013' in ar.list_radii_sets('van_der_waals', usage_role='target') + + +def test_list_dataset_infos_can_filter_by_usage_role() -> None: + infos = ar.list_dataset_infos('atomic_radius', usage_role='support') + assert tuple(info.ref.set_id for info in infos) == ('rahm2016',) + assert all(info.usage_role == 'support' for info in infos) + + +def test_list_radii_set_infos_can_filter_by_usage_role() -> None: + infos = ar.list_radii_set_infos('van_der_waals', usage_role='target') + assert 'alvarez2013' in {info.ref.set_id for info in infos} + assert all(info.ref.quantity == 'van_der_waals_radius' for info in infos) diff --git a/tools/check_registry.py b/tools/check_registry.py index a57f49f..02b1e14 100644 --- a/tools/check_registry.py +++ b/tools/check_registry.py @@ -5,6 +5,7 @@ from collections import defaultdict from dataclasses import asdict +from importlib import import_module from pathlib import Path import sys from typing import Iterable @@ -14,27 +15,35 @@ if str(SRC) not in sys.path: sys.path.insert(0, str(SRC)) -import atomref as ar -from atomref.registry import get_builtin_set - _ALLOWED_USAGE_ROLES = {"target", "support"} +def _load_atomref_module(): + return import_module("atomref") + + +def _get_builtin_set(ref): + registry = import_module("atomref.registry") + return registry.get_builtin_set(ref) + + def _canonical_token(value: str) -> str: return " ".join(value.strip().lower().split()) -def _iter_dataset_refs() -> Iterable[ar.DatasetRef]: +def _iter_dataset_refs() -> Iterable[object]: + ar = _load_atomref_module() for quantity in ar.list_quantities(): - for set_id in ar.list_dataset_ids(quantity): - yield ar.DatasetRef(quantity, set_id) + for info in ar.list_dataset_infos(quantity): + yield info.ref def _validate_alias_collisions(errors: list[str]) -> None: + ar = _load_atomref_module() for quantity in ar.list_quantities(): seen: dict[str, str] = {} - for set_id in ar.list_dataset_ids(quantity): - info = ar.get_dataset_info(ar.DatasetRef(quantity, set_id)) + for info in ar.list_dataset_infos(quantity): + set_id = info.ref.set_id for token in (set_id, *info.aliases): key = _canonical_token(token) previous = seen.get(key) @@ -49,13 +58,14 @@ def _validate_alias_collisions(errors: list[str]) -> None: def _validate_dataset_metadata(errors: list[str]) -> None: + ar = _load_atomref_module() quantities = set(ar.list_quantities()) by_role: dict[str, list[str]] = defaultdict(list) for ref in _iter_dataset_refs(): quantity_info = ar.get_quantity_info(ref.quantity) info = ar.get_dataset_info(ref) - dataset = get_builtin_set(ref) + dataset = _get_builtin_set(ref) if info.ref != ref: errors.append(f"dataset ref mismatch: requested {ref!r}, got {info.ref!r}") From 9cf2b71e17de851e96df5d47943d1b30ecd88983 Mon Sep 17 00:00:00 2001 From: Ivan Chernyshov Date: Sat, 14 Mar 2026 21:54:39 +0300 Subject: [PATCH 08/15] Improves docs --- .github/workflows/ci.yml | 2 + README.md | 12 +++ docs/guide/notebooks.md | 17 ++++ docs/guide/quickstart.md | 6 ++ docs/index.md | 12 +++ mkdocs.yml | 1 + notebooks/01-quickstart.ipynb | 77 ++++++++++++++++ notebooks/02-policies-and-assessment.ipynb | 96 ++++++++++++++++++++ notebooks/03-custom-sets-and-discovery.ipynb | 78 ++++++++++++++++ pyproject.toml | 1 + tests/meta/test_notebooks.py | 24 +++++ tools/check_dist.py | 4 + tools/check_notebooks.py | 95 +++++++++++++++++++ 13 files changed, 425 insertions(+) create mode 100644 docs/guide/notebooks.md create mode 100644 notebooks/01-quickstart.ipynb create mode 100644 notebooks/02-policies-and-assessment.ipynb create mode 100644 notebooks/03-custom-sets-and-discovery.ipynb create mode 100644 tests/meta/test_notebooks.py create mode 100644 tools/check_notebooks.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3512c2a..12d3421 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,6 +20,8 @@ jobs: run: flake8 src tests tools - name: Validate packaged registry run: python tools/check_registry.py + - name: Validate notebooks + run: python tools/check_notebooks.py docs-check: runs-on: ubuntu-latest diff --git a/README.md b/README.md index d9f0d6c..b94dd31 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,18 @@ print(ar.list_dataset_infos("van_der_waals_radius", usage_role="target")) print(ar.list_dataset_infos("atomic_radius", usage_role="support")) ``` +## Notebooks + +Hands-on notebooks live in the repository and mirror the main v0.1 workflows: + +- [`01-quickstart.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/01-quickstart.ipynb) +- [`02-policies-and-assessment.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/02-policies-and-assessment.ipynb) +- [`03-custom-sets-and-discovery.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/03-custom-sets-and-discovery.ipynb) + +Open them locally in Jupyter or browse them on GitHub for worked examples of +lookup, transfer-backed policies, dataset discovery, and custom element-scalar +sets. + ## Relationship to the Delone Commons ecosystem `atomref` is intended to be reusable outside the surrounding ecosystem, but it diff --git a/docs/guide/notebooks.md b/docs/guide/notebooks.md new file mode 100644 index 0000000..9d39376 --- /dev/null +++ b/docs/guide/notebooks.md @@ -0,0 +1,17 @@ +# Notebook gallery + +`atomref` ships example Jupyter notebooks that mirror the main v0.1 user +workflows. They live in the repository under `notebooks/` and can be opened +locally with JupyterLab, VS Code, or any other notebook frontend. + +Available notebooks: + +- [`01-quickstart.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/01-quickstart.ipynb) + – basic imports, element helpers, `get_*` vs `lookup_*`, quantity discovery. +- [`02-policies-and-assessment.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/02-policies-and-assessment.ipynb) + – transfer policies, substitution vs linear transfer, policy assessment. +- [`03-custom-sets-and-discovery.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/03-custom-sets-and-discovery.ipynb) + – custom user-defined sets, catalog inspection, metadata discovery. + +The notebooks are plain JSON files without heavy execution metadata so they stay +diff-friendly in version control. diff --git a/docs/guide/quickstart.md b/docs/guide/quickstart.md index eb55fd3..5017079 100644 --- a/docs/guide/quickstart.md +++ b/docs/guide/quickstart.md @@ -26,3 +26,9 @@ print(ar.list_dataset_infos("covalent_radius")) print(ar.list_radii_set_infos("van_der_waals", usage_role="target")) ``` +Need runnable versions of these examples? See the notebooks page and the +matching notebook files in the repository: + +- [`01-quickstart.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/01-quickstart.ipynb) +- [`02-policies-and-assessment.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/02-policies-and-assessment.ipynb) +- [`03-custom-sets-and-discovery.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/03-custom-sets-and-discovery.ipynb) diff --git a/docs/index.md b/docs/index.md index d9f0d6c..b94dd31 100644 --- a/docs/index.md +++ b/docs/index.md @@ -64,6 +64,18 @@ print(ar.list_dataset_infos("van_der_waals_radius", usage_role="target")) print(ar.list_dataset_infos("atomic_radius", usage_role="support")) ``` +## Notebooks + +Hands-on notebooks live in the repository and mirror the main v0.1 workflows: + +- [`01-quickstart.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/01-quickstart.ipynb) +- [`02-policies-and-assessment.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/02-policies-and-assessment.ipynb) +- [`03-custom-sets-and-discovery.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/03-custom-sets-and-discovery.ipynb) + +Open them locally in Jupyter or browse them on GitHub for worked examples of +lookup, transfer-backed policies, dataset discovery, and custom element-scalar +sets. + ## Relationship to the Delone Commons ecosystem `atomref` is intended to be reusable outside the surrounding ecosystem, but it diff --git a/mkdocs.yml b/mkdocs.yml index 8b5060c..2a97e1a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -20,6 +20,7 @@ nav: - Guide: - Install: guide/install.md - Quickstart: guide/quickstart.md + - Notebooks: guide/notebooks.md - Policies: guide/policies.md - Custom sets: guide/custom_sets.md - Non-goals: guide/non_goals.md diff --git a/notebooks/01-quickstart.ipynb b/notebooks/01-quickstart.ipynb new file mode 100644 index 0000000..2c09cc0 --- /dev/null +++ b/notebooks/01-quickstart.ipynb @@ -0,0 +1,77 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# atomref quickstart\n", + "\n", + "This notebook covers the basic public API: element helpers, direct `get_*` calls, provenance-carrying `lookup_*` calls, and quantity / dataset discovery.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import atomref as ar\n", + "\n", + "print(ar.get_element(\"Cl\"))\n", + "print(ar.list_quantities())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "r_c = ar.get_covalent_radius(\"C\")\n", + "r_vdw = ar.get_vdw_radius(\"O\")\n", + "print(r_c)\n", + "print(r_vdw)\n", + "assert r_c == 0.76\n", + "assert r_vdw == 1.50\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lookup = ar.lookup_vdw_radius(\"Pm\")\n", + "print(lookup)\n", + "print(lookup.value)\n", + "print(lookup.source)\n", + "print(lookup.resolved_from)\n", + "assert lookup.source == \"transfer_linear\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(ar.get_quantity_info(\"atomic_radius\"))\n", + "for info in ar.list_dataset_infos(\"van_der_waals_radius\", usage_role=\"target\"):\n", + " print(info.ref.set_id, info.semantic_class, info.origin_class)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/02-policies-and-assessment.ipynb b/notebooks/02-policies-and-assessment.ipynb new file mode 100644 index 0000000..7db7e45 --- /dev/null +++ b/notebooks/02-policies-and-assessment.ipynb @@ -0,0 +1,96 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Policies and assessment\n", + "\n", + "This notebook shows how `atomref` resolves missing values through ordered transfer steps and how to inspect policy-level behavior.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import atomref as ar\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "covalent_policy = ar.RadiiPolicy(\n", + " kind=\"covalent\",\n", + " base_set=\"cordero2008\",\n", + " transfers=(\n", + " ar.SubstitutionTransfer(\n", + " source=ar.DatasetRef(\"covalent_radius\", \"csd_legacy_cov\")\n", + " ),\n", + " ),\n", + ")\n", + "\n", + "lookup_bk = ar.lookup_covalent_radius(\"Bk\", policy=covalent_policy)\n", + "print(lookup_bk)\n", + "assert lookup_bk.source == \"transfer_substitution\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vdw_policy = ar.RadiiPolicy(\n", + " kind=\"van_der_waals\",\n", + " base_set=\"alvarez2013\",\n", + " transfers=(\n", + " ar.LinearTransfer(\n", + " predictors=(ar.DatasetRef(\"atomic_radius\", \"rahm2016\"),)\n", + " ),\n", + " ),\n", + ")\n", + "\n", + "lookup_pm = ar.lookup_vdw_radius(\"Pm\", policy=vdw_policy)\n", + "print(lookup_pm.fit)\n", + "print(lookup_pm.value)\n", + "assert lookup_pm.source == \"transfer_linear\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assessment = ar.assess_radii_policy(\n", + " [\"C\", \"Xe\", \"Pm\", \"Bk\"],\n", + " policy=vdw_policy,\n", + " detail=True,\n", + ")\n", + "\n", + "print(assessment)\n", + "print(assessment.n_base, assessment.n_transfer_linear)\n", + "for item in assessment.per_element:\n", + " print(item.symbol, item.lookup.source)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/03-custom-sets-and-discovery.ipynb b/notebooks/03-custom-sets-and-discovery.ipynb new file mode 100644 index 0000000..827c91f --- /dev/null +++ b/notebooks/03-custom-sets-and-discovery.ipynb @@ -0,0 +1,78 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Custom sets and dataset discovery\n", + "\n", + "This notebook shows how to define a small user-provided set, plug it into a policy, and inspect the packaged dataset catalog.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import atomref as ar\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_cov = ar.ElementScalarSet.from_mapping(\n", + " ref=ar.DatasetRef(\"covalent_radius\", \"demo_user_cov\"),\n", + " values={\"C\": 0.77, \"O\": 0.67},\n", + " name=\"Demo user covalent set\",\n", + " units=\"angstrom\",\n", + " description=\"Example custom set for notebook usage.\",\n", + " notes=(\"Notebook example\",),\n", + ")\n", + "\n", + "policy = ar.RadiiPolicy(\n", + " kind=\"covalent\",\n", + " base_set=custom_cov,\n", + " transfers=(\n", + " ar.SubstitutionTransfer(\n", + " source=ar.DatasetRef(\"covalent_radius\", \"cordero2008\")\n", + " ),\n", + " ),\n", + ")\n", + "\n", + "for symbol in (\"C\", \"O\", \"N\"):\n", + " print(symbol, ar.lookup_covalent_radius(symbol, policy=policy))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for info in ar.list_radii_set_infos(\"van_der_waals\", usage_role=\"target\"):\n", + " print(info.ref.set_id, info.semantic_class, info.origin_class, info.phase_context)\n", + "\n", + "rahm = ar.get_dataset_info(ar.DatasetRef(\"atomic_radius\", \"rahm2016\"))\n", + "print(rahm.name)\n", + "print(rahm.semantic_class, rahm.phase_context, rahm.usage_role)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index ea2b569..065faab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,6 +70,7 @@ include = [ "/tests", "/docs", "/tools", + "/notebooks", "/mkdocs.yml", "/README.md", "/CHANGELOG.md", diff --git a/tests/meta/test_notebooks.py b/tests/meta/test_notebooks.py new file mode 100644 index 0000000..f49775f --- /dev/null +++ b/tests/meta/test_notebooks.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from pathlib import Path +import subprocess +import sys + + +REPO_ROOT = Path(__file__).resolve().parents[2] +SCRIPT = REPO_ROOT / "tools" / "check_notebooks.py" +NOTEBOOKS = REPO_ROOT / "notebooks" + + +def test_notebook_files_exist() -> None: + expected = { + "01-quickstart.ipynb", + "02-policies-and-assessment.ipynb", + "03-custom-sets-and-discovery.ipynb", + } + actual = {path.name for path in NOTEBOOKS.glob("*.ipynb")} + assert expected.issubset(actual) + + +def test_notebooks_validate_and_execute() -> None: + subprocess.run([sys.executable, str(SCRIPT)], cwd=REPO_ROOT, check=True) diff --git a/tools/check_dist.py b/tools/check_dist.py index 3eb4c66..b9d80b5 100644 --- a/tools/check_dist.py +++ b/tools/check_dist.py @@ -23,6 +23,10 @@ 'README.md', 'LICENSE', 'pyproject.toml', + 'notebooks/01-quickstart.ipynb', + 'notebooks/02-policies-and-assessment.ipynb', + 'notebooks/03-custom-sets-and-discovery.ipynb', + 'tools/check_notebooks.py', } diff --git a/tools/check_notebooks.py b/tools/check_notebooks.py new file mode 100644 index 0000000..830d742 --- /dev/null +++ b/tools/check_notebooks.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +"""Validate notebook JSON structure and execute notebook code cells.""" + +from __future__ import annotations + +from contextlib import redirect_stdout +import io +import json +from pathlib import Path +import sys + + +REPO_ROOT = Path(__file__).resolve().parents[1] +SRC = REPO_ROOT / "src" +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) + +NOTEBOOKS = REPO_ROOT / "notebooks" +REQUIRED_NOTEBOOKS = ( + "01-quickstart.ipynb", + "02-policies-and-assessment.ipynb", + "03-custom-sets-and-discovery.ipynb", +) + + +class NotebookCheckError(RuntimeError): + """Raised when a notebook is malformed or fails to execute.""" + + +def iter_notebooks() -> tuple[Path, ...]: + return tuple(NOTEBOOKS / name for name in REQUIRED_NOTEBOOKS) + + +def load_notebook(path: Path) -> dict[str, object]: + data = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise NotebookCheckError(f"{path.name}: expected top-level JSON object") + return data + + +def iter_code_cells(data: dict[str, object], *, path: Path) -> tuple[str, ...]: + cells = data.get("cells") + if not isinstance(cells, list): + raise NotebookCheckError(f"{path.name}: missing notebook cell list") + + code: list[str] = [] + for index, cell in enumerate(cells): + if not isinstance(cell, dict): + raise NotebookCheckError(f"{path.name}: cell {index} is not an object") + cell_type = cell.get("cell_type") + if cell_type != "code": + continue + source = cell.get("source", []) + if isinstance(source, str): + text = source + elif isinstance(source, list) and all(isinstance(line, str) for line in source): + text = "".join(source) + else: + raise NotebookCheckError( + f"{path.name}: cell {index} has invalid code source" + ) + code.append(text) + if not code: + raise NotebookCheckError(f"{path.name}: contains no code cells") + return tuple(code) + + +def execute_notebook(path: Path) -> None: + if not path.exists(): + raise NotebookCheckError(f"missing notebook: {path}") + data = load_notebook(path) + namespace = {"__name__": "__main__"} + for index, source in enumerate(iter_code_cells(data, path=path), start=1): + if not source.strip(): + continue + try: + code = compile(source, f"{path.name}::cell{index}", "exec") + with redirect_stdout(io.StringIO()): + exec(code, namespace, namespace) + except Exception as exc: # noqa: BLE001 + raise NotebookCheckError( + f"{path.name}: execution failed in code cell {index}: {exc}" + ) from exc + + +def main() -> int: + notebooks = iter_notebooks() + for notebook in notebooks: + execute_notebook(notebook) + print(f"Validated {len(notebooks)} notebook(s).") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From e1f19d6a0eacce7f686e229a7b221c4c89a39365 Mon Sep 17 00:00:00 2001 From: Ivan Chernyshov Date: Sat, 14 Mar 2026 23:11:14 +0300 Subject: [PATCH 09/15] Adds public API --- CHANGELOG.md | 2 ++ README.md | 12 ++++++++++++ docs/guide/quickstart.md | 12 ++++++++++++ docs/index.md | 12 ++++++++++++ src/atomref/__init__.py | 4 ++++ src/atomref/radii.py | 5 +++++ tests/meta/test_public_api.py | 2 ++ tests/registry/test_registry.py | 13 +++++++++++++ 8 files changed, 62 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index faca26a..34daba8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,3 +7,5 @@ - Added registry design separating operational quantity from scientific classification. - Added radii policies with substitution and linear transfer models. + +- Added public packaged-set retrieval helpers: `get_builtin_set()` and `get_radii_set()`. diff --git a/README.md b/README.md index b94dd31..7d7253c 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,18 @@ print(ar.list_dataset_infos("van_der_waals_radius", usage_role="target")) print(ar.list_dataset_infos("atomic_radius", usage_role="support")) ``` +You can also retrieve the packaged set object directly: + +```python +import atomref as ar + +vdw = ar.get_radii_set("van_der_waals", "alvarez2013") +print(vdw.get("O")) + +raw = ar.get_builtin_set(ar.DatasetRef("atomic_radius", "rahm2016")) +print(raw.get("Pm")) +``` + ## Notebooks Hands-on notebooks live in the repository and mirror the main v0.1 workflows: diff --git a/docs/guide/quickstart.md b/docs/guide/quickstart.md index 5017079..d23e99f 100644 --- a/docs/guide/quickstart.md +++ b/docs/guide/quickstart.md @@ -26,6 +26,18 @@ print(ar.list_dataset_infos("covalent_radius")) print(ar.list_radii_set_infos("van_der_waals", usage_role="target")) ``` +You can also retrieve the packaged set object directly: + +```python +import atomref as ar + +vdw = ar.get_radii_set("van_der_waals", "alvarez2013") +print(vdw.get("O")) + +raw = ar.get_builtin_set(ar.DatasetRef("atomic_radius", "rahm2016")) +print(raw.get("Pm")) +``` + Need runnable versions of these examples? See the notebooks page and the matching notebook files in the repository: diff --git a/docs/index.md b/docs/index.md index b94dd31..7d7253c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -64,6 +64,18 @@ print(ar.list_dataset_infos("van_der_waals_radius", usage_role="target")) print(ar.list_dataset_infos("atomic_radius", usage_role="support")) ``` +You can also retrieve the packaged set object directly: + +```python +import atomref as ar + +vdw = ar.get_radii_set("van_der_waals", "alvarez2013") +print(vdw.get("O")) + +raw = ar.get_builtin_set(ar.DatasetRef("atomic_radius", "rahm2016")) +print(raw.get("Pm")) +``` + ## Notebooks Hands-on notebooks live in the repository and mirror the main v0.1 workflows: diff --git a/src/atomref/__init__.py b/src/atomref/__init__.py index d08e1ed..815b42c 100644 --- a/src/atomref/__init__.py +++ b/src/atomref/__init__.py @@ -15,6 +15,7 @@ RadiiPolicyAssessment, assess_radii_policy, get_covalent_radius, + get_radii_set, get_radii_set_info, get_vdw_radius, list_radii_set_infos, @@ -29,6 +30,7 @@ ElementScalarSet, QuantityInfo, Reference, + get_builtin_set, get_dataset_info, get_quantity_info, list_dataset_ids, @@ -50,6 +52,7 @@ "ElementScalarSet", "QuantityInfo", "Reference", + "get_builtin_set", "get_dataset_info", "get_quantity_info", "list_dataset_ids", @@ -67,6 +70,7 @@ "DEFAULT_VDW_POLICY", "list_radii_sets", "list_radii_set_infos", + "get_radii_set", "get_radii_set_info", "lookup_covalent_radius", "get_covalent_radius", diff --git a/src/atomref/radii.py b/src/atomref/radii.py index a5ede9e..cda8a89 100644 --- a/src/atomref/radii.py +++ b/src/atomref/radii.py @@ -13,6 +13,7 @@ DatasetInfo, DatasetRef, ElementScalarSet, + get_builtin_set, get_dataset_info, list_dataset_ids, list_dataset_infos, @@ -138,6 +139,10 @@ def get_radii_set_info(kind: RadiiKind, set_id: str) -> DatasetInfo: return get_dataset_info(DatasetRef(_quantity_for_kind(kind), set_id)) +def get_radii_set(kind: RadiiKind, set_id: str) -> RadiiSet: + return get_builtin_set(DatasetRef(_quantity_for_kind(kind), set_id)) + + def _validate_policy_kind(policy: RadiiPolicy, *, expected: RadiiKind) -> None: if policy.kind != expected: raise PolicyError(f"expected a {expected!r} radii policy, got {policy.kind!r}") diff --git a/tests/meta/test_public_api.py b/tests/meta/test_public_api.py index a6cb329..8f191bf 100644 --- a/tests/meta/test_public_api.py +++ b/tests/meta/test_public_api.py @@ -15,6 +15,8 @@ 'DEFAULT_VDW_POLICY', 'LinearTransfer', 'SubstitutionTransfer', + 'get_builtin_set', + 'get_radii_set', 'get_covalent_radius', 'lookup_covalent_radius', 'get_vdw_radius', diff --git a/tests/registry/test_registry.py b/tests/registry/test_registry.py index 3cfaec5..48afbae 100644 --- a/tests/registry/test_registry.py +++ b/tests/registry/test_registry.py @@ -80,3 +80,16 @@ def test_list_radii_set_infos_can_filter_by_usage_role() -> None: infos = ar.list_radii_set_infos('van_der_waals', usage_role='target') assert 'alvarez2013' in {info.ref.set_id for info in infos} assert all(info.ref.quantity == 'van_der_waals_radius' for info in infos) + + +def test_public_builtin_set_helper_is_exported() -> None: + ds = ar.get_builtin_set(ar.DatasetRef('covalent_radius', 'cordero2008')) + assert ds.info.ref.quantity == 'covalent_radius' + assert ds.get('C') == 0.76 + + +def test_public_radii_set_helper_returns_packaged_radii_set() -> None: + ds = ar.get_radii_set('van_der_waals', 'alvarez2013') + assert ds.info.ref.quantity == 'van_der_waals_radius' + assert ds.info.ref.set_id == 'alvarez2013' + assert ds.get('O') == 1.5 From b64972838970af2951d0ef5eb272ea93172f5e9e Mon Sep 17 00:00:00 2001 From: Ivan Chernyshov Date: Sun, 15 Mar 2026 08:44:16 +0300 Subject: [PATCH 10/15] Improves docs --- .github/workflows/ci.yml | 8 + .github/workflows/docs.yml | 4 + CHANGELOG.md | 10 +- README.md | 165 +++++++++------- docs/api/atomref.md | 3 + docs/api/elements.md | 7 + docs/api/index.md | 28 ++- docs/api/policy.md | 9 + docs/api/radii.md | 8 + docs/api/registry.md | 9 + docs/api/transfer.md | 9 + docs/datasets/atomic_radius.md | 22 ++- docs/datasets/covalent_radius.md | 37 +++- docs/datasets/index.md | 35 +++- docs/datasets/van_der_waals_radius.md | 60 +++++- docs/guide/custom_sets.md | 19 +- docs/guide/install.md | 19 +- docs/guide/non_goals.md | 26 ++- docs/guide/notebooks.md | 28 ++- docs/guide/policies.md | 112 +++++++++-- docs/guide/quickstart.md | 78 ++++---- docs/index.md | 161 ++++++++------- docs/notebooks/01-quickstart.md | 72 +++++++ docs/notebooks/02-policies-and-assessment.md | 73 +++++++ .../notebooks/03-custom-sets-and-discovery.md | 56 ++++++ mkdocs.yml | 11 +- notebooks/01-quickstart.ipynb | 164 ++++++++------- notebooks/02-policies-and-assessment.ipynb | 187 +++++++++--------- notebooks/03-custom-sets-and-discovery.ipynb | 3 +- src/atomref/__init__.py | 2 + src/atomref/elements.py | 35 ++-- src/atomref/errors.py | 9 +- src/atomref/policy.py | 57 +++++- src/atomref/radii.py | 71 ++++++- src/atomref/registry.py | 72 ++++++- src/atomref/transfer.py | 19 +- tests/meta/test_notebooks.py | 21 +- tests/meta/test_text_generation_tools.py | 34 ++++ tools/README.md | 27 +++ tools/check_dist.py | 74 ++++--- tools/check_notebooks.py | 10 + tools/export_notebooks.py | 146 ++++++++++++++ tools/gen_readme.py | 55 +++++- 43 files changed, 1576 insertions(+), 479 deletions(-) create mode 100644 docs/api/elements.md create mode 100644 docs/api/policy.md create mode 100644 docs/api/radii.md create mode 100644 docs/api/registry.md create mode 100644 docs/api/transfer.md create mode 100644 docs/notebooks/01-quickstart.md create mode 100644 docs/notebooks/02-policies-and-assessment.md create mode 100644 docs/notebooks/03-custom-sets-and-discovery.md create mode 100644 tests/meta/test_text_generation_tools.py create mode 100644 tools/README.md create mode 100644 tools/export_notebooks.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 12d3421..6f00ac0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,6 +22,10 @@ jobs: run: python tools/check_registry.py - name: Validate notebooks run: python tools/check_notebooks.py + - name: Check notebook exports + run: python tools/export_notebooks.py --check + - name: Check README sync + run: python tools/gen_readme.py --check docs-check: runs-on: ubuntu-latest @@ -34,6 +38,10 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install .[docs] + - name: Export notebooks and README + run: | + python tools/export_notebooks.py --check + python tools/gen_readme.py --check - name: Build docs run: mkdocs build --strict diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 70396d7..418ce0d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -17,5 +17,9 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install .[docs] + - name: Check generated files + run: | + python tools/export_notebooks.py --check + python tools/gen_readme.py --check - name: Build docs run: mkdocs build --strict diff --git a/CHANGELOG.md b/CHANGELOG.md index 34daba8..dcfa24a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,5 +7,11 @@ - Added registry design separating operational quantity from scientific classification. - Added radii policies with substitution and linear transfer models. - -- Added public packaged-set retrieval helpers: `get_builtin_set()` and `get_radii_set()`. +- Added public packaged-set retrieval helpers: `get_builtin_set()` and + `get_radii_set()`. +- Added runnable notebooks together with generated Markdown notebook pages in + the docs. +- Expanded the docs with dataset guidance, module-level API pages, and a tools + overview. +- Added docstrings across the main importable modules, including important + internal helpers used across modules. diff --git a/README.md b/README.md index 7d7253c..9d5eb79 100644 --- a/README.md +++ b/README.md @@ -1,104 +1,133 @@ # atomref -`atomref` is a small pure-Python package for curated atomic reference data and -policy-based lookup in geometry and structure-analysis code. +[![CI](https://github.com/DeloneCommons/atomref/actions/workflows/ci.yml/badge.svg)](https://github.com/DeloneCommons/atomref/actions/workflows/ci.yml) +[![Docs](https://github.com/DeloneCommons/atomref/actions/workflows/docs.yml/badge.svg)](https://github.com/DeloneCommons/atomref/actions/workflows/docs.yml) +[![PyPI](https://img.shields.io/pypi/v/atomref.svg)](https://pypi.org/project/atomref/) +[![Python Versions](https://img.shields.io/pypi/pyversions/atomref.svg)](https://pypi.org/project/atomref/) +[![License](https://img.shields.io/pypi/l/atomref.svg)](https://github.com/DeloneCommons/atomref/blob/main/LICENSE) -It is **not** a periodic-table encyclopedia. The package is meant to sit under -higher-level scientific software and provide: +`atomref` is a small pure-Python package for **curated atomic reference data** +and **provenance-aware lookup policies** used by geometry and +structure-analysis algorithms. + +It is not meant to be yet another periodic-table encyclopedia. The package is +for code that needs stable atomic reference values with explicit provenance, +clear fallback behavior, and honest handling of incomplete preferred datasets. + +What you get in v0.1: - stable element metadata, -- named radii sets, -- explicit dataset provenance, +- curated named radii sets, +- dataset provenance and coverage metadata, - deterministic lookup policies, -- transfer from broader-support datasets into narrower target sets. - -For v0.1 the public scope is intentionally radii-first. +- substitution and linear transfer from support datasets into target datasets, +- user-defined custom element-indexed scalar sets. ## Why this exists -Many geometry algorithms need a complete reference table, but the scientifically -preferred dataset is often incomplete. `atomref` makes that situation explicit: -choose a target dataset, add one or more transfer steps, and keep provenance on -what was returned. +Scientific software often wants a complete lookup table, but the best dataset +for the job is rarely complete. `atomref` makes that situation explicit. +Instead of hiding ad hoc defaults inside algorithm code, you choose a target +set, describe how missing values may be restored, and keep provenance on what +was actually returned. -The default examples mirror the current `molcryst` behavior: +The default v0.1 behavior is intentionally simple and practical: -- covalent radii: use `cordero2008`, substitute from `csd_legacy_cov` -- van der Waals radii: use `alvarez2013`, linearly transfer from - `atomic_radius:rahm2016` +- **Cordero covalent radii** (`cordero2008`) are the preferred covalent target + set, with missing values substituted from the **legacy CSD covalent radii** + (`csd_legacy_cov`). +- **Alvarez van der Waals radii** (`alvarez2013`) are the preferred vdW target + set, with missing values restored from the **Rahm isodensity atomic radii** + (`rahm2016`) through a fitted linear transfer. ## Quick example -```python -import atomref as ar - -r_c = ar.get_covalent_radius("C") -r_vdw = ar.get_vdw_radius("O") - -lookup = ar.lookup_vdw_radius("Pm") -print(lookup.value, lookup.source, lookup.resolved_from) +```pycon +>>> import atomref as ar +>>> ar.get_covalent_radius("C") +0.76 +>>> ar.get_vdw_radius("O") +1.5 +>>> lookup = ar.lookup_vdw_radius("Pm") +>>> lookup.value +2.8972265395148358 +>>> lookup.source +'transfer_linear' +>>> lookup.resolved_from +(DatasetRef(quantity='atomic_radius', set_id='rahm2016'),) ``` -## Public API split: `get_*` vs `lookup_*` +`get_*` returns only the number. `lookup_*` returns a `LookupResult` that also +records where the value came from and whether a transfer model was involved. -- `get_*` returns only the selected numeric value, or `None`. -- `lookup_*` returns the provenance-carrying `LookupResult` object. +You can inspect the packaged quantity and dataset catalog directly: -This follows the current `molcryst` pattern. +```pycon +>>> import atomref as ar +>>> ar.list_quantities() +('covalent_radius', 'van_der_waals_radius', 'atomic_radius') +>>> ar.get_quantity_info("atomic_radius") +QuantityInfo(quantity='atomic_radius', domain='element', units='angstrom', description='Element-indexed isolated-atom or theory-defined atomic radii used as transferable support data.') +>>> [info.ref.set_id for info in ar.list_dataset_infos("van_der_waals_radius", usage_role="target")] +['bondi1964', 'rowland_taylor1996', 'alvarez2013', 'chernyshov2020'] +``` -## Current built-in quantities +You can also load a packaged set directly: -- `covalent_radius` -- `van_der_waals_radius` -- `atomic_radius` (support quantity; currently used for transfer from - `rahm2016`) +```pycon +>>> import atomref as ar +>>> vdw = ar.get_radii_set("van_der_waals", "alvarez2013") +>>> vdw.get("O") +1.5 +>>> raw = ar.get_builtin_set(ar.DatasetRef("atomic_radius", "rahm2016")) +>>> raw.get("Pm") +2.83 +``` -You can inspect the packaged quantity layer directly: +## Notebook walkthroughs -```python -import atomref as ar +The repository ships example notebooks for the main v0.1 workflows. In the +documentation they are also available as rendered Markdown pages, so users can +read them without opening Jupyter first. -print(ar.list_quantities()) -print(ar.get_quantity_info("atomic_radius")) -print(ar.list_dataset_infos("van_der_waals_radius", usage_role="target")) -print(ar.list_dataset_infos("atomic_radius", usage_role="support")) -``` +- [Notebook overview](https://delonecommons.github.io/atomref/guide/notebooks/) +- [Quickstart notebook](https://delonecommons.github.io/atomref/notebooks/01-quickstart/) +- [Policies and assessment notebook](https://delonecommons.github.io/atomref/notebooks/02-policies-and-assessment/) +- [Custom sets and discovery notebook](https://delonecommons.github.io/atomref/notebooks/03-custom-sets-and-discovery/) -You can also retrieve the packaged set object directly: +## Relationship to Delone Commons -```python -import atomref as ar +`atomref` is designed as a standalone package, but within Delone Commons it is +primarily intended to support chemistry-aware packages such as: -vdw = ar.get_radii_set("van_der_waals", "alvarez2013") -print(vdw.get("O")) +- `molcryst`, for covalent-bond detection and contact analysis, +- future `chemvoro`, for chemistry-aware contact and hydrogen workflows. -raw = ar.get_builtin_set(ar.DatasetRef("atomic_radius", "rahm2016")) -print(raw.get("Pm")) -``` +By contrast, `pyvoro2` and `pbcgraph` are intentionally general mathematical +packages and are not direct consumers of `atomref`. -## Notebooks +## Data curation and developer tools -Hands-on notebooks live in the repository and mirror the main v0.1 workflows: +The repository also ships small maintenance tools. The most important ones are: -- [`01-quickstart.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/01-quickstart.ipynb) -- [`02-policies-and-assessment.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/02-policies-and-assessment.ipynb) -- [`03-custom-sets-and-discovery.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/03-custom-sets-and-discovery.ipynb) +- `python tools/check_registry.py` — validate curated registry metadata against + packaged CSV tables, +- `python tools/check_notebooks.py` — execute notebook code cells, +- `python tools/export_notebooks.py` — turn notebooks into Markdown pages for + the docs, +- `python tools/gen_readme.py` — regenerate `README.md` from this page. -Open them locally in Jupyter or browse them on GitHub for worked examples of -lookup, transfer-backed policies, dataset discovery, and custom element-scalar -sets. +See the [tools README](https://github.com/DeloneCommons/atomref/blob/main/tools/README.md) +for a short description of each script. -## Relationship to the Delone Commons ecosystem +--- -`atomref` is intended to be reusable outside the surrounding ecosystem, but it -fits naturally beneath: +This README is generated from `docs/index.md`. -- `molcryst` -- `pyvoro2` -- `pbcgraph` +To regenerate it: -Those packages should consume atomic reference data from `atomref` rather than -re-curating such datasets independently. +```bash +python tools/gen_readme.py +``` -For data-curation changes, validate the packaged registry against the bundled -CSV tables with `python tools/check_registry.py`. +Edit the documentation sources instead of editing `README.md` directly. diff --git a/docs/api/atomref.md b/docs/api/atomref.md index dcbc5e0..3536e34 100644 --- a/docs/api/atomref.md +++ b/docs/api/atomref.md @@ -1,3 +1,6 @@ # atomref +The top-level package re-exports the main user-facing API so that most code can +simply do `import atomref as ar`. + ::: atomref diff --git a/docs/api/elements.md b/docs/api/elements.md new file mode 100644 index 0000000..c4275a0 --- /dev/null +++ b/docs/api/elements.md @@ -0,0 +1,7 @@ +# atomref.elements + +Element identity is intentionally minimal in v0.1: atomic number, symbol, and +name. The module also contains the canonicalization helpers used throughout the +package. + +::: atomref.elements diff --git a/docs/api/index.md b/docs/api/index.md index da15dbf..e69e719 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -1,4 +1,28 @@ # API -The top-level package exports the main radii helpers together with the registry, -policy, and transfer data structures. +The public API is small on purpose. + +Most users will spend most of their time in the top-level package namespace and +in the radii helpers. The lower-level modules are still documented because they +expose the actual data model behind the package. + +## Common tasks + +- get a single value: use `get_covalent_radius(...)` or `get_vdw_radius(...)` +- inspect provenance: use `lookup_covalent_radius(...)` or + `lookup_vdw_radius(...)` +- browse packaged datasets: use `list_quantities()`, `get_quantity_info(...)`, + `list_dataset_infos(...)`, or `list_radii_set_infos(...)` +- load a packaged set directly: use `get_builtin_set(...)` or `get_radii_set(...)` +- define a custom set: use `ElementScalarSet.from_mapping(...)` +- define transfer-backed lookup behavior: use `RadiiPolicy`, + `SubstitutionTransfer`, and `LinearTransfer` + +## Module reference + +- [Top-level package](atomref.md) +- [Elements](elements.md) +- [Registry and packaged datasets](registry.md) +- [Transfer models](transfer.md) +- [Generic policy core](policy.md) +- [Radii API](radii.md) diff --git a/docs/api/policy.md b/docs/api/policy.md new file mode 100644 index 0000000..99d51d9 --- /dev/null +++ b/docs/api/policy.md @@ -0,0 +1,9 @@ +# atomref.policy + +This module contains the generic resolver that sits below the radii-specific +API. + +It is useful when you want to understand exactly how overrides, base datasets, +transfers, fallbacks, and missing values are ordered and reported. + +::: atomref.policy diff --git a/docs/api/radii.md b/docs/api/radii.md new file mode 100644 index 0000000..05617a4 --- /dev/null +++ b/docs/api/radii.md @@ -0,0 +1,8 @@ +# atomref.radii + +This is the main user-facing module in v0.1. + +It provides radii policies, packaged radii-set discovery, lookup helpers, and +policy-assessment reports. + +::: atomref.radii diff --git a/docs/api/registry.md b/docs/api/registry.md new file mode 100644 index 0000000..4f664e6 --- /dev/null +++ b/docs/api/registry.md @@ -0,0 +1,9 @@ +# atomref.registry + +This module contains the packaged data model. + +If you want to understand how `atomref` classifies datasets, how aliases are +resolved, or how built-in CSV tables are turned into typed in-memory objects, +this is the key module to read. + +::: atomref.registry diff --git a/docs/api/transfer.md b/docs/api/transfer.md new file mode 100644 index 0000000..eab5672 --- /dev/null +++ b/docs/api/transfer.md @@ -0,0 +1,9 @@ +# atomref.transfer + +Transfer models describe how missing target values may be restored from other +datasets. + +In v0.1 the core built-in models are direct substitution and one-predictor +linear transfer. + +::: atomref.transfer diff --git a/docs/datasets/atomic_radius.md b/docs/datasets/atomic_radius.md index 00a43cd..1704980 100644 --- a/docs/datasets/atomic_radius.md +++ b/docs/datasets/atomic_radius.md @@ -1,10 +1,22 @@ # Atomic radius -This quantity currently exists to hold transferable support datasets that are -not best described as direct condensed-phase vdW radii. +The `atomic_radius` quantity exists in v0.1 to hold support datasets that are +scientifically useful but should not be presented as direct condensed-phase vdW +radii. -Built-in v0.1 support set: +## Rahm isodensity atomic radii (`rahm2016`) -- `rahm2016` +This is currently the only built-in atomic-radius dataset. -`rahm2016` is intentionally classified here as atomic support data rather than as a direct vdW target set. +- **What it is:** radii for isolated neutral atoms defined by the + ρ = 0.001 e/bohr³ electron-density isosurface. +- **Source idea:** a consistent theory-based atomic size measure derived from + computed electron densities. +- **Coverage:** broad, but not complete for the full periodic table. +- **Why it matters here:** it correlates well with structural vdW radii and is a + useful support baseline when a condensed-phase target set is incomplete. +- **How `atomref` uses it:** support-only dataset for linear transfer into + target vdW values such as `alvarez2013`. + +This is an important example of the package philosophy: a dataset can be very +useful algorithmically without being mislabeled as something it is not. diff --git a/docs/datasets/covalent_radius.md b/docs/datasets/covalent_radius.md index f298635..d2e2251 100644 --- a/docs/datasets/covalent_radius.md +++ b/docs/datasets/covalent_radius.md @@ -1,6 +1,37 @@ # Covalent radius -Built-in v0.1 sets: +The covalent-radius quantity in v0.1 is aimed at bond-detection and related +geometry workflows. It currently ships one preferred target dataset and one +legacy support dataset. -- `cordero2008` -- `csd_legacy_cov` +## Cordero covalent radii (`cordero2008`) + +This is the main covalent-radius target set in `atomref` v0.1. + +- **What it is:** a broad covalent-radius compilation based mainly on + crystallographic bond distances. +- **Why it matters:** it is a modern, widely used reference set for element-wise + covalent radii. +- **Coverage:** broad coverage across the periodic table, but not complete for + every element. +- **How `atomref` uses it:** direct target dataset for covalent-radius lookup. + +If you want one covalent set to start with, this is usually the right first +choice. + +## Legacy CSD covalent radii (`csd_legacy_cov`) + +This set reflects the older covalent radii historically used in CSD software for +bond perception. + +- **What it is:** a practical, legacy-oriented bond-assignment table. +- **Why it matters:** it has long been used in chemistry software and contains + placeholder conventions that are still relevant for compatibility work. +- **Coverage:** broad practical coverage, with explicit placeholder values for + elements not covered by the historical table. +- **How `atomref` uses it:** support dataset for substitution when the preferred + Cordero target set is missing a value. + +Because it contains legacy placeholders, it is not the preferred scientific +starting point. It is mainly useful as a support layer and for compatibility +with older workflows. diff --git a/docs/datasets/index.md b/docs/datasets/index.md index cbd132e..20d4c3e 100644 --- a/docs/datasets/index.md +++ b/docs/datasets/index.md @@ -1,17 +1,34 @@ # Datasets -The package distinguishes between: +`atomref` does not treat all datasets as interchangeable lookup tables. +Instead, the package records several layers of classification: - **quantity** — the operational property being requested, - **semantic class** — what the dataset scientifically represents, -- **origin / phase context** — how and where it was derived. +- **origin class** — how the values were obtained, +- **phase context** — what physical context they describe, +- **usage role** — whether the package treats the dataset as a direct target set + or as support data for transfer. -This is what keeps support-only datasets such as `rahm2016` usable without -misclassifying them as direct condensed-phase vdW radii. +This is what allows a dataset such as **Rahm isodensity atomic radii** +(`rahm2016`) to be useful in van der Waals workflows without pretending that it +is itself a condensed-phase structural vdW-radius set. -For programmatic inspection, use `atomref.list_quantities()`, `atomref.get_quantity_info(...)`, and `atomref.list_dataset_infos(...)`. +## Programmatic inspection -Dataset metadata also carries a package-level `usage_role`, which currently -distinguishes direct target sets from support-only sets used for substitution or -linear transfer. Use `atomref.list_dataset_ids(..., usage_role=...)` to inspect -that layer programmatically. +The most useful catalog helpers are: + +- `atomref.list_quantities()` +- `atomref.get_quantity_info(...)` +- `atomref.list_dataset_infos(...)` +- `atomref.list_radii_set_infos(...)` + +If you only need dataset ids, use `list_dataset_ids(...)` or `list_radii_sets(...)`. +If you want the packaged values themselves, use `get_builtin_set(...)` or +`get_radii_set(...)`. + +## Built-in quantity families in v0.1 + +- [Covalent radius](covalent_radius.md) +- [van der Waals radius](van_der_waals_radius.md) +- [Atomic radius](atomic_radius.md) diff --git a/docs/datasets/van_der_waals_radius.md b/docs/datasets/van_der_waals_radius.md index d757bab..c678639 100644 --- a/docs/datasets/van_der_waals_radius.md +++ b/docs/datasets/van_der_waals_radius.md @@ -1,11 +1,57 @@ # van der Waals radius -Built-in v0.1 target sets: +The van der Waals quantity in v0.1 intentionally includes several target sets +with different scientific backgrounds. This lets users choose between a classic +historical compilation, structural contact-derived sets, and compatibility-only +legacy tables. -- `bondi1964` -- `rowland_taylor1996` -- `alvarez2013` -- `chernyshov2020` -- `csd_legacy_vdw` +## Bondi van der Waals radii (`bondi1964`) -Support-only sets may live under other quantities. +A classic historical reference set compiled from mixed experimental sources. + +- **What it is:** the traditional Bondi vdW table used throughout chemistry. +- **Coverage:** limited, especially for transition metals and heavier elements. +- **Why you might use it:** historical consistency or comparison with older + literature and software defaults. + +## Rowland & Taylor nonbonded-contact radii (`rowland_taylor1996`) + +A small but influential structural set derived from organic-crystal nonbonded +contacts. + +- **What it is:** a condensed-phase structural vdW set focused on common organic + elements. +- **Coverage:** intentionally narrow. +- **Why you might use it:** organic-crystal contact analysis and comparisons to + classic contact-distance literature. + +## Alvarez van der Waals radii (`alvarez2013`) + +This is the main van der Waals target set in `atomref` v0.1. + +- **What it is:** a broad structural vdW set derived from statistical analysis + of many interatomic distances in the Cambridge Structural Database. +- **Coverage:** broad, but still incomplete for some elements. +- **Why you might use it:** it is a strong default for general condensed-phase + geometry and contact work. +- **How `atomref` uses it:** direct target set for vdW lookup, with missing + values restored from support data when requested by policy. + +## Chernyshov line-of-sight vdW radii (`chernyshov2020`) + +A reduced element-wise view of a more atom-type-aware structural analysis. + +- **What it is:** vdW radii inferred from line-of-sight contact classification. +- **Coverage:** focused on elements common in molecular crystals. +- **Why you might use it:** you want a contact-derived set informed by the LoS + idea while still using a simple element-wise API. + +## Legacy CSD van der Waals radii (`csd_legacy_vdw`) + +A compatibility-oriented table used historically in CSD tools. + +- **What it is:** an older practical vdW table with placeholder conventions. +- **Coverage:** broad practical coverage, but not a modern scientific target + set. +- **How `atomref` uses it:** support-only data for legacy compatibility and + future migration work. diff --git a/docs/guide/custom_sets.md b/docs/guide/custom_sets.md index bfc55cb..ed4d664 100644 --- a/docs/guide/custom_sets.md +++ b/docs/guide/custom_sets.md @@ -1,8 +1,10 @@ # Custom sets -Custom element-indexed scalar datasets can be built with -`ElementScalarSet.from_mapping(...)` and then used directly in a `RadiiPolicy` -or a transfer model. +`atomref` is not limited to the packaged tables. You can build a small +user-defined element-indexed scalar dataset and use it as a base dataset or as a +support dataset inside a transfer-backed policy. + +The simplest entry point is `ElementScalarSet.from_mapping(...)`. ```python from atomref import DatasetRef, ElementScalarSet, RadiiPolicy @@ -16,3 +18,14 @@ custom = ElementScalarSet.from_mapping( policy = RadiiPolicy(kind="covalent", base_set=custom) ``` + +This is useful when you want to: + +- test an alternative reference table, +- pin a small project-specific dataset without creating a full package fork, +- combine a user dataset with built-in support data through substitution or + linear transfer. + +In v0.1 custom sets are element-domain scalar datasets, which keeps the data +model small and stable. Later versions may add more specialized domains, but +custom element-wise sets are already enough for many geometry workflows. diff --git a/docs/guide/install.md b/docs/guide/install.md index 2e2ae65..00a4f22 100644 --- a/docs/guide/install.md +++ b/docs/guide/install.md @@ -1,8 +1,23 @@ # Install +For normal use, install the runtime package: + ```bash pip install atomref ``` -The runtime package is pure Python and has no required runtime dependencies -outside the standard library. +`atomref` is pure Python and has no required runtime dependencies outside the +standard library. + +For local development, documentation work, and tests, install the editable +package together with the main extras: + +```bash +pip install -e ".[test,docs,dev]" +``` + +Those extras currently cover: + +- `test` — pytest and test-only compatibility helpers, +- `docs` — MkDocs and API documentation tooling, +- `dev` — flake8, build, and release metadata checks. diff --git a/docs/guide/non_goals.md b/docs/guide/non_goals.md index 57bca94..b38aa68 100644 --- a/docs/guide/non_goals.md +++ b/docs/guide/non_goals.md @@ -1,11 +1,23 @@ # Non-goals -`atomref` does not aim to handle: +`atomref` is intentionally narrow. -- file parsing, -- crystallographic symmetry, -- structure inference, -- Voronoi or power tessellation, -- chemistry-specific plane-position logic. +It is **not** trying to be: -Those concerns belong in higher-level packages. +- a general periodic-table encyclopedia, +- a home for arbitrary atomic or chemical properties, +- a structure parser, +- a crystallographic symmetry package, +- a structure-inference engine, +- a Voronoi / tessellation library, +- an environment-specific chemistry model, +- a machine-learning framework for extrapolating unseen chemistry. + +The package is about **curated reference data and explicit lookup policies**. +That includes provenance, transfer from broader support datasets, and stable API +surfaces that higher-level scientific code can rely on. + +Future versions may widen the range of supported *reference-data families* — for +example X–H distances or radial atomic reference functions — but the package +should still remain a small reference-data layer rather than a full chemistry +platform. diff --git a/docs/guide/notebooks.md b/docs/guide/notebooks.md index 9d39376..cdd1721 100644 --- a/docs/guide/notebooks.md +++ b/docs/guide/notebooks.md @@ -1,17 +1,25 @@ # Notebook gallery -`atomref` ships example Jupyter notebooks that mirror the main v0.1 user -workflows. They live in the repository under `notebooks/` and can be opened -locally with JupyterLab, VS Code, or any other notebook frontend. +`atomref` ships example Jupyter notebooks that cover the main v0.1 workflows. +Each notebook is available in two forms: -Available notebooks: +- the original `.ipynb` file in the repository, +- a rendered Markdown copy included in these docs. + +That way users can either run the notebooks locally or read them directly on the +documentation site. + +## Available notebooks + +- [Quickstart notebook](../notebooks/01-quickstart.md) — basic imports, + `get_*` vs `lookup_*`, quantity discovery, and packaged-set access. +- [Policies and assessment notebook](../notebooks/02-policies-and-assessment.md) + — overrides, transfer-backed policies, and policy summaries. +- [Custom sets and discovery notebook](../notebooks/03-custom-sets-and-discovery.md) + — user-defined sets, catalog inspection, and metadata exploration. + +The original notebook files are also in the repository: - [`01-quickstart.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/01-quickstart.ipynb) - – basic imports, element helpers, `get_*` vs `lookup_*`, quantity discovery. - [`02-policies-and-assessment.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/02-policies-and-assessment.ipynb) - – transfer policies, substitution vs linear transfer, policy assessment. - [`03-custom-sets-and-discovery.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/03-custom-sets-and-discovery.ipynb) - – custom user-defined sets, catalog inspection, metadata discovery. - -The notebooks are plain JSON files without heavy execution metadata so they stay -diff-friendly in version control. diff --git a/docs/guide/policies.md b/docs/guide/policies.md index a5a5b1b..62663ae 100644 --- a/docs/guide/policies.md +++ b/docs/guide/policies.md @@ -1,28 +1,102 @@ # Policies -A policy is the ordered rule set for selecting a value. +A policy tells `atomref` how to answer the question “what value should I use for +this element?” -Resolution order in v0.1: +That may sound simple, but in practice scientific datasets are often +incomplete. A policy makes the decision process explicit instead of hiding it in +algorithm code. -1. override -2. base dataset -3. transfers in order -4. fallback -5. missing +## Resolution order -Built-in transfer models: +In v0.1 every lookup follows the same ordered path: -- `SubstitutionTransfer` -- `LinearTransfer` +1. **Override** +2. **Base dataset** +3. **Transfer models**, in the order you listed them +4. **Fallback** +5. **Missing** -`LinearTransfer` is intentionally limited to one predictor in v0.1, but the API -already accepts a predictor tuple so later multi-predictor linear models do not -require a redesign. +Each step has a specific meaning. -## Target vs support sets +### Override -`atomref` keeps the lookup behavior separate from the scientific classification -of a dataset. In addition, each built-in dataset now carries a package-level -`usage_role` such as `target` or `support`. This is how `rahm2016` can remain -available for linear transfer into `alvarez2013`-style vdW values without being -misrepresented as a direct condensed-phase vdW target set. +An override is a value you provide directly for a specific element. It wins over +everything else and is useful when you want to pin one or two elements without +changing the whole dataset. + +### Base dataset + +The base dataset is the preferred source. For example, the default covalent +policy starts from the **Cordero covalent radii** (`cordero2008`), and the +default vdW policy starts from the **Alvarez van der Waals radii** +(`alvarez2013`). + +### Transfer + +A transfer model is used only when the base dataset has no value for the +requested element. + +Built-in transfer models in v0.1 are: + +- `SubstitutionTransfer` — take a value directly from another dataset, +- `LinearTransfer` — infer a target-equivalent value from a support dataset + through a fitted linear model. + +`LinearTransfer` already accepts a tuple of predictors in the API, but the v0.1 +runtime intentionally supports exactly one predictor dataset. That keeps the +implementation simple now while leaving room for later multi-predictor linear +models. + +### Fallback + +A fallback is a constant last-resort value. It is useful when an algorithm must +receive *some* number even if both the base dataset and transfer sources are +missing a value. + +### Missing + +If nothing above can produce a value and no fallback was configured, the result +is simply missing. In that case `get_*` returns `None`, while `lookup_*` +returns a `LookupResult` with `source="missing"` and explanatory notes. + +## Target datasets and support datasets + +`atomref` separates **what a dataset is used for** from **what it scientifically +represents**. + +That is why the package stores: + +- the operational **quantity**, +- the scientific **semantic class**, +- the package-level **usage role**. + +This distinction matters for datasets such as **Rahm isodensity atomic radii** +(`rahm2016`). They are useful support data for restoring missing van der Waals +radii, but they are not the same thing as a condensed-phase structural vdW +radius set. In `atomref`, that difference is recorded in the metadata instead of +being hidden. + +## Example + +```python +import atomref as ar + +policy = ar.RadiiPolicy( + kind="van_der_waals", + base_set="alvarez2013", + transfers=( + ar.LinearTransfer( + predictors=(ar.DatasetRef("atomic_radius", "rahm2016"),), + ), + ), + overrides={"Xe": 2.10}, +) +``` + +With that policy: + +- xenon uses the explicit override, +- elements present in `alvarez2013` use the base vdW value, +- missing elements may be restored from `rahm2016`, +- anything still unresolved remains missing unless you also set a fallback. diff --git a/docs/guide/quickstart.md b/docs/guide/quickstart.md index d23e99f..3649653 100644 --- a/docs/guide/quickstart.md +++ b/docs/guide/quickstart.md @@ -1,46 +1,56 @@ # Quickstart -```python -import atomref as ar - -print(ar.get_covalent_radius("C")) -print(ar.get_vdw_radius("O")) - -m = ar.lookup_vdw_radius("Pm") -print(m.value) -print(m.source) -print(m.resolved_from) +The two most important user-facing ideas in `atomref` are: + +- `get_*` returns only the selected number, +- `lookup_*` returns the number **and** provenance metadata. + +```pycon +>>> import atomref as ar +>>> ar.get_covalent_radius("C") +0.76 +>>> ar.get_vdw_radius("O") +1.5 +>>> lookup = ar.lookup_vdw_radius("Pm") +>>> lookup.value +2.8972265395148358 +>>> lookup.source +'transfer_linear' +>>> lookup.resolved_from +(DatasetRef(quantity='atomic_radius', set_id='rahm2016'),) ``` -Use `get_*` when you only need the number, and `lookup_*` when you need -provenance. - -You can also inspect the packaged quantity layer directly: +Use `get_*` when you only need the value. Use `lookup_*` when you want to know +whether the result came from the preferred dataset, a support dataset, a policy +override, or a fallback. -```python -import atomref as ar +You can inspect the packaged quantity layer directly: -print(ar.list_quantities()) -print(ar.get_quantity_info("atomic_radius")) -print(ar.list_dataset_infos("covalent_radius")) -print(ar.list_radii_set_infos("van_der_waals", usage_role="target")) +```pycon +>>> import atomref as ar +>>> ar.list_quantities() +('covalent_radius', 'van_der_waals_radius', 'atomic_radius') +>>> ar.get_quantity_info("atomic_radius") +QuantityInfo(quantity='atomic_radius', domain='element', units='angstrom', description='Element-indexed isolated-atom or theory-defined atomic radii used as transferable support data.') +>>> [info.ref.set_id for info in ar.list_radii_set_infos("van_der_waals", usage_role="target")] +['bondi1964', 'rowland_taylor1996', 'alvarez2013', 'chernyshov2020'] ``` -You can also retrieve the packaged set object directly: - -```python -import atomref as ar - -vdw = ar.get_radii_set("van_der_waals", "alvarez2013") -print(vdw.get("O")) +And you can load a packaged set object directly: -raw = ar.get_builtin_set(ar.DatasetRef("atomic_radius", "rahm2016")) -print(raw.get("Pm")) +```pycon +>>> import atomref as ar +>>> vdw = ar.get_radii_set("van_der_waals", "alvarez2013") +>>> vdw.get("O") +1.5 +>>> raw = ar.get_builtin_set(ar.DatasetRef("atomic_radius", "rahm2016")) +>>> raw.get("Pm") +2.83 ``` -Need runnable versions of these examples? See the notebooks page and the -matching notebook files in the repository: +For longer, runnable examples see: -- [`01-quickstart.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/01-quickstart.ipynb) -- [`02-policies-and-assessment.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/02-policies-and-assessment.ipynb) -- [`03-custom-sets-and-discovery.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/03-custom-sets-and-discovery.ipynb) +- the [notebook overview](notebooks.md), +- the [quickstart notebook page](../notebooks/01-quickstart.md), +- the [policies notebook page](../notebooks/02-policies-and-assessment.md), +- the [custom sets notebook page](../notebooks/03-custom-sets-and-discovery.md). diff --git a/docs/index.md b/docs/index.md index 7d7253c..c59777e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,104 +1,121 @@ # atomref -`atomref` is a small pure-Python package for curated atomic reference data and -policy-based lookup in geometry and structure-analysis code. +[![CI](https://github.com/DeloneCommons/atomref/actions/workflows/ci.yml/badge.svg)](https://github.com/DeloneCommons/atomref/actions/workflows/ci.yml) +[![Docs](https://github.com/DeloneCommons/atomref/actions/workflows/docs.yml/badge.svg)](https://github.com/DeloneCommons/atomref/actions/workflows/docs.yml) +[![PyPI](https://img.shields.io/pypi/v/atomref.svg)](https://pypi.org/project/atomref/) +[![Python Versions](https://img.shields.io/pypi/pyversions/atomref.svg)](https://pypi.org/project/atomref/) +[![License](https://img.shields.io/pypi/l/atomref.svg)](https://github.com/DeloneCommons/atomref/blob/main/LICENSE) -It is **not** a periodic-table encyclopedia. The package is meant to sit under -higher-level scientific software and provide: +`atomref` is a small pure-Python package for **curated atomic reference data** +and **provenance-aware lookup policies** used by geometry and +structure-analysis algorithms. + +It is not meant to be yet another periodic-table encyclopedia. The package is +for code that needs stable atomic reference values with explicit provenance, +clear fallback behavior, and honest handling of incomplete preferred datasets. + +What you get in v0.1: - stable element metadata, -- named radii sets, -- explicit dataset provenance, +- curated named radii sets, +- dataset provenance and coverage metadata, - deterministic lookup policies, -- transfer from broader-support datasets into narrower target sets. - -For v0.1 the public scope is intentionally radii-first. +- substitution and linear transfer from support datasets into target datasets, +- user-defined custom element-indexed scalar sets. ## Why this exists -Many geometry algorithms need a complete reference table, but the scientifically -preferred dataset is often incomplete. `atomref` makes that situation explicit: -choose a target dataset, add one or more transfer steps, and keep provenance on -what was returned. +Scientific software often wants a complete lookup table, but the best dataset +for the job is rarely complete. `atomref` makes that situation explicit. +Instead of hiding ad hoc defaults inside algorithm code, you choose a target +set, describe how missing values may be restored, and keep provenance on what +was actually returned. -The default examples mirror the current `molcryst` behavior: +The default v0.1 behavior is intentionally simple and practical: -- covalent radii: use `cordero2008`, substitute from `csd_legacy_cov` -- van der Waals radii: use `alvarez2013`, linearly transfer from - `atomic_radius:rahm2016` +- **Cordero covalent radii** (`cordero2008`) are the preferred covalent target + set, with missing values substituted from the **legacy CSD covalent radii** + (`csd_legacy_cov`). +- **Alvarez van der Waals radii** (`alvarez2013`) are the preferred vdW target + set, with missing values restored from the **Rahm isodensity atomic radii** + (`rahm2016`) through a fitted linear transfer. ## Quick example -```python -import atomref as ar - -r_c = ar.get_covalent_radius("C") -r_vdw = ar.get_vdw_radius("O") - -lookup = ar.lookup_vdw_radius("Pm") -print(lookup.value, lookup.source, lookup.resolved_from) +```pycon +>>> import atomref as ar +>>> ar.get_covalent_radius("C") +0.76 +>>> ar.get_vdw_radius("O") +1.5 +>>> lookup = ar.lookup_vdw_radius("Pm") +>>> lookup.value +2.8972265395148358 +>>> lookup.source +'transfer_linear' +>>> lookup.resolved_from +(DatasetRef(quantity='atomic_radius', set_id='rahm2016'),) ``` -## Public API split: `get_*` vs `lookup_*` - -- `get_*` returns only the selected numeric value, or `None`. -- `lookup_*` returns the provenance-carrying `LookupResult` object. +`get_*` returns only the number. `lookup_*` returns a `LookupResult` that also +records where the value came from and whether a transfer model was involved. -This follows the current `molcryst` pattern. +You can inspect the packaged quantity and dataset catalog directly: -## Current built-in quantities - -- `covalent_radius` -- `van_der_waals_radius` -- `atomic_radius` (support quantity; currently used for transfer from - `rahm2016`) - -You can inspect the packaged quantity layer directly: - -```python -import atomref as ar - -print(ar.list_quantities()) -print(ar.get_quantity_info("atomic_radius")) -print(ar.list_dataset_infos("van_der_waals_radius", usage_role="target")) -print(ar.list_dataset_infos("atomic_radius", usage_role="support")) +```pycon +>>> import atomref as ar +>>> ar.list_quantities() +('covalent_radius', 'van_der_waals_radius', 'atomic_radius') +>>> ar.get_quantity_info("atomic_radius") +QuantityInfo(quantity='atomic_radius', domain='element', units='angstrom', description='Element-indexed isolated-atom or theory-defined atomic radii used as transferable support data.') +>>> [info.ref.set_id for info in ar.list_dataset_infos("van_der_waals_radius", usage_role="target")] +['bondi1964', 'rowland_taylor1996', 'alvarez2013', 'chernyshov2020'] ``` -You can also retrieve the packaged set object directly: +You can also load a packaged set directly: -```python -import atomref as ar +```pycon +>>> import atomref as ar +>>> vdw = ar.get_radii_set("van_der_waals", "alvarez2013") +>>> vdw.get("O") +1.5 +>>> raw = ar.get_builtin_set(ar.DatasetRef("atomic_radius", "rahm2016")) +>>> raw.get("Pm") +2.83 +``` -vdw = ar.get_radii_set("van_der_waals", "alvarez2013") -print(vdw.get("O")) +## Notebook walkthroughs -raw = ar.get_builtin_set(ar.DatasetRef("atomic_radius", "rahm2016")) -print(raw.get("Pm")) -``` +The repository ships example notebooks for the main v0.1 workflows. In the +documentation they are also available as rendered Markdown pages, so users can +read them without opening Jupyter first. -## Notebooks +- [Notebook overview](https://delonecommons.github.io/atomref/guide/notebooks/) +- [Quickstart notebook](https://delonecommons.github.io/atomref/notebooks/01-quickstart/) +- [Policies and assessment notebook](https://delonecommons.github.io/atomref/notebooks/02-policies-and-assessment/) +- [Custom sets and discovery notebook](https://delonecommons.github.io/atomref/notebooks/03-custom-sets-and-discovery/) -Hands-on notebooks live in the repository and mirror the main v0.1 workflows: +## Relationship to Delone Commons -- [`01-quickstart.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/01-quickstart.ipynb) -- [`02-policies-and-assessment.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/02-policies-and-assessment.ipynb) -- [`03-custom-sets-and-discovery.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/03-custom-sets-and-discovery.ipynb) +`atomref` is designed as a standalone package, but within Delone Commons it is +primarily intended to support chemistry-aware packages such as: -Open them locally in Jupyter or browse them on GitHub for worked examples of -lookup, transfer-backed policies, dataset discovery, and custom element-scalar -sets. +- `molcryst`, for covalent-bond detection and contact analysis, +- future `chemvoro`, for chemistry-aware contact and hydrogen workflows. -## Relationship to the Delone Commons ecosystem +By contrast, `pyvoro2` and `pbcgraph` are intentionally general mathematical +packages and are not direct consumers of `atomref`. -`atomref` is intended to be reusable outside the surrounding ecosystem, but it -fits naturally beneath: +## Data curation and developer tools -- `molcryst` -- `pyvoro2` -- `pbcgraph` +The repository also ships small maintenance tools. The most important ones are: -Those packages should consume atomic reference data from `atomref` rather than -re-curating such datasets independently. +- `python tools/check_registry.py` — validate curated registry metadata against + packaged CSV tables, +- `python tools/check_notebooks.py` — execute notebook code cells, +- `python tools/export_notebooks.py` — turn notebooks into Markdown pages for + the docs, +- `python tools/gen_readme.py` — regenerate `README.md` from this page. -For data-curation changes, validate the packaged registry against the bundled -CSV tables with `python tools/check_registry.py`. +See the [tools README](https://github.com/DeloneCommons/atomref/blob/main/tools/README.md) +for a short description of each script. diff --git a/docs/notebooks/01-quickstart.md b/docs/notebooks/01-quickstart.md new file mode 100644 index 0000000..3a9f22b --- /dev/null +++ b/docs/notebooks/01-quickstart.md @@ -0,0 +1,72 @@ + + +[Open the original notebook on GitHub](https://github.com/DeloneCommons/atomref/blob/main/notebooks/01-quickstart.ipynb) +# atomref quickstart + +This notebook covers the main public API in v0.1: element helpers, direct +`get_*` calls, provenance-carrying `lookup_*` calls, and packaged dataset +discovery. +```python +import atomref as ar + +print(ar.get_element('Cl')) +print(ar.list_quantities()) +``` +**Output** +```text +Element(z=17, symbol='Cl', name='Chlorine') +('covalent_radius', 'van_der_waals_radius', 'atomic_radius') +``` +```python +r_c = ar.get_covalent_radius('C') +r_vdw = ar.get_vdw_radius('O') +print(r_c) +print(r_vdw) +assert r_c == 0.76 +assert r_vdw == 1.50 +``` +**Output** +```text +0.76 +1.5 +``` +```python +lookup = ar.lookup_vdw_radius('Pm') +print(f"{lookup.value:.12f}") +print(lookup.source) +print(lookup.resolved_from) +assert lookup.source == 'transfer_linear' +``` +**Output** +```text +2.897226539515 +transfer_linear +(DatasetRef(quantity='atomic_radius', set_id='rahm2016'),) +``` +```python +quantity = ar.get_quantity_info('atomic_radius') +print(quantity.quantity, quantity.domain, quantity.units) + +for info in ar.list_dataset_infos('van_der_waals_radius', usage_role='target'): + print(info.ref.set_id, info.name, info.usage_role) +``` +**Output** +```text +atomic_radius element angstrom +bondi1964 Bondi van der Waals radii target +rowland_taylor1996 Rowland & Taylor nonbonded contact radii target +alvarez2013 Alvarez van der Waals radii target +chernyshov2020 Chernyshov LoS van der Waals radii target +``` +```python +vdw = ar.get_radii_set('van_der_waals', 'alvarez2013') +print(vdw.get('O')) + +support = ar.get_builtin_set(ar.DatasetRef('atomic_radius', 'rahm2016')) +print(support.get('Pm')) +``` +**Output** +```text +1.5 +2.83 +``` diff --git a/docs/notebooks/02-policies-and-assessment.md b/docs/notebooks/02-policies-and-assessment.md new file mode 100644 index 0000000..4f6baf6 --- /dev/null +++ b/docs/notebooks/02-policies-and-assessment.md @@ -0,0 +1,73 @@ + + +[Open the original notebook on GitHub](https://github.com/DeloneCommons/atomref/blob/main/notebooks/02-policies-and-assessment.ipynb) +# Policies and assessment + +This notebook shows how `atomref` resolves missing values through ordered +policy steps and how to inspect policy-level behavior. +```python +import atomref as ar +``` +```python +covalent_policy = ar.RadiiPolicy( + kind='covalent', + base_set='cordero2008', + transfers=( + ar.SubstitutionTransfer( + source=ar.DatasetRef('covalent_radius', 'csd_legacy_cov') + ), + ), +) +lookup = ar.lookup_covalent_radius('Bk', policy=covalent_policy) +print(lookup.source) +print(f"{lookup.value:.12f}") +print(lookup.resolved_from) +``` +**Output** +```text +transfer_substitution +1.540000000000 +(DatasetRef(quantity='covalent_radius', set_id='csd_legacy_cov'),) +``` +```python +vdw_policy = ar.RadiiPolicy( + kind='van_der_waals', + base_set='alvarez2013', + transfers=( + ar.LinearTransfer( + predictors=(ar.DatasetRef('atomic_radius', 'rahm2016'),) + ), + ), +) +lookup = ar.lookup_vdw_radius('Pm', policy=vdw_policy) +print(f"{lookup.value:.12f}") +print(lookup.source) +print( + f"slope={lookup.fit.coefficients[0]:.12f} intercept={lookup.fit.intercept:.12f} n={lookup.fit.n_points}" +) +``` +**Output** +```text +2.897226539515 +transfer_linear +slope=1.135336645553 intercept=-0.315776167399 n=90 +``` +```python +assessment = ar.assess_radii_policy( + ['C', 'Xe', 'Pm', 'Bk'], + policy=vdw_policy, + detail=True, +) +print(assessment.n_base, assessment.n_transfer_linear, assessment.n_missing) +for row in assessment.per_element: + value = 'None' if row.lookup.value is None else f"{row.lookup.value:.12f}" + print(row.symbol, row.lookup.source, value) +``` +**Output** +```text +3 1 0 +C base 1.770000000000 +Xe base 2.060000000000 +Pm transfer_linear 2.897226539515 +Bk base 3.400000000000 +``` diff --git a/docs/notebooks/03-custom-sets-and-discovery.md b/docs/notebooks/03-custom-sets-and-discovery.md new file mode 100644 index 0000000..51dc5e2 --- /dev/null +++ b/docs/notebooks/03-custom-sets-and-discovery.md @@ -0,0 +1,56 @@ + + +[Open the original notebook on GitHub](https://github.com/DeloneCommons/atomref/blob/main/notebooks/03-custom-sets-and-discovery.ipynb) +# Custom sets and dataset discovery + +This notebook shows how to define a small user-provided set, plug it into a +policy, and inspect the packaged dataset catalog. +```python +import atomref as ar +``` +```python +custom_cov = ar.ElementScalarSet.from_mapping( + ref=ar.DatasetRef("covalent_radius", "demo_user_cov"), + values={"C": 0.77, "O": 0.67}, + name="Demo user covalent set", + units="angstrom", + description="Example custom set for notebook usage.", + notes=("Notebook example",), +) + +policy = ar.RadiiPolicy( + kind="covalent", + base_set=custom_cov, + transfers=( + ar.SubstitutionTransfer( + source=ar.DatasetRef("covalent_radius", "cordero2008") + ), + ), +) + +for symbol in ("C", "O", "N"): + print(symbol, ar.lookup_covalent_radius(symbol, policy=policy)) +``` +**Output** +```text +C LookupResult(value=0.77, source='base', target=DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'), resolved_from=(DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'),), is_placeholder=False, fit=None, notes=()) +O LookupResult(value=0.67, source='base', target=DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'), resolved_from=(DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'),), is_placeholder=False, fit=None, notes=()) +N LookupResult(value=0.71, source='transfer_substitution', target=DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'), resolved_from=(DatasetRef(quantity='covalent_radius', set_id='cordero2008'),), is_placeholder=False, fit=None, notes=('missing in base set; substituted from transfer source',)) +``` +```python +for info in ar.list_radii_set_infos("van_der_waals", usage_role="target"): + print(info.ref.set_id, info.semantic_class, info.origin_class, info.phase_context) + +rahm = ar.get_dataset_info(ar.DatasetRef("atomic_radius", "rahm2016")) +print(rahm.name) +print(rahm.semantic_class, rahm.phase_context, rahm.usage_role) +``` +**Output** +```text +bondi1964 vdw_compiled compiled_experimental mixed_or_legacy +rowland_taylor1996 vdw_structural structural condensed_phase +alvarez2013 vdw_structural structural condensed_phase +chernyshov2020 vdw_structural_typed_reduced structural condensed_phase +Rahm isodensity atomic radii (ρ=0.001 e/bohr³) +atomic_isodensity isolated_atom support +``` diff --git a/mkdocs.yml b/mkdocs.yml index 2a97e1a..c3e560c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -20,7 +20,6 @@ nav: - Guide: - Install: guide/install.md - Quickstart: guide/quickstart.md - - Notebooks: guide/notebooks.md - Policies: guide/policies.md - Custom sets: guide/custom_sets.md - Non-goals: guide/non_goals.md @@ -29,6 +28,11 @@ nav: - Covalent radius: datasets/covalent_radius.md - van der Waals radius: datasets/van_der_waals_radius.md - Atomic radius: datasets/atomic_radius.md + - Notebooks: + - Overview: guide/notebooks.md + - Quickstart notebook: notebooks/01-quickstart.md + - Policies and assessment notebook: notebooks/02-policies-and-assessment.md + - Custom sets and discovery notebook: notebooks/03-custom-sets-and-discovery.md - Development: - Architecture: dev/architecture.md - Data curation: dev/data_curation.md @@ -36,3 +40,8 @@ nav: - API: - Overview: api/index.md - atomref: api/atomref.md + - atomref.elements: api/elements.md + - atomref.registry: api/registry.md + - atomref.transfer: api/transfer.md + - atomref.policy: api/policy.md + - atomref.radii: api/radii.md diff --git a/notebooks/01-quickstart.ipynb b/notebooks/01-quickstart.ipynb index 2c09cc0..6d6d16f 100644 --- a/notebooks/01-quickstart.ipynb +++ b/notebooks/01-quickstart.ipynb @@ -1,77 +1,93 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# atomref quickstart\n", - "\n", - "This notebook covers the basic public API: element helpers, direct `get_*` calls, provenance-carrying `lookup_*` calls, and quantity / dataset discovery.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import atomref as ar\n", - "\n", - "print(ar.get_element(\"Cl\"))\n", - "print(ar.list_quantities())\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "r_c = ar.get_covalent_radius(\"C\")\n", - "r_vdw = ar.get_vdw_radius(\"O\")\n", - "print(r_c)\n", - "print(r_vdw)\n", - "assert r_c == 0.76\n", - "assert r_vdw == 1.50\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "lookup = ar.lookup_vdw_radius(\"Pm\")\n", - "print(lookup)\n", - "print(lookup.value)\n", - "print(lookup.source)\n", - "print(lookup.resolved_from)\n", - "assert lookup.source == \"transfer_linear\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(ar.get_quantity_info(\"atomic_radius\"))\n", - "for info in ar.list_dataset_infos(\"van_der_waals_radius\", usage_role=\"target\"):\n", - " print(info.ref.set_id, info.semantic_class, info.origin_class)\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# atomref quickstart\n", + "\n", + "This notebook covers the main public API in v0.1: element helpers, direct\n", + "`get_*` calls, provenance-carrying `lookup_*` calls, and packaged dataset\n", + "discovery.\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import atomref as ar\n", + "\n", + "print(ar.get_element('Cl'))\n", + "print(ar.list_quantities())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "r_c = ar.get_covalent_radius('C')\n", + "r_vdw = ar.get_vdw_radius('O')\n", + "print(r_c)\n", + "print(r_vdw)\n", + "assert r_c == 0.76\n", + "assert r_vdw == 1.50\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lookup = ar.lookup_vdw_radius('Pm')\n", + "print(f\"{lookup.value:.12f}\")\n", + "print(lookup.source)\n", + "print(lookup.resolved_from)\n", + "assert lookup.source == 'transfer_linear'\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "quantity = ar.get_quantity_info('atomic_radius')\n", + "print(quantity.quantity, quantity.domain, quantity.units)\n", + "\n", + "for info in ar.list_dataset_infos('van_der_waals_radius', usage_role='target'):\n", + " print(info.ref.set_id, info.name, info.usage_role)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vdw = ar.get_radii_set('van_der_waals', 'alvarez2013')\n", + "print(vdw.get('O'))\n", + "\n", + "support = ar.get_builtin_set(ar.DatasetRef('atomic_radius', 'rahm2016'))\n", + "print(support.get('Pm'))\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/notebooks/02-policies-and-assessment.ipynb b/notebooks/02-policies-and-assessment.ipynb index 7db7e45..dfe2678 100644 --- a/notebooks/02-policies-and-assessment.ipynb +++ b/notebooks/02-policies-and-assessment.ipynb @@ -1,96 +1,97 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Policies and assessment\n", - "\n", - "This notebook shows how `atomref` resolves missing values through ordered transfer steps and how to inspect policy-level behavior.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import atomref as ar\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "covalent_policy = ar.RadiiPolicy(\n", - " kind=\"covalent\",\n", - " base_set=\"cordero2008\",\n", - " transfers=(\n", - " ar.SubstitutionTransfer(\n", - " source=ar.DatasetRef(\"covalent_radius\", \"csd_legacy_cov\")\n", - " ),\n", - " ),\n", - ")\n", - "\n", - "lookup_bk = ar.lookup_covalent_radius(\"Bk\", policy=covalent_policy)\n", - "print(lookup_bk)\n", - "assert lookup_bk.source == \"transfer_substitution\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vdw_policy = ar.RadiiPolicy(\n", - " kind=\"van_der_waals\",\n", - " base_set=\"alvarez2013\",\n", - " transfers=(\n", - " ar.LinearTransfer(\n", - " predictors=(ar.DatasetRef(\"atomic_radius\", \"rahm2016\"),)\n", - " ),\n", - " ),\n", - ")\n", - "\n", - "lookup_pm = ar.lookup_vdw_radius(\"Pm\", policy=vdw_policy)\n", - "print(lookup_pm.fit)\n", - "print(lookup_pm.value)\n", - "assert lookup_pm.source == \"transfer_linear\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "assessment = ar.assess_radii_policy(\n", - " [\"C\", \"Xe\", \"Pm\", \"Bk\"],\n", - " policy=vdw_policy,\n", - " detail=True,\n", - ")\n", - "\n", - "print(assessment)\n", - "print(assessment.n_base, assessment.n_transfer_linear)\n", - "for item in assessment.per_element:\n", - " print(item.symbol, item.lookup.source)\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Policies and assessment\n", + "\n", + "This notebook shows how `atomref` resolves missing values through ordered\n", + "policy steps and how to inspect policy-level behavior.\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import atomref as ar\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "covalent_policy = ar.RadiiPolicy(\n", + " kind='covalent',\n", + " base_set='cordero2008',\n", + " transfers=(\n", + " ar.SubstitutionTransfer(\n", + " source=ar.DatasetRef('covalent_radius', 'csd_legacy_cov')\n", + " ),\n", + " ),\n", + ")\n", + "lookup = ar.lookup_covalent_radius('Bk', policy=covalent_policy)\n", + "print(lookup.source)\n", + "print(f\"{lookup.value:.12f}\")\n", + "print(lookup.resolved_from)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vdw_policy = ar.RadiiPolicy(\n", + " kind='van_der_waals',\n", + " base_set='alvarez2013',\n", + " transfers=(\n", + " ar.LinearTransfer(\n", + " predictors=(ar.DatasetRef('atomic_radius', 'rahm2016'),)\n", + " ),\n", + " ),\n", + ")\n", + "lookup = ar.lookup_vdw_radius('Pm', policy=vdw_policy)\n", + "print(f\"{lookup.value:.12f}\")\n", + "print(lookup.source)\n", + "print(\n", + " f\"slope={lookup.fit.coefficients[0]:.12f} intercept={lookup.fit.intercept:.12f} n={lookup.fit.n_points}\"\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assessment = ar.assess_radii_policy(\n", + " ['C', 'Xe', 'Pm', 'Bk'],\n", + " policy=vdw_policy,\n", + " detail=True,\n", + ")\n", + "print(assessment.n_base, assessment.n_transfer_linear, assessment.n_missing)\n", + "for row in assessment.per_element:\n", + " value = 'None' if row.lookup.value is None else f\"{row.lookup.value:.12f}\"\n", + " print(row.symbol, row.lookup.source, value)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/notebooks/03-custom-sets-and-discovery.ipynb b/notebooks/03-custom-sets-and-discovery.ipynb index 827c91f..58f9d92 100644 --- a/notebooks/03-custom-sets-and-discovery.ipynb +++ b/notebooks/03-custom-sets-and-discovery.ipynb @@ -6,7 +6,8 @@ "source": [ "# Custom sets and dataset discovery\n", "\n", - "This notebook shows how to define a small user-provided set, plug it into a policy, and inspect the packaged dataset catalog.\n" + "This notebook shows how to define a small user-provided set, plug it into a\n", + "policy, and inspect the packaged dataset catalog.\n" ] }, { diff --git a/src/atomref/__init__.py b/src/atomref/__init__.py index 815b42c..6104c9d 100644 --- a/src/atomref/__init__.py +++ b/src/atomref/__init__.py @@ -1,3 +1,5 @@ +"""Public package exports for :mod:`atomref`.""" + from .__about__ import __version__ from .elements import ( Element, diff --git a/src/atomref/elements.py b/src/atomref/elements.py index 42f0598..5245b80 100644 --- a/src/atomref/elements.py +++ b/src/atomref/elements.py @@ -1,4 +1,4 @@ -"""Periodic table access for stable element identity.""" +"""Periodic-table access for stable element identity.""" from __future__ import annotations @@ -9,13 +9,13 @@ from importlib import resources -_MISSING_TOKENS = {'', '?', '.'} -_LEADING_ALPHA_RE = re.compile(r'([A-Za-z]{1,3})') +_MISSING_TOKENS = {"", "?", "."} +_LEADING_ALPHA_RE = re.compile(r"([A-Za-z]{1,3})") @dataclass(frozen=True, slots=True) class Element: - """Chemical element identity.""" + """Chemical element identity keyed by atomic number and symbol.""" z: int symbol: str @@ -23,6 +23,8 @@ class Element: def _normalize_element_token(token: str | None) -> str | None: + """Strip quotes and obvious missing-value markers from a token.""" + if token is None: return None @@ -43,7 +45,12 @@ def _normalize_element_token(token: str | None) -> str | None: def canonicalize_element_symbol(token: str | None) -> str | None: - """Canonicalize a free-form element token.""" + """Canonicalize a free-form token to a conventional element symbol. + + The function accepts strings such as ``"cl"``, ``" Cl "`` or + ``"Cl12"`` and returns ``"Cl"`` when a leading element-like token can be + identified. Missing-value markers and non-element strings return ``None``. + """ raw = _normalize_element_token(token) if raw is None: @@ -59,25 +66,29 @@ def canonicalize_element_symbol(token: str | None) -> str | None: @lru_cache(maxsize=1) def _load_elements_by_symbol() -> dict[str, Element]: - table_path = resources.files('atomref.data').joinpath('periodic_table.csv') - with table_path.open('r', encoding='utf-8', newline='') as handle: + """Load the packaged periodic table into a symbol-keyed mapping.""" + + table_path = resources.files("atomref.data").joinpath("periodic_table.csv") + with table_path.open("r", encoding="utf-8", newline="") as handle: reader = csv.DictReader(handle) out: dict[str, Element] = {} for row in reader: - z = int(row['z']) - symbol = row['symbol'] - name = row['name'] + z = int(row["z"]) + symbol = row["symbol"] + name = row["name"] out[symbol] = Element(z=z, symbol=symbol, name=name) return out @lru_cache(maxsize=1) def _elements_in_z_order() -> tuple[Element, ...]: + """Return packaged elements sorted by increasing atomic number.""" + return tuple(sorted(_load_elements_by_symbol().values(), key=lambda e: e.z)) def is_valid_element_symbol(symbol: str | None) -> bool: - """Return ``True`` if ``symbol`` is a known element symbol.""" + """Return ``True`` if ``symbol`` is a known packaged element symbol.""" if symbol is None: return False @@ -85,7 +96,7 @@ def is_valid_element_symbol(symbol: str | None) -> bool: def get_element(symbol: str | None) -> Element | None: - """Look up element identity by symbol or free-form token.""" + """Look up packaged element identity from a symbol-like token.""" sym = canonicalize_element_symbol(symbol) if sym is None: diff --git a/src/atomref/errors.py b/src/atomref/errors.py index 1922cf5..d31660a 100644 --- a/src/atomref/errors.py +++ b/src/atomref/errors.py @@ -1,9 +1,12 @@ +"""Package-local exceptions used across :mod:`atomref`.""" + + class AtomrefError(Exception): - """Base package error.""" + """Base class for package-defined errors.""" class DatasetError(AtomrefError): - """Packaged dataset or registry error.""" + """Raised when packaged data or registry metadata are invalid.""" class MissingValueError(AtomrefError): @@ -11,4 +14,4 @@ class MissingValueError(AtomrefError): class PolicyError(AtomrefError): - """Raised for invalid policy configuration.""" + """Raised for invalid policy configuration or transfer resolution.""" diff --git a/src/atomref/policy.py b/src/atomref/policy.py index 5b242e2..36741fe 100644 --- a/src/atomref/policy.py +++ b/src/atomref/policy.py @@ -34,6 +34,12 @@ @dataclass(frozen=True, slots=True) class LookupResult: + """Result of resolving one value through a policy. + + ``value`` carries the final scalar value when one could be produced, while + ``source`` and the remaining metadata explain how that value was obtained. + """ + value: float | None source: LookupSource target: DatasetRef @@ -43,6 +49,8 @@ class LookupResult: notes: tuple[str, ...] = () def __float__(self) -> float: + """Coerce the resolved value to ``float`` or raise if it is missing.""" + if self.value is None: raise TypeError("reference value is missing") return float(self.value) @@ -50,6 +58,8 @@ def __float__(self) -> float: @dataclass(frozen=True, slots=True) class ValuePolicy(Generic[K]): + """Ordered rule set for resolving element-domain scalar values.""" + base: DatasetLike transfers: tuple[TransferModel, ...] = () overrides: Mapping[K, float] = field(default_factory=dict) @@ -57,6 +67,11 @@ class ValuePolicy(Generic[K]): def _normalize_element_symbol(symbol: str | None) -> str | None: + """Normalize user input to a packaged element symbol. + + The current resolver treats ``D`` and ``T`` as hydrogen aliases. + """ + cand = canonicalize_element_symbol(symbol) if cand in {"D", "T"}: cand = "H" @@ -68,6 +83,8 @@ def _normalize_element_symbol(symbol: str | None) -> str | None: def _resolve_target_ref(policy: ValuePolicy[object]) -> DatasetRef: + """Return the target dataset reference implied by a policy base.""" + return resolve_dataset_like(policy.base).ref @@ -78,6 +95,8 @@ def _fit_linear_transfer( min_points: int, exclude_placeholders: bool, ) -> LinearFit: + """Fit a one-predictor linear transfer model between two datasets.""" + xs: list[float] = [] ys: list[float] = [] @@ -133,6 +152,8 @@ def _fit_linear_transfer_cached( min_points: int, exclude_placeholders: bool, ) -> LinearFit: + """Cache fits between two packaged datasets for repeated reuse.""" + return _fit_linear_transfer( get_builtin_set(base_ref), get_builtin_set(predictor_ref), @@ -142,6 +163,8 @@ def _fit_linear_transfer_cached( def _fit_transfer_model(base: DatasetLike, transfer: TransferModel) -> LinearFit | None: + """Return the fit object for a transfer model when it needs one.""" + if not isinstance(transfer, LinearTransfer): return None if len(transfer.predictors) != 1: @@ -150,7 +173,10 @@ def _fit_transfer_model(base: DatasetLike, transfer: TransferModel) -> LinearFit predictor = transfer.predictors[0] if isinstance(base, DatasetRef) and isinstance(predictor, DatasetRef): return _fit_linear_transfer_cached( - base, predictor, transfer.min_points, transfer.exclude_placeholders + base, + predictor, + transfer.min_points, + transfer.exclude_placeholders, ) return _fit_linear_transfer( resolve_dataset_like(base), @@ -161,8 +187,13 @@ def _fit_transfer_model(base: DatasetLike, transfer: TransferModel) -> LinearFit def _apply_substitution_transfer( - symbol: str, *, target: DatasetRef, transfer: SubstitutionTransfer + symbol: str, + *, + target: DatasetRef, + transfer: SubstitutionTransfer, ) -> tuple[LookupResult | None, str | None]: + """Try to resolve ``symbol`` by direct substitution from another dataset.""" + source_set = resolve_dataset_like(transfer.source) value = source_set.get(symbol) if value is None: @@ -182,8 +213,14 @@ def _apply_substitution_transfer( def _apply_linear_transfer( - symbol: str, *, base: DatasetLike, target: DatasetRef, transfer: LinearTransfer + symbol: str, + *, + base: DatasetLike, + target: DatasetRef, + transfer: LinearTransfer, ) -> tuple[LookupResult | None, str | None]: + """Try to resolve ``symbol`` through linear transfer from predictor data.""" + if len(transfer.predictors) != 1: raise PolicyError("v0.1 LinearTransfer supports exactly one predictor dataset") @@ -194,7 +231,8 @@ def _apply_linear_transfer( predictor_f = float(predictor_value) if transfer.exclude_placeholders and _is_placeholder_value( - predictor_set.info, predictor_f + predictor_set.info, + predictor_f, ): return None, f"predictor value in {predictor_set.ref.set_id} is a placeholder" @@ -217,6 +255,8 @@ def _apply_linear_transfer( def _resolve_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupResult: + """Resolve a value through override, base, transfer, and fallback steps.""" + target = _resolve_target_ref(policy) base_set = resolve_dataset_like(policy.base) if base_set.info.domain != "element": @@ -251,11 +291,16 @@ def _resolve_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupRes for transfer in policy.transfers: if isinstance(transfer, SubstitutionTransfer): result, note = _apply_substitution_transfer( - sym, target=target, transfer=transfer + sym, + target=target, + transfer=transfer, ) elif isinstance(transfer, LinearTransfer): result, note = _apply_linear_transfer( - sym, base=policy.base, target=target, transfer=transfer + sym, + base=policy.base, + target=target, + transfer=transfer, ) else: # pragma: no cover - closed union today raise PolicyError(f"unsupported transfer model: {type(transfer)!r}") diff --git a/src/atomref/radii.py b/src/atomref/radii.py index cda8a89..01f13f4 100644 --- a/src/atomref/radii.py +++ b/src/atomref/radii.py @@ -32,6 +32,12 @@ @dataclass(frozen=True, slots=True) class RadiiPolicy: + """Policy wrapper specialized for radii lookup. + + ``kind`` determines the target quantity, while the remaining fields mirror + the generic :class:`atomref.policy.ValuePolicy` interface. + """ + kind: RadiiKind base_set: str | RadiiSet transfers: tuple[TransferModel, ...] = () @@ -39,6 +45,8 @@ class RadiiPolicy: fallback: float | None = None def as_value_policy(self) -> ValuePolicy[str]: + """Convert the radii policy into the generic scalar-value policy.""" + quantity = _quantity_for_kind(self.kind) if isinstance(self.base_set, ElementScalarSet): if self.base_set.ref.quantity != quantity: @@ -68,12 +76,16 @@ def as_value_policy(self) -> ValuePolicy[str]: @dataclass(frozen=True, slots=True) class RadiiElementAssessment: + """Per-element row in a radii policy assessment report.""" + symbol: str lookup: LookupResult @dataclass(frozen=True, slots=True) class RadiiPolicyAssessment: + """Summary of how a radii policy behaved over a set of elements.""" + kind: RadiiKind policy: RadiiPolicy elements: tuple[str, ...] @@ -96,6 +108,8 @@ class RadiiPolicyAssessment: def _quantity_for_kind(kind: RadiiKind) -> str: + """Translate public radii kind names into registry quantity ids.""" + try: return _KIND_TO_QUANTITY[kind] except KeyError as exc: @@ -103,6 +117,8 @@ def _quantity_for_kind(kind: RadiiKind) -> str: def _normalize_radii_symbol(symbol: str | None) -> str | None: + """Normalize symbols accepted by the radii convenience layer.""" + cand = canonicalize_element_symbol(symbol) if cand in {"D", "T"}: cand = "H" @@ -110,6 +126,8 @@ def _normalize_radii_symbol(symbol: str | None) -> str | None: def _normalize_assessment_elements(elements: Iterable[str]) -> tuple[str, ...]: + """Normalize, validate, deduplicate, and sort assessment element labels.""" + symbols: set[str] = set() for token in elements: sym = _normalize_radii_symbol(token) @@ -124,65 +142,102 @@ def _normalize_assessment_elements(elements: Iterable[str]) -> tuple[str, ...]: def list_radii_sets( - kind: RadiiKind, *, usage_role: str | None = None + kind: RadiiKind, + *, + usage_role: str | None = None, ) -> tuple[str, ...]: + """List packaged radii-set ids for one radii kind.""" + return list_dataset_ids(_quantity_for_kind(kind), usage_role=usage_role) def list_radii_set_infos( - kind: RadiiKind, *, usage_role: str | None = None + kind: RadiiKind, + *, + usage_role: str | None = None, ) -> tuple[DatasetInfo, ...]: + """Return packaged metadata objects for radii sets of one kind.""" + return list_dataset_infos(_quantity_for_kind(kind), usage_role=usage_role) def get_radii_set_info(kind: RadiiKind, set_id: str) -> DatasetInfo: + """Return metadata for one packaged radii set.""" + return get_dataset_info(DatasetRef(_quantity_for_kind(kind), set_id)) def get_radii_set(kind: RadiiKind, set_id: str) -> RadiiSet: + """Load one packaged radii set as an :class:`ElementScalarSet`.""" + return get_builtin_set(DatasetRef(_quantity_for_kind(kind), set_id)) def _validate_policy_kind(policy: RadiiPolicy, *, expected: RadiiKind) -> None: + """Raise when a policy is used with the wrong public radii helper.""" + if policy.kind != expected: raise PolicyError(f"expected a {expected!r} radii policy, got {policy.kind!r}") def _lookup_radius(symbol: str | None, *, policy: RadiiPolicy) -> LookupResult: + """Shared implementation for radii lookup helpers.""" + return _resolve_value(symbol, policy=policy.as_value_policy()) def lookup_covalent_radius( - symbol: str | None, *, policy: RadiiPolicy | None = None + symbol: str | None, + *, + policy: RadiiPolicy | None = None, ) -> LookupResult: + """Resolve a covalent radius together with provenance information.""" + active = DEFAULT_COVALENT_POLICY if policy is None else policy _validate_policy_kind(active, expected="covalent") return _lookup_radius(symbol, policy=active) def get_covalent_radius( - symbol: str | None, *, policy: RadiiPolicy | None = None + symbol: str | None, + *, + policy: RadiiPolicy | None = None, ) -> float | None: + """Return only the selected covalent-radius value, without provenance.""" + return lookup_covalent_radius(symbol, policy=policy).value def lookup_vdw_radius( - symbol: str | None, *, policy: RadiiPolicy | None = None + symbol: str | None, + *, + policy: RadiiPolicy | None = None, ) -> LookupResult: + """Resolve a van der Waals radius together with provenance information.""" + active = DEFAULT_VDW_POLICY if policy is None else policy _validate_policy_kind(active, expected="van_der_waals") return _lookup_radius(symbol, policy=active) def get_vdw_radius( - symbol: str | None, *, policy: RadiiPolicy | None = None + symbol: str | None, + *, + policy: RadiiPolicy | None = None, ) -> float | None: + """Return only the selected van der Waals radius, without provenance.""" + return lookup_vdw_radius(symbol, policy=policy).value def assess_radii_policy( - elements: Iterable[str], *, policy: RadiiPolicy, detail: bool = False + elements: Iterable[str], + *, + policy: RadiiPolicy, + detail: bool = False, ) -> RadiiPolicyAssessment: + """Assess how a radii policy resolves values over a set of elements.""" + elems = _normalize_assessment_elements(elements) value_policy = policy.as_value_policy() @@ -260,9 +315,11 @@ def assess_radii_policy( SubstitutionTransfer(source=DatasetRef("covalent_radius", "csd_legacy_cov")), ), ) +"""Default covalent-radii policy used by the convenience helpers.""" DEFAULT_VDW_POLICY = RadiiPolicy( kind="van_der_waals", base_set="alvarez2013", transfers=(LinearTransfer(predictors=(DatasetRef("atomic_radius", "rahm2016"),)),), ) +"""Default vdW-radii policy used by the convenience helpers.""" diff --git a/src/atomref/registry.py b/src/atomref/registry.py index c465786..594e98e 100644 --- a/src/atomref/registry.py +++ b/src/atomref/registry.py @@ -5,9 +5,9 @@ from collections.abc import Iterable, Mapping from dataclasses import dataclass import csv -import json from functools import lru_cache from importlib import resources +import json from .elements import canonicalize_element_symbol, get_element, iter_elements from .errors import DatasetError @@ -18,12 +18,20 @@ @dataclass(frozen=True, slots=True) class DatasetRef: + """Stable reference to a packaged dataset. + + The ``quantity`` identifies the operational property family, while + ``set_id`` names a specific curated dataset within that family. + """ + quantity: QuantityId set_id: str @dataclass(frozen=True, slots=True) class Reference: + """Bibliographic record attached to packaged dataset metadata.""" + authors: str | None = None year: int | None = None title: str | None = None @@ -36,6 +44,8 @@ class Reference: @dataclass(frozen=True, slots=True) class CoverageInfo: + """Coverage summary for an element-indexed scalar dataset.""" + n_values: int z_min: int | None = None z_max: int | None = None @@ -46,6 +56,8 @@ class CoverageInfo: @dataclass(frozen=True, slots=True) class QuantityInfo: + """Metadata shared by all datasets that belong to one quantity.""" + quantity: QuantityId domain: DomainId units: str | None = None @@ -54,6 +66,13 @@ class QuantityInfo: @dataclass(frozen=True, slots=True) class DatasetInfo: + """Curated metadata for one packaged dataset. + + This object keeps operational classification such as ``ref.quantity`` and + ``usage_role`` separate from scientific classification such as + ``semantic_class`` and ``phase_context``. + """ + ref: DatasetRef domain: DomainId units: str | None @@ -75,6 +94,8 @@ class DatasetInfo: @dataclass(frozen=True, slots=True) class ElementScalarSet: + """Element-indexed scalar dataset stored densely by atomic number.""" + ref: DatasetRef info: DatasetInfo values_by_z: tuple[float | None, ...] @@ -96,6 +117,8 @@ def from_mapping( notes: Iterable[str] = (), placeholder_value: float | None = None, ) -> "ElementScalarSet": + """Build a custom element-domain dataset from a symbol-keyed mapping.""" + n_z = max(e.z for e in iter_elements()) values_by_z: list[float | None] = [None] * (n_z + 1) @@ -143,6 +166,8 @@ def from_mapping( return cls(ref=ref, info=info, values_by_z=tuple(values_by_z)) def get(self, symbol: str | None) -> float | None: + """Return the scalar value for ``symbol`` or ``None`` if absent.""" + sym = _normalize_element_domain_symbol(symbol) elem = get_element(sym) if elem is None: @@ -154,6 +179,8 @@ def get(self, symbol: str | None) -> float | None: def _normalize_element_domain_symbol(symbol: str | None) -> str | None: + """Normalize element-domain symbols and fold D/T onto hydrogen.""" + cand = canonicalize_element_symbol(symbol) if cand in {"D", "T"}: return "H" @@ -162,6 +189,8 @@ def _normalize_element_domain_symbol(symbol: str | None) -> str | None: @lru_cache(maxsize=1) def _load_registry_json() -> dict[str, object]: + """Load the packaged registry JSON as a validated top-level mapping.""" + path = resources.files("atomref.data").joinpath("registry.json") with path.open("r", encoding="utf-8") as handle: data = json.load(handle) @@ -171,6 +200,8 @@ def _load_registry_json() -> dict[str, object]: def _get_quantities_mapping() -> Mapping[str, object]: + """Return the raw ``quantities`` mapping from ``registry.json``.""" + quantities = _load_registry_json().get("quantities") if not isinstance(quantities, dict): raise DatasetError("invalid registry.json: missing quantities mapping") @@ -178,6 +209,8 @@ def _get_quantities_mapping() -> Mapping[str, object]: def _get_datasets_mapping() -> Mapping[str, object]: + """Return the raw ``datasets`` mapping from ``registry.json``.""" + datasets = _load_registry_json().get("datasets") if not isinstance(datasets, dict): raise DatasetError("invalid registry.json: missing datasets mapping") @@ -185,6 +218,8 @@ def _get_datasets_mapping() -> Mapping[str, object]: def _datasets_for_quantity(quantity: QuantityId) -> Mapping[str, object]: + """Return the dataset table for one quantity or raise on unknown input.""" + datasets = _get_datasets_mapping().get(quantity) if not isinstance(datasets, dict): raise DatasetError(f"unknown quantity: {quantity!r}") @@ -192,10 +227,14 @@ def _datasets_for_quantity(quantity: QuantityId) -> Mapping[str, object]: def list_quantities() -> tuple[str, ...]: + """List packaged quantity identifiers in registry order.""" + return tuple(_get_quantities_mapping().keys()) def get_quantity_info(quantity: QuantityId) -> QuantityInfo: + """Return quantity-level metadata for a packaged quantity.""" + raw = _get_quantities_mapping().get(quantity) if not isinstance(raw, dict): raise DatasetError(f"unknown quantity: {quantity!r}") @@ -207,15 +246,22 @@ def get_quantity_info(quantity: QuantityId) -> QuantityInfo: raw.get("description") if isinstance(raw.get("description"), str) else None ) return QuantityInfo( - quantity=quantity, domain=domain, units=units, description=description + quantity=quantity, + domain=domain, + units=units, + description=description, ) def _canonicalize_alias_token(value: str) -> str: + """Normalize a dataset id or alias for case-insensitive comparison.""" + return " ".join(value.strip().lower().split()) def _resolve_set_id(quantity: QuantityId, set_id: str) -> str: + """Resolve a dataset id or alias to its canonical packaged set id.""" + by_quantity = _datasets_for_quantity(quantity) if set_id in by_quantity: return set_id @@ -239,6 +285,12 @@ def _resolve_set_id(quantity: QuantityId, set_id: str) -> str: def list_dataset_ids( quantity: QuantityId, *, usage_role: str | None = None ) -> tuple[str, ...]: + """List packaged dataset identifiers for a quantity. + + When ``usage_role`` is provided, only datasets with a matching normalized + role such as ``"target"`` or ``"support"`` are returned. + """ + dataset_ids = tuple(_datasets_for_quantity(quantity).keys()) if usage_role is None: return dataset_ids @@ -256,6 +308,8 @@ def list_dataset_ids( def list_dataset_infos( quantity: QuantityId, *, usage_role: str | None = None ) -> tuple[DatasetInfo, ...]: + """Return packaged dataset metadata objects for a quantity.""" + return tuple( get_dataset_info(DatasetRef(quantity, set_id)) for set_id in list_dataset_ids(quantity, usage_role=usage_role) @@ -263,6 +317,8 @@ def list_dataset_infos( def _coerce_reference(obj: object) -> Reference: + """Coerce a raw registry reference entry into :class:`Reference`.""" + if not isinstance(obj, dict): raise DatasetError("invalid reference entry in registry.json") return Reference( @@ -280,6 +336,8 @@ def _coerce_reference(obj: object) -> Reference: def _coerce_coverage(obj: object) -> CoverageInfo | None: + """Coerce raw coverage metadata into :class:`CoverageInfo`.""" + if not isinstance(obj, dict): return None covered = obj.get("covered_z") @@ -297,6 +355,8 @@ def _coerce_coverage(obj: object) -> CoverageInfo | None: def get_dataset_info(ref: DatasetRef) -> DatasetInfo: + """Return curated metadata for a packaged dataset reference.""" + actual_set_id = _resolve_set_id(ref.quantity, ref.set_id) actual_ref = DatasetRef(quantity=ref.quantity, set_id=actual_set_id) @@ -403,6 +463,8 @@ def get_dataset_info(ref: DatasetRef) -> DatasetInfo: @lru_cache(maxsize=None) def _load_csv_columns(filename: str) -> dict[str, tuple[float | None, ...]]: + """Load all value columns from one packaged dense-by-Z CSV table.""" + path = resources.files("atomref.data").joinpath(filename) with path.open("r", encoding="utf-8", newline="") as handle: reader = csv.DictReader(handle) @@ -427,6 +489,8 @@ def _load_csv_columns(filename: str) -> dict[str, tuple[float | None, ...]]: @lru_cache(maxsize=None) def get_builtin_set(ref: DatasetRef) -> ElementScalarSet: + """Load a packaged dataset as an :class:`ElementScalarSet`.""" + info = get_dataset_info(ref) if info.domain != "element": raise DatasetError( @@ -448,12 +512,16 @@ def get_builtin_set(ref: DatasetRef) -> ElementScalarSet: def resolve_dataset_like(dataset: DatasetLike) -> ElementScalarSet: + """Resolve either a packaged reference or a custom set to a loaded set.""" + if isinstance(dataset, ElementScalarSet): return dataset return get_builtin_set(dataset) def _is_placeholder_value(info: DatasetInfo, value: float) -> bool: + """Return ``True`` when ``value`` equals the dataset's placeholder value.""" + if info.placeholder_value is None: return False return abs(value - info.placeholder_value) < 1e-12 diff --git a/src/atomref/transfer.py b/src/atomref/transfer.py index d7f5d5e..14362db 100644 --- a/src/atomref/transfer.py +++ b/src/atomref/transfer.py @@ -1,4 +1,4 @@ -"""Transfer model configuration types.""" +"""Transfer-model configuration types for policy-based lookup.""" from __future__ import annotations @@ -9,6 +9,13 @@ @dataclass(frozen=True, slots=True) class LinearFit: + """Summary statistics for a fitted linear transfer model. + + Parameters are stored in a compact, serializable form so they can be + attached to :class:`atomref.policy.LookupResult` objects and reused in + reporting code. + """ + coefficients: tuple[float, ...] intercept: float n_points: int @@ -18,14 +25,24 @@ class LinearFit: @dataclass(frozen=True, slots=True) class SubstitutionTransfer: + """Use another dataset directly when the base dataset is missing a value.""" + source: DatasetLike @dataclass(frozen=True, slots=True) class LinearTransfer: + """Infer missing target values from one or more predictor datasets. + + In v0.1 the public API stores predictors as a tuple for forward + compatibility, but the runtime implementation intentionally accepts exactly + one predictor dataset. + """ + predictors: tuple[DatasetLike, ...] min_points: int = 2 exclude_placeholders: bool = True TransferModel = SubstitutionTransfer | LinearTransfer +"""Closed union of transfer models supported by the core resolver.""" diff --git a/tests/meta/test_notebooks.py b/tests/meta/test_notebooks.py index f49775f..d420476 100644 --- a/tests/meta/test_notebooks.py +++ b/tests/meta/test_notebooks.py @@ -6,8 +6,10 @@ REPO_ROOT = Path(__file__).resolve().parents[2] -SCRIPT = REPO_ROOT / "tools" / "check_notebooks.py" +CHECK_SCRIPT = REPO_ROOT / "tools" / "check_notebooks.py" +EXPORT_SCRIPT = REPO_ROOT / "tools" / "export_notebooks.py" NOTEBOOKS = REPO_ROOT / "notebooks" +EXPORTED_NOTEBOOKS = REPO_ROOT / "docs" / "notebooks" def test_notebook_files_exist() -> None: @@ -21,4 +23,19 @@ def test_notebook_files_exist() -> None: def test_notebooks_validate_and_execute() -> None: - subprocess.run([sys.executable, str(SCRIPT)], cwd=REPO_ROOT, check=True) + subprocess.run([sys.executable, str(CHECK_SCRIPT)], cwd=REPO_ROOT, check=True) + + +def test_exported_notebook_pages_are_in_sync() -> None: + expected = { + "01-quickstart.md", + "02-policies-and-assessment.md", + "03-custom-sets-and-discovery.md", + } + actual = {path.name for path in EXPORTED_NOTEBOOKS.glob("*.md")} + assert expected.issubset(actual) + subprocess.run( + [sys.executable, str(EXPORT_SCRIPT), "--check"], + cwd=REPO_ROOT, + check=True, + ) diff --git a/tests/meta/test_text_generation_tools.py b/tests/meta/test_text_generation_tools.py new file mode 100644 index 0000000..b6203a7 --- /dev/null +++ b/tests/meta/test_text_generation_tools.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +import importlib.util +from pathlib import Path +import sys + + +REPO_ROOT = Path(__file__).resolve().parents[2] +MODULE_PATH = REPO_ROOT / "tools" / "export_notebooks.py" + +spec = importlib.util.spec_from_file_location("export_notebooks_tool", MODULE_PATH) +assert spec is not None and spec.loader is not None +export_notebooks = importlib.util.module_from_spec(spec) +sys.modules[spec.name] = export_notebooks +spec.loader.exec_module(export_notebooks) + + +def test_export_notebooks_check_ignores_crlf(tmp_path: Path) -> None: + """Notebook export checks should ignore Windows vs Unix newline differences.""" + + output_dir = tmp_path / "docs" + output_dir.mkdir() + + for notebook_name, output_name in export_notebooks.NOTEBOOK_OUTPUTS.items(): + rendered = export_notebooks._export_markdown( + export_notebooks.NOTEBOOKS / notebook_name + ) + (output_dir / output_name).write_text( + rendered.replace("\n", "\r\n"), + encoding="utf-8", + newline="", + ) + + assert export_notebooks.export_notebooks(output_dir, check=True) == 0 diff --git a/tools/README.md b/tools/README.md new file mode 100644 index 0000000..8074430 --- /dev/null +++ b/tools/README.md @@ -0,0 +1,27 @@ +# tools + +This directory contains small maintenance scripts used during development and +release preparation. + +## Scripts + +- `check_dist.py` — verify that wheel and source-distribution artifacts contain + the key files expected by the project. +- `check_notebooks.py` — validate notebook JSON and execute notebook code cells. +- `check_registry.py` — validate curated registry metadata against packaged CSV + tables. +- `export_notebooks.py` — render the bundled notebooks into Markdown pages under + `docs/notebooks/`. +- `gen_readme.py` — regenerate `README.md` from `docs/index.md`. + +## Typical commands + +```bash +python tools/check_registry.py +python tools/check_notebooks.py +python tools/export_notebooks.py +python tools/gen_readme.py +``` + +The main project README is generated from the documentation home page. To change +`README.md`, edit `docs/index.md` and then run `python tools/gen_readme.py`. diff --git a/tools/check_dist.py b/tools/check_dist.py index b9d80b5..92cef29 100644 --- a/tools/check_dist.py +++ b/tools/check_dist.py @@ -1,3 +1,5 @@ +"""Verify that built distributions contain the project's key files.""" + from __future__ import annotations import argparse @@ -7,26 +9,31 @@ REQUIRED_WHEEL_MEMBERS = { - 'atomref/data/periodic_table.csv', - 'atomref/data/covalent.csv', - 'atomref/data/van_der_waals.csv', - 'atomref/data/registry.json', - 'atomref/py.typed', + "atomref/data/periodic_table.csv", + "atomref/data/covalent.csv", + "atomref/data/van_der_waals.csv", + "atomref/data/registry.json", + "atomref/py.typed", } REQUIRED_SDIST_SUFFIXES = { - 'src/atomref/data/periodic_table.csv', - 'src/atomref/data/covalent.csv', - 'src/atomref/data/van_der_waals.csv', - 'src/atomref/data/registry.json', - 'src/atomref/py.typed', - 'README.md', - 'LICENSE', - 'pyproject.toml', - 'notebooks/01-quickstart.ipynb', - 'notebooks/02-policies-and-assessment.ipynb', - 'notebooks/03-custom-sets-and-discovery.ipynb', - 'tools/check_notebooks.py', + "src/atomref/data/periodic_table.csv", + "src/atomref/data/covalent.csv", + "src/atomref/data/van_der_waals.csv", + "src/atomref/data/registry.json", + "src/atomref/py.typed", + "README.md", + "LICENSE", + "pyproject.toml", + "notebooks/01-quickstart.ipynb", + "notebooks/02-policies-and-assessment.ipynb", + "notebooks/03-custom-sets-and-discovery.ipynb", + "docs/notebooks/01-quickstart.md", + "docs/notebooks/02-policies-and-assessment.md", + "docs/notebooks/03-custom-sets-and-discovery.md", + "tools/check_notebooks.py", + "tools/export_notebooks.py", + "tools/README.md", } @@ -35,15 +42,22 @@ class DistCheckError(RuntimeError): def _assert_members_present( - actual: set[str], required: set[str], *, label: str + actual: set[str], + required: set[str], + *, + label: str, ) -> None: + """Raise when ``required`` contains members not present in ``actual``.""" + missing = sorted(required - actual) if missing: - joined = ', '.join(missing) - raise DistCheckError(f'{label} is missing required members: {joined}') + joined = ", ".join(missing) + raise DistCheckError(f"{label} is missing required members: {joined}") def _members_matching_suffixes(actual: set[str], suffixes: set[str]) -> set[str]: + """Return suffixes that match at least one member name from ``actual``.""" + matched: set[str] = set() for suffix in suffixes: if any(name.endswith(suffix) for name in actual): @@ -52,6 +66,8 @@ def _members_matching_suffixes(actual: set[str], suffixes: set[str]) -> set[str] def check_wheel(path: Path) -> None: + """Validate the contents of one built wheel.""" + with zipfile.ZipFile(path) as zf: names = set(zf.namelist()) matched = { @@ -63,24 +79,28 @@ def check_wheel(path: Path) -> None: def check_sdist(path: Path) -> None: - with tarfile.open(path, 'r:gz') as tf: + """Validate the contents of one built source distribution.""" + + with tarfile.open(path, "r:gz") as tf: names = {member.name for member in tf.getmembers()} matched = _members_matching_suffixes(names, REQUIRED_SDIST_SUFFIXES) _assert_members_present(matched, REQUIRED_SDIST_SUFFIXES, label=path.name) def main() -> None: + """Validate wheel and sdist artifacts found in a distribution directory.""" + parser = argparse.ArgumentParser() - parser.add_argument('dist_dir', type=Path, nargs='?', default=Path('dist')) + parser.add_argument("dist_dir", type=Path, nargs="?", default=Path("dist")) args = parser.parse_args() dist_dir = args.dist_dir - wheels = sorted(dist_dir.glob('*.whl')) - sdists = sorted(dist_dir.glob('*.tar.gz')) + wheels = sorted(dist_dir.glob("*.whl")) + sdists = sorted(dist_dir.glob("*.tar.gz")) if not wheels: - raise DistCheckError(f'no wheel files found in {dist_dir}') + raise DistCheckError(f"no wheel files found in {dist_dir}") if not sdists: - raise DistCheckError(f'no source distributions found in {dist_dir}') + raise DistCheckError(f"no source distributions found in {dist_dir}") for wheel in wheels: check_wheel(wheel) @@ -88,5 +108,5 @@ def main() -> None: check_sdist(sdist) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tools/check_notebooks.py b/tools/check_notebooks.py index 830d742..51d9dfa 100644 --- a/tools/check_notebooks.py +++ b/tools/check_notebooks.py @@ -28,10 +28,14 @@ class NotebookCheckError(RuntimeError): def iter_notebooks() -> tuple[Path, ...]: + """Return the notebooks that are expected to ship with the project.""" + return tuple(NOTEBOOKS / name for name in REQUIRED_NOTEBOOKS) def load_notebook(path: Path) -> dict[str, object]: + """Load one notebook JSON document.""" + data = json.loads(path.read_text(encoding="utf-8")) if not isinstance(data, dict): raise NotebookCheckError(f"{path.name}: expected top-level JSON object") @@ -39,6 +43,8 @@ def load_notebook(path: Path) -> dict[str, object]: def iter_code_cells(data: dict[str, object], *, path: Path) -> tuple[str, ...]: + """Return notebook code-cell sources in order.""" + cells = data.get("cells") if not isinstance(cells, list): raise NotebookCheckError(f"{path.name}: missing notebook cell list") @@ -66,6 +72,8 @@ def iter_code_cells(data: dict[str, object], *, path: Path) -> tuple[str, ...]: def execute_notebook(path: Path) -> None: + """Execute all code cells from one notebook in a shared namespace.""" + if not path.exists(): raise NotebookCheckError(f"missing notebook: {path}") data = load_notebook(path) @@ -84,6 +92,8 @@ def execute_notebook(path: Path) -> None: def main() -> int: + """Validate and execute every required notebook.""" + notebooks = iter_notebooks() for notebook in notebooks: execute_notebook(notebook) diff --git a/tools/export_notebooks.py b/tools/export_notebooks.py new file mode 100644 index 0000000..aa6761d --- /dev/null +++ b/tools/export_notebooks.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +"""Export bundled notebooks to Markdown pages for the docs.""" + +from __future__ import annotations + +from contextlib import redirect_stdout +import argparse +import io +import json +from pathlib import Path +import sys + + +REPO_ROOT = Path(__file__).resolve().parents[1] +SRC = REPO_ROOT / "src" +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) + +NOTEBOOKS = REPO_ROOT / "notebooks" +DEFAULT_OUTPUT_DIR = REPO_ROOT / "docs" / "notebooks" +NOTEBOOK_OUTPUTS = { + "01-quickstart.ipynb": "01-quickstart.md", + "02-policies-and-assessment.ipynb": "02-policies-and-assessment.md", + "03-custom-sets-and-discovery.ipynb": "03-custom-sets-and-discovery.md", +} +HEADER = ( + "\n" + "\n\n" +) + + +class NotebookExportError(RuntimeError): + """Raised when notebook export fails.""" + + +def _load_notebook(path: Path) -> dict[str, object]: + """Load one notebook JSON document.""" + + data = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise NotebookExportError(f"{path.name}: expected top-level JSON object") + return data + + +def _cell_source(cell: dict[str, object], *, path: Path, index: int) -> str: + """Return normalized source text for one notebook cell.""" + + source = cell.get("source", []) + if isinstance(source, str): + return source + if isinstance(source, list) and all(isinstance(line, str) for line in source): + return "".join(source) + raise NotebookExportError(f"{path.name}: invalid source in cell {index}") + + +def _export_markdown(path: Path) -> str: + """Render one notebook as Markdown, executing code cells for output.""" + + data = _load_notebook(path) + cells = data.get("cells") + if not isinstance(cells, list): + raise NotebookExportError(f"{path.name}: missing notebook cell list") + + namespace = {"__name__": "__main__"} + parts: list[str] = [HEADER] + parts.append( + f"[Open the original notebook on GitHub]" + f"(https://github.com/DeloneCommons/atomref/blob/main/notebooks/{path.name})\n" + ) + + for index, cell in enumerate(cells, start=1): + if not isinstance(cell, dict): + raise NotebookExportError(f"{path.name}: cell {index} is not an object") + source = _cell_source(cell, path=path, index=index) + cell_type = cell.get("cell_type") + if cell_type == "markdown": + text = source.strip() + if text: + parts.append(f"{text}\n") + continue + if cell_type != "code": + continue + code_text = source.rstrip() + parts.append("```python\n") + parts.append(f"{code_text}\n") + parts.append("```\n") + if not code_text.strip(): + continue + + stdout = io.StringIO() + try: + code = compile(code_text, f"{path.name}::cell{index}", "exec") + with redirect_stdout(stdout): + exec(code, namespace, namespace) + except Exception as exc: # noqa: BLE001 + raise NotebookExportError( + f"{path.name}: execution failed in code cell {index}: {exc}" + ) from exc + + output = stdout.getvalue().rstrip() + if output: + parts.append("**Output**\n\n") + parts.append("```text\n") + parts.append(f"{output}\n") + parts.append("```\n") + + return "\n".join(part.rstrip() for part in parts if part).rstrip() + "\n" + + +def export_notebooks(output_dir: Path, *, check: bool = False) -> int: + """Export bundled notebooks or verify that exported pages are in sync.""" + + output_dir.mkdir(parents=True, exist_ok=True) + for notebook_name, output_name in NOTEBOOK_OUTPUTS.items(): + notebook_path = NOTEBOOKS / notebook_name + rendered = _export_markdown(notebook_path) + output_path = output_dir / output_name + if check: + current = output_path.read_text(encoding="utf-8").replace("\r\n", "\n") + if current != rendered: + print( + f"{output_path} is out of sync with {notebook_path.name}", + file=sys.stderr, + ) + return 1 + else: + output_path.write_text(rendered, encoding="utf-8", newline="\n") + return 0 + + +def main() -> int: + """Export notebook Markdown pages or check that they are current.""" + + parser = argparse.ArgumentParser() + parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR) + parser.add_argument( + "--check", + action="store_true", + help="exit with status 1 when exported pages are out of sync", + ) + args = parser.parse_args() + return export_notebooks(args.output_dir, check=args.check) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/gen_readme.py b/tools/gen_readme.py index cad0335..71b954d 100644 --- a/tools/gen_readme.py +++ b/tools/gen_readme.py @@ -1,20 +1,61 @@ +"""Generate ``README.md`` from the documentation home page.""" + from __future__ import annotations import argparse from pathlib import Path +import sys REPO_ROOT = Path(__file__).resolve().parents[1] -SOURCE = REPO_ROOT / 'docs' / 'index.md' -README = REPO_ROOT / 'README.md' +SOURCE = REPO_ROOT / "docs" / "index.md" +README = REPO_ROOT / "README.md" +FOOTER = """ + +--- + +This README is generated from `docs/index.md`. + +To regenerate it: + +```bash +python tools/gen_readme.py +``` + +Edit the documentation sources instead of editing `README.md` directly. +""" -def main() -> None: +def render_readme() -> str: + """Return the generated README text.""" + + body = SOURCE.read_text(encoding="utf-8").rstrip() + return f"{body}{FOOTER}" + + +def main() -> int: + """Generate or verify the repository README file.""" + parser = argparse.ArgumentParser() - parser.add_argument('--output', type=Path, default=README) + parser.add_argument("--output", type=Path, default=README) + parser.add_argument( + "--check", + action="store_true", + help="exit with status 1 when the target file is out of sync", + ) args = parser.parse_args() - args.output.write_text(SOURCE.read_text(encoding='utf-8'), encoding='utf-8') + + rendered = render_readme() + if args.check: + current = args.output.read_text(encoding="utf-8") + if current != rendered: + print(f"{args.output} is out of sync with docs/index.md", file=sys.stderr) + return 1 + return 0 + + args.output.write_text(rendered, encoding="utf-8") + return 0 -if __name__ == '__main__': - main() +if __name__ == "__main__": + raise SystemExit(main()) From 169ae719e5d9b78610f9406fa72066ce37434a61 Mon Sep 17 00:00:00 2001 From: Ivan Chernyshov Date: Sun, 15 Mar 2026 11:31:59 +0300 Subject: [PATCH 11/15] Adds pre-release check --- CHANGELOG.md | 45 ++++++++----- README.md | 1 + docs/guide/install.md | 7 ++ docs/index.md | 1 + mkdocs.yml | 1 + pyproject.toml | 4 +- src/atomref/__about__.py | 2 +- tests/meta/test_release_tools.py | 22 +++++++ tools/README.md | 3 + tools/check_dist.py | 4 ++ tools/release_check.py | 106 +++++++++++++++++++++++++++++++ 11 files changed, 179 insertions(+), 17 deletions(-) create mode 100644 tests/meta/test_release_tools.py create mode 100644 tools/release_check.py diff --git a/CHANGELOG.md b/CHANGELOG.md index dcfa24a..cac0707 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,17 +1,32 @@ # Changelog -## 0.1.0a0 - -- Initial scaffold extracted from the `molcryst` chemistry data layer. -- Added packaged element metadata and radii tables. -- Added registry design separating operational quantity from scientific - classification. -- Added radii policies with substitution and linear transfer models. -- Added public packaged-set retrieval helpers: `get_builtin_set()` and - `get_radii_set()`. -- Added runnable notebooks together with generated Markdown notebook pages in - the docs. -- Expanded the docs with dataset guidance, module-level API pages, and a tools - overview. -- Added docstrings across the main importable modules, including important - internal helpers used across modules. +## 0.1.0 - 2026-03-15 + +First public release. + +### Added + +- Packaged element metadata and curated radii tables. +- Quantity-aware registry metadata that separates operational lookup quantity + from scientific classification and dataset usage role. +- Provenance-aware radii policies with deterministic resolution order. +- Substitution and linear-transfer support for restoring missing values from + curated support datasets. +- Public helpers for inspecting quantities, dataset metadata, and packaged + built-in sets. +- Runnable notebooks together with generated Markdown notebook pages in the + documentation. +- Validation and maintenance tools for registry checks, notebook export, README + generation, and distribution-artifact inspection. + +### Documentation + +- Expanded dataset guides with citations and selection-oriented descriptions. +- Added module-level API pages and notebook walkthroughs. +- Added developer-facing curation and tooling notes. + +### Packaging + +- Built and validated wheel and source-distribution artifacts. +- Added CI coverage for linting, tests, docs builds, notebook sync, and + distribution checks. diff --git a/README.md b/README.md index 9d5eb79..d9d90bd 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,7 @@ The repository also ships small maintenance tools. The most important ones are: - `python tools/export_notebooks.py` — turn notebooks into Markdown pages for the docs, - `python tools/gen_readme.py` — regenerate `README.md` from this page. +- `python tools/release_check.py` — run the full release-preparation checklist, including linting, tests, docs, builds, and artifact validation. See the [tools README](https://github.com/DeloneCommons/atomref/blob/main/tools/README.md) for a short description of each script. diff --git a/docs/guide/install.md b/docs/guide/install.md index 00a4f22..e7e0697 100644 --- a/docs/guide/install.md +++ b/docs/guide/install.md @@ -21,3 +21,10 @@ Those extras currently cover: - `test` — pytest and test-only compatibility helpers, - `docs` — MkDocs and API documentation tooling, - `dev` — flake8, build, and release metadata checks. + + +For a full local pre-release validation pass after installing those extras, run: + +```bash +python tools/release_check.py +``` diff --git a/docs/index.md b/docs/index.md index c59777e..3bc7495 100644 --- a/docs/index.md +++ b/docs/index.md @@ -116,6 +116,7 @@ The repository also ships small maintenance tools. The most important ones are: - `python tools/export_notebooks.py` — turn notebooks into Markdown pages for the docs, - `python tools/gen_readme.py` — regenerate `README.md` from this page. +- `python tools/release_check.py` — run the full release-preparation checklist, including linting, tests, docs, builds, and artifact validation. See the [tools README](https://github.com/DeloneCommons/atomref/blob/main/tools/README.md) for a short description of each script. diff --git a/mkdocs.yml b/mkdocs.yml index c3e560c..2658174 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -11,6 +11,7 @@ plugins: - mkdocstrings: handlers: python: + paths: [src] options: show_root_heading: true show_source: false diff --git a/pyproject.toml b/pyproject.toml index 065faab..b712101 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ authors = [ ] keywords = ["chemistry", "materials", "crystallography", "reference data", "atomic radii"] classifiers = [ - "Development Status :: 2 - Pre-Alpha", + "Development Status :: 3 - Alpha", "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering :: Chemistry", "Topic :: Software Development :: Libraries", @@ -26,6 +26,7 @@ classifiers = [ "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Operating System :: OS Independent", + "Typing :: Typed", ] dependencies = [] @@ -34,6 +35,7 @@ Homepage = "https://delonecommons.github.io/atomref/" Documentation = "https://delonecommons.github.io/atomref/" Repository = "https://github.com/DeloneCommons/atomref" Issues = "https://github.com/DeloneCommons/atomref/issues" +Changelog = "https://github.com/DeloneCommons/atomref/blob/main/CHANGELOG.md" [project.optional-dependencies] test = [ diff --git a/src/atomref/__about__.py b/src/atomref/__about__.py index 44cdb9a..3dc1f76 100644 --- a/src/atomref/__about__.py +++ b/src/atomref/__about__.py @@ -1 +1 @@ -__version__ = '0.1.0a0' +__version__ = "0.1.0" diff --git a/tests/meta/test_release_tools.py b/tests/meta/test_release_tools.py new file mode 100644 index 0000000..7cbff90 --- /dev/null +++ b/tests/meta/test_release_tools.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +import subprocess +import sys +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +# Keeping this as a subprocess test ensures the helper stays importable and +# exposes a stable CLI entry point without running the expensive full release +# workflow inside the unit test suite. +def test_release_check_help() -> None: + result = subprocess.run( + [sys.executable, "tools/release_check.py", "--help"], + cwd=REPO_ROOT, + check=True, + capture_output=True, + text=True, + ) + assert "release-preparation checks" in result.stdout diff --git a/tools/README.md b/tools/README.md index 8074430..943900d 100644 --- a/tools/README.md +++ b/tools/README.md @@ -13,6 +13,8 @@ release preparation. - `export_notebooks.py` — render the bundled notebooks into Markdown pages under `docs/notebooks/`. - `gen_readme.py` — regenerate `README.md` from `docs/index.md`. +- `release_check.py` — run the full release-preparation checklist, + including linting, tests, docs, builds, and artifact validation. ## Typical commands @@ -21,6 +23,7 @@ python tools/check_registry.py python tools/check_notebooks.py python tools/export_notebooks.py python tools/gen_readme.py +python tools/release_check.py ``` The main project README is generated from the documentation home page. To change diff --git a/tools/check_dist.py b/tools/check_dist.py index 92cef29..df70910 100644 --- a/tools/check_dist.py +++ b/tools/check_dist.py @@ -23,6 +23,8 @@ "src/atomref/data/registry.json", "src/atomref/py.typed", "README.md", + "CHANGELOG.md", + "DEV_PLAN.md", "LICENSE", "pyproject.toml", "notebooks/01-quickstart.ipynb", @@ -33,6 +35,8 @@ "docs/notebooks/03-custom-sets-and-discovery.md", "tools/check_notebooks.py", "tools/export_notebooks.py", + "tools/gen_readme.py", + "tools/release_check.py", "tools/README.md", } diff --git a/tools/release_check.py b/tools/release_check.py new file mode 100644 index 0000000..a357a18 --- /dev/null +++ b/tools/release_check.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +"""Run the full release-preparation checks for the repository. + +This helper is intended for local release preparation. It runs the same checks +that are exercised separately in CI, then builds source and wheel artifacts, +validates them, and smoke-tests the built wheel in an isolated virtual +environment. +""" + +from __future__ import annotations + +import argparse +from pathlib import Path +import shutil +import subprocess +import sys +import tempfile +import venv + + +REPO_ROOT = Path(__file__).resolve().parents[1] +DIST_DIR = REPO_ROOT / "dist" +BUILD_DIR = REPO_ROOT / "build" + + +def _run(*args: str, env: dict[str, str] | None = None) -> None: + """Run one subprocess command in the repository root.""" + + print("+", " ".join(args)) + subprocess.run(args, cwd=REPO_ROOT, check=True, env=env) + + +def _fresh_build_dirs() -> None: + """Remove build artifacts from previous runs.""" + + shutil.rmtree(DIST_DIR, ignore_errors=True) + shutil.rmtree(BUILD_DIR, ignore_errors=True) + + +def _smoke_test_wheel() -> None: + """Install the built wheel into a temporary virtualenv and import it.""" + + wheels = sorted(DIST_DIR.glob("*.whl")) + if not wheels: + raise RuntimeError("no wheel found in dist/") + wheel = wheels[-1] + + with tempfile.TemporaryDirectory(prefix="atomref-release-check-") as tmp: + env_dir = Path(tmp) / "venv" + builder = venv.EnvBuilder(with_pip=True) + builder.create(env_dir) + bindir = "Scripts" if sys.platform.startswith("win") else "bin" + python = env_dir / bindir / "python" + _run(str(python), "-m", "pip", "install", "--no-deps", str(wheel)) + _run( + str(python), + "-c", + ( + "import atomref as ar; " + "assert ar.get_covalent_radius('C') == 0.76; " + "assert ar.get_vdw_radius('C') == 1.77; " + "assert 'atomic_radius' in ar.list_quantities(); " + "assert 'rahm2016' in ar.list_dataset_ids(" + "'atomic_radius', usage_role='support')" + ), + ) + + +def main() -> int: + """Run lint, tests, docs, build, metadata, and wheel smoke checks.""" + + parser = argparse.ArgumentParser( + description="Run the full release-preparation checks for the repository.", + ) + parser.add_argument( + "--skip-docs", + action="store_true", + help="skip the strict MkDocs build step", + ) + parser.add_argument( + "--skip-smoke-test", + action="store_true", + help="skip the temporary-virtualenv wheel import smoke test", + ) + args = parser.parse_args() + + _run("flake8", "src", "tests", "tools") + _run(sys.executable, "tools/check_registry.py") + _run(sys.executable, "tools/check_notebooks.py") + _run(sys.executable, "tools/export_notebooks.py", "--check") + _run(sys.executable, "tools/gen_readme.py", "--check") + _run(sys.executable, "-m", "pytest", "-q") + if not args.skip_docs: + _run("mkdocs", "build", "--strict") + + _fresh_build_dirs() + _run(sys.executable, "-m", "build") + _run(sys.executable, "-m", "twine", "check", "dist/*") + _run(sys.executable, "tools/check_dist.py", "dist") + if not args.skip_smoke_test: + _smoke_test_wheel() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From a9900821f4d696f014787bf943546ed3f9aea851 Mon Sep 17 00:00:00 2001 From: Ivan Chernyshov Date: Sun, 15 Mar 2026 19:01:49 +0300 Subject: [PATCH 12/15] Adds functionality for X-H bonds --- CHANGELOG.md | 59 ++++ README.md | 57 +++- docs/api/index.md | 23 +- docs/api/policy.md | 21 +- docs/api/registry.md | 10 + docs/api/transfer.md | 18 +- docs/api/xh.md | 22 ++ docs/datasets/index.md | 12 +- docs/datasets/xh_bond_length.md | 39 +++ docs/dev/architecture.md | 84 ++++- docs/guide/policies.md | 113 +++++-- docs/guide/quickstart.md | 11 +- docs/index.md | 57 +++- docs/notebooks/01-quickstart.md | 2 +- mkdocs.yml | 2 + src/atomref/__about__.py | 2 +- src/atomref/__init__.py | 22 +- src/atomref/data/registry.json | 64 ++++ src/atomref/data/xh_bond_length.csv | 119 +++++++ src/atomref/policy.py | 463 +++++++++++++++++++++----- src/atomref/radii.py | 81 ++++- src/atomref/registry.py | 113 ++++++- src/atomref/transfer.py | 31 +- src/atomref/xh.py | 168 ++++++++++ tests/meta/test_imports.py | 1 + tests/meta/test_package_data.py | 2 + tests/meta/test_public_api.py | 7 + tests/meta/test_registry_integrity.py | 8 +- tests/policy/test_policy.py | 69 ++++ tests/radii/test_selection.py | 50 +++ tests/registry/test_registry.py | 50 ++- tests/xh/test_xh.py | 67 ++++ tools/check_registry.py | 3 +- 33 files changed, 1645 insertions(+), 205 deletions(-) create mode 100644 docs/api/xh.md create mode 100644 docs/datasets/xh_bond_length.md create mode 100644 src/atomref/data/xh_bond_length.csv create mode 100644 src/atomref/xh.py create mode 100644 tests/policy/test_policy.py create mode 100644 tests/xh/test_xh.py diff --git a/CHANGELOG.md b/CHANGELOG.md index cac0707..fbb2887 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,64 @@ # Changelog +## 0.1.3 - 2026-03-15 + +### Added + +- Support for using generic policies and wrapper policies as transfer sources in + `SubstitutionTransfer` and `LinearTransfer`. +- Public `atomref.xh` module docs and examples for policy-backed predictor + workflows. + +### Changed + +- `LinearTransfer` now treats predictors as **sources** rather than only raw + datasets, while still keeping the current runtime to one predictor at a time. +- Generic policy resolution now supports blocked element keys, which is used by + the X–H helper to prevent invalid `H` parent-element lookups. +- Transfer results now preserve nested-policy provenance through + `resolved_from` and explanatory notes when a policy source is involved. + +## 0.1.2 - 2026-03-15 + +### Added + +- New `xh_bond_length` quantity family. +- Packaged provisional X–H dataset `csd_legacy_xh_cno` with ConQuest/CSD + hydrogen-normalisation targets for `C`, `N`, and `O`. +- New `atomref.xh` convenience layer with `XHPolicy`, `DEFAULT_XH_POLICY`, set + listing helpers, and X–H lookup helpers. + +### Documentation + +- Added X–H dataset and API pages. +- Documented the provisional scope of X–H support in `0.1.x` and the planned + broader follow-up in `0.2.x`. + +## 0.1.1 - 2026-03-15 + +### Added + +- Public generic lookup helpers `lookup_value(...)` and `get_value(...)`. +- Tests for alias normalization, immutable metadata, non-finite-value rejection, + collision detection, and explicit placeholder notes. + +### Changed + +- Registry metadata returned by `get_dataset_info(...)` is now frozen so callers + cannot mutate the cached registry state. +- Dataset-alias resolution now normalizes Unicode and dash variants more + robustly. +- Custom-set construction and policy configuration now reject normalized-key + collisions and non-finite numeric values. +- Radii-specific wrappers now reject negative override and fallback values. +- Base and substitution lookups now emit explicit placeholder notes when the + returned numeric value is a dataset placeholder. +- `LinearTransfer` now validates empty-predictor and invalid-`min_points` + configurations eagerly. +- The docs now explain the distinction between quantity, domain, dataset, and + policy, and clarify that the current runtime supports only the `element` + domain. + ## 0.1.0 - 2026-03-15 First public release. diff --git a/README.md b/README.md index d9d90bd..0d784fc 100644 --- a/README.md +++ b/README.md @@ -14,15 +14,36 @@ It is not meant to be yet another periodic-table encyclopedia. The package is for code that needs stable atomic reference values with explicit provenance, clear fallback behavior, and honest handling of incomplete preferred datasets. -What you get in v0.1: +What you get in the current `0.1.x` line: - stable element metadata, - curated named radii sets, +- provisional X–H bond-length support for hydrogen-normalisation workflows, - dataset provenance and coverage metadata, - deterministic lookup policies, -- substitution and linear transfer from support datasets into target datasets, +- substitution and linear transfer from support datasets or policies into target datasets, - user-defined custom element-indexed scalar sets. +## Core terms + +`atomref` uses a small vocabulary on purpose. + +- **quantity** — the operational property family being requested, such as + `covalent_radius`, `van_der_waals_radius`, `atomic_radius`, or + `xh_bond_length`. +- **domain** — the key space used to index that quantity. In the current + runtime, the supported domain is `element`, meaning lookups are keyed by an + element symbol. +- **dataset** — one curated named table inside a quantity, such as + `cordero2008`, `alvarez2013`, or `csd_legacy_xh_cno`. +- **policy** — the ordered rule set that decides what value to return when the + preferred dataset is incomplete. + +The metadata layer already records `domain` explicitly because the package is +built for later extension, but the current runtime intentionally keeps the +implementation narrow and stable: **v0.1 resolves only element-domain scalar +values**. + ## Why this exists Scientific software often wants a complete lookup table, but the best dataset @@ -31,7 +52,7 @@ Instead of hiding ad hoc defaults inside algorithm code, you choose a target set, describe how missing values may be restored, and keep provenance on what was actually returned. -The default v0.1 behavior is intentionally simple and practical: +The default `0.1.x` behavior is intentionally simple and practical: - **Cordero covalent radii** (`cordero2008`) are the preferred covalent target set, with missing values substituted from the **legacy CSD covalent radii** @@ -39,6 +60,10 @@ The default v0.1 behavior is intentionally simple and practical: - **Alvarez van der Waals radii** (`alvarez2013`) are the preferred vdW target set, with missing values restored from the **Rahm isodensity atomic radii** (`rahm2016`) through a fitted linear transfer. +- **CSD/ConQuest hydrogen-normalisation defaults** (`csd_legacy_xh_cno`) are a + provisional sparse X–H target set for `C`, `N`, and `O`, with other parent + elements inferred from **Cordero covalent radii** through a fitted linear + policy. ## Quick example @@ -48,6 +73,8 @@ The default v0.1 behavior is intentionally simple and practical: 0.76 >>> ar.get_vdw_radius("O") 1.5 +>>> ar.get_xh_bond_length("N") +1.015 >>> lookup = ar.lookup_vdw_radius("Pm") >>> lookup.value 2.8972265395148358 @@ -58,16 +85,17 @@ The default v0.1 behavior is intentionally simple and practical: ``` `get_*` returns only the number. `lookup_*` returns a `LookupResult` that also -records where the value came from and whether a transfer model was involved. +records where the value came from and whether a transfer model or policy source +was involved. You can inspect the packaged quantity and dataset catalog directly: ```pycon >>> import atomref as ar >>> ar.list_quantities() -('covalent_radius', 'van_der_waals_radius', 'atomic_radius') ->>> ar.get_quantity_info("atomic_radius") -QuantityInfo(quantity='atomic_radius', domain='element', units='angstrom', description='Element-indexed isolated-atom or theory-defined atomic radii used as transferable support data.') +('covalent_radius', 'van_der_waals_radius', 'atomic_radius', 'xh_bond_length') +>>> ar.get_quantity_info("xh_bond_length") +QuantityInfo(quantity='xh_bond_length', domain='element', units='angstrom', description='Element-indexed reference X-H bond lengths keyed by parent element X and intended for hydrogen-position normalisation or related geometry workflows.') >>> [info.ref.set_id for info in ar.list_dataset_infos("van_der_waals_radius", usage_role="target")] ['bondi1964', 'rowland_taylor1996', 'alvarez2013', 'chernyshov2020'] ``` @@ -79,14 +107,14 @@ You can also load a packaged set directly: >>> vdw = ar.get_radii_set("van_der_waals", "alvarez2013") >>> vdw.get("O") 1.5 ->>> raw = ar.get_builtin_set(ar.DatasetRef("atomic_radius", "rahm2016")) ->>> raw.get("Pm") -2.83 +>>> xh = ar.get_xh_set("csd_legacy_xh_cno") +>>> xh.get("C") +1.089 ``` ## Notebook walkthroughs -The repository ships example notebooks for the main v0.1 workflows. In the +The repository ships example notebooks for the main `0.1.x` workflows. In the documentation they are also available as rendered Markdown pages, so users can read them without opening Jupyter first. @@ -100,7 +128,7 @@ read them without opening Jupyter first. `atomref` is designed as a standalone package, but within Delone Commons it is primarily intended to support chemistry-aware packages such as: -- `molcryst`, for covalent-bond detection and contact analysis, +- `molcryst`, for covalent-bond detection, contact analysis, and hydrogen workflows, - future `chemvoro`, for chemistry-aware contact and hydrogen workflows. By contrast, `pyvoro2` and `pbcgraph` are intentionally general mathematical @@ -115,8 +143,9 @@ The repository also ships small maintenance tools. The most important ones are: - `python tools/check_notebooks.py` — execute notebook code cells, - `python tools/export_notebooks.py` — turn notebooks into Markdown pages for the docs, -- `python tools/gen_readme.py` — regenerate `README.md` from this page. -- `python tools/release_check.py` — run the full release-preparation checklist, including linting, tests, docs, builds, and artifact validation. +- `python tools/gen_readme.py` — regenerate `README.md` from this page, +- `python tools/release_check.py` — run the full release-preparation checklist, + including linting, tests, docs, builds, and artifact validation. See the [tools README](https://github.com/DeloneCommons/atomref/blob/main/tools/README.md) for a short description of each script. diff --git a/docs/api/index.md b/docs/api/index.md index e69e719..f56eb7c 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -3,20 +3,24 @@ The public API is small on purpose. Most users will spend most of their time in the top-level package namespace and -in the radii helpers. The lower-level modules are still documented because they -expose the actual data model behind the package. +in the quantity-specific convenience helpers. The lower-level modules are still +documented because they expose the actual data model behind the package. ## Common tasks -- get a single value: use `get_covalent_radius(...)` or `get_vdw_radius(...)` -- inspect provenance: use `lookup_covalent_radius(...)` or - `lookup_vdw_radius(...)` +- get a single value: use `get_covalent_radius(...)`, `get_vdw_radius(...)`, or + `get_xh_bond_length(...)` +- inspect provenance: use `lookup_covalent_radius(...)`, + `lookup_vdw_radius(...)`, `lookup_xh_bond_length(...)`, or the generic + `lookup_value(...)` - browse packaged datasets: use `list_quantities()`, `get_quantity_info(...)`, - `list_dataset_infos(...)`, or `list_radii_set_infos(...)` -- load a packaged set directly: use `get_builtin_set(...)` or `get_radii_set(...)` + `list_dataset_infos(...)`, `list_radii_set_infos(...)`, or + `list_xh_set_infos(...)` +- load a packaged set directly: use `get_builtin_set(...)`, `get_radii_set(...)`, + or `get_xh_set(...)` - define a custom set: use `ElementScalarSet.from_mapping(...)` -- define transfer-backed lookup behavior: use `RadiiPolicy`, - `SubstitutionTransfer`, and `LinearTransfer` +- define transfer-backed lookup behavior: use `ValuePolicy`, `RadiiPolicy`, + `XHPolicy`, `SubstitutionTransfer`, and `LinearTransfer` ## Module reference @@ -26,3 +30,4 @@ expose the actual data model behind the package. - [Transfer models](transfer.md) - [Generic policy core](policy.md) - [Radii API](radii.md) +- [X–H API](xh.md) diff --git a/docs/api/policy.md b/docs/api/policy.md index 99d51d9..5b68440 100644 --- a/docs/api/policy.md +++ b/docs/api/policy.md @@ -1,9 +1,22 @@ # atomref.policy -This module contains the generic resolver that sits below the radii-specific -API. +This module contains the generic resolver that sits below the radii-specific and +X–H-specific convenience APIs. -It is useful when you want to understand exactly how overrides, base datasets, -transfers, fallbacks, and missing values are ordered and reported. +Use it when you want to work directly with the common value-selection engine: + +- `ValuePolicy` — generic element-domain policy configuration, +- `lookup_value(...)` — resolve one value together with provenance, +- `get_value(...)` — resolve only the numeric value, +- `LookupResult` — the structured result object returned by the resolver. + +A few practical notes: + +- The current runtime supports **element-domain** scalar policies. +- `ValuePolicy` normalizes element-symbol overrides eagerly. +- Transfer sources may be packaged datasets, custom sets, generic policies, or + wrapper policies that expose `as_value_policy()`. +- `LookupResult.is_placeholder` refers to the returned numeric value itself, not + to whether any transfer happened. ::: atomref.policy diff --git a/docs/api/registry.md b/docs/api/registry.md index 4f664e6..9c41653 100644 --- a/docs/api/registry.md +++ b/docs/api/registry.md @@ -6,4 +6,14 @@ If you want to understand how `atomref` classifies datasets, how aliases are resolved, or how built-in CSV tables are turned into typed in-memory objects, this is the key module to read. +The most important registry ideas are: + +- **quantity** — the operational property family, +- **domain** — the key space used to index that quantity, +- **dataset** — one curated named table inside the quantity. + +In the current runtime, the implemented lookup domain is `element`. +The registry still stores `domain` explicitly because the metadata design is +meant to stay reusable as the package grows. + ::: atomref.registry diff --git a/docs/api/transfer.md b/docs/api/transfer.md index eab5672..797626e 100644 --- a/docs/api/transfer.md +++ b/docs/api/transfer.md @@ -1,9 +1,21 @@ # atomref.transfer Transfer models describe how missing target values may be restored from other -datasets. +sources. -In v0.1 the core built-in models are direct substitution and one-predictor -linear transfer. +In the current runtime the built-in models are: + +- direct substitution (`SubstitutionTransfer`), +- one-predictor linear transfer (`LinearTransfer`). + +A transfer source may be: + +- a packaged dataset reference, +- a custom `ElementScalarSet`, +- a generic `ValuePolicy`, +- a wrapper policy that exposes `as_value_policy()`. + +`LinearTransfer` currently accepts exactly one predictor source at runtime, even +though the public API stores predictors as a tuple for forward compatibility. ::: atomref.transfer diff --git a/docs/api/xh.md b/docs/api/xh.md new file mode 100644 index 0000000..cca073e --- /dev/null +++ b/docs/api/xh.md @@ -0,0 +1,22 @@ +# atomref.xh + +This module provides the provisional X–H bond-length helpers introduced in the +`0.1.x` line. + +It is intentionally narrow: + +- one packaged sparse target dataset, `csd_legacy_xh_cno`, +- one wrapper policy, `XHPolicy`, +- convenience helpers for listing packaged X–H sets and resolving X–H values. + +The built-in quantity is keyed by the **parent element `X`** in `X–H` and is +currently aimed at hydrogen-position normalisation or related geometry +workflows. + +In the default policy: + +- `C`, `N`, and `O` use curated ConQuest/CSD defaults, +- other parent elements may be inferred from `cordero2008`, +- fuller X–H literature support is planned for `0.2.x`. + +::: atomref.xh diff --git a/docs/datasets/index.md b/docs/datasets/index.md index 20d4c3e..d699ff0 100644 --- a/docs/datasets/index.md +++ b/docs/datasets/index.md @@ -4,6 +4,7 @@ Instead, the package records several layers of classification: - **quantity** — the operational property being requested, +- **domain** — the key space used to index that quantity, - **semantic class** — what the dataset scientifically represents, - **origin class** — how the values were obtained, - **phase context** — what physical context they describe, @@ -22,13 +23,16 @@ The most useful catalog helpers are: - `atomref.get_quantity_info(...)` - `atomref.list_dataset_infos(...)` - `atomref.list_radii_set_infos(...)` +- `atomref.list_xh_set_infos(...)` -If you only need dataset ids, use `list_dataset_ids(...)` or `list_radii_sets(...)`. -If you want the packaged values themselves, use `get_builtin_set(...)` or -`get_radii_set(...)`. +If you only need dataset ids, use `list_dataset_ids(...)`, `list_radii_sets(...)`, +or `list_xh_sets(...)`. +If you want the packaged values themselves, use `get_builtin_set(...)`, +`get_radii_set(...)`, or `get_xh_set(...)`. -## Built-in quantity families in v0.1 +## Built-in quantity families in `0.1.x` - [Covalent radius](covalent_radius.md) - [van der Waals radius](van_der_waals_radius.md) - [Atomic radius](atomic_radius.md) +- [X–H bond length](xh_bond_length.md) diff --git a/docs/datasets/xh_bond_length.md b/docs/datasets/xh_bond_length.md new file mode 100644 index 0000000..2bef656 --- /dev/null +++ b/docs/datasets/xh_bond_length.md @@ -0,0 +1,39 @@ +# X–H bond length + +The `xh_bond_length` quantity is a small provisional addition in the `0.1.x` +line. + +Its purpose is not to claim a complete literature survey of X–H bond lengths. +Instead, it provides a stable, provenance-aware starting point for +hydrogen-normalisation workflows and related geometry code. + +## Packaged target dataset + +### CSD legacy X–H neutron-normalisation targets (`csd_legacy_xh_cno`) + +- **What it is:** the fixed `C–H`, `N–H`, and `O–H` target lengths used by + ConQuest for terminal-hydrogen normalisation. +- **Coverage:** only parent elements `C`, `N`, and `O`. +- **Values:** `C–H = 1.089 Å`, `N–H = 1.015 Å`, `O–H = 0.993 Å`. +- **Primary provenance:** the ConQuest user guide section *Hydrogen Atom + Location in Crystal Structure Analyses*. +- **Secondary provenance:** Allen & Bruno (2010), which the ConQuest guide cites + for these defaults. + +## How `atomref` uses it + +The built-in `DEFAULT_XH_POLICY` treats `csd_legacy_xh_cno` as a sparse target +set and restores missing parent elements through a fitted linear transfer from +`cordero2008` covalent radii. + +That means the package draws a sharp line between: + +- **curated dataset values** — currently only `C`, `N`, and `O`, and +- **policy-generated values** — inferred for other parent elements when the + predictor policy can supply a covalent radius. + +## Scope note + +This is intentionally a small addendum rather than full X–H support. +Broader X–H datasets, richer policies, and more complete literature treatment +are planned for `0.2.x`. diff --git a/docs/dev/architecture.md b/docs/dev/architecture.md index 7dd08b4..cbdf743 100644 --- a/docs/dev/architecture.md +++ b/docs/dev/architecture.md @@ -1,7 +1,83 @@ # Architecture -Publicly, v0.1 is radii-first. +Publicly, `atomref` is still radii-first, with a small provisional X–H helper. -Internally, the package is built around element-indexed scalar datasets plus a -small transfer layer. That keeps the public API simple while leaving a clean -path to later quantities such as X-H bond lengths. +Internally, the package is built around four layers: + +1. **elements** — stable element metadata and symbol canonicalization, +2. **registry** — curated quantity and dataset metadata plus packaged data + loading, +3. **policy core** — generic value selection with overrides, transfers, + fallbacks, blocked keys, and provenance, +4. **quantity wrappers** — convenience APIs such as `atomref.radii` and + `atomref.xh`. + +## Core terminology + +A few terms are deliberately separated in the design: + +- **quantity** — the operational property family being requested, +- **domain** — the key space used to index that quantity, +- **dataset** — one curated source table inside the quantity, +- **policy** — the ordered rule set used to select a final value. + +This separation is what allows the package to say, for example, that +`rahm2016` belongs to the `atomic_radius` quantity but can still act as support +data in a van der Waals policy. + +## Domain support in the current runtime + +The registry schema is domain-aware, but the current resolver intentionally +implements only one domain: + +- `element` + +That means: + +- packaged built-in sets are currently element-indexed scalar tables, +- `ValuePolicy` resolves element symbols, +- transfer fitting is performed over element-wise overlap. + +The metadata keeps `domain` explicit now so later versions can extend the data +model without having to reinterpret existing registry entries. + +## Policy resolution and transfer sources + +The generic resolver works in a fixed order: + +1. blocked keys, +2. overrides, +3. base dataset, +4. transfer models, +5. fallback, +6. missing. + +Transfer sources can be: + +- packaged datasets, +- custom `ElementScalarSet` objects, +- generic `ValuePolicy` objects, +- wrapper policies exposing `as_value_policy()`. + +That last point is important. It means higher-level code can express +"infer values from my chosen covalent-radii policy" instead of being forced to +refer to one hard-coded predictor dataset. + +## Placeholder handling + +Placeholder semantics stay attached to the value that was actually returned. +This means `LookupResult.is_placeholder` can be true for: + +- a base lookup, +- a substitution transfer, +- a nested policy used as a transfer source. + +A linear transfer normally returns a computed value and therefore does not carry +placeholder status itself. + +## Why the design stays small + +The package deliberately avoids a large object graph or a chemistry-specific DSL. +A quantity wrapper is usually only a thin adapter over the generic policy core. +That keeps the internals easy to test and lets other scientific packages reuse +`atomref` without bringing in the rest of the Delone Commons stack. diff --git a/docs/guide/policies.md b/docs/guide/policies.md index 62663ae..fd53047 100644 --- a/docs/guide/policies.md +++ b/docs/guide/policies.md @@ -7,18 +7,39 @@ That may sound simple, but in practice scientific datasets are often incomplete. A policy makes the decision process explicit instead of hiding it in algorithm code. +## Terms used in the policy layer + +A few terms appear repeatedly in the API and docs: + +- **quantity** — the operational property family being requested. +- **domain** — the lookup key space. In the current runtime that means + `element`, so lookups are keyed by element symbol. +- **dataset** — a curated named table inside one quantity. +- **policy** — the ordered rule set used to resolve missing values. + +The quantity and dataset live in the curated registry. The policy is the +selection logic that sits on top of them. + ## Resolution order -In v0.1 every lookup follows the same ordered path: +In `0.1.x` every lookup follows the same ordered path: -1. **Override** -2. **Base dataset** -3. **Transfer models**, in the order you listed them -4. **Fallback** -5. **Missing** +1. **Blocked key** (optional) +2. **Override** +3. **Base dataset** +4. **Transfer models**, in the order you listed them +5. **Fallback** +6. **Missing** Each step has a specific meaning. +### Blocked key + +Some quantity wrappers need to declare that certain domain keys should never be +resolved, even if a transfer model could otherwise invent a number. The current +X–H helper uses this for `H`, because `xh_bond_length` is keyed by the parent +atom `X` in `X–H`, not by hydrogen itself. + ### Override An override is a value you provide directly for a specific element. It wins over @@ -37,16 +58,28 @@ default vdW policy starts from the **Alvarez van der Waals radii** A transfer model is used only when the base dataset has no value for the requested element. -Built-in transfer models in v0.1 are: +Built-in transfer models in `0.1.x` are: + +- `SubstitutionTransfer` — take a value directly from another dataset or policy, +- `LinearTransfer` — infer a target-equivalent value from another dataset or + policy through a fitted linear model. -- `SubstitutionTransfer` — take a value directly from another dataset, -- `LinearTransfer` — infer a target-equivalent value from a support dataset - through a fitted linear model. +`LinearTransfer` already accepts a tuple of predictors in the API, but the +current runtime intentionally supports exactly one predictor source. That keeps +the implementation simple now while leaving room for later multi-predictor +linear models. -`LinearTransfer` already accepts a tuple of predictors in the API, but the v0.1 -runtime intentionally supports exactly one predictor dataset. That keeps the -implementation simple now while leaving room for later multi-predictor linear -models. +Transfer sources can now be: + +- a packaged dataset reference (`DatasetRef`), +- a custom `ElementScalarSet`, +- a generic `ValuePolicy`, +- a wrapper policy such as `RadiiPolicy` or `XHPolicy`. + +When a transfer source is itself a policy, `atomref` uses the values selected by +that policy. This lets higher-level workflows express things like “infer X–H +lengths from my chosen covalent-radii policy” instead of hard-coding a specific +support dataset. ### Fallback @@ -60,6 +93,25 @@ If nothing above can produce a value and no fallback was configured, the result is simply missing. In that case `get_*` returns `None`, while `lookup_*` returns a `LookupResult` with `source="missing"` and explanatory notes. +## Placeholder values and `is_placeholder` + +Some support datasets use placeholder numbers to stand in for “unknown but keep +this legacy table dense enough for downstream heuristics”. + +`LookupResult.is_placeholder` answers one narrow question: + +> Is the **returned numeric value itself** marked as a placeholder by the source +> that supplied it? + +It does **not** mean “a transfer happened”. Examples: + +- a base lookup can have `is_placeholder=True` if the base dataset contains a + placeholder value, +- a substitution transfer can also have `is_placeholder=True` if it copied a + placeholder from the transfer source, +- a linear transfer is computed, not copied, so `is_placeholder` is normally + `False`. + ## Target datasets and support datasets `atomref` separates **what a dataset is used for** from **what it scientifically @@ -68,6 +120,7 @@ represents**. That is why the package stores: - the operational **quantity**, +- the lookup **domain**, - the scientific **semantic class**, - the package-level **usage role**. @@ -77,7 +130,9 @@ radii, but they are not the same thing as a condensed-phase structural vdW radius set. In `atomref`, that difference is recorded in the metadata instead of being hidden. -## Example +## Examples + +A standard dataset-backed transfer: ```python import atomref as ar @@ -94,9 +149,27 @@ policy = ar.RadiiPolicy( ) ``` -With that policy: +A policy-backed transfer source: + +```python +import atomref as ar + +xh_policy = ar.XHPolicy( + base_set="csd_legacy_xh_cno", + transfers=( + ar.LinearTransfer( + predictors=(ar.DEFAULT_COVALENT_POLICY,), + min_points=3, + ), + ), +) +``` + +With that X–H policy: -- xenon uses the explicit override, -- elements present in `alvarez2013` use the base vdW value, -- missing elements may be restored from `rahm2016`, -- anything still unresolved remains missing unless you also set a fallback. +- `C`, `N`, and `O` use the curated ConQuest defaults, +- missing parent elements may be inferred from the **selected covalent-radii + policy**, not just from one hard-coded support dataset, +- if the predictor policy itself needed a transfer to produce a covalent radius, + the resulting `LookupResult` still records that provenance in `resolved_from` + and `notes`. diff --git a/docs/guide/quickstart.md b/docs/guide/quickstart.md index 3649653..72e6858 100644 --- a/docs/guide/quickstart.md +++ b/docs/guide/quickstart.md @@ -11,6 +11,8 @@ The two most important user-facing ideas in `atomref` are: 0.76 >>> ar.get_vdw_radius("O") 1.5 +>>> ar.get_xh_bond_length("N") +1.015 >>> lookup = ar.lookup_vdw_radius("Pm") >>> lookup.value 2.8972265395148358 @@ -29,9 +31,9 @@ You can inspect the packaged quantity layer directly: ```pycon >>> import atomref as ar >>> ar.list_quantities() -('covalent_radius', 'van_der_waals_radius', 'atomic_radius') ->>> ar.get_quantity_info("atomic_radius") -QuantityInfo(quantity='atomic_radius', domain='element', units='angstrom', description='Element-indexed isolated-atom or theory-defined atomic radii used as transferable support data.') +('covalent_radius', 'van_der_waals_radius', 'atomic_radius', 'xh_bond_length') +>>> ar.get_quantity_info("xh_bond_length") +QuantityInfo(quantity='xh_bond_length', domain='element', units='angstrom', description='Element-indexed reference X-H bond lengths keyed by parent element X and intended for hydrogen-position normalisation or related geometry workflows.') >>> [info.ref.set_id for info in ar.list_radii_set_infos("van_der_waals", usage_role="target")] ['bondi1964', 'rowland_taylor1996', 'alvarez2013', 'chernyshov2020'] ``` @@ -46,6 +48,9 @@ And you can load a packaged set object directly: >>> raw = ar.get_builtin_set(ar.DatasetRef("atomic_radius", "rahm2016")) >>> raw.get("Pm") 2.83 +>>> xh = ar.get_xh_set("csd_legacy_xh_cno") +>>> xh.get("C") +1.089 ``` For longer, runnable examples see: diff --git a/docs/index.md b/docs/index.md index 3bc7495..17c5481 100644 --- a/docs/index.md +++ b/docs/index.md @@ -14,15 +14,36 @@ It is not meant to be yet another periodic-table encyclopedia. The package is for code that needs stable atomic reference values with explicit provenance, clear fallback behavior, and honest handling of incomplete preferred datasets. -What you get in v0.1: +What you get in the current `0.1.x` line: - stable element metadata, - curated named radii sets, +- provisional X–H bond-length support for hydrogen-normalisation workflows, - dataset provenance and coverage metadata, - deterministic lookup policies, -- substitution and linear transfer from support datasets into target datasets, +- substitution and linear transfer from support datasets or policies into target datasets, - user-defined custom element-indexed scalar sets. +## Core terms + +`atomref` uses a small vocabulary on purpose. + +- **quantity** — the operational property family being requested, such as + `covalent_radius`, `van_der_waals_radius`, `atomic_radius`, or + `xh_bond_length`. +- **domain** — the key space used to index that quantity. In the current + runtime, the supported domain is `element`, meaning lookups are keyed by an + element symbol. +- **dataset** — one curated named table inside a quantity, such as + `cordero2008`, `alvarez2013`, or `csd_legacy_xh_cno`. +- **policy** — the ordered rule set that decides what value to return when the + preferred dataset is incomplete. + +The metadata layer already records `domain` explicitly because the package is +built for later extension, but the current runtime intentionally keeps the +implementation narrow and stable: **v0.1 resolves only element-domain scalar +values**. + ## Why this exists Scientific software often wants a complete lookup table, but the best dataset @@ -31,7 +52,7 @@ Instead of hiding ad hoc defaults inside algorithm code, you choose a target set, describe how missing values may be restored, and keep provenance on what was actually returned. -The default v0.1 behavior is intentionally simple and practical: +The default `0.1.x` behavior is intentionally simple and practical: - **Cordero covalent radii** (`cordero2008`) are the preferred covalent target set, with missing values substituted from the **legacy CSD covalent radii** @@ -39,6 +60,10 @@ The default v0.1 behavior is intentionally simple and practical: - **Alvarez van der Waals radii** (`alvarez2013`) are the preferred vdW target set, with missing values restored from the **Rahm isodensity atomic radii** (`rahm2016`) through a fitted linear transfer. +- **CSD/ConQuest hydrogen-normalisation defaults** (`csd_legacy_xh_cno`) are a + provisional sparse X–H target set for `C`, `N`, and `O`, with other parent + elements inferred from **Cordero covalent radii** through a fitted linear + policy. ## Quick example @@ -48,6 +73,8 @@ The default v0.1 behavior is intentionally simple and practical: 0.76 >>> ar.get_vdw_radius("O") 1.5 +>>> ar.get_xh_bond_length("N") +1.015 >>> lookup = ar.lookup_vdw_radius("Pm") >>> lookup.value 2.8972265395148358 @@ -58,16 +85,17 @@ The default v0.1 behavior is intentionally simple and practical: ``` `get_*` returns only the number. `lookup_*` returns a `LookupResult` that also -records where the value came from and whether a transfer model was involved. +records where the value came from and whether a transfer model or policy source +was involved. You can inspect the packaged quantity and dataset catalog directly: ```pycon >>> import atomref as ar >>> ar.list_quantities() -('covalent_radius', 'van_der_waals_radius', 'atomic_radius') ->>> ar.get_quantity_info("atomic_radius") -QuantityInfo(quantity='atomic_radius', domain='element', units='angstrom', description='Element-indexed isolated-atom or theory-defined atomic radii used as transferable support data.') +('covalent_radius', 'van_der_waals_radius', 'atomic_radius', 'xh_bond_length') +>>> ar.get_quantity_info("xh_bond_length") +QuantityInfo(quantity='xh_bond_length', domain='element', units='angstrom', description='Element-indexed reference X-H bond lengths keyed by parent element X and intended for hydrogen-position normalisation or related geometry workflows.') >>> [info.ref.set_id for info in ar.list_dataset_infos("van_der_waals_radius", usage_role="target")] ['bondi1964', 'rowland_taylor1996', 'alvarez2013', 'chernyshov2020'] ``` @@ -79,14 +107,14 @@ You can also load a packaged set directly: >>> vdw = ar.get_radii_set("van_der_waals", "alvarez2013") >>> vdw.get("O") 1.5 ->>> raw = ar.get_builtin_set(ar.DatasetRef("atomic_radius", "rahm2016")) ->>> raw.get("Pm") -2.83 +>>> xh = ar.get_xh_set("csd_legacy_xh_cno") +>>> xh.get("C") +1.089 ``` ## Notebook walkthroughs -The repository ships example notebooks for the main v0.1 workflows. In the +The repository ships example notebooks for the main `0.1.x` workflows. In the documentation they are also available as rendered Markdown pages, so users can read them without opening Jupyter first. @@ -100,7 +128,7 @@ read them without opening Jupyter first. `atomref` is designed as a standalone package, but within Delone Commons it is primarily intended to support chemistry-aware packages such as: -- `molcryst`, for covalent-bond detection and contact analysis, +- `molcryst`, for covalent-bond detection, contact analysis, and hydrogen workflows, - future `chemvoro`, for chemistry-aware contact and hydrogen workflows. By contrast, `pyvoro2` and `pbcgraph` are intentionally general mathematical @@ -115,8 +143,9 @@ The repository also ships small maintenance tools. The most important ones are: - `python tools/check_notebooks.py` — execute notebook code cells, - `python tools/export_notebooks.py` — turn notebooks into Markdown pages for the docs, -- `python tools/gen_readme.py` — regenerate `README.md` from this page. -- `python tools/release_check.py` — run the full release-preparation checklist, including linting, tests, docs, builds, and artifact validation. +- `python tools/gen_readme.py` — regenerate `README.md` from this page, +- `python tools/release_check.py` — run the full release-preparation checklist, + including linting, tests, docs, builds, and artifact validation. See the [tools README](https://github.com/DeloneCommons/atomref/blob/main/tools/README.md) for a short description of each script. diff --git a/docs/notebooks/01-quickstart.md b/docs/notebooks/01-quickstart.md index 3a9f22b..475e218 100644 --- a/docs/notebooks/01-quickstart.md +++ b/docs/notebooks/01-quickstart.md @@ -15,7 +15,7 @@ print(ar.list_quantities()) **Output** ```text Element(z=17, symbol='Cl', name='Chlorine') -('covalent_radius', 'van_der_waals_radius', 'atomic_radius') +('covalent_radius', 'van_der_waals_radius', 'atomic_radius', 'xh_bond_length') ``` ```python r_c = ar.get_covalent_radius('C') diff --git a/mkdocs.yml b/mkdocs.yml index 2658174..e0952f2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -29,6 +29,7 @@ nav: - Covalent radius: datasets/covalent_radius.md - van der Waals radius: datasets/van_der_waals_radius.md - Atomic radius: datasets/atomic_radius.md + - X–H bond length: datasets/xh_bond_length.md - Notebooks: - Overview: guide/notebooks.md - Quickstart notebook: notebooks/01-quickstart.md @@ -46,3 +47,4 @@ nav: - atomref.transfer: api/transfer.md - atomref.policy: api/policy.md - atomref.radii: api/radii.md + - atomref.xh: api/xh.md diff --git a/src/atomref/__about__.py b/src/atomref/__about__.py index 3dc1f76..ae73625 100644 --- a/src/atomref/__about__.py +++ b/src/atomref/__about__.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "0.1.3" diff --git a/src/atomref/__init__.py b/src/atomref/__init__.py index 6104c9d..fb569b3 100644 --- a/src/atomref/__init__.py +++ b/src/atomref/__init__.py @@ -8,7 +8,7 @@ iter_elements, is_valid_element_symbol, ) -from .policy import LookupResult, ValuePolicy +from .policy import LookupResult, ValuePolicy, get_value, lookup_value from .radii import ( DEFAULT_COVALENT_POLICY, DEFAULT_VDW_POLICY, @@ -25,6 +25,16 @@ lookup_covalent_radius, lookup_vdw_radius, ) +from .xh import ( + DEFAULT_XH_POLICY, + XHPolicy, + get_xh_bond_length, + get_xh_set, + get_xh_set_info, + list_xh_set_infos, + list_xh_sets, + lookup_xh_bond_length, +) from .registry import ( CoverageInfo, DatasetInfo, @@ -65,6 +75,8 @@ "SubstitutionTransfer", "LookupResult", "ValuePolicy", + "lookup_value", + "get_value", "RadiiPolicy", "RadiiElementAssessment", "RadiiPolicyAssessment", @@ -79,4 +91,12 @@ "lookup_vdw_radius", "get_vdw_radius", "assess_radii_policy", + "XHPolicy", + "DEFAULT_XH_POLICY", + "list_xh_sets", + "list_xh_set_infos", + "get_xh_set", + "get_xh_set_info", + "lookup_xh_bond_length", + "get_xh_bond_length", ] diff --git a/src/atomref/data/registry.json b/src/atomref/data/registry.json index a722e1c..e6e4469 100644 --- a/src/atomref/data/registry.json +++ b/src/atomref/data/registry.json @@ -23,6 +23,11 @@ "domain": "element", "units": "angstrom", "description": "Element-indexed isolated-atom or theory-defined atomic radii used as transferable support data." + }, + "xh_bond_length": { + "domain": "element", + "units": "angstrom", + "description": "Element-indexed reference X-H bond lengths keyed by parent element X and intended for hydrogen-position normalisation or related geometry workflows." } }, "datasets": { @@ -437,6 +442,65 @@ ], "usage_role": "support" } + }, + "xh_bond_length": { + "csd_legacy_xh_cno": { + "name": "CSD legacy X-H neutron-normalisation targets (C/N/O)", + "description": "Fixed C-H, N-H, and O-H target bond lengths used by ConQuest for hydrogen-position normalisation.", + "semantic_class": "xh_neutron_normalisation", + "origin_class": "compiled_experimental", + "phase_context": "condensed_phase", + "method_summary": "Sparse parent-element target set for hydrogen normalisation. ConQuest moves H along the experimentally determined X-H vector to these neutron-derived distances.", + "storage": { + "format": "dense_by_z_csv", + "filename": "xh_bond_length.csv", + "column": "csd_legacy_xh_cno" + }, + "coverage": { + "n_values": 3, + "z_min": 6, + "z_max": 8, + "has_placeholders": false, + "covered_z": [ + 6, + 7, + 8 + ], + "missing_z": [ + 1, + 2, + 3, + 4, + 5 + ] + }, + "placeholder_value": null, + "extraction_source": "ConQuest User Guide and Tutorials, section 'Hydrogen Atom Location in Crystal Structure Analyses'.", + "aliases": [ + "CSD X-H normalisation defaults", + "ConQuest X-H normalisation", + "CSD legacy X-H" + ], + "references": [ + { + "publisher": "Cambridge Crystallographic Data Centre (CCDC)", + "title": "ConQuest User Guide and Tutorials", + "url": "https://www.ccdc.cam.ac.uk/media/Documentation/C82017ED-FAE4-4D93-BA5A-8D841F1E4314/ConQuest-UserGuide_2020_1.pdf", + "note": "Hydrogen Atom Location in Crystal Structure Analyses; ConQuest normalises terminal C-H, N-H, and O-H distances to 1.089 Å, 1.015 Å, and 0.993 Å, respectively." + }, + { + "authors": "F. H. Allen; I. J. Bruno", + "title": "Bond lengths in organic and metal-organic compounds revisited: X-H bond lengths from neutron diffraction data", + "venue": "Acta Cryst. B66 (2010) 380-386" + } + ], + "notes": [ + "Sparse provisional target set for parent elements C, N, and O only.", + "In atomref v0.1.x this dataset seeds transfer-based inference for other parent elements rather than claiming direct curated coverage beyond C/N/O.", + "Fuller X-H dataset and policy support is planned for atomref 0.2.x." + ], + "usage_role": "target" + } } } } diff --git a/src/atomref/data/xh_bond_length.csv b/src/atomref/data/xh_bond_length.csv new file mode 100644 index 0000000..4ae4bca --- /dev/null +++ b/src/atomref/data/xh_bond_length.csv @@ -0,0 +1,119 @@ +z,csd_legacy_xh_cno +1, +2, +3, +4, +5, +6,1.089 +7,1.015 +8,0.993 +9, +10, +11, +12, +13, +14, +15, +16, +17, +18, +19, +20, +21, +22, +23, +24, +25, +26, +27, +28, +29, +30, +31, +32, +33, +34, +35, +36, +37, +38, +39, +40, +41, +42, +43, +44, +45, +46, +47, +48, +49, +50, +51, +52, +53, +54, +55, +56, +57, +58, +59, +60, +61, +62, +63, +64, +65, +66, +67, +68, +69, +70, +71, +72, +73, +74, +75, +76, +77, +78, +79, +80, +81, +82, +83, +84, +85, +86, +87, +88, +89, +90, +91, +92, +93, +94, +95, +96, +97, +98, +99, +100, +101, +102, +103, +104, +105, +106, +107, +108, +109, +110, +111, +112, +113, +114, +115, +116, +117, +118, diff --git a/src/atomref/policy.py b/src/atomref/policy.py index 36741fe..235bbe3 100644 --- a/src/atomref/policy.py +++ b/src/atomref/policy.py @@ -6,9 +6,14 @@ from dataclasses import dataclass, field from functools import lru_cache import math +from types import MappingProxyType from typing import Generic, Literal, TypeVar -from .elements import canonicalize_element_symbol, is_valid_element_symbol +from .elements import ( + canonicalize_element_symbol, + is_valid_element_symbol, + iter_elements, +) from .errors import PolicyError from .registry import ( DatasetLike, @@ -18,7 +23,13 @@ get_builtin_set, resolve_dataset_like, ) -from .transfer import LinearFit, LinearTransfer, SubstitutionTransfer, TransferModel +from .transfer import ( + LinearFit, + LinearTransfer, + SubstitutionTransfer, + SupportsValuePolicy, + TransferModel, +) K = TypeVar("K") @@ -31,6 +42,8 @@ "missing", ] +_ACTIVE_POLICY_IDS: list[int] = [] + @dataclass(frozen=True, slots=True) class LookupResult: @@ -58,12 +71,114 @@ def __float__(self) -> float: @dataclass(frozen=True, slots=True) class ValuePolicy(Generic[K]): - """Ordered rule set for resolving element-domain scalar values.""" + """Ordered rule set for resolving element-domain scalar values. + + The v0.1 runtime resolves only element-domain policies even though the + metadata layer already records a more general ``domain`` concept. During + construction, element-domain override keys are normalized to canonical + element symbols and validated as finite floats. + """ base: DatasetLike transfers: tuple[TransferModel, ...] = () overrides: Mapping[K, float] = field(default_factory=dict) fallback: float | None = None + blocked: tuple[str, ...] = () + + def __post_init__(self) -> None: + """Validate and normalize policy configuration eagerly.""" + + if self.fallback is not None: + object.__setattr__( + self, + "fallback", + _coerce_policy_float(self.fallback, what="policy fallback"), + ) + + base_set = resolve_dataset_like(self.base) + if base_set.info.domain != "element": + return + + normalized_blocked: list[str] = [] + seen_blocked: set[str] = set() + for key in self.blocked: + if not isinstance(key, str): + raise PolicyError( + "element-domain blocked keys must be element-symbol strings" + ) + sym = _normalize_element_symbol(key) + if sym is None: + raise PolicyError(f"invalid blocked element symbol: {key!r}") + if sym not in seen_blocked: + normalized_blocked.append(sym) + seen_blocked.add(sym) + object.__setattr__(self, "blocked", tuple(normalized_blocked)) + + normalized_overrides: dict[str, float] = {} + seen_original_keys: dict[str, str] = {} + for key, value in self.overrides.items(): + if not isinstance(key, str): + raise PolicyError( + "element-domain policy overrides must be keyed by element symbols" + ) + sym = _normalize_element_symbol(key) + if sym is None: + raise PolicyError(f"invalid override element symbol: {key!r}") + if sym in seen_blocked: + raise PolicyError(f"override key {key!r} is blocked in this policy") + previous = seen_original_keys.get(sym) + if previous is not None and previous != key: + raise PolicyError( + f"override keys {previous!r} and {key!r} both normalize to {sym!r}" + ) + seen_original_keys[sym] = key + normalized_overrides[sym] = _coerce_policy_float( + value, + what=f"override value for {key!r}", + ) + + object.__setattr__( + self, + "overrides", + MappingProxyType(normalized_overrides), + ) + + +@dataclass(frozen=True, slots=True) +class _ResolvedElementSource: + """Internal representation of an element-domain transfer source.""" + + ref: DatasetRef + values_by_z: tuple[float | None, ...] + placeholder_by_z: tuple[bool, ...] + via_policy: bool = False + + +@dataclass(frozen=True, slots=True) +class _TransferSourceValue: + """Internal representation of one value obtained from a transfer source.""" + + value: float + ref: DatasetRef + resolved_from: tuple[DatasetRef, ...] + is_placeholder: bool + via_policy: bool = False + lookup_source: LookupSource | None = None + notes: tuple[str, ...] = () + + + +def _coerce_policy_float(value: object, *, what: str) -> float: + """Return a finite float for policy configuration values.""" + + try: + out = float(value) + except (TypeError, ValueError) as exc: + raise PolicyError(f"{what} must be a finite float") from exc + if not math.isfinite(out): + raise PolicyError(f"{what} must be a finite float") + return out + def _normalize_element_symbol(symbol: str | None) -> str | None: @@ -82,35 +197,139 @@ def _normalize_element_symbol(symbol: str | None) -> str | None: return cand + def _resolve_target_ref(policy: ValuePolicy[object]) -> DatasetRef: """Return the target dataset reference implied by a policy base.""" return resolve_dataset_like(policy.base).ref + +def _coerce_nested_policy(source: object) -> ValuePolicy[str] | None: + """Return ``source`` as a generic value policy when possible.""" + + if isinstance(source, ValuePolicy): + return source + if isinstance(source, SupportsValuePolicy): + nested = source.as_value_policy() + if not isinstance(nested, ValuePolicy): + raise PolicyError("policy-like transfer sources must return ValuePolicy") + return nested + return None + + + +def _materialize_transfer_source( + source: DatasetLike | SupportsValuePolicy | ValuePolicy[str], +) -> _ResolvedElementSource: + """Materialize any element-domain transfer source into dense by-Z arrays.""" + + nested_policy = _coerce_nested_policy(source) + if nested_policy is None: + dataset = resolve_dataset_like(source) + placeholders = tuple( + False + if value is None + else _is_placeholder_value(dataset.info, float(value)) + for value in dataset.values_by_z + ) + return _ResolvedElementSource( + ref=dataset.ref, + values_by_z=dataset.values_by_z, + placeholder_by_z=placeholders, + via_policy=False, + ) + + target = _resolve_target_ref(nested_policy) + n_z = max(elem.z for elem in iter_elements()) + values: list[float | None] = [None] * (n_z + 1) + placeholders: list[bool] = [False] * (n_z + 1) + for elem in iter_elements(): + lookup = lookup_value(elem.symbol, policy=nested_policy) + values[elem.z] = lookup.value + placeholders[elem.z] = lookup.is_placeholder if lookup.value is not None else False + return _ResolvedElementSource( + ref=target, + values_by_z=tuple(values), + placeholder_by_z=tuple(placeholders), + via_policy=True, + ) + + + +def _lookup_transfer_source_value( + symbol: str, + source: DatasetLike | SupportsValuePolicy | ValuePolicy[str], +) -> tuple[_TransferSourceValue | None, str | None]: + """Resolve one element value from a transfer source or nested policy.""" + + nested_policy = _coerce_nested_policy(source) + if nested_policy is None: + source_set = resolve_dataset_like(source) + value = source_set.get(symbol) + if value is None: + return None, f"no value in {source_set.ref.set_id}" + value_f = float(value) + return ( + _TransferSourceValue( + value=value_f, + ref=source_set.ref, + resolved_from=(source_set.ref,), + is_placeholder=_is_placeholder_value(source_set.info, value_f), + via_policy=False, + lookup_source="base", + notes=(), + ), + None, + ) + + lookup = lookup_value(symbol, policy=nested_policy) + if lookup.value is None: + if lookup.notes: + return ( + None, + "policy source returned no value: " + "; ".join(lookup.notes), + ) + return None, "policy source returned no value" + + return ( + _TransferSourceValue( + value=float(lookup.value), + ref=_resolve_target_ref(nested_policy), + resolved_from=lookup.resolved_from, + is_placeholder=lookup.is_placeholder, + via_policy=True, + lookup_source=lookup.source, + notes=lookup.notes, + ), + None, + ) + + + def _fit_linear_transfer( base_set: ElementScalarSet, - predictor_set: ElementScalarSet, + predictor_source: _ResolvedElementSource, *, min_points: int, exclude_placeholders: bool, ) -> LinearFit: - """Fit a one-predictor linear transfer model between two datasets.""" + """Fit a one-predictor linear transfer model between two sources.""" xs: list[float] = [] ys: list[float] = [] - n_z = min(len(base_set.values_by_z), len(predictor_set.values_by_z)) + n_z = min(len(base_set.values_by_z), len(predictor_source.values_by_z)) for z in range(1, n_z): y = base_set.values_by_z[z] - x = predictor_set.values_by_z[z] + x = predictor_source.values_by_z[z] if y is None or x is None: continue y_f = float(y) x_f = float(x) if exclude_placeholders and ( _is_placeholder_value(base_set.info, y_f) - or _is_placeholder_value(predictor_set.info, x_f) + or predictor_source.placeholder_by_z[z] ): continue xs.append(x_f) @@ -156,19 +375,20 @@ def _fit_linear_transfer_cached( return _fit_linear_transfer( get_builtin_set(base_ref), - get_builtin_set(predictor_ref), + _materialize_transfer_source(predictor_ref), min_points=min_points, exclude_placeholders=exclude_placeholders, ) + def _fit_transfer_model(base: DatasetLike, transfer: TransferModel) -> LinearFit | None: """Return the fit object for a transfer model when it needs one.""" if not isinstance(transfer, LinearTransfer): return None if len(transfer.predictors) != 1: - raise PolicyError("v0.1 LinearTransfer supports exactly one predictor dataset") + raise PolicyError("v0.1 LinearTransfer supports exactly one predictor source") predictor = transfer.predictors[0] if isinstance(base, DatasetRef) and isinstance(predictor, DatasetRef): @@ -180,38 +400,50 @@ def _fit_transfer_model(base: DatasetLike, transfer: TransferModel) -> LinearFit ) return _fit_linear_transfer( resolve_dataset_like(base), - resolve_dataset_like(predictor), + _materialize_transfer_source(predictor), min_points=transfer.min_points, exclude_placeholders=transfer.exclude_placeholders, ) + def _apply_substitution_transfer( symbol: str, *, target: DatasetRef, transfer: SubstitutionTransfer, ) -> tuple[LookupResult | None, str | None]: - """Try to resolve ``symbol`` by direct substitution from another dataset.""" - - source_set = resolve_dataset_like(transfer.source) - value = source_set.get(symbol) - if value is None: - return None, f"no substitution value in {source_set.ref.set_id}" - value_f = float(value) + """Try to resolve ``symbol`` by direct substitution from another source.""" + + source_value, note = _lookup_transfer_source_value(symbol, transfer.source) + if source_value is None: + return None, note + + notes = [ + "missing in base set; substituted from policy source" + if source_value.via_policy + else "missing in base set; substituted from transfer source" + ] + if source_value.via_policy and source_value.lookup_source not in (None, "base"): + notes.append( + f"policy source resolved the value via {source_value.lookup_source}" + ) + if source_value.is_placeholder: + notes.append("transfer source value is marked as a placeholder") return ( LookupResult( - value=value_f, + value=source_value.value, source="transfer_substitution", target=target, - resolved_from=(source_set.ref,), - is_placeholder=_is_placeholder_value(source_set.info, value_f), - notes=("missing in base set; substituted from transfer source",), + resolved_from=source_value.resolved_from, + is_placeholder=source_value.is_placeholder, + notes=tuple(notes), ), None, ) + def _apply_linear_transfer( symbol: str, *, @@ -222,105 +454,154 @@ def _apply_linear_transfer( """Try to resolve ``symbol`` through linear transfer from predictor data.""" if len(transfer.predictors) != 1: - raise PolicyError("v0.1 LinearTransfer supports exactly one predictor dataset") + raise PolicyError("v0.1 LinearTransfer supports exactly one predictor source") - predictor_set = resolve_dataset_like(transfer.predictors[0]) - predictor_value = predictor_set.get(symbol) + predictor_value, note = _lookup_transfer_source_value(symbol, transfer.predictors[0]) if predictor_value is None: - return None, f"no predictor value in {predictor_set.ref.set_id}" - predictor_f = float(predictor_value) + return None, note - if transfer.exclude_placeholders and _is_placeholder_value( - predictor_set.info, - predictor_f, - ): - return None, f"predictor value in {predictor_set.ref.set_id} is a placeholder" + if transfer.exclude_placeholders and predictor_value.is_placeholder: + if predictor_value.via_policy: + return None, "predictor value from policy source is a placeholder" + return None, f"predictor value in {predictor_value.ref.set_id} is a placeholder" fit = _fit_transfer_model(base, transfer) if fit is None: return None, "no fit available for linear transfer" - predicted = fit.coefficients[0] * predictor_f + fit.intercept + predicted = fit.coefficients[0] * predictor_value.value + fit.intercept + + notes = ["missing in base set; inferred via linear transfer"] + if predictor_value.via_policy: + notes.append("predictor value supplied by policy source") + notes.append("linear fit used policy-materialized predictor values") + if predictor_value.lookup_source not in (None, "base"): + notes.append( + f"policy predictor resolved the value via {predictor_value.lookup_source}" + ) + return ( LookupResult( value=float(predicted), source="transfer_linear", target=target, - resolved_from=(predictor_set.ref,), + resolved_from=predictor_value.resolved_from, is_placeholder=False, fit=fit, - notes=("missing in base set; inferred via linear transfer",), + notes=tuple(notes), ), None, ) + def _resolve_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupResult: """Resolve a value through override, base, transfer, and fallback steps.""" - target = _resolve_target_ref(policy) - base_set = resolve_dataset_like(policy.base) - if base_set.info.domain != "element": - raise PolicyError("v0.1 resolver supports only element-domain datasets") - - sym = _normalize_element_symbol(symbol) - if sym is None: - note = "unknown element" if symbol is not None else "missing element symbol" - return LookupResult(value=None, source="missing", target=target, notes=(note,)) - - if sym in policy.overrides: - return LookupResult( - value=float(policy.overrides[sym]), - source="override", - target=target, - notes=("value supplied by policy override",), - ) - - base_value = base_set.get(sym) - if base_value is not None: - base_f = float(base_value) - return LookupResult( - value=base_f, - source="base", - target=target, - resolved_from=(base_set.ref,), - is_placeholder=_is_placeholder_value(base_set.info, base_f), - notes=(), - ) + policy_id = id(policy) + if policy_id in _ACTIVE_POLICY_IDS: + raise PolicyError("cyclic policy resolution detected") + + _ACTIVE_POLICY_IDS.append(policy_id) + try: + target = _resolve_target_ref(policy) + base_set = resolve_dataset_like(policy.base) + if base_set.info.domain != "element": + raise PolicyError("v0.1 resolver supports only element-domain datasets") + + sym = _normalize_element_symbol(symbol) + if sym is None: + note = "unknown element" if symbol is not None else "missing element symbol" + return LookupResult(value=None, source="missing", target=target, notes=(note,)) + + if sym in policy.blocked: + return LookupResult( + value=None, + source="missing", + target=target, + notes=(f"{sym} is blocked by this policy",), + ) - transfer_notes: list[str] = ["missing in base set"] - for transfer in policy.transfers: - if isinstance(transfer, SubstitutionTransfer): - result, note = _apply_substitution_transfer( - sym, + if sym in policy.overrides: + return LookupResult( + value=float(policy.overrides[sym]), + source="override", target=target, - transfer=transfer, + notes=("value supplied by policy override",), + ) + + base_value = base_set.get(sym) + if base_value is not None: + base_f = float(base_value) + is_placeholder = _is_placeholder_value(base_set.info, base_f) + notes = ( + ("base dataset value is marked as a placeholder",) + if is_placeholder + else () ) - elif isinstance(transfer, LinearTransfer): - result, note = _apply_linear_transfer( - sym, - base=policy.base, + return LookupResult( + value=base_f, + source="base", target=target, - transfer=transfer, + resolved_from=(base_set.ref,), + is_placeholder=is_placeholder, + notes=notes, ) - else: # pragma: no cover - closed union today - raise PolicyError(f"unsupported transfer model: {type(transfer)!r}") - if result is not None: - return result - if note: - transfer_notes.append(note) + transfer_notes: list[str] = ["missing in base set"] + for transfer in policy.transfers: + if isinstance(transfer, SubstitutionTransfer): + result, note = _apply_substitution_transfer( + sym, + target=target, + transfer=transfer, + ) + elif isinstance(transfer, LinearTransfer): + result, note = _apply_linear_transfer( + sym, + base=policy.base, + target=target, + transfer=transfer, + ) + else: # pragma: no cover - closed union today + raise PolicyError(f"unsupported transfer model: {type(transfer)!r}") + + if result is not None: + return result + if note: + transfer_notes.append(note) + + if policy.fallback is not None: + return LookupResult( + value=float(policy.fallback), + source="fallback", + target=target, + notes=tuple(transfer_notes + ["using fallback value"]), + ) - if policy.fallback is not None: return LookupResult( - value=float(policy.fallback), - source="fallback", + value=None, + source="missing", target=target, - notes=tuple(transfer_notes + ["using fallback value"]), + notes=tuple(transfer_notes), ) + finally: + popped = _ACTIVE_POLICY_IDS.pop() + assert popped == policy_id # internal stack discipline - return LookupResult( - value=None, - source="missing", - target=target, - notes=tuple(transfer_notes), - ) + + +def lookup_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupResult: + """Public entry point for generic element-domain scalar lookup. + + This is the same resolver used internally by the radii convenience layer. + In v0.1 the runtime supports only element-domain policies. + """ + + return _resolve_value(symbol, policy=policy) + + + +def get_value(symbol: str | None, *, policy: ValuePolicy[str]) -> float | None: + """Return only the resolved scalar value for an element-domain policy.""" + + return lookup_value(symbol, policy=policy).value diff --git a/src/atomref/radii.py b/src/atomref/radii.py index 01f13f4..de7ff36 100644 --- a/src/atomref/radii.py +++ b/src/atomref/radii.py @@ -4,11 +4,18 @@ from collections.abc import Iterable, Mapping from dataclasses import dataclass, field +import math from typing import Literal from .elements import canonicalize_element_symbol, get_element, is_valid_element_symbol from .errors import PolicyError -from .policy import LookupResult, ValuePolicy, _fit_transfer_model, _resolve_value +from .policy import ( + LookupResult, + ValuePolicy, + _fit_transfer_model, + get_value, + lookup_value, +) from .registry import ( DatasetInfo, DatasetRef, @@ -59,18 +66,27 @@ def as_value_policy(self) -> ValuePolicy[str]: else: base = DatasetRef(quantity, self.base_set) - normalized_overrides: dict[str, float] = {} - for key, value in self.overrides.items(): - sym = _normalize_radii_symbol(key) - if sym is None or not is_valid_element_symbol(sym): - raise PolicyError(f"invalid override element symbol: {key!r}") - normalized_overrides[sym] = float(value) + checked_overrides = { + key: _coerce_non_negative_radii_value( + value, + what=f"radii override value for {key!r}", + ) + for key, value in self.overrides.items() + } + checked_fallback = ( + None + if self.fallback is None + else _coerce_non_negative_radii_value( + self.fallback, + what="radii fallback", + ) + ) return ValuePolicy( base=base, transfers=self.transfers, - overrides=normalized_overrides, - fallback=self.fallback, + overrides=checked_overrides, + fallback=checked_fallback, ) @@ -107,6 +123,26 @@ class RadiiPolicyAssessment: per_element: tuple[RadiiElementAssessment, ...] = () + +def _coerce_non_negative_radii_value(value: object, *, what: str) -> float: + """Validate a radii-like policy number. + + The generic :class:`atomref.policy.ValuePolicy` accepts any finite scalar. + Radii-specific convenience helpers are stricter and reject negative values. + """ + + try: + out = float(value) + except (TypeError, ValueError) as exc: + raise PolicyError(f"{what} must be a finite float") from exc + if not math.isfinite(out): + raise PolicyError(f"{what} must be a finite float") + if out < 0: + raise PolicyError(f"{what} must be non-negative") + return out + + + def _quantity_for_kind(kind: RadiiKind) -> str: """Translate public radii kind names into registry quantity ids.""" @@ -116,6 +152,7 @@ def _quantity_for_kind(kind: RadiiKind) -> str: raise PolicyError(f"unknown radii kind: {kind!r}") from exc + def _normalize_radii_symbol(symbol: str | None) -> str | None: """Normalize symbols accepted by the radii convenience layer.""" @@ -125,6 +162,7 @@ def _normalize_radii_symbol(symbol: str | None) -> str | None: return cand + def _normalize_assessment_elements(elements: Iterable[str]) -> tuple[str, ...]: """Normalize, validate, deduplicate, and sort assessment element labels.""" @@ -141,6 +179,7 @@ def _normalize_assessment_elements(elements: Iterable[str]) -> tuple[str, ...]: ) + def list_radii_sets( kind: RadiiKind, *, @@ -151,6 +190,7 @@ def list_radii_sets( return list_dataset_ids(_quantity_for_kind(kind), usage_role=usage_role) + def list_radii_set_infos( kind: RadiiKind, *, @@ -161,18 +201,21 @@ def list_radii_set_infos( return list_dataset_infos(_quantity_for_kind(kind), usage_role=usage_role) + def get_radii_set_info(kind: RadiiKind, set_id: str) -> DatasetInfo: """Return metadata for one packaged radii set.""" return get_dataset_info(DatasetRef(_quantity_for_kind(kind), set_id)) + def get_radii_set(kind: RadiiKind, set_id: str) -> RadiiSet: """Load one packaged radii set as an :class:`ElementScalarSet`.""" return get_builtin_set(DatasetRef(_quantity_for_kind(kind), set_id)) + def _validate_policy_kind(policy: RadiiPolicy, *, expected: RadiiKind) -> None: """Raise when a policy is used with the wrong public radii helper.""" @@ -180,10 +223,12 @@ def _validate_policy_kind(policy: RadiiPolicy, *, expected: RadiiKind) -> None: raise PolicyError(f"expected a {expected!r} radii policy, got {policy.kind!r}") + def _lookup_radius(symbol: str | None, *, policy: RadiiPolicy) -> LookupResult: """Shared implementation for radii lookup helpers.""" - return _resolve_value(symbol, policy=policy.as_value_policy()) + return lookup_value(symbol, policy=policy.as_value_policy()) + def lookup_covalent_radius( @@ -198,6 +243,7 @@ def lookup_covalent_radius( return _lookup_radius(symbol, policy=active) + def get_covalent_radius( symbol: str | None, *, @@ -205,7 +251,10 @@ def get_covalent_radius( ) -> float | None: """Return only the selected covalent-radius value, without provenance.""" - return lookup_covalent_radius(symbol, policy=policy).value + active = DEFAULT_COVALENT_POLICY if policy is None else policy + _validate_policy_kind(active, expected="covalent") + return get_value(symbol, policy=active.as_value_policy()) + def lookup_vdw_radius( @@ -220,14 +269,18 @@ def lookup_vdw_radius( return _lookup_radius(symbol, policy=active) + def get_vdw_radius( symbol: str | None, *, policy: RadiiPolicy | None = None, ) -> float | None: - """Return only the selected van der Waals radius, without provenance.""" + """Return only the selected van der Waals-radius value, without provenance.""" + + active = DEFAULT_VDW_POLICY if policy is None else policy + _validate_policy_kind(active, expected="van_der_waals") + return get_value(symbol, policy=active.as_value_policy()) - return lookup_vdw_radius(symbol, policy=policy).value def assess_radii_policy( @@ -254,7 +307,7 @@ def assess_radii_policy( per_element: list[RadiiElementAssessment] = [] for symbol in elems: - lookup = _resolve_value(symbol, policy=value_policy) + lookup = lookup_value(symbol, policy=value_policy) if lookup.source == "override": n_override += 1 elif lookup.source == "base": diff --git a/src/atomref/registry.py b/src/atomref/registry.py index 594e98e..b9f2730 100644 --- a/src/atomref/registry.py +++ b/src/atomref/registry.py @@ -8,6 +8,9 @@ from functools import lru_cache from importlib import resources import json +import math +from types import MappingProxyType +import unicodedata from .elements import canonicalize_element_symbol, get_element, iter_elements from .errors import DatasetError @@ -121,21 +124,44 @@ def from_mapping( n_z = max(e.z for e in iter_elements()) values_by_z: list[float | None] = [None] * (n_z + 1) + seen_keys: dict[str, str] = {} + + placeholder_f = ( + None + if placeholder_value is None + else _coerce_finite_float( + placeholder_value, + what=f"placeholder value for custom dataset {ref.set_id!r}", + ) + ) for key, value in values.items(): sym = _normalize_element_domain_symbol(key) elem = get_element(sym) if elem is None: raise DatasetError(f"invalid element symbol in custom set: {key!r}") - values_by_z[elem.z] = None if value is None else float(value) + previous = seen_keys.get(sym) + if previous is not None and previous != key: + raise DatasetError( + f"custom-set keys {previous!r} and {key!r} both normalize to {sym!r}" + ) + seen_keys[sym] = key + values_by_z[elem.z] = ( + None + if value is None + else _coerce_finite_float( + value, + what=f"value for element {key!r} in custom dataset {ref.set_id!r}", + ) + ) covered_z = tuple( z for z, value in enumerate(values_by_z) if z > 0 and value is not None ) has_placeholders = False - if placeholder_value is not None: + if placeholder_f is not None: has_placeholders = any( - value is not None and abs(value - placeholder_value) < 1e-12 + value is not None and abs(value - placeholder_f) < 1e-12 for value in values_by_z[1:] ) @@ -149,7 +175,7 @@ def from_mapping( semantic_class=semantic_class, origin_class=origin_class, phase_context=phase_context, - placeholder_value=placeholder_value, + placeholder_value=placeholder_f, aliases=(), references=tuple(references), notes=tuple(notes), @@ -178,6 +204,19 @@ def get(self, symbol: str | None) -> float | None: DatasetLike = DatasetRef | ElementScalarSet +_DASH_TRANSLATION = str.maketrans( + { + "‐": "-", + "‑": "-", + "‒": "-", + "–": "-", + "—": "-", + "―": "-", + "−": "-", + } +) + + def _normalize_element_domain_symbol(symbol: str | None) -> str | None: """Normalize element-domain symbols and fold D/T onto hydrogen.""" @@ -199,6 +238,35 @@ def _load_registry_json() -> dict[str, object]: return data +def _freeze_json_like(value: object) -> object: + """Recursively freeze JSON-like metadata structures. + + Registry metadata is cached globally. Returning raw dicts or lists from that + cache would let callers mutate shared package state through the metadata + objects returned by :func:`get_dataset_info`. + """ + + if isinstance(value, dict): + frozen = {str(key): _freeze_json_like(item) for key, item in value.items()} + return MappingProxyType(frozen) + if isinstance(value, list): + return tuple(_freeze_json_like(item) for item in value) + return value + + +def _coerce_finite_float(value: object, *, what: str) -> float: + """Return ``value`` as a finite float or raise :class:`DatasetError`.""" + + try: + out = float(value) + except (TypeError, ValueError) as exc: + raise DatasetError(f"{what} must be a finite float") from exc + if not math.isfinite(out): + raise DatasetError(f"{what} must be a finite float") + return out + + + def _get_quantities_mapping() -> Mapping[str, object]: """Return the raw ``quantities`` mapping from ``registry.json``.""" @@ -208,6 +276,7 @@ def _get_quantities_mapping() -> Mapping[str, object]: return quantities + def _get_datasets_mapping() -> Mapping[str, object]: """Return the raw ``datasets`` mapping from ``registry.json``.""" @@ -217,6 +286,7 @@ def _get_datasets_mapping() -> Mapping[str, object]: return datasets + def _datasets_for_quantity(quantity: QuantityId) -> Mapping[str, object]: """Return the dataset table for one quantity or raise on unknown input.""" @@ -226,12 +296,14 @@ def _datasets_for_quantity(quantity: QuantityId) -> Mapping[str, object]: return datasets + def list_quantities() -> tuple[str, ...]: """List packaged quantity identifiers in registry order.""" return tuple(_get_quantities_mapping().keys()) + def get_quantity_info(quantity: QuantityId) -> QuantityInfo: """Return quantity-level metadata for a packaged quantity.""" @@ -253,10 +325,14 @@ def get_quantity_info(quantity: QuantityId) -> QuantityInfo: ) + def _canonicalize_alias_token(value: str) -> str: """Normalize a dataset id or alias for case-insensitive comparison.""" - return " ".join(value.strip().lower().split()) + normalized = unicodedata.normalize("NFKC", value) + normalized = normalized.translate(_DASH_TRANSLATION) + return " ".join(normalized.strip().lower().split()) + def _resolve_set_id(quantity: QuantityId, set_id: str) -> str: @@ -282,6 +358,7 @@ def _resolve_set_id(quantity: QuantityId, set_id: str) -> str: raise DatasetError(f"unknown dataset id for {quantity!r}: {set_id!r}") + def list_dataset_ids( quantity: QuantityId, *, usage_role: str | None = None ) -> tuple[str, ...]: @@ -305,6 +382,7 @@ def list_dataset_ids( return tuple(filtered) + def list_dataset_infos( quantity: QuantityId, *, usage_role: str | None = None ) -> tuple[DatasetInfo, ...]: @@ -316,6 +394,7 @@ def list_dataset_infos( ) + def _coerce_reference(obj: object) -> Reference: """Coerce a raw registry reference entry into :class:`Reference`.""" @@ -335,6 +414,7 @@ def _coerce_reference(obj: object) -> Reference: ) + def _coerce_coverage(obj: object) -> CoverageInfo | None: """Coerce raw coverage metadata into :class:`CoverageInfo`.""" @@ -354,6 +434,7 @@ def _coerce_coverage(obj: object) -> CoverageInfo | None: ) + def get_dataset_info(ref: DatasetRef) -> DatasetInfo: """Return curated metadata for a packaged dataset reference.""" @@ -401,7 +482,9 @@ def get_dataset_info(ref: DatasetRef) -> DatasetInfo: else () ) storage = ( - raw_entry.get("storage") if isinstance(raw_entry.get("storage"), dict) else None + _freeze_json_like(raw_entry.get("storage")) + if isinstance(raw_entry.get("storage"), dict) + else None ) return DatasetInfo( @@ -444,7 +527,10 @@ def get_dataset_info(ref: DatasetRef) -> DatasetInfo: else None ), placeholder_value=( - float(raw_entry["placeholder_value"]) + _coerce_finite_float( + raw_entry["placeholder_value"], + what=f"placeholder value for packaged dataset {actual_ref!r}", + ) if raw_entry.get("placeholder_value") is not None else None ), @@ -456,7 +542,7 @@ def get_dataset_info(ref: DatasetRef) -> DatasetInfo: aliases=aliases, references=references, notes=notes, - storage=storage, + storage=storage if isinstance(storage, Mapping) else None, coverage=_coerce_coverage(raw_entry.get("coverage")), ) @@ -483,7 +569,14 @@ def _load_csv_columns(filename: str) -> dict[str, tuple[float | None, ...]]: values[name][z] = None continue raw = raw.strip() - values[name][z] = float(raw) if raw else None + values[name][z] = ( + _coerce_finite_float( + raw, + what=f"value in {filename!r} column {name!r} for Z={z}", + ) + if raw + else None + ) return {name: tuple(vals) for name, vals in values.items()} @@ -511,6 +604,7 @@ def get_builtin_set(ref: DatasetRef) -> ElementScalarSet: return ElementScalarSet(ref=info.ref, info=info, values_by_z=table[column]) + def resolve_dataset_like(dataset: DatasetLike) -> ElementScalarSet: """Resolve either a packaged reference or a custom set to a loaded set.""" @@ -519,6 +613,7 @@ def resolve_dataset_like(dataset: DatasetLike) -> ElementScalarSet: return get_builtin_set(dataset) + def _is_placeholder_value(info: DatasetInfo, value: float) -> bool: """Return ``True`` when ``value`` equals the dataset's placeholder value.""" diff --git a/src/atomref/transfer.py b/src/atomref/transfer.py index 14362db..9e071db 100644 --- a/src/atomref/transfer.py +++ b/src/atomref/transfer.py @@ -3,9 +3,22 @@ from __future__ import annotations from dataclasses import dataclass +from typing import TYPE_CHECKING, Protocol, runtime_checkable +from .errors import PolicyError from .registry import DatasetLike +if TYPE_CHECKING: # pragma: no cover - typing only + from .policy import ValuePolicy + + +@runtime_checkable +class SupportsValuePolicy(Protocol): + """Protocol for wrapper objects that can expose a generic value policy.""" + + def as_value_policy(self) -> "ValuePolicy[str]": + """Return the generic element-domain value policy.""" + @dataclass(frozen=True, slots=True) class LinearFit: @@ -25,24 +38,32 @@ class LinearFit: @dataclass(frozen=True, slots=True) class SubstitutionTransfer: - """Use another dataset directly when the base dataset is missing a value.""" + """Use another dataset or policy directly when the base dataset is missing a value.""" - source: DatasetLike + source: DatasetLike | SupportsValuePolicy | ValuePolicy[str] @dataclass(frozen=True, slots=True) class LinearTransfer: - """Infer missing target values from one or more predictor datasets. + """Infer missing target values from one or more predictor datasets or policies. In v0.1 the public API stores predictors as a tuple for forward compatibility, but the runtime implementation intentionally accepts exactly - one predictor dataset. + one predictor source. """ - predictors: tuple[DatasetLike, ...] + predictors: tuple[DatasetLike | SupportsValuePolicy | ValuePolicy[str], ...] min_points: int = 2 exclude_placeholders: bool = True + def __post_init__(self) -> None: + """Validate obvious configuration errors eagerly.""" + + if not self.predictors: + raise PolicyError("LinearTransfer requires at least one predictor") + if self.min_points < 2: + raise PolicyError("LinearTransfer min_points must be at least 2") + TransferModel = SubstitutionTransfer | LinearTransfer """Closed union of transfer models supported by the core resolver.""" diff --git a/src/atomref/xh.py b/src/atomref/xh.py new file mode 100644 index 0000000..c2a87be --- /dev/null +++ b/src/atomref/xh.py @@ -0,0 +1,168 @@ +"""X-H bond-length helpers built on the generic policy core.""" + +from __future__ import annotations + +from collections.abc import Mapping +from dataclasses import dataclass, field +import math + +from .elements import canonicalize_element_symbol, is_valid_element_symbol +from .errors import PolicyError +from .policy import LookupResult, ValuePolicy, get_value, lookup_value +from .registry import DatasetInfo, DatasetRef, ElementScalarSet, get_builtin_set, get_dataset_info, list_dataset_ids, list_dataset_infos +from .transfer import LinearTransfer, TransferModel + +XHSet = ElementScalarSet + +_QUANTITY = "xh_bond_length" + + +@dataclass(frozen=True, slots=True) +class XHPolicy: + """Policy wrapper specialized for parent-element X-H bond lengths. + + The quantity key is fixed to ``"xh_bond_length"`` and uses the parent + element ``X`` as the lookup key. ``H`` itself is not considered a valid + parent element for this quantity. + """ + + base_set: str | XHSet + transfers: tuple[TransferModel, ...] = () + overrides: Mapping[str, float] = field(default_factory=dict) + fallback: float | None = None + + def as_value_policy(self) -> ValuePolicy[str]: + """Convert the X-H policy into the generic scalar-value policy.""" + + if isinstance(self.base_set, ElementScalarSet): + if self.base_set.ref.quantity != _QUANTITY: + raise PolicyError( + f"base_set quantity {self.base_set.ref.quantity!r} is incompatible with X-H lookup" + ) + base = self.base_set + else: + base = DatasetRef(_QUANTITY, self.base_set) + + checked_overrides: dict[str, float] = {} + for key, value in self.overrides.items(): + sym = _normalize_xh_symbol(key) + if sym is None or not is_valid_element_symbol(sym): + raise PolicyError(f"invalid X-H parent element symbol: {key!r}") + if sym == "H": + raise PolicyError("H is not a valid parent element for xh_bond_length") + checked_overrides[key] = _coerce_non_negative_xh_value( + value, + what=f"X-H override value for {key!r}", + ) + + checked_fallback = ( + None + if self.fallback is None + else _coerce_non_negative_xh_value(self.fallback, what="X-H fallback") + ) + + return ValuePolicy( + base=base, + transfers=self.transfers, + overrides=checked_overrides, + fallback=checked_fallback, + blocked=("H",), + ) + + + +def _coerce_non_negative_xh_value(value: object, *, what: str) -> float: + """Validate an X-H-like policy number.""" + + try: + out = float(value) + except (TypeError, ValueError) as exc: + raise PolicyError(f"{what} must be a finite float") from exc + if not math.isfinite(out): + raise PolicyError(f"{what} must be a finite float") + if out < 0: + raise PolicyError(f"{what} must be non-negative") + return out + + + +def _normalize_xh_symbol(symbol: str | None) -> str | None: + """Normalize symbols accepted by the X-H convenience layer.""" + + cand = canonicalize_element_symbol(symbol) + if cand in {"D", "T"}: + cand = "H" + return cand + + + +def list_xh_sets(*, usage_role: str | None = None) -> tuple[str, ...]: + """List packaged X-H set ids.""" + + return list_dataset_ids(_QUANTITY, usage_role=usage_role) + + + +def list_xh_set_infos(*, usage_role: str | None = None) -> tuple[DatasetInfo, ...]: + """Return packaged metadata objects for X-H sets.""" + + return list_dataset_infos(_QUANTITY, usage_role=usage_role) + + + +def get_xh_set_info(set_id: str) -> DatasetInfo: + """Return metadata for one packaged X-H set.""" + + return get_dataset_info(DatasetRef(_QUANTITY, set_id)) + + + +def get_xh_set(set_id: str) -> XHSet: + """Load one packaged X-H set as an :class:`ElementScalarSet`.""" + + return get_builtin_set(DatasetRef(_QUANTITY, set_id)) + + + +def lookup_xh_bond_length( + symbol: str | None, + *, + policy: XHPolicy | None = None, +) -> LookupResult: + """Resolve a parent-element X-H bond length with provenance.""" + + active = DEFAULT_XH_POLICY if policy is None else policy + lookup = lookup_value(symbol, policy=active.as_value_policy()) + if lookup.value is None and _normalize_xh_symbol(symbol) == "H": + return LookupResult( + value=None, + source="missing", + target=lookup.target, + notes=("H is not a valid parent element for xh_bond_length",), + ) + return lookup + + + +def get_xh_bond_length( + symbol: str | None, + *, + policy: XHPolicy | None = None, +) -> float | None: + """Return only the selected X-H bond-length value, without provenance.""" + + active = DEFAULT_XH_POLICY if policy is None else policy + return get_value(symbol, policy=active.as_value_policy()) + + +DEFAULT_XH_POLICY = XHPolicy( + base_set="csd_legacy_xh_cno", + transfers=( + LinearTransfer( + predictors=(DatasetRef("covalent_radius", "cordero2008"),), + min_points=3, + exclude_placeholders=True, + ), + ), +) +"""Default X-H policy used by the convenience helpers.""" diff --git a/tests/meta/test_imports.py b/tests/meta/test_imports.py index 374996a..66210e7 100644 --- a/tests/meta/test_imports.py +++ b/tests/meta/test_imports.py @@ -10,6 +10,7 @@ 'atomref.transfer', 'atomref.policy', 'atomref.radii', + 'atomref.xh', ] diff --git a/tests/meta/test_package_data.py b/tests/meta/test_package_data.py index e5c393c..a9a7e61 100644 --- a/tests/meta/test_package_data.py +++ b/tests/meta/test_package_data.py @@ -11,6 +11,7 @@ def test_packaged_data_files_are_available() -> None: 'covalent.csv', 'van_der_waals.csv', 'registry.json', + 'xh_bond_length.csv', ): assert data_root.joinpath(name).is_file(), name @@ -20,6 +21,7 @@ def test_packaged_registry_keeps_atomic_support_classification() -> None: raw = json.loads(data_root.joinpath('registry.json').read_text(encoding='utf-8')) assert 'atomic_radius' in raw['datasets'] + assert 'xh_bond_length' in raw['datasets'] rahm = raw['datasets']['atomic_radius']['rahm2016'] assert rahm['usage_role'] == 'support' assert rahm['semantic_class'] == 'atomic_isodensity' diff --git a/tests/meta/test_public_api.py b/tests/meta/test_public_api.py index 8f191bf..f3583a1 100644 --- a/tests/meta/test_public_api.py +++ b/tests/meta/test_public_api.py @@ -21,6 +21,13 @@ 'lookup_covalent_radius', 'get_vdw_radius', 'lookup_vdw_radius', + 'XHPolicy', + 'DEFAULT_XH_POLICY', + 'get_xh_set', + 'get_xh_bond_length', + 'lookup_xh_bond_length', + 'list_xh_sets', + 'list_xh_set_infos', 'list_quantities', 'list_dataset_ids', 'list_dataset_infos', diff --git a/tests/meta/test_registry_integrity.py b/tests/meta/test_registry_integrity.py index 853df5c..a32b44c 100644 --- a/tests/meta/test_registry_integrity.py +++ b/tests/meta/test_registry_integrity.py @@ -4,22 +4,18 @@ from dataclasses import asdict import atomref as ar -from atomref.registry import get_builtin_set +from atomref.registry import _canonicalize_alias_token, get_builtin_set _ALLOWED_USAGE_ROLES = {"target", "support"} -def _canonical_token(value: str) -> str: - return " ".join(value.strip().lower().split()) - - def test_dataset_aliases_are_unique_within_each_quantity() -> None: for quantity in ar.list_quantities(): seen: dict[str, str] = {} for set_id in ar.list_dataset_ids(quantity): info = ar.get_dataset_info(ar.DatasetRef(quantity, set_id)) for token in (set_id, *info.aliases): - key = _canonical_token(token) + key = _canonicalize_alias_token(token) previous = seen.get(key) assert previous in (None, set_id) seen[key] = set_id diff --git a/tests/policy/test_policy.py b/tests/policy/test_policy.py new file mode 100644 index 0000000..3b38717 --- /dev/null +++ b/tests/policy/test_policy.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +import pytest + +import atomref as ar +from atomref.errors import PolicyError + + +def test_lookup_value_is_public_generic_entry_point() -> None: + policy = ar.ValuePolicy( + base=ar.DatasetRef('covalent_radius', 'cordero2008'), + overrides={'d': 0.5}, + ) + lookup = ar.lookup_value('H', policy=policy) + assert lookup.source == 'override' + assert lookup.value == pytest.approx(0.5) + + +def test_get_value_returns_only_scalar() -> None: + policy = ar.ValuePolicy(base=ar.DatasetRef('covalent_radius', 'cordero2008')) + assert ar.get_value('C', policy=policy) == pytest.approx(0.76) + + +def test_value_policy_rejects_normalized_override_collisions() -> None: + with pytest.raises(PolicyError): + ar.ValuePolicy( + base=ar.DatasetRef('covalent_radius', 'cordero2008'), + overrides={'H': 0.31, 'D': 0.4}, + ) + + +def test_value_policy_rejects_non_finite_fallback() -> None: + with pytest.raises(PolicyError): + ar.ValuePolicy( + base=ar.DatasetRef('covalent_radius', 'cordero2008'), + fallback=float('nan'), + ) + + +def test_substitution_transfer_accepts_policy_source() -> None: + custom = ar.ElementScalarSet.from_mapping( + ref=ar.DatasetRef('covalent_radius', 'demo_user_cov'), + values={'C': 0.77}, + name='Demo covalent set', + units='angstrom', + ) + policy = ar.ValuePolicy( + base=custom, + transfers=(ar.SubstitutionTransfer(source=ar.DEFAULT_COVALENT_POLICY),), + ) + lookup = ar.lookup_value('Bk', policy=policy) + assert lookup.source == 'transfer_substitution' + assert lookup.value == pytest.approx(1.54) + assert lookup.resolved_from == (ar.DatasetRef('covalent_radius', 'csd_legacy_cov'),) + assert any('policy source' in note for note in lookup.notes) + + +def test_linear_transfer_accepts_policy_predictor() -> None: + predictor_policy = ar.ValuePolicy(base=ar.DatasetRef('atomic_radius', 'rahm2016')) + policy = ar.RadiiPolicy( + kind='van_der_waals', + base_set='alvarez2013', + transfers=(ar.LinearTransfer(predictors=(predictor_policy,),),), + ) + lookup = ar.lookup_vdw_radius('Pm', policy=policy) + assert lookup.source == 'transfer_linear' + assert lookup.value == pytest.approx(ar.lookup_vdw_radius('Pm').value) + assert lookup.fit is not None + assert any('policy source' in note for note in lookup.notes) diff --git a/tests/radii/test_selection.py b/tests/radii/test_selection.py index e84a4f0..8977363 100644 --- a/tests/radii/test_selection.py +++ b/tests/radii/test_selection.py @@ -97,3 +97,53 @@ def test_linear_transfer_rejects_multiple_predictors_in_v0_1() -> None: ) with pytest.raises(PolicyError): ar.lookup_vdw_radius("Pm", policy=policy) + + +def test_base_placeholder_note_is_explicit() -> None: + policy = ar.RadiiPolicy(kind='covalent', base_set='csd_legacy_cov') + lookup = ar.lookup_covalent_radius('Es', policy=policy) + assert lookup.source == 'base' + assert lookup.is_placeholder is True + assert any('placeholder' in note for note in lookup.notes) + + +def test_substitution_placeholder_note_is_explicit() -> None: + lookup = ar.lookup_covalent_radius('Es') + assert lookup.source == 'transfer_substitution' + assert lookup.is_placeholder is True + assert any('placeholder' in note for note in lookup.notes) + + +def test_radii_policy_rejects_normalized_override_collisions() -> None: + policy = ar.RadiiPolicy( + kind='covalent', + base_set='cordero2008', + overrides={'H': 0.31, 'D': 0.4}, + ) + with pytest.raises(PolicyError): + ar.lookup_covalent_radius('H', policy=policy) + + +def test_radii_policy_rejects_non_finite_override() -> None: + policy = ar.RadiiPolicy( + kind='covalent', + base_set='cordero2008', + overrides={'C': float('nan')}, + ) + with pytest.raises(PolicyError): + ar.lookup_covalent_radius('C', policy=policy) + + +def test_radii_policy_rejects_negative_fallback() -> None: + policy = ar.RadiiPolicy( + kind='van_der_waals', + base_set='bondi1964', + fallback=-1.0, + ) + with pytest.raises(PolicyError): + ar.lookup_vdw_radius('Be', policy=policy) + + +def test_linear_transfer_validates_empty_predictors() -> None: + with pytest.raises(PolicyError): + ar.LinearTransfer(predictors=()) diff --git a/tests/registry/test_registry.py b/tests/registry/test_registry.py index 48afbae..d497d9f 100644 --- a/tests/registry/test_registry.py +++ b/tests/registry/test_registry.py @@ -1,8 +1,12 @@ from __future__ import annotations from importlib import resources +from types import MappingProxyType + +import pytest import atomref as ar +from atomref.errors import DatasetError from atomref.registry import get_builtin_set @@ -34,7 +38,12 @@ def test_builtin_set_loading_works() -> None: def test_list_quantities_and_quantity_info() -> None: quantities = ar.list_quantities() - assert quantities == ('covalent_radius', 'van_der_waals_radius', 'atomic_radius') + assert quantities == ( + 'covalent_radius', + 'van_der_waals_radius', + 'atomic_radius', + 'xh_bond_length', + ) info = ar.get_quantity_info('atomic_radius') assert info.quantity == 'atomic_radius' @@ -93,3 +102,42 @@ def test_public_radii_set_helper_returns_packaged_radii_set() -> None: assert ds.info.ref.quantity == 'van_der_waals_radius' assert ds.info.ref.set_id == 'alvarez2013' assert ds.get('O') == 1.5 + + +def test_dataset_info_storage_is_frozen() -> None: + info = ar.get_dataset_info(ar.DatasetRef('covalent_radius', 'cordero2008')) + assert isinstance(info.storage, MappingProxyType) + assert info.storage['column'] == 'cordero2008' + with pytest.raises(TypeError): + info.storage['column'] = 'broken' + + fresh = ar.get_dataset_info(ar.DatasetRef('covalent_radius', 'cordero2008')) + assert fresh.storage is not None + assert fresh.storage['column'] == 'cordero2008' + + +def test_dataset_alias_resolution_normalizes_dash_variants() -> None: + info = ar.get_dataset_info( + ar.DatasetRef('covalent_radius', 'Cordero-Alvarez covalent radii') + ) + assert info.ref.set_id == 'cordero2008' + + +def test_custom_set_rejects_normalized_key_collisions() -> None: + with pytest.raises(DatasetError): + ar.ElementScalarSet.from_mapping( + ref=ar.DatasetRef('covalent_radius', 'demo'), + values={'H': 0.31, 'D': 0.5}, + name='Demo', + units='angstrom', + ) + + +def test_custom_set_rejects_non_finite_values() -> None: + with pytest.raises(DatasetError): + ar.ElementScalarSet.from_mapping( + ref=ar.DatasetRef('covalent_radius', 'demo'), + values={'C': float('nan')}, + name='Demo', + units='angstrom', + ) diff --git a/tests/xh/test_xh.py b/tests/xh/test_xh.py new file mode 100644 index 0000000..3cffe15 --- /dev/null +++ b/tests/xh/test_xh.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import pytest + +import atomref as ar +from atomref.errors import PolicyError + + +def test_get_xh_bond_length_returns_curated_cno_values() -> None: + assert ar.get_xh_bond_length('C') == pytest.approx(1.089) + assert ar.get_xh_bond_length('N') == pytest.approx(1.015) + assert ar.get_xh_bond_length('O') == pytest.approx(0.993) + + +def test_lookup_xh_bond_length_infers_other_elements_from_cordero() -> None: + lookup = ar.lookup_xh_bond_length('S') + assert lookup.source == 'transfer_linear' + assert lookup.resolved_from == (ar.DatasetRef('covalent_radius', 'cordero2008'),) + assert lookup.fit is not None + assert lookup.fit.n_points == 3 + assert lookup.value == pytest.approx(1.3587333333333333) + + +def test_lookup_xh_bond_length_rejects_h_as_parent_element() -> None: + lookup = ar.lookup_xh_bond_length('H') + assert lookup.value is None + assert lookup.source == 'missing' + assert any('not a valid parent element' in note for note in lookup.notes) + + +def test_list_xh_sets_and_metadata() -> None: + assert ar.list_xh_sets() == ('csd_legacy_xh_cno',) + info = ar.get_xh_set_info('csd_legacy_xh_cno') + assert info.ref.quantity == 'xh_bond_length' + assert info.usage_role == 'target' + assert info.coverage is not None + assert info.coverage.n_values == 3 + + +def test_xh_policy_rejects_h_override_key() -> None: + policy = ar.XHPolicy(base_set='csd_legacy_xh_cno', overrides={'H': 1.0}) + with pytest.raises(PolicyError): + policy.as_value_policy() + + +def test_xh_policy_rejects_negative_fallback() -> None: + policy = ar.XHPolicy(base_set='csd_legacy_xh_cno', fallback=-1.0) + with pytest.raises(PolicyError): + policy.as_value_policy() + + +def test_xh_policy_accepts_wrapper_policy_predictor() -> None: + policy = ar.XHPolicy( + base_set='csd_legacy_xh_cno', + transfers=( + ar.LinearTransfer( + predictors=(ar.DEFAULT_COVALENT_POLICY,), + min_points=3, + exclude_placeholders=True, + ), + ), + ) + lookup = ar.lookup_xh_bond_length('Bk', policy=policy) + assert lookup.source == 'transfer_linear' + assert lookup.value == pytest.approx(1.8291333333333335) + assert lookup.resolved_from == (ar.DatasetRef('covalent_radius', 'csd_legacy_cov'),) + assert any('policy source' in note for note in lookup.notes) diff --git a/tools/check_registry.py b/tools/check_registry.py index 02b1e14..3af6025 100644 --- a/tools/check_registry.py +++ b/tools/check_registry.py @@ -28,7 +28,8 @@ def _get_builtin_set(ref): def _canonical_token(value: str) -> str: - return " ".join(value.strip().lower().split()) + registry = import_module("atomref.registry") + return registry._canonicalize_alias_token(value) def _iter_dataset_refs() -> Iterable[object]: From d593e5a5c479034a502c226e71ef45461a9335a1 Mon Sep 17 00:00:00 2001 From: Ivan Chernyshov Date: Sun, 15 Mar 2026 19:55:46 +0300 Subject: [PATCH 13/15] Fixes formatting --- src/atomref/policy.py | 32 +++++++++++++++----------------- src/atomref/radii.py | 15 --------------- src/atomref/registry.py | 17 ++--------------- src/atomref/transfer.py | 5 ++++- src/atomref/xh.py | 22 ++++++++++++---------- 5 files changed, 33 insertions(+), 58 deletions(-) diff --git a/src/atomref/policy.py b/src/atomref/policy.py index 235bbe3..a8c8616 100644 --- a/src/atomref/policy.py +++ b/src/atomref/policy.py @@ -167,7 +167,6 @@ class _TransferSourceValue: notes: tuple[str, ...] = () - def _coerce_policy_float(value: object, *, what: str) -> float: """Return a finite float for policy configuration values.""" @@ -180,7 +179,6 @@ def _coerce_policy_float(value: object, *, what: str) -> float: return out - def _normalize_element_symbol(symbol: str | None) -> str | None: """Normalize user input to a packaged element symbol. @@ -197,14 +195,12 @@ def _normalize_element_symbol(symbol: str | None) -> str | None: return cand - def _resolve_target_ref(policy: ValuePolicy[object]) -> DatasetRef: """Return the target dataset reference implied by a policy base.""" return resolve_dataset_like(policy.base).ref - def _coerce_nested_policy(source: object) -> ValuePolicy[str] | None: """Return ``source`` as a generic value policy when possible.""" @@ -218,7 +214,6 @@ def _coerce_nested_policy(source: object) -> ValuePolicy[str] | None: return None - def _materialize_transfer_source( source: DatasetLike | SupportsValuePolicy | ValuePolicy[str], ) -> _ResolvedElementSource: @@ -247,7 +242,9 @@ def _materialize_transfer_source( for elem in iter_elements(): lookup = lookup_value(elem.symbol, policy=nested_policy) values[elem.z] = lookup.value - placeholders[elem.z] = lookup.is_placeholder if lookup.value is not None else False + placeholders[elem.z] = ( + lookup.is_placeholder if lookup.value is not None else False + ) return _ResolvedElementSource( ref=target, values_by_z=tuple(values), @@ -256,7 +253,6 @@ def _materialize_transfer_source( ) - def _lookup_transfer_source_value( symbol: str, source: DatasetLike | SupportsValuePolicy | ValuePolicy[str], @@ -306,7 +302,6 @@ def _lookup_transfer_source_value( ) - def _fit_linear_transfer( base_set: ElementScalarSet, predictor_source: _ResolvedElementSource, @@ -381,7 +376,6 @@ def _fit_linear_transfer_cached( ) - def _fit_transfer_model(base: DatasetLike, transfer: TransferModel) -> LinearFit | None: """Return the fit object for a transfer model when it needs one.""" @@ -406,7 +400,6 @@ def _fit_transfer_model(base: DatasetLike, transfer: TransferModel) -> LinearFit ) - def _apply_substitution_transfer( symbol: str, *, @@ -443,7 +436,6 @@ def _apply_substitution_transfer( ) - def _apply_linear_transfer( symbol: str, *, @@ -456,7 +448,10 @@ def _apply_linear_transfer( if len(transfer.predictors) != 1: raise PolicyError("v0.1 LinearTransfer supports exactly one predictor source") - predictor_value, note = _lookup_transfer_source_value(symbol, transfer.predictors[0]) + predictor_value, note = _lookup_transfer_source_value( + symbol, + transfer.predictors[0], + ) if predictor_value is None: return None, note @@ -476,7 +471,8 @@ def _apply_linear_transfer( notes.append("linear fit used policy-materialized predictor values") if predictor_value.lookup_source not in (None, "base"): notes.append( - f"policy predictor resolved the value via {predictor_value.lookup_source}" + "policy predictor resolved the value via " + f"{predictor_value.lookup_source}" ) return ( @@ -493,7 +489,6 @@ def _apply_linear_transfer( ) - def _resolve_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupResult: """Resolve a value through override, base, transfer, and fallback steps.""" @@ -511,7 +506,12 @@ def _resolve_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupRes sym = _normalize_element_symbol(symbol) if sym is None: note = "unknown element" if symbol is not None else "missing element symbol" - return LookupResult(value=None, source="missing", target=target, notes=(note,)) + return LookupResult( + value=None, + source="missing", + target=target, + notes=(note,), + ) if sym in policy.blocked: return LookupResult( @@ -589,7 +589,6 @@ def _resolve_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupRes assert popped == policy_id # internal stack discipline - def lookup_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupResult: """Public entry point for generic element-domain scalar lookup. @@ -600,7 +599,6 @@ def lookup_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupResul return _resolve_value(symbol, policy=policy) - def get_value(symbol: str | None, *, policy: ValuePolicy[str]) -> float | None: """Return only the resolved scalar value for an element-domain policy.""" diff --git a/src/atomref/radii.py b/src/atomref/radii.py index de7ff36..449de58 100644 --- a/src/atomref/radii.py +++ b/src/atomref/radii.py @@ -123,7 +123,6 @@ class RadiiPolicyAssessment: per_element: tuple[RadiiElementAssessment, ...] = () - def _coerce_non_negative_radii_value(value: object, *, what: str) -> float: """Validate a radii-like policy number. @@ -142,7 +141,6 @@ def _coerce_non_negative_radii_value(value: object, *, what: str) -> float: return out - def _quantity_for_kind(kind: RadiiKind) -> str: """Translate public radii kind names into registry quantity ids.""" @@ -152,7 +150,6 @@ def _quantity_for_kind(kind: RadiiKind) -> str: raise PolicyError(f"unknown radii kind: {kind!r}") from exc - def _normalize_radii_symbol(symbol: str | None) -> str | None: """Normalize symbols accepted by the radii convenience layer.""" @@ -162,7 +159,6 @@ def _normalize_radii_symbol(symbol: str | None) -> str | None: return cand - def _normalize_assessment_elements(elements: Iterable[str]) -> tuple[str, ...]: """Normalize, validate, deduplicate, and sort assessment element labels.""" @@ -179,7 +175,6 @@ def _normalize_assessment_elements(elements: Iterable[str]) -> tuple[str, ...]: ) - def list_radii_sets( kind: RadiiKind, *, @@ -190,7 +185,6 @@ def list_radii_sets( return list_dataset_ids(_quantity_for_kind(kind), usage_role=usage_role) - def list_radii_set_infos( kind: RadiiKind, *, @@ -201,21 +195,18 @@ def list_radii_set_infos( return list_dataset_infos(_quantity_for_kind(kind), usage_role=usage_role) - def get_radii_set_info(kind: RadiiKind, set_id: str) -> DatasetInfo: """Return metadata for one packaged radii set.""" return get_dataset_info(DatasetRef(_quantity_for_kind(kind), set_id)) - def get_radii_set(kind: RadiiKind, set_id: str) -> RadiiSet: """Load one packaged radii set as an :class:`ElementScalarSet`.""" return get_builtin_set(DatasetRef(_quantity_for_kind(kind), set_id)) - def _validate_policy_kind(policy: RadiiPolicy, *, expected: RadiiKind) -> None: """Raise when a policy is used with the wrong public radii helper.""" @@ -223,14 +214,12 @@ def _validate_policy_kind(policy: RadiiPolicy, *, expected: RadiiKind) -> None: raise PolicyError(f"expected a {expected!r} radii policy, got {policy.kind!r}") - def _lookup_radius(symbol: str | None, *, policy: RadiiPolicy) -> LookupResult: """Shared implementation for radii lookup helpers.""" return lookup_value(symbol, policy=policy.as_value_policy()) - def lookup_covalent_radius( symbol: str | None, *, @@ -243,7 +232,6 @@ def lookup_covalent_radius( return _lookup_radius(symbol, policy=active) - def get_covalent_radius( symbol: str | None, *, @@ -256,7 +244,6 @@ def get_covalent_radius( return get_value(symbol, policy=active.as_value_policy()) - def lookup_vdw_radius( symbol: str | None, *, @@ -269,7 +256,6 @@ def lookup_vdw_radius( return _lookup_radius(symbol, policy=active) - def get_vdw_radius( symbol: str | None, *, @@ -282,7 +268,6 @@ def get_vdw_radius( return get_value(symbol, policy=active.as_value_policy()) - def assess_radii_policy( elements: Iterable[str], *, diff --git a/src/atomref/registry.py b/src/atomref/registry.py index b9f2730..479ff97 100644 --- a/src/atomref/registry.py +++ b/src/atomref/registry.py @@ -143,7 +143,8 @@ def from_mapping( previous = seen_keys.get(sym) if previous is not None and previous != key: raise DatasetError( - f"custom-set keys {previous!r} and {key!r} both normalize to {sym!r}" + "custom-set keys " + f"{previous!r} and {key!r} both normalize to {sym!r}" ) seen_keys[sym] = key values_by_z[elem.z] = ( @@ -266,7 +267,6 @@ def _coerce_finite_float(value: object, *, what: str) -> float: return out - def _get_quantities_mapping() -> Mapping[str, object]: """Return the raw ``quantities`` mapping from ``registry.json``.""" @@ -276,7 +276,6 @@ def _get_quantities_mapping() -> Mapping[str, object]: return quantities - def _get_datasets_mapping() -> Mapping[str, object]: """Return the raw ``datasets`` mapping from ``registry.json``.""" @@ -286,7 +285,6 @@ def _get_datasets_mapping() -> Mapping[str, object]: return datasets - def _datasets_for_quantity(quantity: QuantityId) -> Mapping[str, object]: """Return the dataset table for one quantity or raise on unknown input.""" @@ -296,14 +294,12 @@ def _datasets_for_quantity(quantity: QuantityId) -> Mapping[str, object]: return datasets - def list_quantities() -> tuple[str, ...]: """List packaged quantity identifiers in registry order.""" return tuple(_get_quantities_mapping().keys()) - def get_quantity_info(quantity: QuantityId) -> QuantityInfo: """Return quantity-level metadata for a packaged quantity.""" @@ -325,7 +321,6 @@ def get_quantity_info(quantity: QuantityId) -> QuantityInfo: ) - def _canonicalize_alias_token(value: str) -> str: """Normalize a dataset id or alias for case-insensitive comparison.""" @@ -334,7 +329,6 @@ def _canonicalize_alias_token(value: str) -> str: return " ".join(normalized.strip().lower().split()) - def _resolve_set_id(quantity: QuantityId, set_id: str) -> str: """Resolve a dataset id or alias to its canonical packaged set id.""" @@ -358,7 +352,6 @@ def _resolve_set_id(quantity: QuantityId, set_id: str) -> str: raise DatasetError(f"unknown dataset id for {quantity!r}: {set_id!r}") - def list_dataset_ids( quantity: QuantityId, *, usage_role: str | None = None ) -> tuple[str, ...]: @@ -382,7 +375,6 @@ def list_dataset_ids( return tuple(filtered) - def list_dataset_infos( quantity: QuantityId, *, usage_role: str | None = None ) -> tuple[DatasetInfo, ...]: @@ -394,7 +386,6 @@ def list_dataset_infos( ) - def _coerce_reference(obj: object) -> Reference: """Coerce a raw registry reference entry into :class:`Reference`.""" @@ -414,7 +405,6 @@ def _coerce_reference(obj: object) -> Reference: ) - def _coerce_coverage(obj: object) -> CoverageInfo | None: """Coerce raw coverage metadata into :class:`CoverageInfo`.""" @@ -434,7 +424,6 @@ def _coerce_coverage(obj: object) -> CoverageInfo | None: ) - def get_dataset_info(ref: DatasetRef) -> DatasetInfo: """Return curated metadata for a packaged dataset reference.""" @@ -604,7 +593,6 @@ def get_builtin_set(ref: DatasetRef) -> ElementScalarSet: return ElementScalarSet(ref=info.ref, info=info, values_by_z=table[column]) - def resolve_dataset_like(dataset: DatasetLike) -> ElementScalarSet: """Resolve either a packaged reference or a custom set to a loaded set.""" @@ -613,7 +601,6 @@ def resolve_dataset_like(dataset: DatasetLike) -> ElementScalarSet: return get_builtin_set(dataset) - def _is_placeholder_value(info: DatasetInfo, value: float) -> bool: """Return ``True`` when ``value`` equals the dataset's placeholder value.""" diff --git a/src/atomref/transfer.py b/src/atomref/transfer.py index 9e071db..909d136 100644 --- a/src/atomref/transfer.py +++ b/src/atomref/transfer.py @@ -38,7 +38,10 @@ class LinearFit: @dataclass(frozen=True, slots=True) class SubstitutionTransfer: - """Use another dataset or policy directly when the base dataset is missing a value.""" + """Use another dataset or policy directly when the base dataset is missing. + + The selected value is copied from the source rather than inferred. + """ source: DatasetLike | SupportsValuePolicy | ValuePolicy[str] diff --git a/src/atomref/xh.py b/src/atomref/xh.py index c2a87be..e445f11 100644 --- a/src/atomref/xh.py +++ b/src/atomref/xh.py @@ -9,7 +9,15 @@ from .elements import canonicalize_element_symbol, is_valid_element_symbol from .errors import PolicyError from .policy import LookupResult, ValuePolicy, get_value, lookup_value -from .registry import DatasetInfo, DatasetRef, ElementScalarSet, get_builtin_set, get_dataset_info, list_dataset_ids, list_dataset_infos +from .registry import ( + DatasetInfo, + DatasetRef, + ElementScalarSet, + get_builtin_set, + get_dataset_info, + list_dataset_ids, + list_dataset_infos, +) from .transfer import LinearTransfer, TransferModel XHSet = ElementScalarSet @@ -37,7 +45,9 @@ def as_value_policy(self) -> ValuePolicy[str]: if isinstance(self.base_set, ElementScalarSet): if self.base_set.ref.quantity != _QUANTITY: raise PolicyError( - f"base_set quantity {self.base_set.ref.quantity!r} is incompatible with X-H lookup" + "base_set quantity " + f"{self.base_set.ref.quantity!r} is incompatible " + "with X-H lookup" ) base = self.base_set else: @@ -70,7 +80,6 @@ def as_value_policy(self) -> ValuePolicy[str]: ) - def _coerce_non_negative_xh_value(value: object, *, what: str) -> float: """Validate an X-H-like policy number.""" @@ -85,7 +94,6 @@ def _coerce_non_negative_xh_value(value: object, *, what: str) -> float: return out - def _normalize_xh_symbol(symbol: str | None) -> str | None: """Normalize symbols accepted by the X-H convenience layer.""" @@ -95,35 +103,30 @@ def _normalize_xh_symbol(symbol: str | None) -> str | None: return cand - def list_xh_sets(*, usage_role: str | None = None) -> tuple[str, ...]: """List packaged X-H set ids.""" return list_dataset_ids(_QUANTITY, usage_role=usage_role) - def list_xh_set_infos(*, usage_role: str | None = None) -> tuple[DatasetInfo, ...]: """Return packaged metadata objects for X-H sets.""" return list_dataset_infos(_QUANTITY, usage_role=usage_role) - def get_xh_set_info(set_id: str) -> DatasetInfo: """Return metadata for one packaged X-H set.""" return get_dataset_info(DatasetRef(_QUANTITY, set_id)) - def get_xh_set(set_id: str) -> XHSet: """Load one packaged X-H set as an :class:`ElementScalarSet`.""" return get_builtin_set(DatasetRef(_QUANTITY, set_id)) - def lookup_xh_bond_length( symbol: str | None, *, @@ -143,7 +146,6 @@ def lookup_xh_bond_length( return lookup - def get_xh_bond_length( symbol: str | None, *, From 1370a025a98c248ca806c786006e2a13019aea10 Mon Sep 17 00:00:00 2001 From: Ivan Chernyshov Date: Sun, 15 Mar 2026 21:24:29 +0300 Subject: [PATCH 14/15] Improves transfer policies --- CHANGELOG.md | 82 +++++++ README.md | 15 +- docs/api/policy.md | 6 +- docs/api/transfer.md | 18 ++ docs/api/xh.md | 2 + docs/dev/architecture.md | 28 ++- docs/guide/policies.md | 53 +++- docs/index.md | 15 +- .../notebooks/03-custom-sets-and-discovery.md | 6 +- src/atomref/__about__.py | 2 +- src/atomref/policy.py | 229 ++++++++++++++++-- src/atomref/radii.py | 12 +- src/atomref/transfer.py | 98 +++++++- src/atomref/xh.py | 11 +- tests/policy/test_policy.py | 181 +++++++++++++- 15 files changed, 711 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fbb2887..8650d50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,87 @@ # Changelog +## 0.1.4 - 2026-03-15 + +### Added + +- explicit nested-policy safeguards for `LinearTransfer`: + - `fit_sources` + - `fit_max_depth` + - `prediction_sources` + - `prediction_max_depth` +- `LookupResult.transfer_depth` to record how many transfer steps produced the + returned value. + +### Changed + +- linear-transfer fitting now distinguishes direct predictor values from nested + policy-derived predictor values. +- the default nested linear-transfer behavior is now conservative for fitting + and allows at most one additional completion step for the final predictor + value. +- cycle detection now uses context-local resolution tokens and correctly catches + recursion through wrapper policies such as `RadiiPolicy` and `XHPolicy`. +- docs were expanded to explain nested-policy predictors, transfer depth, and + cycle detection. + +## 0.1.4 - 2026-03-15 + +### Added + +- `LookupResult.transfer_depth`, which records how many transfer steps were + involved in the returned numeric value. +- Source/depth controls for nested linear-transfer workflows via + `LinearTransfer.fit_sources`, `LinearTransfer.fit_max_depth`, + `LinearTransfer.prediction_sources`, and `LinearTransfer.prediction_max_depth`. +- Regression tests covering generic-policy cycles, wrapper-policy cycles, + conservative nested-fit defaults, and explicit opt-in for deeper nested + linear workflows. + +### Changed + +- Nested policy-backed linear transfers are now guarded in two phases: + conservative defaults are used for fit training, while one additional nested + completion step remains allowed at prediction time. +- Cycle detection now tracks both generic policies and wrapper policies using a + context-local activation stack, so recursion through freshly materialized + wrapper policies is detected reliably and safely. +- Radii and X–H convenience helpers now resolve through wrapper-aware cycle + tracking rather than materializing a fresh generic policy for each public + lookup call. + +### Documentation + +- Expanded the transfer and policy docs to explain nested-policy safeguards, + `transfer_depth`, and cycle detection. +- Added guidance on when chained correlations are scientifically reasonable and + how to opt in deliberately when broader fit training is desired. + +## 0.1.4 - 2026-03-15 + +### Added + +- `LookupResult.transfer_depth` is now used consistently across nested + substitution and linear-transfer workflows so callers can tell how many + transfer steps contributed to a returned value. +- New tests covering nested-policy fit controls, prediction-depth limits, and + cycle detection for both generic and wrapper policies. + +### Changed + +- `LinearTransfer` now distinguishes between values that may participate in + fitting (`fit_sources`, `fit_max_depth`) and values that may be used for the + final element-specific predictor lookup (`prediction_sources`, + `prediction_max_depth`). +- The default linear-transfer behavior is now conservative for fitting + (direct predictor values only) while still allowing one nested completion + step during final prediction. +- Policy-resolution cycle detection now tracks wrapper-policy identities as + well as generic `ValuePolicy` objects and is stored in a context-local stack + instead of a process-global mutable list. +- Quantity wrappers continue to use the generic policy core, but now route + through wrapper-aware lookup helpers so cycle checks remain effective for + `RadiiPolicy` and `XHPolicy`. + ## 0.1.3 - 2026-03-15 ### Added diff --git a/README.md b/README.md index 0d784fc..52a3e8c 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,8 @@ What you get in the current `0.1.x` line: - dataset provenance and coverage metadata, - deterministic lookup policies, - substitution and linear transfer from support datasets or policies into target datasets, +- guarded nested policy-backed transfers with explicit transfer depth, + conservative fit/prediction controls, and cycle detection, - user-defined custom element-indexed scalar sets. ## Core terms @@ -65,6 +67,13 @@ The default `0.1.x` behavior is intentionally simple and practical: elements inferred from **Cordero covalent radii** through a fitted linear policy. +Nested policy predictors are supported too. In `0.1.4`, `LinearTransfer` +separates **fit-time** use of nested predictor values from +**prediction-time** use. By default, the fit may use only direct nested +values, while the final requested element may still use one additional +nested completion step. That is a useful compromise for workflows such as +provisional X–H inference from a chosen covalent-radii policy. + ## Quick example ```pycon @@ -80,13 +89,15 @@ The default `0.1.x` behavior is intentionally simple and practical: 2.8972265395148358 >>> lookup.source 'transfer_linear' +>>> lookup.transfer_depth +1 >>> lookup.resolved_from (DatasetRef(quantity='atomic_radius', set_id='rahm2016'),) ``` `get_*` returns only the number. `lookup_*` returns a `LookupResult` that also -records where the value came from and whether a transfer model or policy source -was involved. +records where the value came from, whether a transfer model or policy source was +involved, and how many transfer steps were needed (`transfer_depth`). You can inspect the packaged quantity and dataset catalog directly: diff --git a/docs/api/policy.md b/docs/api/policy.md index 5b68440..29b4142 100644 --- a/docs/api/policy.md +++ b/docs/api/policy.md @@ -3,7 +3,7 @@ This module contains the generic resolver that sits below the radii-specific and X–H-specific convenience APIs. -Use it when you want to work directly with the common value-selection engine: +Use it when you want to work directly with the shared value-selection engine: - `ValuePolicy` — generic element-domain policy configuration, - `lookup_value(...)` — resolve one value together with provenance, @@ -18,5 +18,9 @@ A few practical notes: wrapper policies that expose `as_value_policy()`. - `LookupResult.is_placeholder` refers to the returned numeric value itself, not to whether any transfer happened. +- `LookupResult.transfer_depth` counts how many transfer steps were involved in + the returned numeric value. +- Nested lookup is cycle-checked across both generic `ValuePolicy` objects and + wrapper policies such as `RadiiPolicy` and `XHPolicy`. ::: atomref.policy diff --git a/docs/api/transfer.md b/docs/api/transfer.md index 797626e..17e07ad 100644 --- a/docs/api/transfer.md +++ b/docs/api/transfer.md @@ -18,4 +18,22 @@ A transfer source may be: `LinearTransfer` currently accepts exactly one predictor source at runtime, even though the public API stores predictors as a tuple for forward compatibility. +For policy-backed linear predictors, `LinearTransfer` separates two questions: + +- which nested predictor values may be used to **fit** the linear model + (`fit_sources`, `fit_max_depth`), and +- which nested predictor values may be used to **predict** the final requested + element (`prediction_sources`, `prediction_max_depth`). + +The defaults are intentionally conservative: + +- fit only on nested predictor values that came directly from `base` or + `override`, +- but allow one additional nested transfer step when evaluating the predictor + for the requested element. + +That default is meant for workflows such as a sparse X–H target set correlated +against a partial covalent-radii policy that is itself completed from a broader +support set. + ::: atomref.transfer diff --git a/docs/api/xh.md b/docs/api/xh.md index cca073e..cbc1465 100644 --- a/docs/api/xh.md +++ b/docs/api/xh.md @@ -17,6 +17,8 @@ In the default policy: - `C`, `N`, and `O` use curated ConQuest/CSD defaults, - other parent elements may be inferred from `cordero2008`, +- policy-backed predictors are supported as well, with conservative nested-fit + defaults and one additional nested prediction step allowed by default, - fuller X–H literature support is planned for `0.2.x`. ::: atomref.xh diff --git a/docs/dev/architecture.md b/docs/dev/architecture.md index cbdf743..680b755 100644 --- a/docs/dev/architecture.md +++ b/docs/dev/architecture.md @@ -63,6 +63,31 @@ That last point is important. It means higher-level code can express "infer values from my chosen covalent-radii policy" instead of being forced to refer to one hard-coded predictor dataset. +## Nested-policy safeguards and cycle detection + +Policy-backed transfer sources are materialized with more than just raw numeric +values. The resolver also tracks, per element: + +- whether the value came from `base`, `override`, substitution, linear transfer, + or fallback, +- the nested transfer depth that was required to produce it, +- placeholder status. + +`LinearTransfer` uses that information twice: + +- once when fitting the linear relation (`fit_sources` / `fit_max_depth`), +- again when deciding whether the predictor value for the requested element is + admissible (`prediction_sources` / `prediction_max_depth`). + +The default policy is intentionally conservative: fit only on direct nested +predictor values, but allow one additional nested completion step when +predicting the final requested element. This keeps the common two-stage use case +possible without silently training on arbitrarily long inference chains. + +Cycle detection is handled with a context-local activation stack. Both generic +`ValuePolicy` objects and wrapper policies are tracked, so recursion through a +freshly materialized wrapper policy is still detected reliably and safely. + ## Placeholder handling Placeholder semantics stay attached to the value that was actually returned. @@ -73,7 +98,8 @@ This means `LookupResult.is_placeholder` can be true for: - a nested policy used as a transfer source. A linear transfer normally returns a computed value and therefore does not carry -placeholder status itself. +placeholder status itself. Instead, its provenance is carried by +`resolved_from`, explanatory notes, and `transfer_depth`. ## Why the design stays small diff --git a/docs/guide/policies.md b/docs/guide/policies.md index fd53047..912563b 100644 --- a/docs/guide/policies.md +++ b/docs/guide/policies.md @@ -69,7 +69,7 @@ current runtime intentionally supports exactly one predictor source. That keeps the implementation simple now while leaving room for later multi-predictor linear models. -Transfer sources can now be: +Transfer sources can be: - a packaged dataset reference (`DatasetRef`), - a custom `ElementScalarSet`, @@ -81,6 +81,35 @@ that policy. This lets higher-level workflows express things like “infer X–H lengths from my chosen covalent-radii policy” instead of hard-coding a specific support dataset. +#### Nested policy safeguards for `LinearTransfer` + +When a predictor source is itself a policy, two different questions matter: + +1. Which nested predictor values are trustworthy enough to train the linear fit? +2. Which nested predictor value is acceptable for the final requested element? + +`atomref` keeps those two decisions separate. By default: + +- `fit_sources=("base", "override")` and `fit_max_depth=0`, +- `prediction_sources=("base", "override", "transfer_substitution", "transfer_linear")` + and `prediction_max_depth=1`. + +That means the fitted relationship is trained only on direct predictor values by +default, while one additional nested completion step is still allowed at +prediction time. + +This is a good default for workflows such as: + +- sparse target X–H data from `csd_legacy_xh_cno`, +- a partial covalent-radii predictor policy with direct `s,p` values, +- one inner transfer from a broader support set such as `cordero2008` to make + the predictor usable for `d` or `f` elements. + +In that setup, the outer X–H fit still uses direct predictor anchors, while the +final requested element may use one nested predictor transfer. If you really do +want fit training to use nested predictor values as well, you can opt in +explicitly by widening `fit_sources` and/or increasing `fit_max_depth`. + ### Fallback A fallback is a constant last-resort value. It is useful when an algorithm must @@ -112,6 +141,24 @@ It does **not** mean “a transfer happened”. Examples: - a linear transfer is computed, not copied, so `is_placeholder` is normally `False`. +## Transfer depth and cycle detection + +`LookupResult.transfer_depth` counts how many transfer steps were needed to +produce the returned value: + +- direct base and override values have depth `0`, +- one substitution or linear restoration has depth `1`, +- nested transfer chains increase the depth further. + +This makes nested-policy behavior inspectable without trying to infer it from +notes alone. + +Because policies may now depend on other policies, the resolver also performs +cycle detection. A cyclic reference such as policy A depending on policy B while +policy B depends back on policy A raises `PolicyError` instead of recurring +indefinitely. The same protection applies when recursion goes through wrapper +policies such as `RadiiPolicy` or `XHPolicy`. + ## Target datasets and support datasets `atomref` separates **what a dataset is used for** from **what it scientifically @@ -171,5 +218,5 @@ With that X–H policy: - missing parent elements may be inferred from the **selected covalent-radii policy**, not just from one hard-coded support dataset, - if the predictor policy itself needed a transfer to produce a covalent radius, - the resulting `LookupResult` still records that provenance in `resolved_from` - and `notes`. + the resulting `LookupResult` still records that provenance in `resolved_from`, + `notes`, and `transfer_depth`. diff --git a/docs/index.md b/docs/index.md index 17c5481..71babb9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -22,6 +22,8 @@ What you get in the current `0.1.x` line: - dataset provenance and coverage metadata, - deterministic lookup policies, - substitution and linear transfer from support datasets or policies into target datasets, +- guarded nested policy-backed transfers with explicit transfer depth, + conservative fit/prediction controls, and cycle detection, - user-defined custom element-indexed scalar sets. ## Core terms @@ -65,6 +67,13 @@ The default `0.1.x` behavior is intentionally simple and practical: elements inferred from **Cordero covalent radii** through a fitted linear policy. +Nested policy predictors are supported too. In `0.1.4`, `LinearTransfer` +separates **fit-time** use of nested predictor values from +**prediction-time** use. By default, the fit may use only direct nested +values, while the final requested element may still use one additional +nested completion step. That is a useful compromise for workflows such as +provisional X–H inference from a chosen covalent-radii policy. + ## Quick example ```pycon @@ -80,13 +89,15 @@ The default `0.1.x` behavior is intentionally simple and practical: 2.8972265395148358 >>> lookup.source 'transfer_linear' +>>> lookup.transfer_depth +1 >>> lookup.resolved_from (DatasetRef(quantity='atomic_radius', set_id='rahm2016'),) ``` `get_*` returns only the number. `lookup_*` returns a `LookupResult` that also -records where the value came from and whether a transfer model or policy source -was involved. +records where the value came from, whether a transfer model or policy source was +involved, and how many transfer steps were needed (`transfer_depth`). You can inspect the packaged quantity and dataset catalog directly: diff --git a/docs/notebooks/03-custom-sets-and-discovery.md b/docs/notebooks/03-custom-sets-and-discovery.md index 51dc5e2..47138bf 100644 --- a/docs/notebooks/03-custom-sets-and-discovery.md +++ b/docs/notebooks/03-custom-sets-and-discovery.md @@ -33,9 +33,9 @@ for symbol in ("C", "O", "N"): ``` **Output** ```text -C LookupResult(value=0.77, source='base', target=DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'), resolved_from=(DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'),), is_placeholder=False, fit=None, notes=()) -O LookupResult(value=0.67, source='base', target=DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'), resolved_from=(DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'),), is_placeholder=False, fit=None, notes=()) -N LookupResult(value=0.71, source='transfer_substitution', target=DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'), resolved_from=(DatasetRef(quantity='covalent_radius', set_id='cordero2008'),), is_placeholder=False, fit=None, notes=('missing in base set; substituted from transfer source',)) +C LookupResult(value=0.77, source='base', target=DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'), resolved_from=(DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'),), is_placeholder=False, fit=None, notes=(), transfer_depth=0) +O LookupResult(value=0.67, source='base', target=DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'), resolved_from=(DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'),), is_placeholder=False, fit=None, notes=(), transfer_depth=0) +N LookupResult(value=0.71, source='transfer_substitution', target=DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'), resolved_from=(DatasetRef(quantity='covalent_radius', set_id='cordero2008'),), is_placeholder=False, fit=None, notes=('missing in base set; substituted from transfer source',), transfer_depth=1) ``` ```python for info in ar.list_radii_set_infos("van_der_waals", usage_role="target"): diff --git a/src/atomref/__about__.py b/src/atomref/__about__.py index ae73625..bbab024 100644 --- a/src/atomref/__about__.py +++ b/src/atomref/__about__.py @@ -1 +1 @@ -__version__ = "0.1.3" +__version__ = "0.1.4" diff --git a/src/atomref/policy.py b/src/atomref/policy.py index a8c8616..a2f922f 100644 --- a/src/atomref/policy.py +++ b/src/atomref/policy.py @@ -3,6 +3,7 @@ from __future__ import annotations from collections.abc import Mapping +import contextvars from dataclasses import dataclass, field from functools import lru_cache import math @@ -42,7 +43,10 @@ "missing", ] -_ACTIVE_POLICY_IDS: list[int] = [] +PolicyToken = tuple[str, int] +_ACTIVE_POLICY_TOKENS: contextvars.ContextVar[tuple[PolicyToken, ...]] = ( + contextvars.ContextVar("atomref_active_policy_tokens", default=()) +) @dataclass(frozen=True, slots=True) @@ -51,6 +55,8 @@ class LookupResult: ``value`` carries the final scalar value when one could be produced, while ``source`` and the remaining metadata explain how that value was obtained. + ``transfer_depth`` counts how many transfer steps were involved in producing + the returned value. Direct base and override values therefore have depth 0. """ value: float | None @@ -60,6 +66,7 @@ class LookupResult: is_placeholder: bool = False fit: LinearFit | None = None notes: tuple[str, ...] = () + transfer_depth: int = 0 def __float__(self) -> float: """Coerce the resolved value to ``float`` or raise if it is missing.""" @@ -119,7 +126,8 @@ def __post_init__(self) -> None: for key, value in self.overrides.items(): if not isinstance(key, str): raise PolicyError( - "element-domain policy overrides must be keyed by element symbols" + "element-domain policy overrides must be keyed by element " + "symbols" ) sym = _normalize_element_symbol(key) if sym is None: @@ -129,7 +137,8 @@ def __post_init__(self) -> None: previous = seen_original_keys.get(sym) if previous is not None and previous != key: raise PolicyError( - f"override keys {previous!r} and {key!r} both normalize to {sym!r}" + f"override keys {previous!r} and {key!r} both normalize to " + f"{sym!r}" ) seen_original_keys[sym] = key normalized_overrides[sym] = _coerce_policy_float( @@ -151,6 +160,8 @@ class _ResolvedElementSource: ref: DatasetRef values_by_z: tuple[float | None, ...] placeholder_by_z: tuple[bool, ...] + lookup_source_by_z: tuple[LookupSource | None, ...] + transfer_depth_by_z: tuple[int | None, ...] via_policy: bool = False @@ -165,6 +176,7 @@ class _TransferSourceValue: via_policy: bool = False lookup_source: LookupSource | None = None notes: tuple[str, ...] = () + transfer_depth: int = 0 def _coerce_policy_float(value: object, *, what: str) -> float: @@ -201,17 +213,50 @@ def _resolve_target_ref(policy: ValuePolicy[object]) -> DatasetRef: return resolve_dataset_like(policy.base).ref -def _coerce_nested_policy(source: object) -> ValuePolicy[str] | None: - """Return ``source`` as a generic value policy when possible.""" +def _policy_resolution_tokens( + policy: ValuePolicy[object], + *, + owner: object | None = None, +) -> tuple[PolicyToken, ...]: + """Return all tokens that should be considered active for one resolution. + + We always track the concrete :class:`ValuePolicy` object identity. When a + wrapper object such as :class:`atomref.radii.RadiiPolicy` or + :class:`atomref.xh.XHPolicy` is the logical source, we also track the + wrapper identity so recursion through freshly materialized generic policies + is still detected. + """ + + tokens: list[PolicyToken] = [("policy", id(policy))] + if owner is not None: + tokens.append((f"owner:{type(owner).__qualname__}", id(owner))) + return tuple(tokens) + + +def _lookup_value_with_owner( + symbol: str | None, + *, + policy: ValuePolicy[str], + owner: object | None, +) -> LookupResult: + """Internal lookup helper that carries wrapper identity for cycle checks.""" + + return _resolve_value(symbol, policy=policy, resolution_owner=owner) + + +def _coerce_nested_policy( + source: object, +) -> tuple[ValuePolicy[str] | None, object | None]: + """Return ``source`` as a generic value policy and its logical owner.""" if isinstance(source, ValuePolicy): - return source + return source, None if isinstance(source, SupportsValuePolicy): nested = source.as_value_policy() if not isinstance(nested, ValuePolicy): raise PolicyError("policy-like transfer sources must return ValuePolicy") - return nested - return None + return nested, source + return None, None def _materialize_transfer_source( @@ -219,7 +264,7 @@ def _materialize_transfer_source( ) -> _ResolvedElementSource: """Materialize any element-domain transfer source into dense by-Z arrays.""" - nested_policy = _coerce_nested_policy(source) + nested_policy, nested_owner = _coerce_nested_policy(source) if nested_policy is None: dataset = resolve_dataset_like(source) placeholders = tuple( @@ -228,10 +273,18 @@ def _materialize_transfer_source( else _is_placeholder_value(dataset.info, float(value)) for value in dataset.values_by_z ) + lookup_sources = tuple( + "base" if value is not None else None for value in dataset.values_by_z + ) + transfer_depths = tuple( + 0 if value is not None else None for value in dataset.values_by_z + ) return _ResolvedElementSource( ref=dataset.ref, values_by_z=dataset.values_by_z, placeholder_by_z=placeholders, + lookup_source_by_z=lookup_sources, + transfer_depth_by_z=transfer_depths, via_policy=False, ) @@ -239,16 +292,25 @@ def _materialize_transfer_source( n_z = max(elem.z for elem in iter_elements()) values: list[float | None] = [None] * (n_z + 1) placeholders: list[bool] = [False] * (n_z + 1) + lookup_sources: list[LookupSource | None] = [None] * (n_z + 1) + transfer_depths: list[int | None] = [None] * (n_z + 1) for elem in iter_elements(): - lookup = lookup_value(elem.symbol, policy=nested_policy) - values[elem.z] = lookup.value - placeholders[elem.z] = ( - lookup.is_placeholder if lookup.value is not None else False + lookup = _lookup_value_with_owner( + elem.symbol, + policy=nested_policy, + owner=nested_owner, ) + values[elem.z] = lookup.value + if lookup.value is not None: + placeholders[elem.z] = lookup.is_placeholder + lookup_sources[elem.z] = lookup.source + transfer_depths[elem.z] = lookup.transfer_depth return _ResolvedElementSource( ref=target, values_by_z=tuple(values), placeholder_by_z=tuple(placeholders), + lookup_source_by_z=tuple(lookup_sources), + transfer_depth_by_z=tuple(transfer_depths), via_policy=True, ) @@ -259,7 +321,7 @@ def _lookup_transfer_source_value( ) -> tuple[_TransferSourceValue | None, str | None]: """Resolve one element value from a transfer source or nested policy.""" - nested_policy = _coerce_nested_policy(source) + nested_policy, nested_owner = _coerce_nested_policy(source) if nested_policy is None: source_set = resolve_dataset_like(source) value = source_set.get(symbol) @@ -275,11 +337,16 @@ def _lookup_transfer_source_value( via_policy=False, lookup_source="base", notes=(), + transfer_depth=0, ), None, ) - lookup = lookup_value(symbol, policy=nested_policy) + lookup = _lookup_value_with_owner( + symbol, + policy=nested_policy, + owner=nested_owner, + ) if lookup.value is None: if lookup.notes: return ( @@ -297,22 +364,64 @@ def _lookup_transfer_source_value( via_policy=True, lookup_source=lookup.source, notes=lookup.notes, + transfer_depth=lookup.transfer_depth, ), None, ) +def _transfer_source_is_allowed( + lookup_source: LookupSource | None, + transfer_depth: int | None, + *, + allowed_sources: tuple[str, ...], + max_depth: int, +) -> bool: + """Return whether a nested predictor value may participate downstream.""" + + if lookup_source is None or transfer_depth is None: + return False + return lookup_source in allowed_sources and transfer_depth <= max_depth + + +def _explain_rejected_transfer_source( + *, + source_role: str, + lookup_source: LookupSource | None, + transfer_depth: int | None, + allowed_sources: tuple[str, ...], + max_depth: int, +) -> str: + """Return a human-readable explanation for a rejected nested source.""" + + if lookup_source is None or transfer_depth is None: + return f"{source_role} policy source did not return a usable value" + if lookup_source not in allowed_sources: + allowed = ", ".join(allowed_sources) + return ( + f"{source_role} policy source resolved via {lookup_source}, which is " + f"excluded by {source_role}_sources=({allowed})" + ) + return ( + f"{source_role} policy source transfer depth {transfer_depth} exceeds " + f"allowed maximum {max_depth} ({source_role}_max_depth)" + ) + + def _fit_linear_transfer( base_set: ElementScalarSet, predictor_source: _ResolvedElementSource, *, min_points: int, exclude_placeholders: bool, + fit_sources: tuple[str, ...], + fit_max_depth: int, ) -> LinearFit: """Fit a one-predictor linear transfer model between two sources.""" xs: list[float] = [] ys: list[float] = [] + filtered_by_fit_restrictions = 0 n_z = min(len(base_set.values_by_z), len(predictor_source.values_by_z)) for z in range(1, n_z): @@ -320,6 +429,14 @@ def _fit_linear_transfer( x = predictor_source.values_by_z[z] if y is None or x is None: continue + if not _transfer_source_is_allowed( + predictor_source.lookup_source_by_z[z], + predictor_source.transfer_depth_by_z[z], + allowed_sources=fit_sources, + max_depth=fit_max_depth, + ): + filtered_by_fit_restrictions += 1 + continue y_f = float(y) x_f = float(x) if exclude_placeholders and ( @@ -332,6 +449,11 @@ def _fit_linear_transfer( n = len(xs) if n < min_points: + if predictor_source.via_policy and filtered_by_fit_restrictions > 0: + raise PolicyError( + "not enough overlapping elements to fit linear transfer after " + "applying fit source constraints (fit-source restrictions)" + ) raise PolicyError("not enough overlapping elements to fit linear transfer") x_mean = sum(xs) / n @@ -365,6 +487,8 @@ def _fit_linear_transfer_cached( predictor_ref: DatasetRef, min_points: int, exclude_placeholders: bool, + fit_sources: tuple[str, ...], + fit_max_depth: int, ) -> LinearFit: """Cache fits between two packaged datasets for repeated reuse.""" @@ -373,6 +497,8 @@ def _fit_linear_transfer_cached( _materialize_transfer_source(predictor_ref), min_points=min_points, exclude_placeholders=exclude_placeholders, + fit_sources=fit_sources, + fit_max_depth=fit_max_depth, ) @@ -391,12 +517,16 @@ def _fit_transfer_model(base: DatasetLike, transfer: TransferModel) -> LinearFit predictor, transfer.min_points, transfer.exclude_placeholders, + transfer.fit_sources, + transfer.fit_max_depth, ) return _fit_linear_transfer( resolve_dataset_like(base), _materialize_transfer_source(predictor), min_points=transfer.min_points, exclude_placeholders=transfer.exclude_placeholders, + fit_sources=transfer.fit_sources, + fit_max_depth=transfer.fit_max_depth, ) @@ -431,6 +561,7 @@ def _apply_substitution_transfer( resolved_from=source_value.resolved_from, is_placeholder=source_value.is_placeholder, notes=tuple(notes), + transfer_depth=source_value.transfer_depth + 1, ), None, ) @@ -455,6 +586,23 @@ def _apply_linear_transfer( if predictor_value is None: return None, note + if not _transfer_source_is_allowed( + predictor_value.lookup_source, + predictor_value.transfer_depth, + allowed_sources=transfer.prediction_sources, + max_depth=transfer.prediction_max_depth, + ): + return ( + None, + _explain_rejected_transfer_source( + source_role="prediction", + lookup_source=predictor_value.lookup_source, + transfer_depth=predictor_value.transfer_depth, + allowed_sources=transfer.prediction_sources, + max_depth=transfer.prediction_max_depth, + ), + ) + if transfer.exclude_placeholders and predictor_value.is_placeholder: if predictor_value.via_policy: return None, "predictor value from policy source is a placeholder" @@ -468,7 +616,10 @@ def _apply_linear_transfer( notes = ["missing in base set; inferred via linear transfer"] if predictor_value.via_policy: notes.append("predictor value supplied by policy source") - notes.append("linear fit used policy-materialized predictor values") + notes.append( + "linear fit applied fit-source and transfer-depth limits to " + "policy-materialized predictor values" + ) if predictor_value.lookup_source not in (None, "base"): notes.append( "policy predictor resolved the value via " @@ -484,19 +635,26 @@ def _apply_linear_transfer( is_placeholder=False, fit=fit, notes=tuple(notes), + transfer_depth=predictor_value.transfer_depth + 1, ), None, ) -def _resolve_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupResult: +def _resolve_value( + symbol: str | None, + *, + policy: ValuePolicy[str], + resolution_owner: object | None = None, +) -> LookupResult: """Resolve a value through override, base, transfer, and fallback steps.""" - policy_id = id(policy) - if policy_id in _ACTIVE_POLICY_IDS: + active_tokens = _ACTIVE_POLICY_TOKENS.get() + resolution_tokens = _policy_resolution_tokens(policy, owner=resolution_owner) + if any(token in active_tokens for token in resolution_tokens): raise PolicyError("cyclic policy resolution detected") - _ACTIVE_POLICY_IDS.append(policy_id) + stack_token = _ACTIVE_POLICY_TOKENS.set(active_tokens + resolution_tokens) try: target = _resolve_target_ref(policy) base_set = resolve_dataset_like(policy.base) @@ -527,6 +685,7 @@ def _resolve_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupRes source="override", target=target, notes=("value supplied by policy override",), + transfer_depth=0, ) base_value = base_set.get(sym) @@ -545,6 +704,7 @@ def _resolve_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupRes resolved_from=(base_set.ref,), is_placeholder=is_placeholder, notes=notes, + transfer_depth=0, ) transfer_notes: list[str] = ["missing in base set"] @@ -576,6 +736,7 @@ def _resolve_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupRes source="fallback", target=target, notes=tuple(transfer_notes + ["using fallback value"]), + transfer_depth=0, ) return LookupResult( @@ -585,8 +746,30 @@ def _resolve_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupRes notes=tuple(transfer_notes), ) finally: - popped = _ACTIVE_POLICY_IDS.pop() - assert popped == policy_id # internal stack discipline + _ACTIVE_POLICY_TOKENS.reset(stack_token) + + +def _lookup_value_from_policy_source( + symbol: str | None, + *, + source: ValuePolicy[str] | SupportsValuePolicy, +) -> LookupResult: + """Resolve a value from either a generic policy or a wrapper policy.""" + + if isinstance(source, ValuePolicy): + return _lookup_value_with_owner(symbol, policy=source, owner=None) + policy = source.as_value_policy() + return _lookup_value_with_owner(symbol, policy=policy, owner=source) + + +def _get_value_from_policy_source( + symbol: str | None, + *, + source: ValuePolicy[str] | SupportsValuePolicy, +) -> float | None: + """Return only the scalar selected by a generic or wrapper policy.""" + + return _lookup_value_from_policy_source(symbol, source=source).value def lookup_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupResult: @@ -596,7 +779,7 @@ def lookup_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupResul In v0.1 the runtime supports only element-domain policies. """ - return _resolve_value(symbol, policy=policy) + return _lookup_value_with_owner(symbol, policy=policy, owner=None) def get_value(symbol: str | None, *, policy: ValuePolicy[str]) -> float | None: diff --git a/src/atomref/radii.py b/src/atomref/radii.py index 449de58..b33877f 100644 --- a/src/atomref/radii.py +++ b/src/atomref/radii.py @@ -13,8 +13,8 @@ LookupResult, ValuePolicy, _fit_transfer_model, - get_value, - lookup_value, + _get_value_from_policy_source, + _lookup_value_from_policy_source, ) from .registry import ( DatasetInfo, @@ -217,7 +217,7 @@ def _validate_policy_kind(policy: RadiiPolicy, *, expected: RadiiKind) -> None: def _lookup_radius(symbol: str | None, *, policy: RadiiPolicy) -> LookupResult: """Shared implementation for radii lookup helpers.""" - return lookup_value(symbol, policy=policy.as_value_policy()) + return _lookup_value_from_policy_source(symbol, source=policy) def lookup_covalent_radius( @@ -241,7 +241,7 @@ def get_covalent_radius( active = DEFAULT_COVALENT_POLICY if policy is None else policy _validate_policy_kind(active, expected="covalent") - return get_value(symbol, policy=active.as_value_policy()) + return _get_value_from_policy_source(symbol, source=active) def lookup_vdw_radius( @@ -265,7 +265,7 @@ def get_vdw_radius( active = DEFAULT_VDW_POLICY if policy is None else policy _validate_policy_kind(active, expected="van_der_waals") - return get_value(symbol, policy=active.as_value_policy()) + return _get_value_from_policy_source(symbol, source=active) def assess_radii_policy( @@ -292,7 +292,7 @@ def assess_radii_policy( per_element: list[RadiiElementAssessment] = [] for symbol in elems: - lookup = lookup_value(symbol, policy=value_policy) + lookup = _lookup_value_from_policy_source(symbol, source=policy) if lookup.source == "override": n_override += 1 elif lookup.source == "base": diff --git a/src/atomref/transfer.py b/src/atomref/transfer.py index 909d136..54eb724 100644 --- a/src/atomref/transfer.py +++ b/src/atomref/transfer.py @@ -3,7 +3,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Protocol, runtime_checkable +from typing import TYPE_CHECKING, Literal, Protocol, runtime_checkable from .errors import PolicyError from .registry import DatasetLike @@ -12,6 +12,37 @@ from .policy import ValuePolicy +TransferValueSource = Literal[ + "override", + "base", + "transfer_substitution", + "transfer_linear", + "fallback", +] +"""Source labels that may be admitted into nested linear-transfer workflows.""" + +_ALLOWED_TRANSFER_VALUE_SOURCES = frozenset( + { + "override", + "base", + "transfer_substitution", + "transfer_linear", + "fallback", + } +) + +_DEFAULT_LINEAR_FIT_SOURCES: tuple[TransferValueSource, ...] = ( + "base", + "override", +) +_DEFAULT_LINEAR_PREDICTION_SOURCES: tuple[TransferValueSource, ...] = ( + "base", + "override", + "transfer_substitution", + "transfer_linear", +) + + @runtime_checkable class SupportsValuePolicy(Protocol): """Protocol for wrapper objects that can expose a generic value policy.""" @@ -53,11 +84,27 @@ class LinearTransfer: In v0.1 the public API stores predictors as a tuple for forward compatibility, but the runtime implementation intentionally accepts exactly one predictor source. + + For nested policy predictors, two safeguards apply: + + - ``fit_sources`` / ``fit_max_depth`` control which predictor values may be + used when fitting the linear model itself; + - ``prediction_sources`` / ``prediction_max_depth`` control which nested + predictor values may be used for the final requested element. + + The defaults are intentionally conservative for fitting and permissive only + enough to allow one additional completion step at prediction time. """ predictors: tuple[DatasetLike | SupportsValuePolicy | ValuePolicy[str], ...] min_points: int = 2 exclude_placeholders: bool = True + fit_sources: tuple[TransferValueSource, ...] = _DEFAULT_LINEAR_FIT_SOURCES + prediction_sources: tuple[TransferValueSource, ...] = ( + _DEFAULT_LINEAR_PREDICTION_SOURCES + ) + fit_max_depth: int = 0 + prediction_max_depth: int = 1 def __post_init__(self) -> None: """Validate obvious configuration errors eagerly.""" @@ -67,6 +114,55 @@ def __post_init__(self) -> None: if self.min_points < 2: raise PolicyError("LinearTransfer min_points must be at least 2") + object.__setattr__( + self, + "fit_sources", + _normalize_transfer_value_sources( + self.fit_sources, + field_name="fit_sources", + ), + ) + object.__setattr__( + self, + "prediction_sources", + _normalize_transfer_value_sources( + self.prediction_sources, + field_name="prediction_sources", + ), + ) + + if self.fit_max_depth < 0: + raise PolicyError("LinearTransfer fit_max_depth must be non-negative") + if self.prediction_max_depth < 0: + raise PolicyError( + "LinearTransfer prediction_max_depth must be non-negative" + ) + TransferModel = SubstitutionTransfer | LinearTransfer """Closed union of transfer models supported by the core resolver.""" + + +def _normalize_transfer_value_sources( + sources: tuple[str, ...], + *, + field_name: str, +) -> tuple[TransferValueSource, ...]: + """Validate and deduplicate source-label controls for linear transfers.""" + + if not sources: + raise PolicyError(f"LinearTransfer {field_name} may not be empty") + + normalized: list[TransferValueSource] = [] + seen: set[str] = set() + for source in sources: + if source not in _ALLOWED_TRANSFER_VALUE_SOURCES: + allowed = ", ".join(sorted(_ALLOWED_TRANSFER_VALUE_SOURCES)) + raise PolicyError( + f"LinearTransfer {field_name} contains unsupported source " + f"{source!r}; allowed values are: {allowed}" + ) + if source not in seen: + normalized.append(source) + seen.add(source) + return tuple(normalized) diff --git a/src/atomref/xh.py b/src/atomref/xh.py index e445f11..5018d99 100644 --- a/src/atomref/xh.py +++ b/src/atomref/xh.py @@ -8,7 +8,12 @@ from .elements import canonicalize_element_symbol, is_valid_element_symbol from .errors import PolicyError -from .policy import LookupResult, ValuePolicy, get_value, lookup_value +from .policy import ( + LookupResult, + ValuePolicy, + _get_value_from_policy_source, + _lookup_value_from_policy_source, +) from .registry import ( DatasetInfo, DatasetRef, @@ -135,7 +140,7 @@ def lookup_xh_bond_length( """Resolve a parent-element X-H bond length with provenance.""" active = DEFAULT_XH_POLICY if policy is None else policy - lookup = lookup_value(symbol, policy=active.as_value_policy()) + lookup = _lookup_value_from_policy_source(symbol, source=active) if lookup.value is None and _normalize_xh_symbol(symbol) == "H": return LookupResult( value=None, @@ -154,7 +159,7 @@ def get_xh_bond_length( """Return only the selected X-H bond-length value, without provenance.""" active = DEFAULT_XH_POLICY if policy is None else policy - return get_value(symbol, policy=active.as_value_policy()) + return _get_value_from_policy_source(symbol, source=active) DEFAULT_XH_POLICY = XHPolicy( diff --git a/tests/policy/test_policy.py b/tests/policy/test_policy.py index 3b38717..618829a 100644 --- a/tests/policy/test_policy.py +++ b/tests/policy/test_policy.py @@ -1,11 +1,64 @@ from __future__ import annotations +from dataclasses import dataclass + import pytest import atomref as ar from atomref.errors import PolicyError +def _make_custom_set( + quantity: str, + set_id: str, + values: dict[str, float | None], +) -> ar.ElementScalarSet: + return ar.ElementScalarSet.from_mapping( + ref=ar.DatasetRef(quantity, set_id), + values=values, + name=set_id, + units='angstrom', + ) + + +def _make_partial_covalent_policy(*, include_o: bool) -> ar.RadiiPolicy: + values = { + 'C': 0.76, + 'N': 0.71, + } + if include_o: + values['O'] = 0.66 + custom = ar.ElementScalarSet.from_mapping( + ref=ar.DatasetRef('covalent_radius', 'demo_partial_cov'), + values=values, + name='Demo partial covalent set', + units='angstrom', + ) + return ar.RadiiPolicy( + kind='covalent', + base_set=custom, + transfers=( + ar.LinearTransfer( + predictors=(ar.DatasetRef('covalent_radius', 'cordero2008'),), + min_points=2, + exclude_placeholders=True, + ), + ), + ) + + +@dataclass +class _DemoPolicyWrapper: + base: ar.ElementScalarSet + source: object | None = None + + def as_value_policy(self) -> ar.ValuePolicy[str]: + transfers = () + if self.source is not None: + transfers = (ar.SubstitutionTransfer(source=self.source),) + return ar.ValuePolicy(base=self.base, transfers=transfers) + + def test_lookup_value_is_public_generic_entry_point() -> None: policy = ar.ValuePolicy( base=ar.DatasetRef('covalent_radius', 'cordero2008'), @@ -14,6 +67,7 @@ def test_lookup_value_is_public_generic_entry_point() -> None: lookup = ar.lookup_value('H', policy=policy) assert lookup.source == 'override' assert lookup.value == pytest.approx(0.5) + assert lookup.transfer_depth == 0 def test_get_value_returns_only_scalar() -> None: @@ -51,7 +105,10 @@ def test_substitution_transfer_accepts_policy_source() -> None: lookup = ar.lookup_value('Bk', policy=policy) assert lookup.source == 'transfer_substitution' assert lookup.value == pytest.approx(1.54) - assert lookup.resolved_from == (ar.DatasetRef('covalent_radius', 'csd_legacy_cov'),) + assert lookup.transfer_depth == 2 + assert lookup.resolved_from == ( + ar.DatasetRef('covalent_radius', 'csd_legacy_cov'), + ) assert any('policy source' in note for note in lookup.notes) @@ -65,5 +122,127 @@ def test_linear_transfer_accepts_policy_predictor() -> None: lookup = ar.lookup_vdw_radius('Pm', policy=policy) assert lookup.source == 'transfer_linear' assert lookup.value == pytest.approx(ar.lookup_vdw_radius('Pm').value) + assert lookup.transfer_depth == 1 assert lookup.fit is not None assert any('policy source' in note for note in lookup.notes) + + +def test_linear_transfer_defaults_allow_direct_fit_and_one_nested_prediction() -> None: + predictor_policy = _make_partial_covalent_policy(include_o=True) + policy = ar.XHPolicy( + base_set='csd_legacy_xh_cno', + transfers=( + ar.LinearTransfer( + predictors=(predictor_policy,), + min_points=3, + exclude_placeholders=True, + ), + ), + ) + lookup = ar.lookup_xh_bond_length('S', policy=policy) + assert lookup.source == 'transfer_linear' + assert lookup.transfer_depth == 2 + assert lookup.fit is not None + assert lookup.fit.n_points == 3 + assert lookup.value == pytest.approx(ar.lookup_xh_bond_length('S').value) + + +def test_linear_transfer_fit_restrictions_block_inference_on_inference_by_default( +) -> None: + predictor_policy = _make_partial_covalent_policy(include_o=False) + policy = ar.XHPolicy( + base_set='csd_legacy_xh_cno', + transfers=( + ar.LinearTransfer( + predictors=(predictor_policy,), + min_points=3, + exclude_placeholders=True, + ), + ), + ) + with pytest.raises(PolicyError, match='fit-source restrictions'): + ar.lookup_xh_bond_length('S', policy=policy) + + +def test_linear_transfer_fit_restrictions_can_be_relaxed_explicitly() -> None: + predictor_policy = _make_partial_covalent_policy(include_o=False) + policy = ar.XHPolicy( + base_set='csd_legacy_xh_cno', + transfers=( + ar.LinearTransfer( + predictors=(predictor_policy,), + min_points=3, + exclude_placeholders=True, + fit_sources=('base', 'override', 'transfer_linear'), + fit_max_depth=1, + ), + ), + ) + lookup = ar.lookup_xh_bond_length('S', policy=policy) + assert lookup.source == 'transfer_linear' + assert lookup.fit is not None + assert lookup.fit.n_points == 3 + + +def test_linear_transfer_prediction_depth_can_be_tightened() -> None: + predictor_policy = _make_partial_covalent_policy(include_o=True) + policy = ar.XHPolicy( + base_set='csd_legacy_xh_cno', + transfers=( + ar.LinearTransfer( + predictors=(predictor_policy,), + min_points=3, + exclude_placeholders=True, + prediction_max_depth=0, + ), + ), + ) + lookup = ar.lookup_xh_bond_length('S', policy=policy) + assert lookup.value is None + assert lookup.source == 'missing' + assert any('prediction_max_depth' in note for note in lookup.notes) + + +def test_linear_transfer_rejects_invalid_nested_source_configuration() -> None: + with pytest.raises(PolicyError, match='fit_max_depth'): + ar.LinearTransfer( + predictors=(ar.DatasetRef('covalent_radius', 'cordero2008'),), + fit_max_depth=-1, + ) + with pytest.raises(PolicyError, match='allowed values'): + ar.LinearTransfer( + predictors=(ar.DatasetRef('covalent_radius', 'cordero2008'),), + prediction_sources=('missing',), # type: ignore[arg-type] + ) + + +def test_lookup_value_detects_generic_policy_cycles() -> None: + empty_1 = _make_custom_set('covalent_radius', 'cycle_empty_1', {}) + empty_2 = _make_custom_set('covalent_radius', 'cycle_empty_2', {}) + policy_1 = ar.ValuePolicy(base=empty_1) + policy_2 = ar.ValuePolicy( + base=empty_2, + transfers=(ar.SubstitutionTransfer(source=policy_1),), + ) + object.__setattr__( + policy_1, + 'transfers', + (ar.SubstitutionTransfer(source=policy_2),), + ) + + with pytest.raises(PolicyError, match='cyclic policy resolution detected'): + ar.lookup_value('C', policy=policy_1) + + +def test_wrapper_policy_cycles_are_detected() -> None: + empty = _make_custom_set('covalent_radius', 'demo_empty_cov', {}) + wrapper_a = _DemoPolicyWrapper(base=empty) + wrapper_b = _DemoPolicyWrapper(base=empty, source=wrapper_a) + wrapper_a.source = wrapper_b + + policy = ar.ValuePolicy( + base=empty, + transfers=(ar.SubstitutionTransfer(source=wrapper_a),), + ) + with pytest.raises(PolicyError, match='cyclic policy resolution detected'): + ar.lookup_value('C', policy=policy) From 53ae9f7754fcbf9a689e24f3ce2633f544f3f8f5 Mon Sep 17 00:00:00 2001 From: Ivan Chernyshov Date: Sun, 15 Mar 2026 22:31:21 +0300 Subject: [PATCH 15/15] Cleanups docs --- CHANGELOG.md | 56 +++------------------------ DEV_PLAN.md | 21 ++++++---- README.md | 17 ++++---- docs/api/elements.md | 4 +- docs/api/radii.md | 2 +- docs/api/xh.md | 4 +- docs/datasets/atomic_radius.md | 2 +- docs/datasets/covalent_radius.md | 6 +-- docs/datasets/index.md | 2 +- docs/datasets/van_der_waals_radius.md | 6 +-- docs/datasets/xh_bond_length.md | 4 +- docs/dev/dev_plan.md | 21 ++++++---- docs/guide/custom_sets.md | 4 +- docs/guide/notebooks.md | 2 +- docs/guide/policies.md | 4 +- docs/index.md | 17 ++++---- docs/notebooks/01-quickstart.md | 2 +- notebooks/01-quickstart.ipynb | 6 +-- src/atomref/policy.py | 16 +++++--- src/atomref/registry.py | 2 +- src/atomref/transfer.py | 6 +-- 21 files changed, 84 insertions(+), 120 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8650d50..18d2c3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,35 +4,13 @@ ### Added -- explicit nested-policy safeguards for `LinearTransfer`: +- `LookupResult.transfer_depth`, which records how many transfer steps were + involved in the returned numeric value. +- Explicit nested-policy safeguards for `LinearTransfer` via: - `fit_sources` - `fit_max_depth` - `prediction_sources` - `prediction_max_depth` -- `LookupResult.transfer_depth` to record how many transfer steps produced the - returned value. - -### Changed - -- linear-transfer fitting now distinguishes direct predictor values from nested - policy-derived predictor values. -- the default nested linear-transfer behavior is now conservative for fitting - and allows at most one additional completion step for the final predictor - value. -- cycle detection now uses context-local resolution tokens and correctly catches - recursion through wrapper policies such as `RadiiPolicy` and `XHPolicy`. -- docs were expanded to explain nested-policy predictors, transfer depth, and - cycle detection. - -## 0.1.4 - 2026-03-15 - -### Added - -- `LookupResult.transfer_depth`, which records how many transfer steps were - involved in the returned numeric value. -- Source/depth controls for nested linear-transfer workflows via - `LinearTransfer.fit_sources`, `LinearTransfer.fit_max_depth`, - `LinearTransfer.prediction_sources`, and `LinearTransfer.prediction_max_depth`. - Regression tests covering generic-policy cycles, wrapper-policy cycles, conservative nested-fit defaults, and explicit opt-in for deeper nested linear workflows. @@ -42,6 +20,8 @@ - Nested policy-backed linear transfers are now guarded in two phases: conservative defaults are used for fit training, while one additional nested completion step remains allowed at prediction time. +- Linear-transfer fitting now distinguishes direct predictor values from nested + policy-derived predictor values. - Cycle detection now tracks both generic policies and wrapper policies using a context-local activation stack, so recursion through freshly materialized wrapper policies is detected reliably and safely. @@ -56,32 +36,6 @@ - Added guidance on when chained correlations are scientifically reasonable and how to opt in deliberately when broader fit training is desired. -## 0.1.4 - 2026-03-15 - -### Added - -- `LookupResult.transfer_depth` is now used consistently across nested - substitution and linear-transfer workflows so callers can tell how many - transfer steps contributed to a returned value. -- New tests covering nested-policy fit controls, prediction-depth limits, and - cycle detection for both generic and wrapper policies. - -### Changed - -- `LinearTransfer` now distinguishes between values that may participate in - fitting (`fit_sources`, `fit_max_depth`) and values that may be used for the - final element-specific predictor lookup (`prediction_sources`, - `prediction_max_depth`). -- The default linear-transfer behavior is now conservative for fitting - (direct predictor values only) while still allowing one nested completion - step during final prediction. -- Policy-resolution cycle detection now tracks wrapper-policy identities as - well as generic `ValuePolicy` objects and is stored in a context-local stack - instead of a process-global mutable list. -- Quantity wrappers continue to use the generic policy core, but now route - through wrapper-aware lookup helpers so cycle checks remain effective for - `RadiiPolicy` and `XHPolicy`. - ## 0.1.3 - 2026-03-15 ### Added diff --git a/DEV_PLAN.md b/DEV_PLAN.md index 7252862..94cdaac 100644 --- a/DEV_PLAN.md +++ b/DEV_PLAN.md @@ -1,21 +1,26 @@ # Development plan -## v0.1 +## Current status (implemented in the `0.1.x` line) -- element metadata -- covalent and van der Waals radii sets -- explicit provenance -- radii policies +- stable element metadata +- curated covalent, van der Waals, and atomic-radius support datasets +- explicit provenance and coverage metadata +- generic value-policy core plus radii and X–H convenience wrappers - substitution and linear transfer - custom element-indexed scalar sets +- policy-backed transfer sources +- nested-policy safeguards, transfer-depth tracking, and cycle detection +- provisional X–H support via `csd_legacy_xh_cno`, `XHPolicy`, and + `DEFAULT_XH_POLICY` -## v0.2 +## Planned for `0.2.x` -- X-H bond-length datasets +- broader X–H datasets and policies - experimental plus computational support sets +- pairwise helper logic such as reference sums and normalization schemes - restoration of incomplete experimental data from broader-support predictors -## v0.3 +## Longer-term design ideas - radial atomic reference functions - simple proto-density support based on spherically averaged atomic data diff --git a/README.md b/README.md index 52a3e8c..869aace 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ It is not meant to be yet another periodic-table encyclopedia. The package is for code that needs stable atomic reference values with explicit provenance, clear fallback behavior, and honest handling of incomplete preferred datasets. -What you get in the current `0.1.x` line: +What you get in the current release line: - stable element metadata, - curated named radii sets, @@ -43,8 +43,8 @@ What you get in the current `0.1.x` line: The metadata layer already records `domain` explicitly because the package is built for later extension, but the current runtime intentionally keeps the -implementation narrow and stable: **v0.1 resolves only element-domain scalar -values**. +implementation narrow and stable: **the current runtime resolves only +element-domain scalar values**. ## Why this exists @@ -54,7 +54,7 @@ Instead of hiding ad hoc defaults inside algorithm code, you choose a target set, describe how missing values may be restored, and keep provenance on what was actually returned. -The default `0.1.x` behavior is intentionally simple and practical: +The built-in default behavior is intentionally simple and practical: - **Cordero covalent radii** (`cordero2008`) are the preferred covalent target set, with missing values substituted from the **legacy CSD covalent radii** @@ -65,11 +65,10 @@ The default `0.1.x` behavior is intentionally simple and practical: - **CSD/ConQuest hydrogen-normalisation defaults** (`csd_legacy_xh_cno`) are a provisional sparse X–H target set for `C`, `N`, and `O`, with other parent elements inferred from **Cordero covalent radii** through a fitted linear - policy. + transfer. -Nested policy predictors are supported too. In `0.1.4`, `LinearTransfer` -separates **fit-time** use of nested predictor values from -**prediction-time** use. By default, the fit may use only direct nested +Nested policy predictors are supported too. `LinearTransfer` separates +**fit-time** use of nested predictor values from **prediction-time** use. By default, the fit may use only direct nested values, while the final requested element may still use one additional nested completion step. That is a useful compromise for workflows such as provisional X–H inference from a chosen covalent-radii policy. @@ -125,7 +124,7 @@ You can also load a packaged set directly: ## Notebook walkthroughs -The repository ships example notebooks for the main `0.1.x` workflows. In the +The repository ships example notebooks for the main workflows. In the documentation they are also available as rendered Markdown pages, so users can read them without opening Jupyter first. diff --git a/docs/api/elements.md b/docs/api/elements.md index c4275a0..2f066c7 100644 --- a/docs/api/elements.md +++ b/docs/api/elements.md @@ -1,7 +1,7 @@ # atomref.elements -Element identity is intentionally minimal in v0.1: atomic number, symbol, and -name. The module also contains the canonicalization helpers used throughout the +Element identity is intentionally minimal in the current implementation: +atomic number, symbol, and name. The module also contains the canonicalization helpers used throughout the package. ::: atomref.elements diff --git a/docs/api/radii.md b/docs/api/radii.md index 05617a4..ff5e214 100644 --- a/docs/api/radii.md +++ b/docs/api/radii.md @@ -1,6 +1,6 @@ # atomref.radii -This is the main user-facing module in v0.1. +This is the main user-facing module for radii workflows. It provides radii policies, packaged radii-set discovery, lookup helpers, and policy-assessment reports. diff --git a/docs/api/xh.md b/docs/api/xh.md index cbc1465..f96db27 100644 --- a/docs/api/xh.md +++ b/docs/api/xh.md @@ -1,7 +1,7 @@ # atomref.xh -This module provides the provisional X–H bond-length helpers introduced in the -`0.1.x` line. +This module provides the provisional X–H bond-length helpers available in the +current release line. It is intentionally narrow: diff --git a/docs/datasets/atomic_radius.md b/docs/datasets/atomic_radius.md index 1704980..2852b3e 100644 --- a/docs/datasets/atomic_radius.md +++ b/docs/datasets/atomic_radius.md @@ -1,6 +1,6 @@ # Atomic radius -The `atomic_radius` quantity exists in v0.1 to hold support datasets that are +The `atomic_radius` quantity exists to hold support datasets that are scientifically useful but should not be presented as direct condensed-phase vdW radii. diff --git a/docs/datasets/covalent_radius.md b/docs/datasets/covalent_radius.md index d2e2251..5e022fd 100644 --- a/docs/datasets/covalent_radius.md +++ b/docs/datasets/covalent_radius.md @@ -1,12 +1,12 @@ # Covalent radius -The covalent-radius quantity in v0.1 is aimed at bond-detection and related -geometry workflows. It currently ships one preferred target dataset and one +The covalent-radius quantity is aimed at bond-detection and related geometry +workflows. It currently ships one preferred target dataset and one legacy support dataset. ## Cordero covalent radii (`cordero2008`) -This is the main covalent-radius target set in `atomref` v0.1. +This is the main covalent-radius target set in the current release line. - **What it is:** a broad covalent-radius compilation based mainly on crystallographic bond distances. diff --git a/docs/datasets/index.md b/docs/datasets/index.md index d699ff0..d3b2951 100644 --- a/docs/datasets/index.md +++ b/docs/datasets/index.md @@ -30,7 +30,7 @@ or `list_xh_sets(...)`. If you want the packaged values themselves, use `get_builtin_set(...)`, `get_radii_set(...)`, or `get_xh_set(...)`. -## Built-in quantity families in `0.1.x` +## Built-in quantity families - [Covalent radius](covalent_radius.md) - [van der Waals radius](van_der_waals_radius.md) diff --git a/docs/datasets/van_der_waals_radius.md b/docs/datasets/van_der_waals_radius.md index c678639..3013d57 100644 --- a/docs/datasets/van_der_waals_radius.md +++ b/docs/datasets/van_der_waals_radius.md @@ -1,7 +1,7 @@ # van der Waals radius -The van der Waals quantity in v0.1 intentionally includes several target sets -with different scientific backgrounds. This lets users choose between a classic +The van der Waals quantity intentionally includes several target sets with +different scientific backgrounds. This lets users choose between a classic historical compilation, structural contact-derived sets, and compatibility-only legacy tables. @@ -27,7 +27,7 @@ contacts. ## Alvarez van der Waals radii (`alvarez2013`) -This is the main van der Waals target set in `atomref` v0.1. +This is the main van der Waals target set in the current release line. - **What it is:** a broad structural vdW set derived from statistical analysis of many interatomic distances in the Cambridge Structural Database. diff --git a/docs/datasets/xh_bond_length.md b/docs/datasets/xh_bond_length.md index 2bef656..28364c5 100644 --- a/docs/datasets/xh_bond_length.md +++ b/docs/datasets/xh_bond_length.md @@ -1,7 +1,7 @@ # X–H bond length -The `xh_bond_length` quantity is a small provisional addition in the `0.1.x` -line. +The `xh_bond_length` quantity is a small provisional addition in the current +release line. Its purpose is not to claim a complete literature survey of X–H bond lengths. Instead, it provides a stable, provenance-aware starting point for diff --git a/docs/dev/dev_plan.md b/docs/dev/dev_plan.md index 7252862..94cdaac 100644 --- a/docs/dev/dev_plan.md +++ b/docs/dev/dev_plan.md @@ -1,21 +1,26 @@ # Development plan -## v0.1 +## Current status (implemented in the `0.1.x` line) -- element metadata -- covalent and van der Waals radii sets -- explicit provenance -- radii policies +- stable element metadata +- curated covalent, van der Waals, and atomic-radius support datasets +- explicit provenance and coverage metadata +- generic value-policy core plus radii and X–H convenience wrappers - substitution and linear transfer - custom element-indexed scalar sets +- policy-backed transfer sources +- nested-policy safeguards, transfer-depth tracking, and cycle detection +- provisional X–H support via `csd_legacy_xh_cno`, `XHPolicy`, and + `DEFAULT_XH_POLICY` -## v0.2 +## Planned for `0.2.x` -- X-H bond-length datasets +- broader X–H datasets and policies - experimental plus computational support sets +- pairwise helper logic such as reference sums and normalization schemes - restoration of incomplete experimental data from broader-support predictors -## v0.3 +## Longer-term design ideas - radial atomic reference functions - simple proto-density support based on spherically averaged atomic data diff --git a/docs/guide/custom_sets.md b/docs/guide/custom_sets.md index ed4d664..71306bb 100644 --- a/docs/guide/custom_sets.md +++ b/docs/guide/custom_sets.md @@ -26,6 +26,6 @@ This is useful when you want to: - combine a user dataset with built-in support data through substitution or linear transfer. -In v0.1 custom sets are element-domain scalar datasets, which keeps the data -model small and stable. Later versions may add more specialized domains, but +In the current implementation custom sets are element-domain scalar datasets, +which keeps the data model small and stable. Later versions may add more specialized domains, but custom element-wise sets are already enough for many geometry workflows. diff --git a/docs/guide/notebooks.md b/docs/guide/notebooks.md index cdd1721..2ad0045 100644 --- a/docs/guide/notebooks.md +++ b/docs/guide/notebooks.md @@ -1,6 +1,6 @@ # Notebook gallery -`atomref` ships example Jupyter notebooks that cover the main v0.1 workflows. +`atomref` ships example Jupyter notebooks that cover the main workflows. Each notebook is available in two forms: - the original `.ipynb` file in the repository, diff --git a/docs/guide/policies.md b/docs/guide/policies.md index 912563b..b9e3b7a 100644 --- a/docs/guide/policies.md +++ b/docs/guide/policies.md @@ -22,7 +22,7 @@ selection logic that sits on top of them. ## Resolution order -In `0.1.x` every lookup follows the same ordered path: +In the current implementation every lookup follows the same ordered path: 1. **Blocked key** (optional) 2. **Override** @@ -58,7 +58,7 @@ default vdW policy starts from the **Alvarez van der Waals radii** A transfer model is used only when the base dataset has no value for the requested element. -Built-in transfer models in `0.1.x` are: +Built-in transfer models are: - `SubstitutionTransfer` — take a value directly from another dataset or policy, - `LinearTransfer` — infer a target-equivalent value from another dataset or diff --git a/docs/index.md b/docs/index.md index 71babb9..198fa6a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -14,7 +14,7 @@ It is not meant to be yet another periodic-table encyclopedia. The package is for code that needs stable atomic reference values with explicit provenance, clear fallback behavior, and honest handling of incomplete preferred datasets. -What you get in the current `0.1.x` line: +What you get in the current release line: - stable element metadata, - curated named radii sets, @@ -43,8 +43,8 @@ What you get in the current `0.1.x` line: The metadata layer already records `domain` explicitly because the package is built for later extension, but the current runtime intentionally keeps the -implementation narrow and stable: **v0.1 resolves only element-domain scalar -values**. +implementation narrow and stable: **the current runtime resolves only +element-domain scalar values**. ## Why this exists @@ -54,7 +54,7 @@ Instead of hiding ad hoc defaults inside algorithm code, you choose a target set, describe how missing values may be restored, and keep provenance on what was actually returned. -The default `0.1.x` behavior is intentionally simple and practical: +The built-in default behavior is intentionally simple and practical: - **Cordero covalent radii** (`cordero2008`) are the preferred covalent target set, with missing values substituted from the **legacy CSD covalent radii** @@ -65,11 +65,10 @@ The default `0.1.x` behavior is intentionally simple and practical: - **CSD/ConQuest hydrogen-normalisation defaults** (`csd_legacy_xh_cno`) are a provisional sparse X–H target set for `C`, `N`, and `O`, with other parent elements inferred from **Cordero covalent radii** through a fitted linear - policy. + transfer. -Nested policy predictors are supported too. In `0.1.4`, `LinearTransfer` -separates **fit-time** use of nested predictor values from -**prediction-time** use. By default, the fit may use only direct nested +Nested policy predictors are supported too. `LinearTransfer` separates +**fit-time** use of nested predictor values from **prediction-time** use. By default, the fit may use only direct nested values, while the final requested element may still use one additional nested completion step. That is a useful compromise for workflows such as provisional X–H inference from a chosen covalent-radii policy. @@ -125,7 +124,7 @@ You can also load a packaged set directly: ## Notebook walkthroughs -The repository ships example notebooks for the main `0.1.x` workflows. In the +The repository ships example notebooks for the main workflows. In the documentation they are also available as rendered Markdown pages, so users can read them without opening Jupyter first. diff --git a/docs/notebooks/01-quickstart.md b/docs/notebooks/01-quickstart.md index 475e218..12e8813 100644 --- a/docs/notebooks/01-quickstart.md +++ b/docs/notebooks/01-quickstart.md @@ -3,7 +3,7 @@ [Open the original notebook on GitHub](https://github.com/DeloneCommons/atomref/blob/main/notebooks/01-quickstart.ipynb) # atomref quickstart -This notebook covers the main public API in v0.1: element helpers, direct +This notebook covers the main public API: element helpers, direct `get_*` calls, provenance-carrying `lookup_*` calls, and packaged dataset discovery. ```python diff --git a/notebooks/01-quickstart.ipynb b/notebooks/01-quickstart.ipynb index 6d6d16f..47b58d1 100644 --- a/notebooks/01-quickstart.ipynb +++ b/notebooks/01-quickstart.ipynb @@ -4,11 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# atomref quickstart\n", - "\n", - "This notebook covers the main public API in v0.1: element helpers, direct\n", - "`get_*` calls, provenance-carrying `lookup_*` calls, and packaged dataset\n", - "discovery.\n" + "# atomref quickstart\n\nThis notebook covers the main public API: element helpers, direct\n`get_*` calls, provenance-carrying `lookup_*` calls, and packaged dataset\ndiscovery.\n" ] }, { diff --git a/src/atomref/policy.py b/src/atomref/policy.py index a2f922f..79cc9f3 100644 --- a/src/atomref/policy.py +++ b/src/atomref/policy.py @@ -80,7 +80,7 @@ def __float__(self) -> float: class ValuePolicy(Generic[K]): """Ordered rule set for resolving element-domain scalar values. - The v0.1 runtime resolves only element-domain policies even though the + The current runtime resolves only element-domain policies even though the metadata layer already records a more general ``domain`` concept. During construction, element-domain override keys are normalized to canonical element symbols and validated as finite floats. @@ -508,7 +508,9 @@ def _fit_transfer_model(base: DatasetLike, transfer: TransferModel) -> LinearFit if not isinstance(transfer, LinearTransfer): return None if len(transfer.predictors) != 1: - raise PolicyError("v0.1 LinearTransfer supports exactly one predictor source") + raise PolicyError( + "LinearTransfer currently supports exactly one predictor source" + ) predictor = transfer.predictors[0] if isinstance(base, DatasetRef) and isinstance(predictor, DatasetRef): @@ -577,7 +579,9 @@ def _apply_linear_transfer( """Try to resolve ``symbol`` through linear transfer from predictor data.""" if len(transfer.predictors) != 1: - raise PolicyError("v0.1 LinearTransfer supports exactly one predictor source") + raise PolicyError( + "LinearTransfer currently supports exactly one predictor source" + ) predictor_value, note = _lookup_transfer_source_value( symbol, @@ -659,7 +663,9 @@ def _resolve_value( target = _resolve_target_ref(policy) base_set = resolve_dataset_like(policy.base) if base_set.info.domain != "element": - raise PolicyError("v0.1 resolver supports only element-domain datasets") + raise PolicyError( + "the resolver currently supports only element-domain datasets" + ) sym = _normalize_element_symbol(symbol) if sym is None: @@ -776,7 +782,7 @@ def lookup_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupResul """Public entry point for generic element-domain scalar lookup. This is the same resolver used internally by the radii convenience layer. - In v0.1 the runtime supports only element-domain policies. + In the current implementation the runtime supports only element-domain policies. """ return _lookup_value_with_owner(symbol, policy=policy, owner=None) diff --git a/src/atomref/registry.py b/src/atomref/registry.py index 479ff97..b17b941 100644 --- a/src/atomref/registry.py +++ b/src/atomref/registry.py @@ -576,7 +576,7 @@ def get_builtin_set(ref: DatasetRef) -> ElementScalarSet: info = get_dataset_info(ref) if info.domain != "element": raise DatasetError( - f"only element-domain datasets are supported in v0.1: {info.ref!r}" + f"only element-domain datasets are currently supported: {info.ref!r}" ) if not isinstance(info.storage, Mapping): raise DatasetError(f"missing storage metadata for dataset: {info.ref!r}") diff --git a/src/atomref/transfer.py b/src/atomref/transfer.py index 54eb724..9adb0ce 100644 --- a/src/atomref/transfer.py +++ b/src/atomref/transfer.py @@ -81,9 +81,9 @@ class SubstitutionTransfer: class LinearTransfer: """Infer missing target values from one or more predictor datasets or policies. - In v0.1 the public API stores predictors as a tuple for forward - compatibility, but the runtime implementation intentionally accepts exactly - one predictor source. + In the current implementation the public API stores predictors as a tuple + for forward compatibility, but the runtime intentionally accepts exactly one + predictor source. For nested policy predictors, two safeguards apply: